From 5ad2d889f67ef6ee3ee10c049d573e8649679200 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 11:31:47 +0200
Subject: [PATCH 001/101] Renamed file and fixed references.

---
 .../src/{Optimizer.md => optimizers/optimizer_framework.md} | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename docs/src/{Optimizer.md => optimizers/optimizer_framework.md} (81%)

diff --git a/docs/src/Optimizer.md b/docs/src/optimizers/optimizer_framework.md
similarity index 81%
rename from docs/src/Optimizer.md
rename to docs/src/optimizers/optimizer_framework.md
index 94473b24d..4278660f1 100644
--- a/docs/src/Optimizer.md
+++ b/docs/src/optimizers/optimizer_framework.md
@@ -7,14 +7,14 @@ Starting from an element of the tangent space $T_Y\mathcal{M}$[^1], we need to p
 [^1]: In practice this is obtained by first using an AD routine on a loss function $L$, and then computing the Riemannian gradient based on this. See the section of the [Stiefel manifold](@ref "The Stiefel Manifold") for an example of this.
 
 ```@example
-Main.include_graphics("tikz/general_optimization_with_boundary") # hide
+Main.include_graphics("../tikz/general_optimization_with_boundary") # hide
 ```
 
-Here the mapping $\Omega$ is a [horizontal lift](optimizers/manifold_related/horizontal_lift.md) from the tangent space onto the **horizontal component of the Lie algebra at $Y$**. 
+Here the mapping $\Omega$ is a [horizontal lift](manifold_related/horizontal_lift.md) from the tangent space onto the **horizontal component of the Lie algebra at $Y$**. 
 
 The red line maps the horizontal component at $Y$, i.e. $\mathfrak{g}^{\mathrm{hor},Y}$, to the horizontal component at $\mathfrak{g}^\mathrm{hor}$.
 
-The $\mathrm{cache}$ stores information about previous optimization steps and is dependent on the optimizer. The elements of the $\mathrm{cache}$ are also in $\mathfrak{g}^\mathrm{hor}$. Based on this the optimer ([Adam](optimizers/adam_optimizer.md) in this case) computes a final velocity, which is the input of a [retraction](optimizers/manifold_related/retractions.md). Because this *update* is done for $\mathfrak{g}^{\mathrm{hor}}\equiv{}T_Y\mathcal{M}$, we still need to perform a mapping, called `apply_section` here, that then finally updates the network parameters. The two red lines are described in [global sections](optimizers/manifold_related/global_sections.md).
+The $\mathrm{cache}$ stores information about previous optimization steps and is dependent on the optimizer. The elements of the $\mathrm{cache}$ are also in $\mathfrak{g}^\mathrm{hor}$. Based on this the optimer ([Adam](optimizers/adam_optimizer.md) in this case) computes a final velocity, which is the input of a [retraction](optimizers/manifold_related/retractions.md). Because this *update* is done for $\mathfrak{g}^{\mathrm{hor}}\equiv{}T_Y\mathcal{M}$, we still need to perform a mapping, called `apply_section` here, that then finally updates the network parameters. The two red lines are described in [global sections](@ref "Global Sections").
 
 ## References 
 

From 8139ae3318b8133925d5689f9d259fe82d7d76a3 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 11:33:07 +0200
Subject: [PATCH 002/101] Moved contents of file to optimizer_framework.

---
 docs/src/optimizers/general_optimization.md | 14 --------------
 docs/src/optimizers/optimizer_framework.md  | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)
 delete mode 100644 docs/src/optimizers/general_optimization.md

diff --git a/docs/src/optimizers/general_optimization.md b/docs/src/optimizers/general_optimization.md
deleted file mode 100644
index 60c9da538..000000000
--- a/docs/src/optimizers/general_optimization.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Optimization for Neural Networks 
-
-Optimization for neural networks is (almost always) some variation on gradient descent. The most basic form of gradient descent is a discretization of the *gradient flow equation*:
-```math
-\dot{\theta} = -\nabla_\theta{}L,
-```
-by means of a Euler time-stepping scheme: 
-```math
-\theta^{t+1} = \theta^{t} - h\nabla_{\theta^{t}}L,
-```
-where $\eta$ (the time step of the Euler scheme) is referred to as the *learning rate*
-
-This equation can easily be generalized to [manifolds](../manifolds/manifolds.md) by replacing the *Euclidean gradient* $\nabla_{\theta^{t}L}$ by a *Riemannian gradient* $-h\mathrm{grad}_{\theta^{t}}L$ and addition by $-h\nabla_{\theta^{t}}L$ with a [retraction](../optimizers/manifold_related/retractions.md) by $-h\mathrm{grad}_{\theta^{t}}L$.
-
diff --git a/docs/src/optimizers/optimizer_framework.md b/docs/src/optimizers/optimizer_framework.md
index 4278660f1..7856d8021 100644
--- a/docs/src/optimizers/optimizer_framework.md
+++ b/docs/src/optimizers/optimizer_framework.md
@@ -1,5 +1,19 @@
 # Optimizer
 
+Optimization for neural networks is (almost always) some variation on gradient descent. The most basic form of gradient descent is a discretization of the *gradient flow equation*:
+```math
+\dot{\theta} = -\nabla_\theta{}L,
+```
+by means of a Euler time-stepping scheme: 
+```math
+\theta^{t+1} = \theta^{t} - h\nabla_{\theta^{t}}L,
+```
+where $\eta$ (the time step of the Euler scheme) is referred to as the *learning rate*
+
+This equation can easily be generalized to [manifolds](../manifolds/manifolds.md) by replacing the *Euclidean gradient* $\nabla_{\theta^{t}L}$ by a *Riemannian gradient* $-h\mathrm{grad}_{\theta^{t}}L$ and addition by $-h\nabla_{\theta^{t}}L$ with a [retraction](../optimizers/manifold_related/retractions.md) by $-h\mathrm{grad}_{\theta^{t}}L$.
+
+## Generalization
+
 In order to generalize neural network optimizers to [homogeneous spaces](@ref "Homogeneous Spaces"), a class of manifolds we often encounter in machine learning, we have to find a [global tangent space representation](@ref "Global Tangent Spaces") which we call $\mathfrak{g}^\mathrm{hor}$ here. 
 
 Starting from an element of the tangent space $T_Y\mathcal{M}$[^1], we need to perform two mappings to arrive at $\mathfrak{g}^\mathrm{hor}$, which we refer to by $\Omega$ and a red horizontal arrow:

From 394bfb0809bcd16b3bcd1aab6e34bfe96c6b1ee0 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 13:04:51 +0200
Subject: [PATCH 003/101] Fixed references to optimizer.

---
 docs/src/architectures/sympnet.md             |  2 +-
 .../manifold_related/global_sections.md       |  2 +-
 .../manifold_related/retractions.md           |  2 +-
 docs/src/optimizers/optimizer_framework.md    | 29 +++++++++++++++----
 .../symplectic_autoencoder.md                 |  2 +-
 docs/src/tutorials/sympnet_tutorial.md        |  2 +-
 6 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/docs/src/architectures/sympnet.md b/docs/src/architectures/sympnet.md
index 4e70fe139..4fb08c193 100644
--- a/docs/src/architectures/sympnet.md
+++ b/docs/src/architectures/sympnet.md
@@ -190,7 +190,7 @@ There are many $r$-finite activation functions commonly used in neural networks,
 - sigmoid $\sigma(x)=\frac{1}{1+e^{-x}}$ for any positive integer $r$, 
 - tanh $\tanh(x)=\frac{e^x-e^{-x}}{e^x+e^{-x}}$ for any positive integer $r$. 
 
-The universal approximation theorems state that we can, in principle, get arbitrarily close to any symplectomorphism defined on $\mathbb{R}^{2d}$. But this does not tell us anything about how to optimize the network. This is can be done with any common [neural network optimizer](../Optimizer.md) and these neural network optimizers always rely on a corresponding loss function.  
+The universal approximation theorems state that we can, in principle, get arbitrarily close to any symplectomorphism defined on $\mathbb{R}^{2d}$. But this does not tell us anything about how to optimize the network. This is can be done with any common [neural network optimizer](@ref "Neural Network Optimizers") and these neural network optimizers always rely on a corresponding loss function.  
 
 ## Loss function
 
diff --git a/docs/src/optimizers/manifold_related/global_sections.md b/docs/src/optimizers/manifold_related/global_sections.md
index 95bba26dc..974bf4b2e 100644
--- a/docs/src/optimizers/manifold_related/global_sections.md
+++ b/docs/src/optimizers/manifold_related/global_sections.md
@@ -41,7 +41,7 @@ meaning that for an element of the [horizontal component of the Lie algebra](@re
 
 ## Optimization
 
-The output of `global_rep` is then used for all the [optimization steps](../../Optimizer.md).
+The output of `global_rep` is then used for all the [optimization steps](@ref "Neural Network Optimizers").
 
 ## References 
 
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 70efd822c..24a1170de 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -36,7 +36,7 @@ Starting from this basic map $\exp:\mathfrak{g}\to{}G$ we can build mappings for
 
 3. **General tangent space to a homogeneous space** $T_Y\mathcal{M}$ with $Y = AE$: For $\Delta=ABE\in{}T_Y\mathcal{M}$ the exponential map is simply $A\exp(B)E$. This is the general case which we deal with.  
 
-The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function `retraction` in `GeometricMachineLearning` performs $\mathfrak{g}^\mathrm{hor}\to\mathcal{M}$, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with `apply_section` and represented through the red vertical line in the diagram on the [general optimizer framework](../../Optimizer.md).
+The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function `retraction` in `GeometricMachineLearning` performs $\mathfrak{g}^\mathrm{hor}\to\mathcal{M}$, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with `apply_section` and represented through the red vertical line in the diagram on the [general optimizer framework](@ref "Neural Network Optimizers").
 
 
 ### Word of caution
diff --git a/docs/src/optimizers/optimizer_framework.md b/docs/src/optimizers/optimizer_framework.md
index 7856d8021..d70e76bc6 100644
--- a/docs/src/optimizers/optimizer_framework.md
+++ b/docs/src/optimizers/optimizer_framework.md
@@ -1,16 +1,29 @@
-# Optimizer
+# Neural Network Optimizers
+
+During *optimization* we aim at changing the neural network parameters in such a way to minimize the loss function. So if we express the loss function ``L`` as a function of the neural network weights ``\Theta`` in a parameter space ``\mathbb{P}`` we can phrase the task as: 
+
+```@eval
+Main.definition(raw"Given a neural network ``\mathcal{NN}`` parametrized by ``\Theta`` and a loss function ``L:\mathbb{P}\to\mathbb{R}`` we call an algorithm an **iterative optimizer** (or simply **optimizer**) if it performs the following task:
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\Theta \leftarrow \mathtt{Optimizer}(\Theta, \mathrm{past history}, t),
+" * Main.indentation * raw"```
+" * Main.indentation * raw"with the aim of decreasing the value ``L(\Theta)`` in each optimization step.")
+```
+
+The past history of the optimization is stored in a cache ([`AdamCache`](@ref), [`MomentumCache`](@ref), [`GradientCache`](@ref) etc.) in `GeometricMachineLearning`.
 
 Optimization for neural networks is (almost always) some variation on gradient descent. The most basic form of gradient descent is a discretization of the *gradient flow equation*:
+
 ```math
-\dot{\theta} = -\nabla_\theta{}L,
+\dot{\theta} = -\nabla_\Theta{}L,
 ```
 by means of a Euler time-stepping scheme: 
 ```math
-\theta^{t+1} = \theta^{t} - h\nabla_{\theta^{t}}L,
+\Theta^{t+1} = \Theta^{t} - h\nabla_{\Theta^{t}}L,
 ```
-where $\eta$ (the time step of the Euler scheme) is referred to as the *learning rate*
+where ``\eta`` (the time step of the Euler scheme) is referred to as the *learning rate*. 
 
-This equation can easily be generalized to [manifolds](../manifolds/manifolds.md) by replacing the *Euclidean gradient* $\nabla_{\theta^{t}L}$ by a *Riemannian gradient* $-h\mathrm{grad}_{\theta^{t}}L$ and addition by $-h\nabla_{\theta^{t}}L$ with a [retraction](../optimizers/manifold_related/retractions.md) by $-h\mathrm{grad}_{\theta^{t}}L$.
+This equation can easily be generalized to [manifolds](@ref "(Matrix) Manifolds") by replacing the *Euclidean gradient* ``\nabla_{\Theta^{t}}L`` by a *Riemannian gradient* $-h\mathrm{grad}_{\theta^{t}}L$ and addition by $-h\nabla_{\theta^{t}}L$ with a [retraction](../optimizers/manifold_related/retractions.md) by $-h\mathrm{grad}_{\theta^{t}}L$.
 
 ## Generalization
 
@@ -30,6 +43,12 @@ The red line maps the horizontal component at $Y$, i.e. $\mathfrak{g}^{\mathrm{h
 
 The $\mathrm{cache}$ stores information about previous optimization steps and is dependent on the optimizer. The elements of the $\mathrm{cache}$ are also in $\mathfrak{g}^\mathrm{hor}$. Based on this the optimer ([Adam](optimizers/adam_optimizer.md) in this case) computes a final velocity, which is the input of a [retraction](optimizers/manifold_related/retractions.md). Because this *update* is done for $\mathfrak{g}^{\mathrm{hor}}\equiv{}T_Y\mathcal{M}$, we still need to perform a mapping, called `apply_section` here, that then finally updates the network parameters. The two red lines are described in [global sections](@ref "Global Sections").
 
+## Library Functions
+
+```@docs; canonical = false
+Optimizer
+```
+
 ## References 
 
 ```@bibliography
diff --git a/docs/src/reduced_order_modeling/symplectic_autoencoder.md b/docs/src/reduced_order_modeling/symplectic_autoencoder.md
index ffa8cf352..5ac6b7734 100644
--- a/docs/src/reduced_order_modeling/symplectic_autoencoder.md
+++ b/docs/src/reduced_order_modeling/symplectic_autoencoder.md
@@ -81,7 +81,7 @@ In order to overcome this difficulty we use neural networks, more specifically [
 Main.include_graphics("../tikz/symplectic_autoencoder") # hide
 ```
 
-So we alternate between SympNet and PSD layers. Because all the PSD layers are based on matrices $\Phi\in{}St(n,N)$ we have to [optimize on the Stiefel manifold](../Optimizer.md).
+So we alternate between SympNet and PSD layers. Because all the PSD layers are based on matrices $\Phi\in{}St(n,N)$ we have to [optimize on the Stiefel manifold](@ref "Neural Network Optimizers").
 
 
 ## References 
diff --git a/docs/src/tutorials/sympnet_tutorial.md b/docs/src/tutorials/sympnet_tutorial.md
index c9a523923..438ffec5f 100644
--- a/docs/src/tutorials/sympnet_tutorial.md
+++ b/docs/src/tutorials/sympnet_tutorial.md
@@ -121,7 +121,7 @@ parameterlength(g_nn.model)
 
 *Remark*: We can also specify whether we would like to start with a layer that changes the $q$-component or one that changes the $p$-component. This can be done via the keywords `init_upper` for `GSympNet`, and `init_upper_linear` and `init_upper_act` for `LASympNet`.
 
-We have to define an optimizer which will be use in the training of the SympNet. For more details on optimizer, please see the [corresponding documentation](../Optimizer.md). In this example we use [Adam](../optimizers/adam_optimizer.md):
+We have to define an optimizer which will be use in the training of the SympNet. For more details on optimizer, please see the [corresponding documentation](@ref "Neural Network Optimizers"). In this example we use [Adam](../optimizers/adam_optimizer.md):
 
 ```@example sympnet
 # set up optimizer; for this we first need to specify the optimization method (argue for why we need the optimizer method)

From d819680a6f45644af02f886c6fe23bb4a04701ba Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 13:47:50 +0200
Subject: [PATCH 004/101] Replaced Adam -> update

---
 docs/src/tikz/general_optimization.tex                    | 4 ++--
 docs/src/tikz/general_optimization_with_boundary.tex      | 4 ++--
 docs/src/tikz/general_optimization_with_boundary_dark.tex | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/src/tikz/general_optimization.tex b/docs/src/tikz/general_optimization.tex
index 4b68d8f04..c50bfac05 100644
--- a/docs/src/tikz/general_optimization.tex
+++ b/docs/src/tikz/general_optimization.tex
@@ -18,8 +18,8 @@
 
     \draw[->] (TYM) -- (ghorY) node[pos=.5, left] {$\Omega$};
     \draw[->, mred] (ghorY) -- (ghor);
-    \draw[->] (ghor) -- (cache) node[pos=.5, sloped, below] {Adam};
-    \draw[->] (cache) -- (ghor2) node[pos=.5, sloped, above] {Adam};
+    \draw[->] (ghor) -- (cache) node[pos=.5, sloped, below] {\texttt{update!}};
+    \draw[->] (cache) -- (ghor2) node[pos=.5, sloped, above] {\texttt{update!}};
     \draw[->] (ghor2) -- (M) node[pos=.5, right] {Retraction};
 \end{tikzpicture}
 \end{document}
\ No newline at end of file
diff --git a/docs/src/tikz/general_optimization_with_boundary.tex b/docs/src/tikz/general_optimization_with_boundary.tex
index a576864d9..3be28bcbd 100644
--- a/docs/src/tikz/general_optimization_with_boundary.tex
+++ b/docs/src/tikz/general_optimization_with_boundary.tex
@@ -31,8 +31,8 @@
 
     \draw[->] (TYM) -- (ghorY) node[pos=.5, left] {$\Omega$};
     \draw[->, mred] (ghorY) -- (ghor);
-    \draw[->] (ghor) -- (cache) node[pos=.5, sloped, below] {Adam};
-    \draw[->] (cache) -- (ghor2) node[pos=.5, sloped, above] {Adam};
+    \draw[->] (ghor) -- (cache) node[pos=.5, sloped, below] {\texttt{update!}};
+    \draw[->] (cache) -- (ghor2) node[pos=.5, sloped, above] {\texttt{update!}};
     \draw[->] (ghor2) -- (M) node[pos=.5, right] {Retraction};
     \draw[->, mgreen] (M2) -- (Euc) node[pos=.5, left] {\color{mgreen}AD};
     \draw[->] (Euc) -- (TYM) node[pos=.5, above] {$\mathtt{rgrad}$};
diff --git a/docs/src/tikz/general_optimization_with_boundary_dark.tex b/docs/src/tikz/general_optimization_with_boundary_dark.tex
index afb2564a3..5a188c63d 100644
--- a/docs/src/tikz/general_optimization_with_boundary_dark.tex
+++ b/docs/src/tikz/general_optimization_with_boundary_dark.tex
@@ -31,8 +31,8 @@
     \coordinate[below of=Euc, yshift=.7cm, xshift=-.3cm] (below_Euc);
     \draw[->, white] (TYM) -- (ghorY) node[pos=.5, left] {$\Omega$};
     \draw[->, mred] (ghorY) -- (ghor);
-    \draw[->, white] (ghor) -- (cache) node[pos=.5, sloped, below] {Adam};
-    \draw[->, white] (cache) -- (ghor2) node[pos=.5, sloped, above] {Adam};
+    \draw[->, white] (ghor) -- (cache) node[pos=.5, sloped, below] {\texttt{update!}};
+    \draw[->, white] (cache) -- (ghor2) node[pos=.5, sloped, above] {\texttt{update!}};
     \draw[->, white] (ghor2) -- (M) node[pos=.5, right] {Retraction};
     \draw[->, mgreen] (M2) -- (below_Euc) node[pos=.5, left] {\color{mgreen}AD};
     \draw[->, white] (Euc) -- (TYM) node[pos=.5, above] {$\mathtt{rgrad}$};

From f93c685abbeb9ad900d9194578ed725d8f68d398 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 14:45:27 +0200
Subject: [PATCH 005/101] Removed the horizontal lift. This is already
 discussed in global_tangent_spaces.

---
 docs/make.jl                                   | 10 +++-------
 .../manifold_related/horizontal_lift.md        | 18 ------------------
 2 files changed, 3 insertions(+), 25 deletions(-)
 delete mode 100644 docs/src/optimizers/manifold_related/horizontal_lift.md

diff --git a/docs/make.jl b/docs/make.jl
index f2f622e12..248e0b8aa 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -50,7 +50,7 @@ end
 function latex_graphics(path::String; label = nothing, caption = nothing, width = .5)
     figure_width = "$(width)\\textwidth"
     latex_label = isnothing(label) ? "" : "\\label{" * label * "}" 
-    latex_caption = isnothing(caption) ? "" : "\\caption{" * Markdown.parse(caption) * "}"
+    latex_caption = isnothing(caption) ? "" : "\\caption{" * string(Markdown.parse(caption)) * "}"
     latex_string = """\\begin{figure}
             \\includegraphics[width = """ * figure_width * "]{" * path * ".png}" *
             latex_caption *
@@ -150,13 +150,9 @@ makedocs(;
             "Symmetric and Skew-Symmetric Matrices" => "arrays/skew_symmetric_matrix.md",
             "Global Tangent Spaces" => "arrays/global_tangent_spaces.md",
         ],
-        "Optimizer Framework" => [
-            "Optimizers" => "Optimizer.md",
-            "General Optimization" => "optimizers/general_optimization.md",
+        "Optimizers" => [
+            "Optimizers" => "optimizers/optimizer_framework.md",
             "Pullbacks" => "pullbacks/computation_of_pullbacks.md",
-        ],
-        "Optimizer Functions" => [
-            "Horizontal Lift" => "optimizers/manifold_related/horizontal_lift.md",
             "Global Sections" => "optimizers/manifold_related/global_sections.md",
             "Retractions" => "optimizers/manifold_related/retractions.md",
             "Geodesic Retraction" => "optimizers/manifold_related/geodesic.md",
diff --git a/docs/src/optimizers/manifold_related/horizontal_lift.md b/docs/src/optimizers/manifold_related/horizontal_lift.md
deleted file mode 100644
index 0646c6f47..000000000
--- a/docs/src/optimizers/manifold_related/horizontal_lift.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# The Horizontal Lift 
-
-For each element $Y\in\mathcal{M}$ we can perform a splitting $\mathfrak{g} = \mathfrak{g}^{\mathrm{hor}, Y}\oplus\mathfrak{g}^{\mathrm{ver}, Y}$, where the two subspaces are the **horizontal** and the **vertical** component of $\mathfrak{g}$ at $Y$ respectively. For homogeneous spaces: $T_Y\mathcal{M} = \mathfrak{g}\cdot{}Y$, i.e. every tangent space to $\mathcal{M}$ can be expressed through the application of the Lie algebra to the relevant element. The vertical component consists of those elements of $\mathfrak{g}$ which are mapped to the zero element of $T_Y\mathcal{M}$, i.e. 
-
-```math
-\mathfrak{g}^{\mathrm{ver}, Y} := \mathrm{ker}(\mathfrak{g}\to{}T_Y\mathcal{M}).
-```
-
-The orthogonal complement[^1] of $\mathfrak{g}^{\mathrm{ver}, Y}$ is the horizontal component and is referred to by $\mathfrak{g}^{\mathrm{hor}, Y}$. This is naturally isomorphic to $T_Y\mathcal{M}$. For the Stiefel manifold the horizontal lift has the simple form: 
-
-```math
-\Omega(Y, V) = \left(\mathbb{I} - \frac{1}{2}\right)VY^T - YV^T(\mathbb{I} - \frac{1}{2}YY^T).
-```
-
-If the element $Y$ is the distinct element $E$, then the elements of $\mathfrak{g}^{\mathrm{hor},E}$ take a particularly simple form, see [Global Tangent Space](@ref "Global Tangent Spaces") for a description of this. 
-
-
-[^1]: The orthogonal complement is taken with respect to a metric defined on $\mathfrak{g}$. For the case of $G=SO(N)$ and $\mathfrak{g}=\mathfrak{so}(N) = \{A:A+A^T =0\}$ this metric can be chosen as $(A_1,A_2)\mapsto{}\frac{1}{2}A_1^TA_2$.

From 604ed4a952d100819ff864e47a08989071ac67b8 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:41:31 +0200
Subject: [PATCH 006/101] Fixed paths to new figures (in an ugly way).

---
 docs/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/Makefile b/docs/Makefile
index d6404243e..d5a9a4db9 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -75,6 +75,8 @@ put_figures_outside_of_minted_environment:
 	sed -i'' -e 's/\\\\texttt/\\texttt/g' build/G*.tex;
 	sed -i'' -e 's/\\\\_/\\_/g' build/G*.tex;
 	sed -i'' -e 's/tangent_space.png/manifolds\/tangent_space.png/g' build/G*.tex;
+	sed -i'' -e 's/retraction_comparison.png/optimizers\/manifold_related\/retraction_comparison.png/g' build/G*.tex;
+	sed -i'' -e 's/retraction_discrepancy.png/optimizers\/manifold_related\/retraction_discrepancy.png/g' build/G*.tex;
 
 make_correct_thrm_and_dfntn_and_xmpl_and_proof_environment:
 	sed -i'' -e 's/{\\textbackslash}begin\\{thrm\\}/\\begin{thrm}/g' build/G*.tex;

From a7be6e0531690422d8b9322a8223cf6f0f1feb05 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:41:56 +0200
Subject: [PATCH 007/101] Fixed additional \n at end of string.

---
 docs/make.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 248e0b8aa..b718158f7 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -3,6 +3,7 @@ using Documenter
 using DocumenterCitations
 using Markdown
 using Bibliography
+using LaTeXStrings
 # using Weave
 
 # this is necessary to avoid warnings. See https://documenter.juliadocs.org/dev/man/syntax/
@@ -50,7 +51,7 @@ end
 function latex_graphics(path::String; label = nothing, caption = nothing, width = .5)
     figure_width = "$(width)\\textwidth"
     latex_label = isnothing(label) ? "" : "\\label{" * label * "}" 
-    latex_caption = isnothing(caption) ? "" : "\\caption{" * string(Markdown.parse(caption)) * "}"
+    latex_caption = isnothing(caption) ? "" : "\\caption{" * string(Markdown.parse(caption))[1:end-2] * "}"
     latex_string = """\\begin{figure}
             \\includegraphics[width = """ * figure_width * "]{" * path * ".png}" *
             latex_caption *

From 7eec00a34c21b6745b90a8fac4e909813f8bec34 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:42:17 +0200
Subject: [PATCH 008/101] Added O'Neill book.

---
 docs/src/GeometricMachineLearning.bib | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
index b50dac1e3..593c047cf 100644
--- a/docs/src/GeometricMachineLearning.bib
+++ b/docs/src/GeometricMachineLearning.bib
@@ -61,7 +61,7 @@ @book{lipschutz1965general
     author={Seymour Lipschutz},
     year={1965},
     publisher={McGraw-Hill Book Company},
-    location={New York City, New York}
+    address={New York City, New York}
 }
 
 @book{bishop1980tensor,
@@ -69,7 +69,15 @@ @book{bishop1980tensor
     author={Richard L. Bishop, Samuel I. Goldberg},
     year={1980},
     publisher={Dover Publications},
-    location={Mineola, New York}
+    address={Mineola, New York}
+}
+
+@book{o1983semi,
+  title={Semi-Riemannian geometry with applications to relativity},
+  author={O'neill, Barrett},
+  year={1983},
+  publisher={Academic press},
+  address={New York City, New York}
 }
 
 @book{do1992riemannian,
@@ -85,7 +93,7 @@ @book{wright2006numerical
   author={Stephen J. Wright, Jorge Nocedal},
   year={2006},
   publisher={Springer Science+Business Media},
-  location={New York, NY}
+  address={New York, NY}
 }
 
 @article{fresca2021comprehensive,

From 3c1645468716baecac885c4e6703333649b29563 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:44:25 +0200
Subject: [PATCH 009/101] Fixed missing backlash.

---
 docs/src/manifolds/homogeneous_spaces.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/manifolds/homogeneous_spaces.md b/docs/src/manifolds/homogeneous_spaces.md
index 825c7008f..eb6949629 100644
--- a/docs/src/manifolds/homogeneous_spaces.md
+++ b/docs/src/manifolds/homogeneous_spaces.md
@@ -22,7 +22,7 @@ The tangent spaces of ``\mathcal{M}`` are of the form ``T_Y\mathcal{M} = \mathfr
 Based on this we can perform a splitting of ``\mathfrak{g}`` into two parts:
 
 ```@eval
-Main.definition(raw"A **splitting of the Lie algebra** ``mathfrak{g}`` at an element of a homogeneous space ``Y`` is a decomposition into a **vertical** and a **horizontal** component, denoted by ``\mathfrak{g} = \mathfrak{g}^{\mathrm{ver},Y} \oplus \mathfrak{g}^{\mathrm{hor},Y}`` such that
+Main.definition(raw"A **splitting of the Lie algebra** ``\mathfrak{g}`` at an element of a homogeneous space ``Y`` is a decomposition into a **vertical** and a **horizontal** component, denoted by ``\mathfrak{g} = \mathfrak{g}^{\mathrm{ver},Y} \oplus \mathfrak{g}^{\mathrm{hor},Y}`` such that
 " * Main.indentation * raw"1. The *vertical component* ``\mathfrak{g}^{\mathrm{ver},Y}`` is the kernel of the map ``\mathfrak{g}\to{}T_Y\mathcal{M}, V \mapsto VY``, i.e. ``\mathfrak{g}^{\mathrm{ver},Y} = \{V\in\mathfrak{g}:VY = 0\}.``
 " * Main.indentation * raw"2. The *horizontal component* ``\mathfrak{g}^{\mathrm{hor},Y}`` is the orthogonal complement of ``\mathfrak{g}^{\mathrm{ver},Y}`` in ``\mathfrak{g}``. It is isomorphic to ``T_Y\mathcal{M}``.
 ")

From fb72208b316f07a660eb4eb9256f3515508e3316 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:44:55 +0200
Subject: [PATCH 010/101] Put 'such that' outside of equation environment.

---
 docs/src/manifolds/metric_and_vector_spaces.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/manifolds/metric_and_vector_spaces.md b/docs/src/manifolds/metric_and_vector_spaces.md
index 99d438d13..75a050a83 100644
--- a/docs/src/manifolds/metric_and_vector_spaces.md
+++ b/docs/src/manifolds/metric_and_vector_spaces.md
@@ -82,7 +82,7 @@ Main.indentation * raw"2. ``x + y = y + x,``
 " * 
 Main.indentation * raw"3. ``\exists 0 \in \mathcal{V}`` such that ``x + 0 = x,``
 " * 
-Main.indentation * raw"4. ``\exists -x \in \mathcal{V} such that ``x + (-x) = 0,``
+Main.indentation * raw"4. ``\exists -x \in \mathcal{V}`` such that ``x + (-x) = 0,``
 " * 
 Main.indentation * raw"5. ``a(ax) = (ab)x,``
 " * 

From 580db2af5df3317217365ed448cada985b5fafc5 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:45:16 +0200
Subject: [PATCH 011/101] Fixed some references.

---
 docs/src/optimizers/manifold_related/global_sections.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/optimizers/manifold_related/global_sections.md b/docs/src/optimizers/manifold_related/global_sections.md
index 974bf4b2e..ab8422709 100644
--- a/docs/src/optimizers/manifold_related/global_sections.md
+++ b/docs/src/optimizers/manifold_related/global_sections.md
@@ -1,6 +1,6 @@
-# Global Sections
+# Global Sections for Homogeneous Spaces
 
-**Global sections** are needed needed for the generalization of [Adam](../adam_optimizer.md) and other optimizers to [homogeneous spaces](@ref "Homogeneous Spaces"). They are necessary to perform the two mappings represented represented by horizontal and vertical red lines in the section on the general [optimizer framework](../../Optimizer.md).
+**Global sections** are needed needed for the generalization of [Adam](../adam_optimizer.md) and other optimizers to [homogeneous spaces](@ref "Homogeneous Spaces"). They are necessary to perform the two mappings represented represented by horizontal and vertical red lines in the section on the general [optimizer framework](../optimizer_framework.md).
 
 ## Computing the global section
 In differential geometry a **section** is always associated to some **bundle**, in our case this bundle is $\pi:G\to\mathcal{M},A\mapsto{}AE$. A section is a mapping $\mathcal{M}\to{}G$ for which $\pi$ is a left inverse, i.e. $\pi\circ\lambda = \mathrm{id}$. 
@@ -17,7 +17,7 @@ In `GeometricMachineLearning`, `GlobalSection` takes an element of $Y\in{}St(n,N
 
 ## Computing the global tangent space representation based on a global section
 
-The output of the [horizontal lift](horizontal_lift.md) $\Omega$ is an element of $\mathfrak{g}^{\mathrm{hor},Y}$. For this mapping $\Omega(Y, B{}Y) = B$ if $B\in\mathfrak{g}^{\mathrm{hor},Y}$, i.e. there is **no information loss** and no projection is performed. We can map the $B\in\mathfrak{g}^{\mathrm{hor},Y}$ to $\mathfrak{g}^\mathrm{hor}$ with $B\mapsto{}\lambda(Y)^{-1}B\lambda(Y)$.
+The output of the [horizontal lift](@ref "Homogeneous Spaces") $\Omega$ is an element of $\mathfrak{g}^{\mathrm{hor},Y}$. For this mapping $\Omega(Y, B{}Y) = B$ if $B\in\mathfrak{g}^{\mathrm{hor},Y}$, i.e. there is **no information loss** and no projection is performed. We can map the $B\in\mathfrak{g}^{\mathrm{hor},Y}$ to $\mathfrak{g}^\mathrm{hor}$ with $B\mapsto{}\lambda(Y)^{-1}B\lambda(Y)$.
 
 The function `global_rep` performs both mappings at once[^1], i.e. it takes an instance of `GlobalSection` and an element of $T_YSt(n,N)$, and then returns an element of $\frak{g}^\mathrm{hor}\equiv$`StiefelLieAlgHorMatrix`.
 

From 8bfe0a5e0735c8ce226730f25d193e8ce46c0520 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:45:45 +0200
Subject: [PATCH 012/101] Added pictures and information to retraction section.

---
 .../manifold_related/retractions.md           | 206 ++++++++++++++++--
 1 file changed, 186 insertions(+), 20 deletions(-)

diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 24a1170de..3e22eb326 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -1,33 +1,189 @@
 # Retractions
 
-## Classical Definition
-Classically, retractions are defined as maps smooth maps 
+In practice we usually do not solve the geodesic equation exactly in each optimization step (even though this is possible and computationally feasible), but prefer approximations that are called "retractions" [absil2008optimization](@cite) for stability. The definition of a retraction in `GeometricMachineLearning` is slightly different from how it is usually defined in textbooks [absil2008optimization, hairer2006geometric](@cite). We discuss the differences here.
 
-```math
-R: T\mathcal{M}\to\mathcal{M}:(x,v)\mapsto{}R_x(v)
+## Classical Retractions
+
+By "classical retraction" we here mean the textbook definition. 
+
+```@eval
+Main.theorem(raw"A **classical retraction** is a smooth map
+" * Main.indentation * raw"```math 
+" * Main.indentation * raw"R: T\mathcal{M}\to\mathcal{M}:(x,v)\mapsto{}R_x(v),
+" * Main.indentation * raw"```
+" * Main.indentation * raw"such that each curve ``c(t) := R_x(tv)`` locally approximates the geodesic, i.e. the following two conditions hold:
+" * Main.indentation * raw"1. ``c(0) = x`` and 
+" * Main.indentation * raw"2. ``c'(0) = v.``
+")
 ```
 
-such that each curve $c(t) := R_x(tv)$ satisfies $c(0) = x$ and $c'(0) = v$.
+Perhaps the most common example for matrix manifolds is the *Cayley retraction*:
 
-## In `GeometricMachineLearning`
+```@eval
+Main.example(raw"The **Cayley retraction** is defined as
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\mathrm{Cayley}(V_x) = \left(\mathbb{I} - \frac{1}{2}V_x\right)^{-1}\left(\mathbb{I} +\frac{1}{2}V_x\right).
+" * Main.indentation * raw"```")
+```
+
+We should mention that the factor ``\frac{1}{2}`` is sometimes left out in the definition of the Cayley transform. But if we leave this out we do not have a retraction. 
+
+We want to compare the [`geodesic`](@ref) retraction with the [`cayley`](@ref) retraction for the example we already introduced when talking about the [exponential map](@ref "Geodesic Sprays and the Exponential Map"):
+
+```@setup s2_retraction
+using GeometricMachineLearning
+using CairoMakie # hide
+import Random # hide
+Random.seed!(123) # hide
+
+Y = rand(StiefelManifold, 3, 1)
+
+v = 5 * rand(3, 1)
+Δ = v - Y * (v' * Y)
+
+function do_setup()
+    fig = Figure(; backgroundcolor = :transparent) # hide
+    text_color = Main.output_type == :html ? :white : :black # hide
+    ax = Axis3(fig[1, 1]; # hide
+            backgroundcolor = :transparent, # hide
+            aspect = (1., 1., 1.), # hide
+            azimuth = π / 6, # hide
+            elevation = π / 8, # hide
+            xlabel = rich("x", subscript("1"), font = :italic, color = text_color), # hide
+            ylabel = rich("x", subscript("2"), font = :italic, color = text_color), # hide
+            zlabel = rich("x", subscript("3"), font = :italic, color = text_color), # hide
+            ) # hide
+
+    # plot a sphere with radius one and origin 0
+    surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .6)
+
+    morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
+    point_vec = ([Y[1]], [Y[2]], [Y[3]])
+    scatter!(ax, point_vec...; color = morange, marker = :star5)
 
-Retractions are a map from the **horizontal component** of the Lie algebra $\mathfrak{g}^\mathrm{hor}$ to the respective manifold.
+    fig, ax, point_vec
+end
 
-For optimization in neural networks (almost always first order) we solve a gradient flow equation 
+mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
+mblue = RGBf(31 / 256, 119 / 256, 180 / 256)
 
-```math
-\dot{W} = -\mathrm{grad}_WL, 
+nothing
 ```
-where $\mathrm{grad}_WL$ is the **Riemannian gradient** of the loss function $L$ evaluated at position $W$.
 
-If we deal with Euclidean spaces (vector spaces), then the Riemannian gradient is just the result of an AD routine and the solution of the equation above can be approximated with $W^{t+1} \gets W^t - \eta\nabla_{W^t}L$, where $\eta$ is the **learning rate**. 
+```@example s2_retraction
+Δ_increments = [Δ * η for η in 0.1 : 0.1 : 2.5]
 
-For manifolds, after we obtained the Riemannian gradient (see e.g. the section on [Stiefel manifold](@ref "The Stiefel Manifold")), we have to solve a **geodesic equation**. This is a canonical ODE associated with any Riemannian manifold. 
+Y_increments_geodesic = [geodesic(Y, Δ_increment) for Δ_increment in Δ_increments]
+Y_increments_cayley = [cayley(Y, Δ_increment) for Δ_increment in Δ_increments]
+ # hide
+function make_plot(; theme=:light) # hide
+
+fig, ax, point_vec = do_setup() # hide
+
+Y_zeros = zeros(length(Y_increments_geodesic))
+Y_geodesic_reshaped = [copy(Y_zeros), copy(Y_zeros), copy(Y_zeros)]
+Y_cayley_reshaped = [copy(Y_zeros), copy(Y_zeros), copy(Y_zeros)]
+
+zip_ob = zip(Y_increments_geodesic, Y_increments_cayley, axes(Y_increments_geodesic, 1))
+
+for (Y_increment_geodesic, Y_increment_cayley, i) in zip_ob
+    Y_geodesic_reshaped[1][i] = Y_increment_geodesic[1]
+    Y_geodesic_reshaped[2][i] = Y_increment_geodesic[2]
+    Y_geodesic_reshaped[3][i] = Y_increment_geodesic[3]
+
+    Y_cayley_reshaped[1][i] = Y_increment_cayley[1]
+    Y_cayley_reshaped[2][i] = Y_increment_cayley[2]
+    Y_cayley_reshaped[3][i] = Y_increment_cayley[3]
+end
+
+scatter!(ax, Y_geodesic_reshaped...; 
+        color = mred, markersize = 5, label = "geodesic retraction")
+
+scatter!(ax, Y_cayley_reshaped...; 
+        color = mblue, markersize = 5, label = "Cayley retraction")
+
+arrow_vec = ([Δ[1]], [Δ[2]], [Δ[3]]) # hide
+arrows!(ax, point_vec..., arrow_vec...; color = mred, linewidth = .02) # hide
+text_color = theme == :light ? :black : :white
+axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color) # hide
+
+fig, ax, zip_ob, Y_increments_geodesic, Y_increments_cayley # hide
+end # hide
+
+if Main.output_type == :html # hide
+    save("retraction_comparison.png",        make_plot(; theme = :light)[1]; px_per_unit = 1.5) # hide
+    save("retraction_comparison_dark.png",   make_plot(; theme = :dark )[1]; px_per_unit = 1.5) # hide
+elseif Main.output_type == :latex # hide
+    save("retraction_comparison.png",       make_plot(; theme = :light)[1]; px_per_unit = 2.0) # hide
+end # hide
+
+Main.include_graphics("retraction_comparison"; caption = raw"Comparison between the geodesic and the Cayley retraction.", width = .8) # hide
+```
 
-The general theory of Riemannian manifolds is rather complicated, but for the neural networks treated in `GeometricMachineLearning`, we only rely on optimization of matrix Lie groups and [homogeneous spaces](../../manifolds/homogeneous_spaces.md), which is much simpler. 
+We see that for small ``\Delta`` increments the Cayley retraction seems to match the geodesic retraction very well, but for larger values there is a notable discrepancy:
 
-For Lie groups each tangent space is isomorphic to its Lie algebra $\mathfrak{g}\equiv{}T_\mathbb{I}G$. The geodesic map from $\mathfrak{g}$ to $G$, for matrix Lie groups with bi-invariant Riemannian metric like $SO(N)$, is simply the application of the matrix exponential $\exp$. Alternatively this can be replaced by the Cayley transform (see (Absil et al, 2008).)
+```@setup s2_retraction
+function plot_discrepancies(discrepancies; theme = :light)
+    fig = Figure(; backgroundcolor = :transparent) # hide
+    text_color = theme == :dark ? :white : :black # hide
+    ax = Axis(fig[1, 1]; # hide
+            backgroundcolor = :transparent, # hide
+            xlabel = rich("η", font = :italic, color = text_color), # hide
+            ylabel = rich("discrepancy", color = text_color), # hide
+            ) # hide
+    lines!(discrepancies; label = "Discrepancies between geodesic and Cayley retraction.", 
+        linewidth = 2, color = mblue)
+
+    axislegend(; position = (.22, .9), backgroundcolor = :transparent, color = text_color) # hide
+
+    fig, ax
+end
+```
+
+```@example s2_retraction
+using LinearAlgebra: norm
+
+_, __, zip_ob, Y_increments_geodesic, Y_increments_cayley = make_plot() # hide
+discrepancies = [norm(Y_geo_inc - Y_cay_inc) for (Y_geo_inc, Y_cay_inc, _) in zip_ob]
+
+if Main.output_type == :html # hide
+    save("retraction_discrepancy.png",        plot_discrepancies(discrepancies; theme = :light)[1]; px_per_unit = 1.5) # hide
+    save("retraction_discrepancy_dark.png",   plot_discrepancies(discrepancies; theme = :dark )[1]; px_per_unit = 1.5) # hide
+elseif Main.output_type == :latex # hide
+    save("retraction_discrepancy.png",        plot_discrepancies(discrepancies; theme = :light)[1]; px_per_unit = 2.0) # hide
+end # hide
+
+Main.include_graphics("retraction_discrepancy"; caption = raw"Discrepancy between the geodesic and the Cayley retraction.", width = .8) # hide
+```
+
+## In `GeometricMachineLearning`
+
+The way we use *retractions*[^1] in `GeometricMachineLearning` is slightly different from their classical definition:
+
+[^1]: Classical retractions are also defined in `GeometricMachineLearning` under the same name, i.e. there is e.g. a method [`cayley(::StiefelLieAlgHorMatrix)`](@ref) and a method [`cayley(::StiefelManifold, ::AbstractMatrix)`](@ref) (the latter being the classical retraction); but the user is *strongly discouraged* from using classical retractions as these are computational inefficient.
+
+```@eval
+Main.definition(raw"A **retraction** is a map ``\mathrm{Retraction}:\mathfrak{g}\mathrm{hor}\to\mathcal{M}`` such that 
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\Delta \mapsto \lambda(Y)\mathrm{Retraction}(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E,
+" * Main.indentation * raw"```
+" * Main.indentation * raw"is a classical retraction.")
+```
+
+We now discuss how two of these retractions, the geodesic retraction (exponential map) and the Cayley retraction, are implemented in `GeometricMachineLearning`.
+
+## The Geodesic Retraction
+
+The *geodesic retraction* is a retraction whose associated curve is also the unique geodesic. For many matrix Lie groups (including ``SO(N)``) geodesics are obtained by simply evaluating the exponential map [absil2008optimization, o1983semi](@cite):
  
+```@eval
+Main.theorem(raw"The geodesic on a matrix Lie group ``G`` with bi-invariant metric for ``B\in{}T_AG`` is simply
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\gamma(t) = \exp(t\cdotBA^-1)A,
+" * Main.indentation * raw"```
+" * Main.indentation * raw"where ``\exp:\mathcal{g}\to{}G`` is the matrix exponential map.")
+```
+
 Starting from this basic map $\exp:\mathfrak{g}\to{}G$ we can build mappings for more complicated cases: 
 
 1. **General tangent space to a Lie group** $T_AG$: The geodesic map for an element $V\in{}T_AG$ is simply $A\exp(A^{-1}V)$.
@@ -36,7 +192,7 @@ Starting from this basic map $\exp:\mathfrak{g}\to{}G$ we can build mappings for
 
 3. **General tangent space to a homogeneous space** $T_Y\mathcal{M}$ with $Y = AE$: For $\Delta=ABE\in{}T_Y\mathcal{M}$ the exponential map is simply $A\exp(B)E$. This is the general case which we deal with.  
 
-The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function `retraction` in `GeometricMachineLearning` performs $\mathfrak{g}^\mathrm{hor}\to\mathcal{M}$, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with `apply_section` and represented through the red vertical line in the diagram on the [general optimizer framework](@ref "Neural Network Optimizers").
+The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function `retraction` in `GeometricMachineLearning` performs $\mathfrak{g}^\mathrm{hor}\to\mathcal{M}$, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with `apply_section` and represented through the red vertical line in the diagram describing [general optimizer framework](@ref "Neural Network Optimizers").
 
 
 ### Word of caution
@@ -44,10 +200,20 @@ The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neil
 The Lie group corresponding to the Stiefel manifold $SO(N)$ has a bi-invariant Riemannian metric associated with it: $(B_1,B_2)\mapsto \mathrm{Tr}(B_1^TB_2)$.
 For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult (see (Bendokat et al, 2021).)
 
-## References 
+## Library Functions
 
-- Absil P A, Mahony R, Sepulchre R. Optimization algorithms on matrix manifolds[M]. Princeton University Press, 2008.
+```@docs; canonical=false
+geodesic
+cayley
+```
+
+## References 
 
-- Bendokat T, Zimmermann R. The real symplectic Stiefel and Grassmann manifolds: metrics, geodesics and applications[J]. arXiv preprint arXiv:2108.12447, 2021.
+```@bibliography
+Pages = []
+Canonical = false
 
-- O'Neill, Barrett. Semi-Riemannian geometry with applications to relativity. Academic press, 1983.
\ No newline at end of file
+absil2008optimization
+bendokat2021real
+o1983semi
+```
\ No newline at end of file

From 47fb7517ea67b59fe9f9759e4b7be4e2c548965c Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:46:25 +0200
Subject: [PATCH 013/101] Added description for all the steps involved in the
 optimizer framework.

---
 docs/src/optimizers/optimizer_framework.md | 32 ++++++++++++++--------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/docs/src/optimizers/optimizer_framework.md b/docs/src/optimizers/optimizer_framework.md
index d70e76bc6..d2ecebbb3 100644
--- a/docs/src/optimizers/optimizer_framework.md
+++ b/docs/src/optimizers/optimizer_framework.md
@@ -1,11 +1,13 @@
 # Neural Network Optimizers
 
+In this section we present the general Optimizer framework used in `GeometricMachineLearning`. For more information on the particular steps involved in this consult the documentation on the various optimizer methods such as the *momentum optimizer* and the [Adam optimizer](@ref "The Adam Optimizer"), and the documentation on [retractions](@ref "Retractions").
+
 During *optimization* we aim at changing the neural network parameters in such a way to minimize the loss function. So if we express the loss function ``L`` as a function of the neural network weights ``\Theta`` in a parameter space ``\mathbb{P}`` we can phrase the task as: 
 
 ```@eval
 Main.definition(raw"Given a neural network ``\mathcal{NN}`` parametrized by ``\Theta`` and a loss function ``L:\mathbb{P}\to\mathbb{R}`` we call an algorithm an **iterative optimizer** (or simply **optimizer**) if it performs the following task:
 " * Main.indentation * raw"```math
-" * Main.indentation * raw"\Theta \leftarrow \mathtt{Optimizer}(\Theta, \mathrm{past history}, t),
+" * Main.indentation * raw"\Theta \leftarrow \mathtt{Optimizer}(\Theta, \text{past history}, t),
 " * Main.indentation * raw"```
 " * Main.indentation * raw"with the aim of decreasing the value ``L(\Theta)`` in each optimization step.")
 ```
@@ -15,38 +17,46 @@ The past history of the optimization is stored in a cache ([`AdamCache`](@ref),
 Optimization for neural networks is (almost always) some variation on gradient descent. The most basic form of gradient descent is a discretization of the *gradient flow equation*:
 
 ```math
-\dot{\theta} = -\nabla_\Theta{}L,
+\dot{\Theta} = -\nabla_\Theta{}L,
 ```
-by means of a Euler time-stepping scheme: 
+by means of an Euler time-stepping scheme: 
 ```math
 \Theta^{t+1} = \Theta^{t} - h\nabla_{\Theta^{t}}L,
 ```
 where ``\eta`` (the time step of the Euler scheme) is referred to as the *learning rate*. 
 
-This equation can easily be generalized to [manifolds](@ref "(Matrix) Manifolds") by replacing the *Euclidean gradient* ``\nabla_{\Theta^{t}}L`` by a *Riemannian gradient* $-h\mathrm{grad}_{\theta^{t}}L$ and addition by $-h\nabla_{\theta^{t}}L$ with a [retraction](../optimizers/manifold_related/retractions.md) by $-h\mathrm{grad}_{\theta^{t}}L$.
+This equation can easily be generalized to [manifolds](@ref "(Matrix) Manifolds") by replacing the *Euclidean gradient* ``\nabla_{\Theta^{t}}L`` by a *Riemannian gradient* ``-h\mathrm{grad}_{\Theta^{t}}L`` and addition by ``-h\nabla_{\Theta^{t}}L`` with the [exponential map](@ref "Geodesic Sprays and the Exponential Map") of ``-h\mathrm{grad}_{\theta^{t}}L``. In practice we often use approximations ot the exponential map however. These are called [retractions](@ref "Retractions").
 
-## Generalization
+## Generalization to Homogeneous Spaces
 
-In order to generalize neural network optimizers to [homogeneous spaces](@ref "Homogeneous Spaces"), a class of manifolds we often encounter in machine learning, we have to find a [global tangent space representation](@ref "Global Tangent Spaces") which we call $\mathfrak{g}^\mathrm{hor}$ here. 
+In order to generalize neural network optimizers to [homogeneous spaces](@ref "Homogeneous Spaces") we utilize their corresponding [global tangent space representation](@ref "Global Tangent Spaces") ``\mathfrak{g}^\mathrm{hor}``. 
 
-Starting from an element of the tangent space $T_Y\mathcal{M}$[^1], we need to perform two mappings to arrive at $\mathfrak{g}^\mathrm{hor}$, which we refer to by $\Omega$ and a red horizontal arrow:
+When introducing the notion of a [global tangent space](@ref "Global Tangent Spaces") we discussed how an element of the tangent space ``T_Y\mathcal{M}`` can be represented in ``\mathfrak{g}^\mathrm{hor}`` by performing two mappings: the first one is the horizontal lift ``\Omega`` (see the docstring for [`GeometricMachineLearning.Ω`](@ref)) and the second one is the adjoint operation[^1] with the lift of ``Y`` called ``\lambda(Y)``. We can visualize the steps required in performing this generalization:
 
-[^1]: In practice this is obtained by first using an AD routine on a loss function $L$, and then computing the Riemannian gradient based on this. See the section of the [Stiefel manifold](@ref "The Stiefel Manifold") for an example of this.
+[^1]: By the *adjoint operation* ``\mathrm{ad}_A:\mathfrak{g}\to\mathfrak{g}`` for an element ``A\in{}G`` we mean ``B \mapsto A^{-1}BA``.
 
 ```@example
 Main.include_graphics("../tikz/general_optimization_with_boundary") # hide
 ```
 
-Here the mapping $\Omega$ is a [horizontal lift](manifold_related/horizontal_lift.md) from the tangent space onto the **horizontal component of the Lie algebra at $Y$**. 
+The `cache` stores information about previous optimization steps and is dependent on the optimizer. In general the cache is represented as one or more elements in ``\mathfrak{g}^\mathrm{hor}``. Based on this the optimizer method (represented by [`update!`](@ref) in the figure) computes a *final velocity*. This final velocity is again an element of ``\mathfrak{g}^\mathrm{hor}``.
+
+The final velocity is then fed into a [retraction](@ref "Retractions")[^2]. For computational reasons we split the retraction into two steps, referred to as "Retraction" and [`apply_section`](@ref) above. These two mappings together are equivalent to: 
+
+[^2]: A retraction is an approximation of the [exponential map](@ref "Geodesic Sprays and the Exponential Map")
+
+```math
+\mathrm{retraction}(\Delta) = \mathrm{retraction}(\lambda(Y)B^\Delta{}E) = \lambda(Y)\mathrm{Retraction}(B^\Delta), 
+```
 
-The red line maps the horizontal component at $Y$, i.e. $\mathfrak{g}^{\mathrm{hor},Y}$, to the horizontal component at $\mathfrak{g}^\mathrm{hor}$.
+where ``\Delta\in{}T_\mathcal{M}`` and ``B^\Delta`` is its representation in ``\mathfrak{g}^\mathrm{hor}`` as ``B^\Delta = \lambda(Y)^{-1}\Omega(\Delta)\lambda(Y).``
 
-The $\mathrm{cache}$ stores information about previous optimization steps and is dependent on the optimizer. The elements of the $\mathrm{cache}$ are also in $\mathfrak{g}^\mathrm{hor}$. Based on this the optimer ([Adam](optimizers/adam_optimizer.md) in this case) computes a final velocity, which is the input of a [retraction](optimizers/manifold_related/retractions.md). Because this *update* is done for $\mathfrak{g}^{\mathrm{hor}}\equiv{}T_Y\mathcal{M}$, we still need to perform a mapping, called `apply_section` here, that then finally updates the network parameters. The two red lines are described in [global sections](@ref "Global Sections").
 
 ## Library Functions
 
 ```@docs; canonical = false
 Optimizer
+update!
 ```
 
 ## References 

From a618d5d23c308b0045e2edfd8f766e86f8d83b41 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:46:52 +0200
Subject: [PATCH 014/101] Exporting cayley.

---
 src/GeometricMachineLearning.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
index ecbe956ff..867a9c662 100644
--- a/src/GeometricMachineLearning.jl
+++ b/src/GeometricMachineLearning.jl
@@ -186,7 +186,7 @@ module GeometricMachineLearning
     export GlobalSection, apply_section
     export global_rep
     export Geodesic, Cayley
-    export geodesic
+    export geodesic, cayley
     export retraction
     # export ⊙², √ᵉˡᵉ, /ᵉˡᵉ, scalar_add
     export update!

From f7e38d7f9570188872481778e8605a8f2acf5684 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:47:28 +0200
Subject: [PATCH 015/101] Fixed bmatrix -> pmatrix.

---
 src/arrays/stiefel_projection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arrays/stiefel_projection.jl b/src/arrays/stiefel_projection.jl
index 09e8a8e39..be3c207e6 100644
--- a/src/arrays/stiefel_projection.jl
+++ b/src/arrays/stiefel_projection.jl
@@ -1,7 +1,7 @@
 @doc raw"""
     StiefelProjection(backend, T, N, n)
 
-Make a matrix of the form ``\begin{bmatrix} \mathbb{I} & \mathbb{O} \end{pmatrix}^T`` for a specific backend and data type.
+Make a matrix of the form ``\begin{pmatrix} \mathbb{I} & \mathbb{O} \end{pmatrix}^T`` for a specific backend and data type.
 
 An array that essentially does `vcat(I(n), zeros(N-n, n))` with GPU support. 
 

From e445abb5241eadad062345cf2e313b93786dc3b6 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:48:08 +0200
Subject: [PATCH 016/101] Added cayley as classical retraction.

---
 .../manifold_related/retractions.jl           | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/optimizers/manifold_related/retractions.jl b/src/optimizers/manifold_related/retractions.jl
index 823d27ef0..92e4217ee 100644
--- a/src/optimizers/manifold_related/retractions.jl
+++ b/src/optimizers/manifold_related/retractions.jl
@@ -80,6 +80,28 @@ end
 
 cayley(B::NamedTuple) = apply_toNT(cayley, B)
 
+@doc raw"""
+    cayley(Y::Manifold, Δ)
+
+Take as input an element of a manifold `Y` and a tangent vector in `Δ` in the corresponding tangent space and compute the Cayley retraction.
+
+In different notation: take as input an element ``x`` of ``\mathcal{M}`` and an element of ``T_x\mathcal{M}`` and return ``\mathrm{Cayley}(v_x).`` For example: 
+
+```julia 
+Y = rand(StiefelManifold{Float64}, N, n)
+Δ = rgrad(Y, rand(N, n))
+cayley(Y, Δ)
+```
+
+See the docstring for [`rgrad`](@ref) for details on this function.
+"""
+function cayley(Y::Manifold{T}, Δ::AbstractMatrix{T}) where T
+    λY = GlobalSection(Y)
+    B = global_rep(λY, Δ)
+    cayleyB = cayley(B)
+    apply_section(λY, cayleyB)
+end
+
 function cayley(B::StiefelLieAlgHorMatrix{T}) where T
     E = StiefelProjection(B)
     unit = one(B.A)

From 497468fa8f0ec0e2ddc5227580371654110cbb86 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:48:36 +0200
Subject: [PATCH 017/101] Added various docstrings.

---
 src/optimizers/optimizer.jl | 68 ++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/optimizers/optimizer.jl b/src/optimizers/optimizer.jl
index 5d8743bcc..24739cf53 100644
--- a/src/optimizers/optimizer.jl
+++ b/src/optimizers/optimizer.jl
@@ -1,9 +1,11 @@
 @doc raw"""
-Optimizer struct that stores the 'method' (i.e. Adam with corresponding hyperparameters), the cache and the optimization step.
+    Optimizer(method, cache, step)
+
+Store the `method` (e.g. [`AdamOptimizer`](@ref) with corresponding hyperparameters), the `cache` (e.g. [`AdamCache`](@ref)) and the optimization step.
 
 It takes as input an optimization method and the parameters of a network. 
 
-For *technical reasons* we first specify an OptimizerMethod that stores all the hyperparameters of the optimizer. 
+For *technical reasons* we first specify an [`OptimizerMethod`](@ref) that stores all the hyperparameters of the optimizer. 
 """
 mutable struct Optimizer{MT<:OptimizerMethod, CT}
     method::MT
@@ -11,51 +13,66 @@ mutable struct Optimizer{MT<:OptimizerMethod, CT}
     step::Int
 end
 
-function Optimizer(m::OptimizerMethod, x::Union{Tuple, NamedTuple})
-    Optimizer(m, init_optimizer_cache(m, x), 0)
+@doc raw"""
+    Optimizer(method, nn_params)
+
+Allocate the cache for a specific `method` and `nn_params` for an instance of `Optimizer`.
+
+Internally this calls [`init_optimizer_cache`](@ref).
+"""
+function Optimizer(method::OptimizerMethod, nn_params::Union{Tuple, NamedTuple})
+    Optimizer(method, init_optimizer_cache(method, nn_params), 0)
 end
 
 """
+    Optimizer(method, nn::NeuralNetwork)
+
+Allocate the cache for a specific `method` and a `NeuralNetwork` for an instance of `Optimizer`.
+
+Internally this calls `Optimizer(method, nn.params)`.
+
 Typically the Optimizer is not initialized with the network parameters, but instead with a NeuralNetwork struct.
 """
-function Optimizer(m::OptimizerMethod, nn::NeuralNetwork)
-    Optimizer(m, nn.params)
+function Optimizer(method::OptimizerMethod, nn::NeuralNetwork)
+    Optimizer(method, nn.params)
 end
 
 Optimizer(nn::NeuralNetwork, m::OptimizerMethod) = Optimizer(m, nn)
 
+@doc raw"""
+    update!(o, cache, B)
+
+First update the `cache` and then update the array `B` based on the optimizer `o`. 
+
+Note that ``B\in\mathfrak{g}^\mathrm{hor}`` in general.
+"""
+function update!(::Optimizer, ::AbstractCache, ::AbstractArray) end
+
 #######################################################################################
 # optimization step function
 
 @doc raw"""
-Optimization for a single layer. 
+    optimization_step!(o, layer, ps, cache, dx)
 
-inputs: 
-- `o::Optimizer`
-- `d::Union{AbstractExplicitLayer, AbstractExplicitCell}`
-- `ps::NamedTuple`: the parameters 
-- `C::NamedTuple`: NamedTuple of the caches 
-- `dx::NamedTuple`: NamedTuple of the derivatives (output of AD routine)
+Update the weights `ps` of a `layer` based on an [`Optimizer`](@ref), a `cache` and first-order derivatives `dx`.
 
-`ps`, `C` and `dx` must have the same keys. 
+The derivatives `dx` here are usually obtained via an AD routine by differentiating a loss function, i.e. `dx` is ``\nabla_xL``.
+
+It is calling the function [`update!`](@ref) internally which has to be implemented for every [`OptimizerMethod`](@ref).
 """
-function optimization_step!(o::Optimizer, d::Union{AbstractExplicitLayer, AbstractExplicitCell}, ps::NamedTuple, C::NamedTuple, dx::NamedTuple)
+function optimization_step!(o::Optimizer, layer::Union{AbstractExplicitLayer, AbstractExplicitCell}, ps::NamedTuple, cache::NamedTuple, dx::NamedTuple)
     gx = rgrad(ps, dx)
     λY = GlobalSection(ps)
     B = global_rep(λY, gx)
-    update!(o, C, B)
-    ps₂ = retraction(d, B)
+    update!(o, cache, B)
+    ps₂ = retraction(layer, B)
     apply_section!(ps, λY, ps₂)
 end
 
 @doc raw"""
-Optimization for an entire neural network, the way this function should be called. 
+    optimization_step!(o::Optimizer, model::Chain, ps::Tuple, dx::Tuple)
 
-inputs: 
-- `o::Optimizer`
-- `model::Chain`
-- `ps::Tuple`
-- `dx::Tuple`
+Optimize a neural network built with `Chain`.
 """
 function optimization_step!(o::Optimizer, model::Chain, ps::Tuple, dx::Tuple)
     o.step += 1
@@ -64,6 +81,11 @@ function optimization_step!(o::Optimizer, model::Chain, ps::Tuple, dx::Tuple)
     end
 end
 
+@doc raw"""
+    optimization_step!(o::Optimizer, model::AbstractExplicitLayer, ps::NamedTuple, dx::NamedTuple)
+
+Optimize a neural network consisting of a single `AbstractExplicitLayer`.
+"""
 function optimization_step!(o::Optimizer, model::AbstractExplicitLayer, ps::NamedTuple, dx::NamedTuple)
     o.step += 1
 

From cb97d60fda31f2a35c52168e64e8327d61491897 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:48:59 +0200
Subject: [PATCH 018/101] Added docstrings.

---
 src/optimizers/optimizer_caches.jl | 55 +++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/src/optimizers/optimizer_caches.jl b/src/optimizers/optimizer_caches.jl
index 398eaa562..0bd84518c 100644
--- a/src/optimizers/optimizer_caches.jl
+++ b/src/optimizers/optimizer_caches.jl
@@ -1,9 +1,9 @@
 @doc raw"""
-AbstractCache has subtypes: 
-- AdamCache
-- MomentumCache
-- GradientCache
-- BFGSCache
+AbstractCache has subtypes:
+- [`AdamCache`](@ref)
+- [`MomentumCache`](@ref)
+- [`GradientCache`](@ref)
+- [`BFGSCache`](@ref)
 
 All of them can be initialized with providing an array (also supporting manifold types).
 """
@@ -12,6 +12,33 @@ abstract type AbstractCache{T} end
 #############################################################################
 # All the definitions of the caches
 
+@doc raw"""
+    AdamCache(Y)
+
+Store the first and second moment for `Y` (initialized as zeros).
+
+First and second moments are called `B₁` and `B₂`.
+
+If the cache is called with an instance of a homogeneous space, e.g. the [`StiefelManifold`](@ref) ``St(n,N)`` it initializes the moments as elements of ``\mathfrak{g}^\mathrm{hor}`` ([`StiefelLieAlgHorMatrix`](@ref)).
+
+# Examples
+
+```jldoctest
+using GeometricMachineLearning
+
+Y = rand(StiefelManifold, 5, 3)
+AdamCache(Y).B₁
+
+# output
+
+5×5 StiefelLieAlgHorMatrix{Float64, SkewSymMatrix{Float64, Vector{Float64}}, Matrix{Float64}}:
+ 0.0  -0.0  -0.0  -0.0  -0.0
+ 0.0   0.0  -0.0  -0.0  -0.0
+ 0.0   0.0   0.0  -0.0  -0.0
+ 0.0   0.0   0.0   0.0   0.0
+ 0.0   0.0   0.0   0.0   0.0
+```
+"""
 struct AdamCache{T, AT <: AbstractArray{T}} <: AbstractCache{T}
     B₁::AT
     B₂::AT 
@@ -20,6 +47,17 @@ struct AdamCache{T, AT <: AbstractArray{T}} <: AbstractCache{T}
     end
 end
 
+@doc raw"""
+    MomentumCache(Y)
+
+Store the moment for `Y` (initialized as zeros).
+
+The moment is called `B`.
+
+If the cache is called with an instance of a homogeneous space, e.g. the [`StiefelManifold`](@ref) ``St(n,N)`` it initializes the moments as elements of ``\mathfrak{g}^\mathrm{hor}`` ([`StiefelLieAlgHorMatrix`](@ref)).
+
+See [`AdamCache`](@ref).
+"""
 struct MomentumCache{T, AT <: AbstractArray{T}} <:AbstractCache{T}
     B::AT
     function MomentumCache(Y::AbstractArray)
@@ -27,6 +65,13 @@ struct MomentumCache{T, AT <: AbstractArray{T}} <:AbstractCache{T}
     end
 end
 
+@doc raw"""
+    GradientCache(Y)
+
+Do not store anything.
+
+The cache for the [`GradientOptimizer`](@ref) does not consider past information.
+"""
 struct GradientCache{T} <: AbstractCache{T} end
 GradientCache(::AbstractArray{T}) where T = GradientCache{T}()
 

From 27fa99106d750c87794d0a85d490197566a38bee Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 28 May 2024 17:49:18 +0200
Subject: [PATCH 019/101] Added docstrings.

---
 src/optimizers/optimizer_method.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/optimizers/optimizer_method.jl b/src/optimizers/optimizer_method.jl
index 6f99294cb..307726021 100644
--- a/src/optimizers/optimizer_method.jl
+++ b/src/optimizers/optimizer_method.jl
@@ -1,3 +1,11 @@
+@doc raw"""
+Each `Optimizer` has to be called with an `OptimizerMethod`. This specifies how the neural network weights are updated in each optimization step.
+"""
 abstract type OptimizerMethod end
 
+@doc raw"""
+    init_optimizer_cache(method, x)
+
+Initialize= the optimizer cache based on input `x` for the given `method`.
+"""
 function init_optimizer_cache(::OptimizerMethod, x) end
\ No newline at end of file

From 856e5fc14be04615539eec9a43272616c5940504 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 16:15:37 +0200
Subject: [PATCH 020/101] Added docstring and made code more legible.

---
 .../manifold_related/modified_exponential.jl  | 77 +++++++++++++++----
 1 file changed, 63 insertions(+), 14 deletions(-)

diff --git a/src/optimizers/manifold_related/modified_exponential.jl b/src/optimizers/manifold_related/modified_exponential.jl
index 1306e73d2..35522b703 100644
--- a/src/optimizers/manifold_related/modified_exponential.jl
+++ b/src/optimizers/manifold_related/modified_exponential.jl
@@ -1,25 +1,74 @@
-"""
-This implements exponential and inverse mappings.
-"""
+@doc raw"""
+    𝔄(A)
+
+Compute ``\mathfrak{A}(A) := \sum_{n=1}^\infty \frac{1}{n!} (A)^{n-1}.``
+
+# Implementation
+
+This uses a Taylor expansion that iteratively adds terms with
+
+```julia
+while norm(Aⁿ) > ε
+    mul!(A_temp, Aⁿ, A)
+    Aⁿ .= A_temp
+    rmul!(Aⁿ, T(inv(i)))
 
-#computes A^-1(exp(A) - I)
+    𝔄 += B
+    n += 1 
+end
+```
+
+until the norm of `Aⁿ` becomes smaller than machine precision. 
+The counter `n` in the above algorithm is initialized as `2`
+The matrices `Aⁿ` and `𝔄` are initialized as the identity matrix.
+"""
 function 𝔄(A::AbstractMatrix{T}) where T
-    B = one(A)
+    Aⁿ = one(A)
     C = one(A)
-    B_temp = zero(A)
-    i = 2
-    while norm(B) > eps(T)
-        mul!(B_temp, B, A)
-        B .= B_temp
-        rmul!(B, T(inv(i)))
-
-        C += B
-        i += 1 
+    A_temp = zero(A)
+    n = 2
+    while norm(Aⁿ) > eps(T)
+        mul!(A_temp, Aⁿ, A)
+        Aⁿ .= A_temp
+        rmul!(Aⁿ, T(inv(n)))
+
+        C += Aⁿ
+        n += 1 
     end
     #print("\nNumber of iterations is: ", i, "\n")
     C
 end
 
+@doc raw"""
+    𝔄(B̂, B̄)
+
+Compute ``\mathfrak{A}(B', B'') := \sum_{n=1}^\infty \frac{1}{n!} ((B'')^TB')^{n-1}.``
+
+This expression has the property ``\mathbb{I} +  B'\mathfrak{A}(B', B'')(B'')^T = \exp(B'(B'')^T).``
+
+# Examples
+
+```jldoctest
+using GeometricMachineLearning
+using GeometricMachineLearning: 𝔄
+import Random
+Random.seed!(123)
+
+B = rand(StiefelLieAlgHorMatrix, 10, 2)
+B̂ = hcat(vcat(.5 * B.A, B.B), vcat(one(B.A), zero(B.B)))
+B̄ = hcat(vcat(one(B.A), zero(B.B)), vcat(-.5 * B.A, -B.B))
+
+one(B̂ * B̄') + B̂ * 𝔄(B̂, B̄) * B̄' ≈ exp(B)
+
+# output
+
+true
+```
+"""
+function 𝔄(B̂::AbstractMatrix{T}, B̄::AbstractMatrix{T}) where T
+    𝔄(B̄' * B̂)
+end
+
 function 𝔄exp(X::AbstractMatrix{T}, Y::AbstractMatrix{T}) where T
     I + X*𝔄(Y*X)*Y
 end
\ No newline at end of file

From 2f05a356dce9dd694e63608cd7af41a0a213e742 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 16:15:55 +0200
Subject: [PATCH 021/101] Added rmrk environment.

---
 docs/Makefile |  5 ++++-
 docs/make.jl  | 10 ++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/Makefile b/docs/Makefile
index d5a9a4db9..7ee53a4af 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -78,7 +78,7 @@ put_figures_outside_of_minted_environment:
 	sed -i'' -e 's/retraction_comparison.png/optimizers\/manifold_related\/retraction_comparison.png/g' build/G*.tex;
 	sed -i'' -e 's/retraction_discrepancy.png/optimizers\/manifold_related\/retraction_discrepancy.png/g' build/G*.tex;
 
-make_correct_thrm_and_dfntn_and_xmpl_and_proof_environment:
+make_correct_thrm_and_dfntn_and_xmpl_and_rmrk_and_proof_environment:
 	sed -i'' -e 's/{\\textbackslash}begin\\{thrm\\}/\\begin{thrm}/g' build/G*.tex;
 	sed -i'' -e 's/{\\textbackslash}end\\{thrm\\}/\\end{thrm}/g' build/G*.tex;
 	sed -i'' -e 's/{\\textbackslash}label\\{th:\([a-zA-Z]*\)\\}/\\label{th:\1}/g' build/G*.tex;
@@ -90,6 +90,9 @@ make_correct_thrm_and_dfntn_and_xmpl_and_proof_environment:
 	sed -i'' -e 's/{\\textbackslash}label\\{xmpl:\([a-zA-Z]*\)\\}/\\label{xmpl:\1}/g' build/G*.tex;
 	sed -i'' -e 's/{\\textbackslash}begin\\{proof\\}/\\begin{proof}/g' build/G*.tex;
 	sed -i'' -e 's/{\\textbackslash}end\\{proof\\}/\\end{proof}/g' build/G*.tex;
+	sed -i'' -e 's/{\\textbackslash}begin\\{rmrk\\}/\\begin{rmrk}/g' build/G*.tex;
+	sed -i'' -e 's/{\\textbackslash}end\\{rmrk\\}/\\end{rmrk}/g' build/G*.tex;
+	sed -i'' -e 's/{\\textbackslash}label\\{rmrk:\([a-zA-Z]*\)\\}/\\label{rmrk:\1}/g' build/G*.tex;
 
 do_correct_quotation_marks:
 	sed -i'' -e 's/{\\textquotedbl}/"/g' build/G*.tex;
diff --git a/docs/make.jl b/docs/make.jl
index b718158f7..be5db5dd5 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -107,6 +107,16 @@ function example(statement::String; label::Union{Nothing, String} = nothing)
     end
 end
 
+function remark(statement::String; label::Union{Nothing, String} = nothing)
+    if Main.output_type == :html
+        Markdown.parse("""!!! info "Remark" 
+            \t $(statement)""")
+    else
+        theorem_label = isnothing(label) ? "" : raw"\label{rmrk:" * label * raw"}"
+        Markdown.parse(raw"\begin{rmrk}" * statement * theorem_label * raw"\end{rmrk}")
+    end
+end
+
 function proof(statement::String)
     if Main.output_type == :html
         Markdown.parse("""!!! details "Proof" 

From b7b51623cccde47a0dacb69bee71e8898eb886ac Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 16:16:26 +0200
Subject: [PATCH 022/101] Added references for efficient computation of
 exponential.

---
 docs/src/GeometricMachineLearning.bib | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
index 593c047cf..5bd714daf 100644
--- a/docs/src/GeometricMachineLearning.bib
+++ b/docs/src/GeometricMachineLearning.bib
@@ -393,4 +393,25 @@ @article{hochreiter1997long
   pages={1735--1780},
   year={1997},
   publisher={MIT press}
+}
+
+@article{celledoni2000approximating,
+  title={Approximating the exponential from a Lie algebra to a Lie group},
+  author={Celledoni, Elena and Iserles, Arieh},
+  journal={Mathematics of Computation},
+  volume={69},
+  number={232},
+  pages={1457--1480},
+  year={2000}
+}
+
+@inproceedings{fraikin2007optimization,
+  title={Optimization over the Stiefel manifold},
+  author={Fraikin, Catherine and H{\"u}per, K and Dooren, P Van},
+  booktitle={PAMM: Proceedings in Applied Mathematics and Mechanics},
+  volume={7},
+  number={1},
+  pages={1062205--1062206},
+  year={2007},
+  organization={Wiley Online Library}
 }
\ No newline at end of file

From 533c25ecef780272401cfd9d2b5f31bea390e7b8 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 16:16:54 +0200
Subject: [PATCH 023/101] Correction: theorem -> definition.

---
 docs/src/arrays/global_tangent_spaces.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/arrays/global_tangent_spaces.md b/docs/src/arrays/global_tangent_spaces.md
index 9ea12dcc8..67a95bb48 100644
--- a/docs/src/arrays/global_tangent_spaces.md
+++ b/docs/src/arrays/global_tangent_spaces.md
@@ -29,7 +29,7 @@ We should note that we have written all Lie group and Lie algebra actions as sim
 Note that the theorem above requires us to find an element ``A\in{}G`` such that ``AE = Y``. If we can find a mapping ``\lambda:\mathcal{M}\to{}G`` we call such a mapping a *global section*. 
 
 ```@eval
-Main.theorem(raw"We call a mapping from ``\lambda:\mathcal{M} \to G`` a homogeneous space to its associated Lie group a **global section** if it satisfies:
+Main.definition(raw"We call a mapping from ``\lambda:\mathcal{M} \to G`` a homogeneous space to its associated Lie group a **global section** if it satisfies:
 " * Main.indentation * raw"```math
 " * Main.indentation * raw"\lambda(Y)E = Y,
 " * Main.indentation * raw"```

From a7bcf920e29c595a9beb7a02fb1a995ecf582dfa Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 16:17:28 +0200
Subject: [PATCH 024/101] Expanded docs on retraction (including examples).

---
 .../manifold_related/retractions.md           | 184 +++++++++++++++---
 1 file changed, 155 insertions(+), 29 deletions(-)

diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 3e22eb326..34e4bb633 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -1,6 +1,6 @@
 # Retractions
 
-In practice we usually do not solve the geodesic equation exactly in each optimization step (even though this is possible and computationally feasible), but prefer approximations that are called "retractions" [absil2008optimization](@cite) for stability. The definition of a retraction in `GeometricMachineLearning` is slightly different from how it is usually defined in textbooks [absil2008optimization, hairer2006geometric](@cite). We discuss the differences here.
+In practice we usually do not solve the geodesic equation exactly in each optimization step (even though this is possible and computationally feasible), but prefer approximations that are called "retractions" [absil2008optimization](@cite) for stability. The definition of a retraction in `GeometricMachineLearning` is slightly different from how it is usually defined in textbooks [absil2008optimization, hairer2006geometric](@cite). We discuss these differences here.
 
 ## Classical Retractions
 
@@ -11,7 +11,7 @@ Main.theorem(raw"A **classical retraction** is a smooth map
 " * Main.indentation * raw"```math 
 " * Main.indentation * raw"R: T\mathcal{M}\to\mathcal{M}:(x,v)\mapsto{}R_x(v),
 " * Main.indentation * raw"```
-" * Main.indentation * raw"such that each curve ``c(t) := R_x(tv)`` locally approximates the geodesic, i.e. the following two conditions hold:
+" * Main.indentation * raw"such that each curve ``c(t) := R_x(tv)`` is a local approximation of a geodesic, i.e. the following two conditions hold:
 " * Main.indentation * raw"1. ``c(0) = x`` and 
 " * Main.indentation * raw"2. ``c'(0) = v.``
 ")
@@ -26,7 +26,7 @@ Main.example(raw"The **Cayley retraction** is defined as
 " * Main.indentation * raw"```")
 ```
 
-We should mention that the factor ``\frac{1}{2}`` is sometimes left out in the definition of the Cayley transform. But if we leave this out we do not have a retraction. 
+We should mention that the factor ``\frac{1}{2}`` is sometimes left out in the definition of the Cayley transform when used in different contexts. But it is necessary for defining a retraction. 
 
 We want to compare the [`geodesic`](@ref) retraction with the [`cayley`](@ref) retraction for the example we already introduced when talking about the [exponential map](@ref "Geodesic Sprays and the Exponential Map"):
 
@@ -41,9 +41,9 @@ Y = rand(StiefelManifold, 3, 1)
 v = 5 * rand(3, 1)
 Δ = v - Y * (v' * Y)
 
-function do_setup()
+function do_setup(; theme=:light)
     fig = Figure(; backgroundcolor = :transparent) # hide
-    text_color = Main.output_type == :html ? :white : :black # hide
+    text_color = theme == :dark ? :white : :black # hide
     ax = Axis3(fig[1, 1]; # hide
             backgroundcolor = :transparent, # hide
             aspect = (1., 1., 1.), # hide
@@ -71,14 +71,17 @@ nothing
 ```
 
 ```@example s2_retraction
-Δ_increments = [Δ * η for η in 0.1 : 0.1 : 2.5]
+η_increments = 0.1 : 0.1 : 2.5
+Δ_increments = [Δ * η for η in η_increments]
 
 Y_increments_geodesic = [geodesic(Y, Δ_increment) for Δ_increment in Δ_increments]
 Y_increments_cayley = [cayley(Y, Δ_increment) for Δ_increment in Δ_increments]
  # hide
 function make_plot(; theme=:light) # hide
 
-fig, ax, point_vec = do_setup() # hide
+text_color = theme == :light ? :black : :white # hide
+
+fig, ax, point_vec = do_setup(; theme = theme) # hide
 
 Y_zeros = zeros(length(Y_increments_geodesic))
 Y_geodesic_reshaped = [copy(Y_zeros), copy(Y_zeros), copy(Y_zeros)]
@@ -87,24 +90,21 @@ Y_cayley_reshaped = [copy(Y_zeros), copy(Y_zeros), copy(Y_zeros)]
 zip_ob = zip(Y_increments_geodesic, Y_increments_cayley, axes(Y_increments_geodesic, 1))
 
 for (Y_increment_geodesic, Y_increment_cayley, i) in zip_ob
-    Y_geodesic_reshaped[1][i] = Y_increment_geodesic[1]
-    Y_geodesic_reshaped[2][i] = Y_increment_geodesic[2]
-    Y_geodesic_reshaped[3][i] = Y_increment_geodesic[3]
+    for d in (1, 2, 3)
+        Y_geodesic_reshaped[d][i] = Y_increment_geodesic[d]
 
-    Y_cayley_reshaped[1][i] = Y_increment_cayley[1]
-    Y_cayley_reshaped[2][i] = Y_increment_cayley[2]
-    Y_cayley_reshaped[3][i] = Y_increment_cayley[3]
+        Y_cayley_reshaped[d][i] = Y_increment_cayley[d]
+    end
 end
 
 scatter!(ax, Y_geodesic_reshaped...; 
-        color = mred, markersize = 5, label = "geodesic retraction")
+        color = mred, markersize = 5, label = rich("geodesic retraction"; color = text_color))
 
 scatter!(ax, Y_cayley_reshaped...; 
-        color = mblue, markersize = 5, label = "Cayley retraction")
+        color = mblue, markersize = 5, label = rich("Cayley retraction"; color = text_color))
 
 arrow_vec = ([Δ[1]], [Δ[2]], [Δ[3]]) # hide
 arrows!(ax, point_vec..., arrow_vec...; color = mred, linewidth = .02) # hide
-text_color = theme == :light ? :black : :white
 axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color) # hide
 
 fig, ax, zip_ob, Y_increments_geodesic, Y_increments_cayley # hide
@@ -131,7 +131,7 @@ function plot_discrepancies(discrepancies; theme = :light)
             xlabel = rich("η", font = :italic, color = text_color), # hide
             ylabel = rich("discrepancy", color = text_color), # hide
             ) # hide
-    lines!(discrepancies; label = "Discrepancies between geodesic and Cayley retraction.", 
+    lines!(η_increments, discrepancies; label = rich("Discrepancies between geodesic and Cayley retraction", color = text_color), 
         linewidth = 2, color = mblue)
 
     axislegend(; position = (.22, .9), backgroundcolor = :transparent, color = text_color) # hide
@@ -153,7 +153,7 @@ elseif Main.output_type == :latex # hide
     save("retraction_discrepancy.png",        plot_discrepancies(discrepancies; theme = :light)[1]; px_per_unit = 2.0) # hide
 end # hide
 
-Main.include_graphics("retraction_discrepancy"; caption = raw"Discrepancy between the geodesic and the Cayley retraction.", width = .8) # hide
+Main.include_graphics("retraction_discrepancy"; caption = raw"Discrepancy between the geodesic and the Cayley retraction.", width = .6) # hide
 ```
 
 ## In `GeometricMachineLearning`
@@ -163,7 +163,7 @@ The way we use *retractions*[^1] in `GeometricMachineLearning` is slightly diffe
 [^1]: Classical retractions are also defined in `GeometricMachineLearning` under the same name, i.e. there is e.g. a method [`cayley(::StiefelLieAlgHorMatrix)`](@ref) and a method [`cayley(::StiefelManifold, ::AbstractMatrix)`](@ref) (the latter being the classical retraction); but the user is *strongly discouraged* from using classical retractions as these are computational inefficient.
 
 ```@eval
-Main.definition(raw"A **retraction** is a map ``\mathrm{Retraction}:\mathfrak{g}\mathrm{hor}\to\mathcal{M}`` such that 
+Main.definition(raw"Given a section ``\lambda:\mathcal{M}\to{}G`` a **retraction** is a map ``\mathrm{Retraction}:\mathfrak{g}^\mathrm{hor}\to\mathcal{M}`` such that 
 " * Main.indentation * raw"```math
 " * Main.indentation * raw"\Delta \mapsto \lambda(Y)\mathrm{Retraction}(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E,
 " * Main.indentation * raw"```
@@ -172,39 +172,165 @@ Main.definition(raw"A **retraction** is a map ``\mathrm{Retraction}:\mathfrak{g}
 
 We now discuss how two of these retractions, the geodesic retraction (exponential map) and the Cayley retraction, are implemented in `GeometricMachineLearning`.
 
-## The Geodesic Retraction
+## Retractions for Homogeneous Spaces
+
+Here we harness special properties of homogeneous spaces to obtain computationally efficient retractions for the [Stiefel manifold](@ref "The Stiefel Manifold") and the [Grassmann manifold](@ref "The Grassmann Manifold"). This is also discussed in e.g. [bendokat2020grassmann, bendokat2021real](@cite).
 
 The *geodesic retraction* is a retraction whose associated curve is also the unique geodesic. For many matrix Lie groups (including ``SO(N)``) geodesics are obtained by simply evaluating the exponential map [absil2008optimization, o1983semi](@cite):
  
 ```@eval
-Main.theorem(raw"The geodesic on a matrix Lie group ``G`` with bi-invariant metric for ``B\in{}T_AG`` is simply
+Main.theorem(raw"The geodesic on a compact matrix Lie group ``G`` with bi-invariant metric for ``B\in{}T_AG`` is simply
 " * Main.indentation * raw"```math
 " * Main.indentation * raw"\gamma(t) = \exp(t\cdotBA^-1)A,
 " * Main.indentation * raw"```
 " * Main.indentation * raw"where ``\exp:\mathcal{g}\to{}G`` is the matrix exponential map.")
 ```
 
-Starting from this basic map $\exp:\mathfrak{g}\to{}G$ we can build mappings for more complicated cases: 
+Because ``SO(N)`` is compact and we furnish it with the canonical metric, i.e. 
 
-1. **General tangent space to a Lie group** $T_AG$: The geodesic map for an element $V\in{}T_AG$ is simply $A\exp(A^{-1}V)$.
+```math
+    g:T_AG\times{}T_AG \to \mathbb{R}, (B_1, B_2) \mapsto \mathrm{Tr}(B_1^TB_2) = \mathrm{Tr}((B_1A^{-1})^T(B_2A^{-1})),
+```
 
-2. **Special tangent space to a homogeneous space** $T_E\mathcal{M}$: For $V=BE\in{}T_E\mathcal{M}$ the exponential map is simply $\exp(B)E$. 
+its geodesics are thus equivalent to the exponential maps. We now use this observation to obtain expression for the geodesics on the [Stiefel manifold](@ref "The Stiefel Manifold") ``St(n, N)``. We use the following theorem from [o1983semi; Proposition 25.7](@cite):
 
-3. **General tangent space to a homogeneous space** $T_Y\mathcal{M}$ with $Y = AE$: For $\Delta=ABE\in{}T_Y\mathcal{M}$ the exponential map is simply $A\exp(B)E$. This is the general case which we deal with.  
+```@eval
+Main.theorem(raw"The geodesics for a naturally-reductive homogeneous space ``\mathcal{M}`` starting at ``Y`` are given by:
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\gamma_{\Delta}(t) = \exp(t\cdot\Omega(\Delta))Y,
+" * Main.indentation * raw"```
+" * Main.indentation * raw"where the ``\exp`` is the exponential map for the Lie group ``G`` corresponding to ``\mathcal{M}``.")
+```
 
-The general theory behind points 2. and 3. is discussed in chapter 11 of (O'Neill, 1983). The function `retraction` in `GeometricMachineLearning` performs $\mathfrak{g}^\mathrm{hor}\to\mathcal{M}$, which is the second of the above points. To get the third from the second point, we simply have to multiply with a matrix from the left. This step is done with `apply_section` and represented through the red vertical line in the diagram describing [general optimizer framework](@ref "Neural Network Optimizers").
+The theorem requires the homogeneous space to be naturally reductive: 
 
+```@eval
+Main.definition(raw"A homogeneous space is called **naturally-reductive** if the following two conditions hold:
+" * Main.indentation * raw"1. ``A^{-1}BA\in\mathfrak{g}^\mathrm{hor}`` for every ``B\in\mathfrak{g}^\mathrm{hor}`` and ``A\in\exp(\mathfrak{g}^\mathrm{ver}),
+" * Main.indentation * raw"2. ``g([X, Y]^\mathrm{hor}, Z) = g(X, [Y, Z]^\mathrm{hor})`` for all ``X, Y, Z \in \mathfrak{g}^\mathrm{hor}``,
+" * Main.indentation * raw"where ``[X, Y]^\mathrm{hor} = \Omega(XYE - YXE)``. If only the first condition holds the homogeneous space is called **reductive** but not **naturally-reductive**.")
+```
+
+We state here without proof that the [Stiefel manifold](@ref "The Stiefel Manifold") and the [Grassmann manifold](@ref "The Grassmann Manifold") are naturally-reductive. An empirical verification of this is very easy:
+
+```@example naturally_reductive
+using GeometricMachineLearning
+import Random # hide
+Random.seed!(123) # hide
 
-### Word of caution
+B = rand(SkewSymMatrix, 6) # ∈ 𝔤
+A = exp(B - StiefelLieAlgHorMatrix(B, 3)) # ∈ exp(𝔤ᵛᵉʳ)
+
+X = rand(StiefelLieAlgHorMatrix, 6, 3) # ∈ 𝔤ʰᵒʳ
+Y = rand(StiefelLieAlgHorMatrix, 6, 3) # ∈ 𝔤ʰᵒʳ
+Z = rand(StiefelLieAlgHorMatrix, 6, 3) # ∈ 𝔤ʰᵒʳ
+
+@assert StiefelLieAlgHorMatrix(A' * X * A, 3) ≈ A' * X * A # hide
+A' * X * A # this has to be in 𝔤ʰᵒʳ for St(3, 6) to be reductive
+```
+
+verifies the first property and
+
+```@example naturally_reductive
+using LinearAlgebra: tr
+adʰᵒʳ(X, Y) = StiefelLieAlgHorMatrix(X * Y - Y * X, 3)
+
+@assert tr(adʰᵒʳ(X, Y)' * Z) ≈ tr(X' * adʰᵒʳ(Y, Z)) # hide
+tr(adʰᵒʳ(X, Y)' * Z) ≈ tr(X' * adʰᵒʳ(Y, Z))
+```
+
+verifies the second.
+
+In `GeometricMachineLearning` we always work with elements in ``\mathfrak{g}^\mathrm{hor}`` and the Lie group ``G`` is always ``SO(N)``. We hence use:
+
+```math
+    \gamma_\Delta(t) = \exp(\lambda(Y)\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y)\lambda(Y)^{-1})Y = \lambda(Y)\exp(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E,
+```
+
+where we have used that 
+
+```math
+ \exp(\Lambda{}B\Lambda^{-1}) = \sum_{n = 0}^\infty \frac{1}{n!}(\Lambda{}B\Lambda^{-1})^n = \sum_{n = 0}^\infty \frac{1}{n!}\underbrace{(\Lambda{}B\Lambda^{-1})}_{\text{$n$ times}} = \sum_{n = 0}^\infty \Lambda\frac{1}{n!}B^n\Lambda^{-1}.
+```
 
-The Lie group corresponding to the Stiefel manifold $SO(N)$ has a bi-invariant Riemannian metric associated with it: $(B_1,B_2)\mapsto \mathrm{Tr}(B_1^TB_2)$.
-For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult (see (Bendokat et al, 2021).)
+Based on this we define the maps: 
+
+```math
+\mathtt{geodesic}: \mathfrak{g}^\mathrm{hor} \to \mathcal{M}, B \mapsto \exp(B)E,
+```
+
+and
+
+```math
+\mathtt{cayley}: \mathfrak{g}^\mathrm{hor} \to \mathcal{M}, B \mapsto \mathrm{Cayley}(B)E,
+```
+
+where ``B = \lambda(Y)^{-1}\Omega(\Delta)\lambda(Y)``. These expressions for `geodesic` and `cayley` are the ones that we typically use in `GeometricMachineLearning` for computational reasons. We show how we can utilize the sparse structure of ``\mathfrak{g}^\mathrm{hor}`` for computing the geodesic retraction and the Cayley retraction (i.e. the expressions ``\exp(B)`` and ``\mathrm{Cayley}(B)`` for ``B\in\mathfrak{g}^\mathrm{hor}``). Similar derivations can be found in [celledoni2000approximating, fraikin2007optimization, bendokat2021real](@cite).
+
+### The Geodesic Retraction
+
+An element of ``\mathfrak{g}^\mathrm{hor}`` can be written as:
+
+```math
+\begin{bmatrix}
+    A & -B^T \\ 
+    B & \mathbb{O}
+\end{bmatrix} = \begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix} =: B'(B'')^Ts,
+```
+
+where we exploit the sparse structure of the array, i.e. it is a multiplication of a ``N\times2n`` with a ``2n\times{}N`` matrix.
+
+We further use the following: 
+
+```math
+    \begin{aligned}
+    \exp(B'(B'')^T) & = \sum_{n=0}^\infty \frac{1}{n!} (B'(B'')^T)^n = \mathbb{I} + \sum_{n=0}^\infty \frac{1}{n!} B'\sum_{n=1}^\infty B'((B'')^TB')(B'')^T \\
+    & = \mathbb{I} + B'\left( \sum_{n=1}^\infty \frac{1}{n!} ((B'')^TB')^{n-1} \right)B'' =: \mathbb{I} + B'\mathfrak{A}(B', B'')B'',
+    \end{aligned}
+```
+
+where we defined ``\mathfrak{A}(B', B'') := \sum_{n=1}^\infty \frac{1}{n!} ((B'')^TB')^{n-1}.`` Note that evaluating ``\mathfrak{A}`` relies on computing products of *small* matrices of size ``2n\times2n.`` We do this by relying on a simple Taylor expansion (see the docstring for [`GeometricMachineLearning.𝔄`](@ref)).
+
+### The Cayley Retraction
+
+For the Cayley retraction we leverage the decomposition of ``B = B'(B'')^T\in\mathfrak{g}^\mathrm{hor}`` through the *Sherman-Morrison-Woodbury formula*:
+
+```math
+(\mathbb{I} - \frac{1}{2}B'(B'')^T)^{-1} = \mathbb{I} + \frac{1}{2}B'(\mathbb{I} - \frac{1}{2}B'(B'')^T)^{-1}(B'')^T
+```
+
+So what we have to compute the inverse of:
+
+```math
+\mathbb{I} - \frac{1}{2}\begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} = 
+\begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}.
+```
+
+By leveraging the sparse structure of the matrices in ``\mathfrak{g}^\mathrm{hor}`` we arrive at the following expression for the Cayley retraction (similar to the case of the geodesic retraction):
+
+```math
+\left(\mathbb{I} + \frac{1}{2}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}^{-1}  \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix} \right)\left( E +  \frac{1}{2}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix}\ \right) = \\
+E + \frac{1}{2}\begin{bmatrix} \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O}  \end{bmatrix}\left(
+    \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix}  + 
+    \begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}^{-1}\left(
+        \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix} + 
+        \begin{bmatrix} \frac{1}{2}A \\ \frac{1}{4}A^2 - \frac{1}{2}B^TB \end{bmatrix}
+    \right)
+    \right)
+```
+
+
+```@eval
+Main.remark(raw"As mentioned previously the Lie group ``SO(N)``, i.e. the one corresponding to the Stiefel manifold and the Grassmann manifold, has a bi-invariant Riemannian metric associated with it: ``(B_1,B_2)\mapsto \mathrm{Tr}(B_1^TB_2)``.
+For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult [bendokat2021real](@cite).")
+```
 
 ## Library Functions
 
 ```@docs; canonical=false
 geodesic
 cayley
+GeometricMachineLearning.𝔄
 ```
 
 ## References 

From 770bf7f04a8559f7057cb3967189fd76bb11277c Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 16:44:50 +0200
Subject: [PATCH 025/101] Moved geodesic retraction and Cayley retraction into
 retractions.md.

---
 docs/make.jl                                  |  2 -
 .../src/optimizers/manifold_related/cayley.md | 49 -------------------
 .../optimizers/manifold_related/geodesic.md   |  3 --
 3 files changed, 54 deletions(-)
 delete mode 100644 docs/src/optimizers/manifold_related/cayley.md
 delete mode 100644 docs/src/optimizers/manifold_related/geodesic.md

diff --git a/docs/make.jl b/docs/make.jl
index be5db5dd5..acb5a9b1a 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -166,8 +166,6 @@ makedocs(;
             "Pullbacks" => "pullbacks/computation_of_pullbacks.md",
             "Global Sections" => "optimizers/manifold_related/global_sections.md",
             "Retractions" => "optimizers/manifold_related/retractions.md",
-            "Geodesic Retraction" => "optimizers/manifold_related/geodesic.md",
-            "Cayley Retraction" => "optimizers/manifold_related/cayley.md",
             "Adam Optimizer" => "optimizers/adam_optimizer.md",
             "BFGS Optimizer" => "optimizers/bfgs_optimizer.md",
             ],
diff --git a/docs/src/optimizers/manifold_related/cayley.md b/docs/src/optimizers/manifold_related/cayley.md
deleted file mode 100644
index ff39b30ee..000000000
--- a/docs/src/optimizers/manifold_related/cayley.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# The Cayley Retraction 
-
-The Cayley transformation is one of the most popular retractions. For several matrix Lie groups it is a mapping from the Lie algebra $\mathfrak{g}$ onto the Lie group $G$. 
-They Cayley retraction reads: 
-
-```math
-    \mathrm{Cayley}(C) = \left(\mathbb{I} -\frac{1}{2}C\right)^{-1}\left(\mathbb{I} +\frac{1}{2}C\right).
-```
-This is easily checked to be a retraction, i.e. $\mathrm{Cayley}(\mathbb{O}) = \mathbb{I}$ and $\frac{\partial}{\partial{}t}\mathrm{Cayley}(tC) = C$.
-
-What we need in practice is not the computation of the Cayley transform of an arbitrary matrix, but the Cayley transform of an element of $\mathfrak{g}^\mathrm{hor}$, the [global tangent space representation](@ref "Global Tangent Spaces"). 
-
-The elements of $\mathfrak{g}^\mathrm{hor}$ can be written as: 
-
-```math
-C = \begin{bmatrix}
-    A & -B^T \\ 
-    B & \mathbb{O}
-\end{bmatrix} = \begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix},
-```
-
-where the second expression exploits the sparse structure of the array, i.e. it is a multiplication of a $N\times2n$ with a $2n\times{}N$ matrix. We can hence use the **Sherman-Morrison-Woodbury formula** to obtain:
-
-```math
-(\mathbb{I} - \frac{1}{2}UV)^{-1} = \mathbb{I} + \frac{1}{2}U(\mathbb{I} - \frac{1}{2}VU)^{-1}V
-```
-
-So what we have to invert is the term 
-
-```math
-\mathbb{I} - \frac{1}{2}\begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} = 
-\begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}.
-```
-
-The whole Cayley transform is then: 
-
-```math
-\left(\mathbb{I} + \frac{1}{2}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}^{-1}  \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix} \right)\left( E +  \frac{1}{2}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix}\ \right) = \\
-E + \frac{1}{2}\begin{bmatrix} \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O}  \end{bmatrix}\left(
-    \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix}  + 
-    \begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}^{-1}\left(
-        \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix} + 
-        \begin{bmatrix} \frac{1}{2}A \\ \frac{1}{4}A^2 - \frac{1}{2}B^TB \end{bmatrix}
-    \right)
-    \right)
-```
-
-
-Note that for computational reason we compute $\mathrm{Cayley}(C)E$ instead of just the Cayley transform (see the section on [retractions](retractions.md)).
\ No newline at end of file
diff --git a/docs/src/optimizers/manifold_related/geodesic.md b/docs/src/optimizers/manifold_related/geodesic.md
deleted file mode 100644
index 9ff9d645c..000000000
--- a/docs/src/optimizers/manifold_related/geodesic.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Geodesic Retraction
-
-General **retractions** are approximations of the exponential map. In `GeometricMachineLearning` we can, instead of using an approximation, solve the geodesic equation exactly (up to numerical error) by specifying `Geodesic()` as the argument of layers that have manifold weights. 
\ No newline at end of file

From fa2eceacaffeb74375df8a190fe2d9d5cd76ebad Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 17:37:13 +0200
Subject: [PATCH 026/101] Started addign docs on parallel transport.

---
 docs/make.jl                                  |  1 +
 .../manifold_related/parallel_transport.md    | 74 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 docs/src/optimizers/manifold_related/parallel_transport.md

diff --git a/docs/make.jl b/docs/make.jl
index acb5a9b1a..dde0f8fae 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -166,6 +166,7 @@ makedocs(;
             "Pullbacks" => "pullbacks/computation_of_pullbacks.md",
             "Global Sections" => "optimizers/manifold_related/global_sections.md",
             "Retractions" => "optimizers/manifold_related/retractions.md",
+            "Parallel transport" => "optimizers/manifol_related/parallel_transport.md",
             "Adam Optimizer" => "optimizers/adam_optimizer.md",
             "BFGS Optimizer" => "optimizers/bfgs_optimizer.md",
             ],
diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
new file mode 100644
index 000000000..4ab528e57
--- /dev/null
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -0,0 +1,74 @@
+# Parallel Transport
+
+The concept of *parallel transport along a geodesic* ``\gamma:[0, T]\to\mathcal{M}`` describes moving a tangent vector from ``T_x\mathcal{M}`` to ``T_{\gamma(t)}\mathcal{M}`` such that its orientation with respect to the geodesic is preserved.
+
+```math
+\Pi_{A \to \exp(V_A)}\tilde{V}_A = \exp(V_AA^{-1})\tilde{V}_A
+```
+
+```math
+\Pi_{Y \to \gamma_\Delta(\eta)}\Delta_2 = \exp(\Omega(Y, \Delta))\Delta_2
+```
+
+We again use the example from when we introduced the concept of [geodesics](@ref "Geodesic Sprays and the Exponential Map").
+
+```@example s2_setup
+using GeometricMachineLearning
+using CairoMakie # hide
+import Random # hide
+Random.seed!(123) # hide
+
+Y = rand(StiefelManifold, 3, 1)
+
+v = 5 * rand(3, 1)
+v₂ = 5 * rand(3, 1)
+Δ = rgrad(Y, v)
+Δ₂ = rgrad(Y, v₂)
+
+fig = Figure(; backgroundcolor = :transparent) # hide
+text_color = Main.output_type == :html ? :white : :black # hide
+ax = Axis3(fig[1, 1]; # hide
+        backgroundcolor = :transparent, # hide
+        aspect = (1., 1., 1.), # hide
+        azimuth = π / 6, # hide
+        elevation = π / 8, # hide
+        xlabel = rich("x", subscript("1"), font = :italic, color = text_color), # hide
+        ylabel = rich("x", subscript("2"), font = :italic, color = text_color), # hide
+        zlabel = rich("x", subscript("3"), font = :italic, color = text_color), # hide
+        ) # hide
+
+# plot a sphere with radius one and origin 0
+surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .6)
+
+morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
+point_vec = ([Y[1]], [Y[2]], [Y[3]])
+scatter!(ax, point_vec...; color = morange, marker = :star5)
+
+mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
+arrow_vec = ([Δ[1]], [Δ[2]], [Δ[3]])
+arrows!(ax, point_vec..., arrow_vec...; color = mred, linewidth = .02)
+
+mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
+arrow_vec2 = ([Δ₂[1]], [Δ₂[2]], [Δ₂[3]])
+arrows!(ax, point_vec..., arrow_vec2...; color = mpurple, linewidth = .02)
+
+fig
+```
+
+```@example s2_retraction
+Δ_increments = [Δ * η for η in 0.1 : 0.1 : 2.5]
+λY = GlobalSection(Y)
+
+B_increments = [global_rep(λY, Δ_increment) for Δ_increment in Δ_increments]
+
+... define parallel transport!!!
+A_increments = [geodesic(B_increment) for B_increment in B_increments]
+Y_increments = [apply_section(λY, A_increment) for A_increment in ]
+
+for Y_increment in Y_increments
+    scatter!(ax, [Y_increment[1]], [Y_increment[2]], [Y_increment[3]]; 
+        color = mred, markersize = 5)
+end
+
+fig
+```
\ No newline at end of file

From f4c77f12a322ebf14d1161e212eb2814bf842793 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 17:38:14 +0200
Subject: [PATCH 027/101] Fixed a problem with an equation environemnt and put
 the final remark into a remark environemnt.

---
 .../manifold_related/retractions.md           | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 34e4bb633..7ff39c4bb 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -206,7 +206,7 @@ The theorem requires the homogeneous space to be naturally reductive:
 
 ```@eval
 Main.definition(raw"A homogeneous space is called **naturally-reductive** if the following two conditions hold:
-" * Main.indentation * raw"1. ``A^{-1}BA\in\mathfrak{g}^\mathrm{hor}`` for every ``B\in\mathfrak{g}^\mathrm{hor}`` and ``A\in\exp(\mathfrak{g}^\mathrm{ver}),
+" * Main.indentation * raw"1. ``A^{-1}BA\in\mathfrak{g}^\mathrm{hor}`` for every ``B\in\mathfrak{g}^\mathrm{hor}`` and ``A\in\exp(\mathfrak{g}^\mathrm{ver}``),
 " * Main.indentation * raw"2. ``g([X, Y]^\mathrm{hor}, Z) = g(X, [Y, Z]^\mathrm{hor})`` for all ``X, Y, Z \in \mathfrak{g}^\mathrm{hor}``,
 " * Main.indentation * raw"where ``[X, Y]^\mathrm{hor} = \Omega(XYE - YXE)``. If only the first condition holds the homogeneous space is called **reductive** but not **naturally-reductive**.")
 ```
@@ -267,6 +267,11 @@ and
 
 where ``B = \lambda(Y)^{-1}\Omega(\Delta)\lambda(Y)``. These expressions for `geodesic` and `cayley` are the ones that we typically use in `GeometricMachineLearning` for computational reasons. We show how we can utilize the sparse structure of ``\mathfrak{g}^\mathrm{hor}`` for computing the geodesic retraction and the Cayley retraction (i.e. the expressions ``\exp(B)`` and ``\mathrm{Cayley}(B)`` for ``B\in\mathfrak{g}^\mathrm{hor}``). Similar derivations can be found in [celledoni2000approximating, fraikin2007optimization, bendokat2021real](@cite).
 
+```@eval
+Main.remark(raw"Further note that, even though the global section ``\lambda:\mathcal{M} \to G`` is not unique, the final geodesic ``
+\gamma_\Delta(t) = \lambda(Y)\exp(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E`` does not depend on the particular section we chose.")
+```
+
 ### The Geodesic Retraction
 
 An element of ``\mathfrak{g}^\mathrm{hor}`` can be written as:
@@ -275,7 +280,7 @@ An element of ``\mathfrak{g}^\mathrm{hor}`` can be written as:
 \begin{bmatrix}
     A & -B^T \\ 
     B & \mathbb{O}
-\end{bmatrix} = \begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix} =: B'(B'')^Ts,
+\end{bmatrix} = \begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix} =: B'(B'')^T,
 ```
 
 where we exploit the sparse structure of the array, i.e. it is a multiplication of a ``N\times2n`` with a ``2n\times{}N`` matrix.
@@ -289,7 +294,13 @@ We further use the following:
     \end{aligned}
 ```
 
-where we defined ``\mathfrak{A}(B', B'') := \sum_{n=1}^\infty \frac{1}{n!} ((B'')^TB')^{n-1}.`` Note that evaluating ``\mathfrak{A}`` relies on computing products of *small* matrices of size ``2n\times2n.`` We do this by relying on a simple Taylor expansion (see the docstring for [`GeometricMachineLearning.𝔄`](@ref)).
+where we defined ``\mathfrak{A}(B', B'') := \sum_{n=1}^\infty \frac{1}{n!} ((B'')^TB')^{n-1}.`` Note that evaluating ``\mathfrak{A}`` relies on computing products of *small* matrices of size ``2n\times2n.`` We do this by relying on a simple Taylor expansion (see the docstring for [`GeometricMachineLearning.𝔄`](@ref)). 
+
+The final expression we obtain is: 
+
+```math
+E + \begin{pmatrix} \frac{1}{2} A & \mathbb{I} \\ B & \mathbb{O} \end{pmatrix} 𝔄(B', B'') \begin{pmatrix} \mathbb{I} \\ \frac{1}{2} A \end{pmatrix}.
+```
 
 ### The Cayley Retraction
 
@@ -319,17 +330,20 @@ E + \frac{1}{2}\begin{bmatrix} \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O}  \end
     \right)
 ```
 
+We conclude with a remark:
 
 ```@eval
-Main.remark(raw"As mentioned previously the Lie group ``SO(N)``, i.e. the one corresponding to the Stiefel manifold and the Grassmann manifold, has a bi-invariant Riemannian metric associated with it: ``(B_1,B_2)\mapsto \mathrm{Tr}(B_1^TB_2)``.
-For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult [bendokat2021real](@cite).")
+Main.remark(raw"As mentioned previously the Lie group ``SO(N)``, i.e. the one corresponding to the Stiefel manifold and the Grassmann manifold, has a bi-invariant Riemannian metric associated with it: ``(B_1,B_2)\mapsto \mathrm{Tr}(B_1^TB_2)``. For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult [bendokat2021real](@cite).")
 ```
 
 ## Library Functions
 
 ```@docs; canonical=false
-geodesic
-cayley
+geodesic(::StiefelLieAlgHorMatrix)
+geodesic(::GrassmannLieAlgHorMatrix)
+cayley(::StiefelLieAlgHorMatrix)
+cayley(::GrassmannLieAlgHorMatrix)
+cayley(::Manifold{T}, ::AbstractMatrix{T}) where T
 GeometricMachineLearning.𝔄
 ```
 

From 9579afbf8c33e74920b9acaa8532084f50b99989 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 17:38:41 +0200
Subject: [PATCH 028/101] Fixed typo.

---
 src/optimizers/manifold_related/modified_exponential.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimizers/manifold_related/modified_exponential.jl b/src/optimizers/manifold_related/modified_exponential.jl
index 35522b703..e69cc8477 100644
--- a/src/optimizers/manifold_related/modified_exponential.jl
+++ b/src/optimizers/manifold_related/modified_exponential.jl
@@ -11,7 +11,7 @@ This uses a Taylor expansion that iteratively adds terms with
 while norm(Aⁿ) > ε
     mul!(A_temp, Aⁿ, A)
     Aⁿ .= A_temp
-    rmul!(Aⁿ, T(inv(i)))
+    rmul!(Aⁿ, inv(n))
 
     𝔄 += B
     n += 1 

From 1a42c2360305283d047feaca073a804a6fcc76ee Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 29 May 2024 17:38:53 +0200
Subject: [PATCH 029/101] Added docstrings.

---
 .../manifold_related/retractions.jl           | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/optimizers/manifold_related/retractions.jl b/src/optimizers/manifold_related/retractions.jl
index 92e4217ee..1dbd213fc 100644
--- a/src/optimizers/manifold_related/retractions.jl
+++ b/src/optimizers/manifold_related/retractions.jl
@@ -57,6 +57,15 @@ function geodesic(Y::Manifold{T}, Δ::AbstractMatrix{T}) where T
     apply_section(λY, expB)
 end
 
+@doc raw"""
+    geodesic(B::StiefelLieAlgHorMatrix)
+
+Compute the geodesic of `B*E` where `E` is the distinct element of the StiefelManifold.
+
+# Implementation
+
+This is using a computationally efficient version of the matrix exponential. See [`GeometricMachineLearning.𝔄`](@ref).
+"""
 function geodesic(B::StiefelLieAlgHorMatrix{T}) where T
     E = StiefelProjection(B)
     unit = one(B.A)
@@ -67,6 +76,13 @@ function geodesic(B::StiefelLieAlgHorMatrix{T}) where T
     )
 end
 
+@doc raw"""
+    geodesic(B::GrassmannLieAlgHorMatrix)
+
+Compute the geodesic of `B*E` where `E` is the distinct element of the StiefelManifold.
+
+See [`geodesic(::StiefelLieAlgHorMatrix)`](@ref).
+"""
 function geodesic(B::GrassmannLieAlgHorMatrix{T}) where T
     N, n = B.N, B.n
     E = typeof(B.B)(StiefelProjection(N, n, T))
@@ -102,6 +118,11 @@ function cayley(Y::Manifold{T}, Δ::AbstractMatrix{T}) where T
     apply_section(λY, cayleyB)
 end
 
+@doc raw"""
+    cayley(B::StiefelLieAlgHorMatrix)
+
+Compute the Cayley retraction of `B` and multiply it with `E` (the distinct element of the Stiefel manifold).
+"""
 function cayley(B::StiefelLieAlgHorMatrix{T}) where T
     E = StiefelProjection(B)
     unit = one(B.A)
@@ -117,4 +138,15 @@ function cayley(B::StiefelLieAlgHorMatrix{T}) where T
             vcat(unit, T(0.5) * A_mat) + exponent \ (vcat(unit, T(0.5) * A_mat) + vcat(T(0.5) * A_mat, T(0.25) * A_mat2 - T(0.5) * BB))
             )
     )
+end
+
+@doc raw"""
+    cayley(B::GrassmannLieAlgHorMatrix)
+
+Compute the Cayley retraction of `B` and multiply it with `E` (the distinct element of the Stiefel manifold).
+
+See [`cayley(::StiefelLieAlgHorMatrix)`](@ref).
+"""
+function cayley(B::GrassmannLieAlgHorMatrix{T}) where T
+    error("Missing implementation!")
 end
\ No newline at end of file

From 7736ac2f1e7a878ceb289631bfcd2439b95f8b54 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 30 May 2024 16:15:34 +0200
Subject: [PATCH 030/101] Also working for GrassmannLieAlgHorMatrix now.

---
 src/arrays/stiefel_projection.jl | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/arrays/stiefel_projection.jl b/src/arrays/stiefel_projection.jl
index be3c207e6..29673ba95 100644
--- a/src/arrays/stiefel_projection.jl
+++ b/src/arrays/stiefel_projection.jl
@@ -33,15 +33,31 @@ function StiefelProjection(A::AbstractMatrix{T}) where T
 end
 
 @doc raw"""
-    StiefelProjection(B::StiefelLieAlgHorMatrix)
+    StiefelProjection(B::AbstractLieAlgHorMatrix)
 
 Extract necessary information from `B` and build an instance of `StiefelProjection`. 
 
 Necessary information here referes to the backend, the data type and the size of the matrix.
 
 The size is queried through `B.N` and `B.n`.
+
+# Examples
+
+```jldoctest
+using GeometricMachineLearning
+
+B₁ = rand(StiefelLieAlgHorMatrix, 5, 2)
+B₂ = rand(GrassmannLieAlgHorMatrix, 5, 2)
+E = [1. 0.; 0. 1.; 0. 0.; 0. 0.; 0. 0.]
+
+StiefelProjection(B₁) ≈ StiefelProjection(B₂) ≈ E 
+
+# output
+
+true
+```
 """
-function StiefelProjection(B::StiefelLieAlgHorMatrix{T}) where T 
+function StiefelProjection(B::AbstractLieAlgHorMatrix{T}) where T 
     StiefelProjection(KernelAbstractions.get_backend(B), T, B.N, B.n)
 end
 

From 8d76b13e47b59fd7e20b6ba0d51372951b755a80 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 30 May 2024 16:16:21 +0200
Subject: [PATCH 031/101] Changed how optimization is done. We now compute the
 random lift only once

---
 .../manifold_related/global_sections.jl       | 70 ++++++++++-----
 .../manifold_related/retractions.jl           | 90 ++++++++-----------
 2 files changed, 84 insertions(+), 76 deletions(-)

diff --git a/src/optimizers/manifold_related/global_sections.jl b/src/optimizers/manifold_related/global_sections.jl
index 4065dccbb..2b57b67f1 100644
--- a/src/optimizers/manifold_related/global_sections.jl
+++ b/src/optimizers/manifold_related/global_sections.jl
@@ -15,14 +15,14 @@ Also see [`apply_section`](@ref) and [`global_rep`](@ref).
 
 For an implementation of `GlobalSection` for a custom array (especially manifolds), the function [`global_section`](@ref) has to be generalized.
 """
-struct GlobalSection{T, AT} 
+struct GlobalSection{T, AT, λT} 
     Y::AT
     # for now the only lift that is implemented is the Stiefel one - these types will have to be expanded!
-    λ::Union{LinearAlgebra.QRCompactWYQ, LinearAlgebra.QRPackedQ, Nothing}
+    λ::λT
 
     function GlobalSection(Y::AbstractVecOrMat)
         λ = global_section(Y)
-       new{eltype(Y), typeof(Y)}(Y, λ) 
+       new{eltype(Y), typeof(Y), typeof(λ)}(Y, λ) 
     end
 end
 
@@ -30,6 +30,10 @@ function GlobalSection(ps::NamedTuple)
     apply_toNT(GlobalSection, ps)
 end
 
+function GlobalSection(ps::Tuple)
+    [GlobalSection(ps_elem) for ps_elem in ps] |> Tuple
+end
+
 @doc raw"""
     Matrix(λY::GlobalSection)
 
@@ -40,11 +44,18 @@ This is not recommended if speed is important!
 Use [`apply_section`](@ref) and [`global_rep`](@ref) instead!
 """
 function Base.Matrix(λY::GlobalSection)
-    N, n = size(λY.Y)
-
-    hcat(Matrix(λY.Y), Matrix(λY.λ)[:, 1:(N - n)])
+    hcat(Matrix(λY.Y), Matrix(λY.λ))
 end
 
+@doc raw"""
+    *(λY, Y)
+
+Apply the element `λY` onto `Y`.
+
+Here `λY` is an element of a Lie group and `Y` is an element of a homogeneous space.
+"""
+Base.:*(λY::GlobalSection, Y::Manifold) = apply_section(λY, Y)
+
 @doc raw"""
     apply_section(λY::GlobalSection{T, AT}, Y₂::AT) where {T, AT <: StiefelManifold{T}}
 
@@ -55,9 +66,6 @@ Mathematically this is the group action of the element ``\lambda{}Y\in{}G`` on t
 Internally it calls the inplace version [`apply_section!`](@ref).
 """
 function apply_section(λY::GlobalSection{T, AT}, Y₂::AT) where {T, AT<:StiefelManifold{T}}
-    N, n = size(λY.Y)
-    @assert (N, n) == size(Y₂)
-    
     Y = StiefelManifold(zero(Y₂.A))
     apply_section!(Y, λY, Y₂)
 
@@ -73,16 +81,11 @@ The inplace version of [`apply_section`](@ref).
 """
 function apply_section!(Y::AT, λY::GlobalSection{T, AT}, Y₂::AT) where {T, AT<:StiefelManifold{T}}
     N, n = size(λY.Y)
-    @assert (N, n) == size(Y₂) == size(Y)
 
-    backend = KernelAbstractions.get_backend(Y)
-    @views Y.A .= λY.Y * Y₂.A[1:n, :] + λY.λ*vcat(Y₂.A[(n+1):N, :], KernelAbstractions.zeros(backend, T, n, n))
+    @views Y.A .= λY.Y * Y₂.A[1:n, :] + λY.λ * Y₂.A[(n+1):N, :]
 end
 
 function apply_section(λY::GlobalSection{T, AT}, Y₂::AT) where {T, AT<:GrassmannManifold{T}}
-    N, n = size(λY.Y)
-    @assert (N, n) == size(Y₂)
-    
     Y = GrassmannManifold(zero(Y₂.A))
     apply_section!(Y, λY, Y₂)
 
@@ -91,14 +94,13 @@ end
 
 function apply_section!(Y::AT, λY::GlobalSection{T, AT}, Y₂::AT) where {T, AT<:GrassmannManifold{T}}
     N, n = size(λY.Y)
-    @assert (N, n) == size(Y₂)
 
-    backend = KernelAbstractions.get_backend(Y₂)
-    @views Y.A = λY.Y * Y₂.A[1:n, :] + λY.λ*vcat(Y₂.A[(n+1):N, :], KernelAbstractions.zeros(backend, T, n, n))
+    @views Y.A = λY.Y * Y₂.A[1:n, :] + λY.λ * Y₂.A[(n + 1):N, :]
 end
 
 function apply_section(λY::GlobalSection{T}, Y₂::AbstractVecOrMat{T}) where {T}
-    λY.Y + Y₂
+    Y = copy(Y₂)
+    apply_section!(Y, λY, Y₂)
 end
 
 function apply_section!(Y::AT, λY::GlobalSection{T, AT}, Y₂::AbstractVecOrMat{T}) where {T, AT<:AbstractVecOrMat{T}}
@@ -183,8 +185,7 @@ function global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T,
     N, n = size(λY.Y)
     StiefelLieAlgHorMatrix(
         SkewSymMatrix(λY.Y.A' * Δ),
-        typeof(Δ)(@views (λY.λ' * Δ)[1:(N-n), 1:n]), 
-        # (λY.λ' * Δ)[(n+1):N, 1:n],
+        λY.λ' * Δ, 
         N, 
         n
     )
@@ -226,8 +227,33 @@ _round(global_rep(λY, Δ); digits = 3)
 function global_rep(λY::GlobalSection{T, AT}, Δ::AbstractMatrix{T}) where {T, AT<:GrassmannManifold{T}}
     N, n = size(λY.Y)
     GrassmannLieAlgHorMatrix(
-        typeof(Δ)(@views (λY.λ' * Δ)[1:(N-n), 1:n]),
+        λY.λ' * Δ,
         N,
         n
     )
+end
+
+function update_section!(Λ⁽ᵗ⁻¹⁾::GlobalSection{T, MT}, B⁽ᵗ⁻¹⁾::AbstractLieAlgHorMatrix{T}, retraction) where {T, MT <: Manifold}
+    N, n = B⁽ᵗ⁻¹⁾.N, B⁽ᵗ⁻¹⁾.n
+    expB = retraction(B⁽ᵗ⁻¹⁾)
+    apply_section!(expB, Λ⁽ᵗ⁻¹⁾, expB)
+    Λ⁽ᵗ⁻¹⁾.Y.A .= @view expB[:, 1:n]
+    Λ⁽ᵗ⁻¹⁾.λ .= @view expB[:, (n+1):N]
+
+    nothing
+end
+
+function update_section!(Λ⁽ᵗ⁻¹⁾::GlobalSection{T, AT}, B⁽ᵗ⁻¹⁾::AT, retraction) where {T, AT <: AbstractVecOrMat{T}}
+    expB = retraction(B⁽ᵗ⁻¹⁾)
+    apply_section!(expB, Λ⁽ᵗ⁻¹⁾, expB)
+    Λ⁽ᵗ⁻¹⁾.Y .= expB
+
+    nothing
+end
+
+function update_section!(Λ⁽ᵗ⁻¹⁾::NamedTuple, B⁽ᵗ⁻¹⁾::NamedTuple, retraction)
+    update_section_closure!(Λ⁽ᵗ⁻¹⁾, B⁽ᵗ⁻¹⁾) = update_section!(Λ⁽ᵗ⁻¹⁾, B⁽ᵗ⁻¹⁾, retraction)
+    apply_toNT(update_section_closure!, Λ⁽ᵗ⁻¹⁾, B⁽ᵗ⁻¹⁾)
+
+    nothing
 end
\ No newline at end of file
diff --git a/src/optimizers/manifold_related/retractions.jl b/src/optimizers/manifold_related/retractions.jl
index 1dbd213fc..cacad9e8a 100644
--- a/src/optimizers/manifold_related/retractions.jl
+++ b/src/optimizers/manifold_related/retractions.jl
@@ -8,30 +8,8 @@ abstract type LayerWithManifold{M, N, retraction} <: AbstractExplicitLayer{M, N}
 """
 abstract type LayerWithOptionalManifold{M, N, Stiefel, retraction} <: AbstractExplicitLayer{M, N} end
 
-#fallback function -> maybe put into another file!
-function retraction(::AbstractExplicitLayer, gx::NamedTuple)
-    gx
-end
-
-function retraction(::LayerWithManifold{M, N, Geodesic}, B::NamedTuple) where {M,N}
-    geodesic(B)
-end
-  
-function retraction(::AbstractExplicitCell, gx::NamedTuple)
-    gx
-end
-
-function retraction(::LayerWithManifold{M, N, Cayley}, B::NamedTuple) where {M,N}
-    cayley(B)
-end
-
-function retraction(::LayerWithOptionalManifold{M, N, true, Geodesic}, B::NamedTuple) where {M,N}
-    geodesic(B)
-end
-
-function retraction(::LayerWithOptionalManifold{M, N, true, Cayley}, B::NamedTuple) where {M,N}
-    cayley(B)
-end
+geodesic(A::AbstractVecOrMat) = A
+cayley(A::AbstractVecOrMat) = A
 
 geodesic(B::NamedTuple) = apply_toNT(geodesic, B)
 
@@ -53,14 +31,15 @@ See the docstring for [`rgrad`](@ref) for details on this function.
 function geodesic(Y::Manifold{T}, Δ::AbstractMatrix{T}) where T
     λY = GlobalSection(Y)
     B = global_rep(λY, Δ)
+    E = StiefelProjection(B)
     expB = geodesic(B)
-    apply_section(λY, expB)
+    λY * typeof(Y)(expB * E)
 end
 
 @doc raw"""
     geodesic(B::StiefelLieAlgHorMatrix)
 
-Compute the geodesic of `B*E` where `E` is the distinct element of the StiefelManifold.
+Compute the geodesic of an element in [`StiefelLieAlgHorMatrix`](@ref).
 
 # Implementation
 
@@ -70,28 +49,25 @@ function geodesic(B::StiefelLieAlgHorMatrix{T}) where T
     E = StiefelProjection(B)
     unit = one(B.A)
     A_mat = B.A * unit
-    exponent = hcat(vcat(T(.5) * A_mat, T(.25) * B.A * A_mat - B.B' * B.B), vcat(unit, T(.5) * A_mat))
-    StiefelManifold(
-        E + hcat(vcat(T(.5) * A_mat, B.B), E) * 𝔄(exponent) * vcat(unit, T(.5) * A_mat)
-    )
+    B̂ = hcat(vcat(T(.5) * A_mat, B.B), E)
+    B̄ = hcat(vcat(unit, T(.5) * A_mat), vcat(zero(B.B'), -B.B'))'
+    StiefelManifold(one(B) + B̂ * 𝔄(B̂, B̄) * B̄')
 end
 
 @doc raw"""
     geodesic(B::GrassmannLieAlgHorMatrix)
 
-Compute the geodesic of `B*E` where `E` is the distinct element of the StiefelManifold.
+Compute the geodesic of an element in [`GrassmannLieAlgHorMatrix`](@ref).
 
 See [`geodesic(::StiefelLieAlgHorMatrix)`](@ref).
 """
 function geodesic(B::GrassmannLieAlgHorMatrix{T}) where T
-    N, n = B.N, B.n
-    E = typeof(B.B)(StiefelProjection(N, n, T))
-    # expression from which matrix exponential and inverse have to be computed
-    unit = typeof(B.B)(I(n))
-    exponent = hcat(vcat(zeros(T, n, n), - B.B' * B.B), vcat(unit, zeros(T, n, n)))
-    GrassmannManifold(
-        E + (hcat(vcat(zeros(T, n, n), B.B), E) * 𝔄(exponent))[1:N, 1:n]
-    )
+    E = StiefelProjection(B)
+    backend = KernelAbstractions.get_backend(B)
+    zero_mat = KernelAbstractions.zeros(backend, T, B.n, B.n)
+    B̂ = hcat(vcat(zero_mat, B.B), E)
+    B̄ = hcat(vcat(one(zero_mat), zero_mat), vcat(zero(B.B'), -B.B'))'
+    GrassmannManifold(one(B) + B̂ * 𝔄(B̂, B̄) * B̄')
 end
 
 cayley(B::NamedTuple) = apply_toNT(cayley, B)
@@ -114,8 +90,9 @@ See the docstring for [`rgrad`](@ref) for details on this function.
 function cayley(Y::Manifold{T}, Δ::AbstractMatrix{T}) where T
     λY = GlobalSection(Y)
     B = global_rep(λY, Δ)
+    E = StiefelProjection(B)
     cayleyB = cayley(B)
-    apply_section(λY, cayleyB)
+    λY * typeof(Y)(cayleyB * E)
 end
 
 @doc raw"""
@@ -125,19 +102,15 @@ Compute the Cayley retraction of `B` and multiply it with `E` (the distinct elem
 """
 function cayley(B::StiefelLieAlgHorMatrix{T}) where T
     E = StiefelProjection(B)
-    unit = one(B.A)
-    A_mat = B.A * one(B.A)
-    A_mat2 = B.A * B.A 
-    BB = B.B' * B.B
-
-    exponent = hcat(vcat(unit - T(.25) * A_mat, T(.5) * BB - T(.125) * A_mat2), vcat(-T(.5) * unit, unit - T(.25) * A_mat))
-    StiefelManifold(
-        E + 
-        T(.5) * hcat(vcat(T(.5) * A_mat, B.B), vcat(unit, zero(B.B)))*
-        (
-            vcat(unit, T(0.5) * A_mat) + exponent \ (vcat(unit, T(0.5) * A_mat) + vcat(T(0.5) * A_mat, T(0.25) * A_mat2 - T(0.5) * BB))
-            )
-    )
+    𝕀_small = one(B.A)
+    𝕆 = zero(𝕀_small)
+    𝕀_small2 = hcat(vcat(𝕀_small, 𝕆), vcat(𝕆, 𝕀_small))
+    𝕀_big = one(B)
+    A_mat = B.A * 𝕀_small
+    B̂ = hcat(vcat(T(.5) * A_mat, B.B), E)
+    B̄ = hcat(vcat(𝕀_small, T(.5) * A_mat), vcat(zero(B.B'), -B.B'))'
+
+    StiefelManifold((𝕀_big + T(.5) * B̂ * inv(𝕀_small2 - T(.5) * B̄' * B̂) * B̄') * (𝕀_big + T(.5) * B))
 end
 
 @doc raw"""
@@ -148,5 +121,14 @@ Compute the Cayley retraction of `B` and multiply it with `E` (the distinct elem
 See [`cayley(::StiefelLieAlgHorMatrix)`](@ref).
 """
 function cayley(B::GrassmannLieAlgHorMatrix{T}) where T
-    error("Missing implementation!")
+    E = StiefelProjection(B)
+    backend = KernelAbstractions.get_backend(B)
+    𝕆 = KernelAbstractions.zeros(backend, T, B.n, B.n)
+    𝕀_small = one(𝕆)
+    𝕀_small2 = hcat(vcat(𝕀_small, 𝕆), vcat(𝕆, 𝕀_small))
+    𝕀_big = one(B)
+    B̂ = hcat(vcat(𝕆, B.B), E)
+    B̄ = hcat(vcat(𝕀_small, 𝕆), vcat(zero(B.B'), -B.B'))'
+
+    GrassmannManifold((𝕀_big + T(.5) * B̂ * inv(𝕀_small2 - T(.5) * B̄' * B̂) * B̄') * (𝕀_big + T(.5) * B))
 end
\ No newline at end of file

From d13401376dcdbebb12a961c60b648820b5a881a5 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 30 May 2024 16:16:46 +0200
Subject: [PATCH 032/101] Outputting a 'normal' matrix.

---
 src/manifolds/grassmann_manifold.jl | 2 +-
 src/manifolds/stiefel_manifold.jl   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/manifolds/grassmann_manifold.jl b/src/manifolds/grassmann_manifold.jl
index 6398a95f5..3142c3373 100644
--- a/src/manifolds/grassmann_manifold.jl
+++ b/src/manifolds/grassmann_manifold.jl
@@ -75,7 +75,7 @@ function global_section(Y::GrassmannManifold{T}) where T
     A = KernelAbstractions.allocate(backend, T, N, N-n)
     randn!(A)
     A = A - Y.A * (Y.A' * A)
-    qr!(A).Q
+    typeof(Y.A)(qr!(A).Q)
 end
 
 @doc raw"""
diff --git a/src/manifolds/stiefel_manifold.jl b/src/manifolds/stiefel_manifold.jl
index f393b0641..c6bcaa1c0 100644
--- a/src/manifolds/stiefel_manifold.jl
+++ b/src/manifolds/stiefel_manifold.jl
@@ -124,7 +124,7 @@ function global_section(Y::StiefelManifold{T}) where T
     A = KernelAbstractions.allocate(backend, T, N, N-n)
     randn!(A)
     A = A - Y.A * (Y.A' * A)
-    qr!(A).Q
+    typeof(Y.A)(qr!(A).Q)
 end
 
 @doc raw"""

From 8a32284ed33f5e0b643bef5383b4f0b34773c2b2 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 30 May 2024 16:17:42 +0200
Subject: [PATCH 033/101] Changed optimizer to now be independent of the
 network layer.

---
 src/data_loader/optimize.jl | 11 ++++++-----
 src/optimizers/optimizer.jl | 33 +++++++++++++++++----------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/data_loader/optimize.jl b/src/data_loader/optimize.jl
index dc270d80f..7c10e7fa5 100644
--- a/src/data_loader/optimize.jl
+++ b/src/data_loader/optimize.jl
@@ -15,7 +15,7 @@ output = \frac{1}{\mathtt{steps\_per\_epoch}}\sum_{t=1}^\mathtt{steps\_per\_epoc
 ```
 This is done because any **reverse differentiation** routine always has two outputs: a pullback and the value of the function it is differentiating. In the case of zygote: `loss_value, pullback = Zygote.pullback(ps -> loss(ps), ps)` (if the loss only depends on the parameters).
 """
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T}, batch::Batch, loss::Union{typeof(loss), NetworkLoss}) where T
+function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T}, batch::Batch, loss::Union{typeof(loss), NetworkLoss}, λY) where T
     count = 0
     total_error = T(0)
     batches = batch(dl)
@@ -26,12 +26,12 @@ function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTu
         loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt, output_nt), ps)
         total_error += loss_value
         dp = pullback(one(loss_value))[1]
-        optimization_step!(opt, model, ps, dp)
+        optimization_step!(opt, λY, ps, dp)
     end
     total_error / count
 end
 
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, <:Any, Nothing, :RegularData}, batch::Batch, loss::Union{typeof(loss), NetworkLoss}) where T
+function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, <:Any, Nothing, :RegularData}, batch::Batch, loss::Union{typeof(loss), NetworkLoss}, λY) where T
     count = 0
     total_error = T(0)
     batches = batch(dl)
@@ -42,7 +42,7 @@ function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTu
         loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt), ps)
         total_error += loss_value
         dp = pullback(one(loss_value))[1]
-        optimization_step!(opt, model, ps, dp)
+        optimization_step!(opt, λY, ps, dp)
     end
     total_error / count
 end
@@ -58,10 +58,11 @@ A functor for `Optimizer`. It is called with:
 The last argument is a function through which `Zygote` differentiates. This argument is optional; if it is not supplied `GeometricMachineLearning` defaults to an appropriate loss for the `DataLoader`.
 """
 function (o::Optimizer)(nn::NeuralNetwork, dl::DataLoader, batch::Batch, n_epochs::Int, loss::NetworkLoss)
+    Λ = GlobalSection(nn.params)
     progress_object = ProgressMeter.Progress(n_epochs; enabled=true)
     loss_array = zeros(n_epochs)
     for i in 1:n_epochs
-        loss_array[i] = optimize_for_one_epoch!(o, nn.model, nn.params, dl, batch, loss)
+        loss_array[i] = optimize_for_one_epoch!(o, nn.model, nn.params, dl, batch, loss, Λ)
         ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss, loss_array[i])]) 
     end
 
diff --git a/src/optimizers/optimizer.jl b/src/optimizers/optimizer.jl
index 24739cf53..8ea2a5e73 100644
--- a/src/optimizers/optimizer.jl
+++ b/src/optimizers/optimizer.jl
@@ -7,10 +7,11 @@ It takes as input an optimization method and the parameters of a network.
 
 For *technical reasons* we first specify an [`OptimizerMethod`](@ref) that stores all the hyperparameters of the optimizer. 
 """
-mutable struct Optimizer{MT<:OptimizerMethod, CT}
+mutable struct Optimizer{MT<:OptimizerMethod, CT, RT}
     method::MT
     cache::CT
     step::Int
+    retraction::RT
 end
 
 @doc raw"""
@@ -20,8 +21,8 @@ Allocate the cache for a specific `method` and `nn_params` for an instance of `O
 
 Internally this calls [`init_optimizer_cache`](@ref).
 """
-function Optimizer(method::OptimizerMethod, nn_params::Union{Tuple, NamedTuple})
-    Optimizer(method, init_optimizer_cache(method, nn_params), 0)
+function Optimizer(method::OptimizerMethod, nn_params::Union{Tuple, NamedTuple}; retraction = cayley)
+    Optimizer(method, init_optimizer_cache(method, nn_params), 0, retraction)
 end
 
 """
@@ -33,11 +34,11 @@ Internally this calls `Optimizer(method, nn.params)`.
 
 Typically the Optimizer is not initialized with the network parameters, but instead with a NeuralNetwork struct.
 """
-function Optimizer(method::OptimizerMethod, nn::NeuralNetwork)
-    Optimizer(method, nn.params)
+function Optimizer(method::OptimizerMethod, nn::NeuralNetwork; kwargs...)
+    Optimizer(method, nn.params; kwargs...)
 end
 
-Optimizer(nn::NeuralNetwork, m::OptimizerMethod) = Optimizer(m, nn)
+Optimizer(nn::NeuralNetwork, m::OptimizerMethod; kwargs...) = Optimizer(m, nn; kwargs...)
 
 @doc raw"""
     update!(o, cache, B)
@@ -52,7 +53,7 @@ function update!(::Optimizer, ::AbstractCache, ::AbstractArray) end
 # optimization step function
 
 @doc raw"""
-    optimization_step!(o, layer, ps, cache, dx)
+    optimization_step!(o, λY, ps, cache, dx)
 
 Update the weights `ps` of a `layer` based on an [`Optimizer`](@ref), a `cache` and first-order derivatives `dx`.
 
@@ -60,13 +61,13 @@ The derivatives `dx` here are usually obtained via an AD routine by differentiat
 
 It is calling the function [`update!`](@ref) internally which has to be implemented for every [`OptimizerMethod`](@ref).
 """
-function optimization_step!(o::Optimizer, layer::Union{AbstractExplicitLayer, AbstractExplicitCell}, ps::NamedTuple, cache::NamedTuple, dx::NamedTuple)
+function optimization_step!(o::Optimizer, λY::NamedTuple, ps::NamedTuple, cache::NamedTuple, dx::NamedTuple)
     gx = rgrad(ps, dx)
-    λY = GlobalSection(ps)
     B = global_rep(λY, gx)
     update!(o, cache, B)
-    ps₂ = retraction(layer, B)
-    apply_section!(ps, λY, ps₂)
+    update_section!(λY, B, o.retraction)
+
+    nothing
 end
 
 @doc raw"""
@@ -74,10 +75,10 @@ end
 
 Optimize a neural network built with `Chain`.
 """
-function optimization_step!(o::Optimizer, model::Chain, ps::Tuple, dx::Tuple)
+function optimization_step!(o::Optimizer, λY::Tuple, ps::Tuple, dx::Tuple)
     o.step += 1
-    for (index, element) in zip(eachindex(model.layers), model.layers)
-        optimization_step!(o, element, ps[index], o.cache[index], dx[index])
+    for (cache, λY, ps, dx) in zip(o.cache, λY, ps, dx)
+        optimization_step!(o, λY, ps, cache, dx)
     end
 end
 
@@ -86,10 +87,10 @@ end
 
 Optimize a neural network consisting of a single `AbstractExplicitLayer`.
 """
-function optimization_step!(o::Optimizer, model::AbstractExplicitLayer, ps::NamedTuple, dx::NamedTuple)
+function optimization_step!(o::Optimizer, λY::NamedTuple, ps::NamedTuple, dx::NamedTuple)
     o.step += 1
 
-    optimization_step!(o, model, ps, o.cache, dx)
+    optimization_step!(o, λY, ps, o.cache, dx)
 end
 
 #######################################################################################

From 3c35460b248e81a574615668429404fbc3e48818 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Thu, 30 May 2024 16:23:13 +0200
Subject: [PATCH 034/101] Fixed typo in equation.

---
 docs/src/optimizers/manifold_related/retractions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 7ff39c4bb..1cd5ba81a 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -181,7 +181,7 @@ The *geodesic retraction* is a retraction whose associated curve is also the uni
 ```@eval
 Main.theorem(raw"The geodesic on a compact matrix Lie group ``G`` with bi-invariant metric for ``B\in{}T_AG`` is simply
 " * Main.indentation * raw"```math
-" * Main.indentation * raw"\gamma(t) = \exp(t\cdotBA^-1)A,
+" * Main.indentation * raw"\gamma(t) = \exp(t\cdot{}BA^-1)A,
 " * Main.indentation * raw"```
 " * Main.indentation * raw"where ``\exp:\mathcal{g}\to{}G`` is the matrix exponential map.")
 ```

From 6e331fa6712a832f63099e514f9b085e606ed87c Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:23:26 +0200
Subject: [PATCH 035/101] Fixed typo.

---
 docs/make.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index dde0f8fae..4f94ae792 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -166,7 +166,7 @@ makedocs(;
             "Pullbacks" => "pullbacks/computation_of_pullbacks.md",
             "Global Sections" => "optimizers/manifold_related/global_sections.md",
             "Retractions" => "optimizers/manifold_related/retractions.md",
-            "Parallel transport" => "optimizers/manifol_related/parallel_transport.md",
+            "Parallel transport" => "optimizers/manifold_related/parallel_transport.md",
             "Adam Optimizer" => "optimizers/adam_optimizer.md",
             "BFGS Optimizer" => "optimizers/bfgs_optimizer.md",
             ],

From 71a91f734c676a2fd011fd5e68eac722b9830920 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:24:07 +0200
Subject: [PATCH 036/101] Added reference for reductive homogeneous spaces.

---
 docs/src/GeometricMachineLearning.bib | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
index 5bd714daf..3835d9a7f 100644
--- a/docs/src/GeometricMachineLearning.bib
+++ b/docs/src/GeometricMachineLearning.bib
@@ -414,4 +414,15 @@ @inproceedings{fraikin2007optimization
   pages={1062205--1062206},
   year={2007},
   organization={Wiley Online Library}
+}
+
+@article{schlarb2024covariant,
+  title={Covariant Derivatives on Homogeneous Spaces: Horizontal Lifts and Parallel Transport},
+  author={Schlarb, Markus},
+  journal={The Journal of Geometric Analysis},
+  volume={34},
+  number={5},
+  pages={1--43},
+  year={2024},
+  publisher={Springer}
 }
\ No newline at end of file

From 1dc1d27e32fd0fcd7573d77ab223897842c239e6 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:24:34 +0200
Subject: [PATCH 037/101] Finished parallel transport (for now).

---
 .../manifold_related/parallel_transport.md    | 94 ++++++++++++++++---
 1 file changed, 79 insertions(+), 15 deletions(-)

diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index 4ab528e57..de0ad5f3f 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -2,17 +2,47 @@
 
 The concept of *parallel transport along a geodesic* ``\gamma:[0, T]\to\mathcal{M}`` describes moving a tangent vector from ``T_x\mathcal{M}`` to ``T_{\gamma(t)}\mathcal{M}`` such that its orientation with respect to the geodesic is preserved.
 
-```math
-\Pi_{A \to \exp(V_A)}\tilde{V}_A = \exp(V_AA^{-1})\tilde{V}_A
+A precise definition of parallel transport needs a notion of a *connection* [lang2012fundamentals, bishop1980tensor, bendokat2020grassmann](@cite). Here we simply state how to parallel transport vectors on the Lie group ``SO(N)`` and the homogeneous spaces ``St(n, N)`` and ``Gr(n, N)``.
+
+```@eval
+Main.theorem(raw"Given two elements ``B^A_1, B^A_2\in{}T_AG`` the parallel transport of ``B^A_2`` along the geodesic of ``B^A_1`` is given by
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\Pi_{A\to\gamma_{B^A_1}(t)} = A\exp(t\cdot{}A^{-1}B^A_1)A^{-1}B^A_2 = A\exp(t\cdot{}B_1)B_2,
+" * Main.indentation * raw"```
+" * Main.indentation * raw"where ``B_i := A^{-1}B_i.``")
+```
+
+For the Stiefel manifold this is not much more complicated[^1]:
+
+[^1]: That this expression is sound from the perspective of Riemannian geometry has to be proved [schlarb2024covariant](@cite). For now the evidence that this is correct is largely empirical. 
+
+```@eval
+Main.theorem(raw"Given two elements ``\Delta_1, \Delta_2\in{}T_Y\mathcal{M}``, the parallel transport of ``\Delta_2`` along the geodesic of ``\Delta_1`` is given by
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\Pi_{Y\to\gamma_{\Delta_1}(t)} = \exp(t\cdot\Omega(Y, \Delta_1))\Delta_2 =  \lambda(Y)\exp(B_1)\lambda(Y)^{-1}\Delta_2,
+" * Main.indentation * raw"```
+" * Main.indentation * raw"where ``B_1 = \lambda(Y)^{-1}\Omega(Y, \Delta_1)\lambda(Y).``")
 ```
 
+We can further modify the expression of parallel transport for the Stiefel manifold: 
+
 ```math
-\Pi_{Y \to \gamma_\Delta(\eta)}\Delta_2 = \exp(\Omega(Y, \Delta))\Delta_2
+\Pi_{Y\to\gamma_{\Delta_1}(t)} = \lambda(Y)\exp(B_1)\lambda(Y)\Omega(Y, \Delta_2)Y = \lambda(Y)\exp(B_1)B_2E,
 ```
 
-We again use the example from when we introduced the concept of [geodesics](@ref "Geodesic Sprays and the Exponential Map").
+where ``B_2 = \lambda(Y)^{-1}\Omega(Y, \Delta_2)\lambda(Y).``. We can now define explicit updating rules for the global section ``\Lambda^{(\cdot)}``, the element of the homogeneous space ``Y^{(\cdot)}``, the tangent vector ``\Delta^{(\cdot)}`` and ``D^{(\cdot)}``, its representation in ``\mathfrak{g}^\mathrm{hor}``.
+
+We thus have:
+1. ``\Lambda^{(t)} \leftarrow \Lambda^{(t-1)}\exp(B^{(t-1)}),``
+2. ``Y^{(t)} \leftarrow \Lambda^{(t)}E,``
+3. ``\Delta^{(t)} \leftarrow  \Lambda^{(t-1)}\exp(B^{(t-1)})(\Lambda^{(t-1)})^{-1}\Delta^{(t-1)} = \Lambda^{(t)}D^{(t-1)}E,``
+4. ``D^{(t)} \leftarrow D^{(t-1)}.``
+
+So we conveniently take parallel transport of vectors into account by representing them in ``\mathfrak{g}^\mathrm{hor}``.
 
-```@example s2_setup
+To demonstrate parallel transport we again use the example from when we introduced the concept of [geodesics](@ref "Geodesic Sprays and the Exponential Map"). We first set up the problem:
+
+```@setup s2_parallel_transport
 using GeometricMachineLearning
 using CairoMakie # hide
 import Random # hide
@@ -20,8 +50,8 @@ Random.seed!(123) # hide
 
 Y = rand(StiefelManifold, 3, 1)
 
-v = 5 * rand(3, 1)
-v₂ = 5 * rand(3, 1)
+v = 2 * rand(3, 1)
+v₂ = 1 * rand(3, 1)
 Δ = rgrad(Y, v)
 Δ₂ = rgrad(Y, v₂)
 
@@ -52,23 +82,57 @@ mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
 arrow_vec2 = ([Δ₂[1]], [Δ₂[2]], [Δ₂[3]])
 arrows!(ax, point_vec..., arrow_vec2...; color = mpurple, linewidth = .02)
 
-fig
+save("two_vectors.png", fig)
 ```
 
-```@example s2_retraction
-Δ_increments = [Δ * η for η in 0.1 : 0.1 : 2.5]
-λY = GlobalSection(Y)
+![]("two_vectors.png")
 
-B_increments = [global_rep(λY, Δ_increment) for Δ_increment in Δ_increments]
+Note that we have chosen the arrow here to have the same direction as before but only about half the magnitude. We further drew another arrow that we want to parallel transport. 
 
-... define parallel transport!!!
-A_increments = [geodesic(B_increment) for B_increment in B_increments]
-Y_increments = [apply_section(λY, A_increment) for A_increment in ]
+```@example s2_parallel_transport
+using GeometricMachineLearning: update_section!
+λY = GlobalSection(Y)
+B = global_rep(λY, Δ)
+B₂ = global_rep(λY, Δ₂)
+
+E = StiefelProjection(3, 1)
+Y_increments = []
+Δ_transported = []
+Δ₂_transported = []
+
+const n_steps = 8
+const tstep = 2
+
+for _ in 1:n_steps
+    update_section!(λY, tstep * B, geodesic)
+    push!(Y_increments, copy(λY.Y))
+    push!(Δ_transported, Matrix(λY) * B * E)
+    push!(Δ₂_transported, Matrix(λY) * B₂ * E)
+end
 
 for Y_increment in Y_increments
     scatter!(ax, [Y_increment[1]], [Y_increment[2]], [Y_increment[3]]; 
         color = mred, markersize = 5)
 end
 
+for (color, vec_transported) in zip((mred, mpurple), (Δ_transported, Δ₂_transported))
+    for (Y_increment, vec_increment) in zip(Y_increments, vec_transported)
+        point_vec = ([Y_increment[1]], [Y_increment[2]], [Y_increment[3]])
+        arrow_vec = ([vec_increment[1]], [vec_increment[2]], [vec_increment[3]])
+        arrows!(ax, point_vec..., arrow_vec...; color = color, linewidth = .02) 
+    end
+end
+
 fig
+```
+
+## References
+
+```@bibliography
+Pages = []
+Canonical = false
+
+lang2012fundamentals
+bishop1980tensor
+bendokat2020grassmann
 ```
\ No newline at end of file

From c36d4a12df072d648b8744f78b01985715065a92 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:24:55 +0200
Subject: [PATCH 038/101] Fixed typo.

---
 docs/src/optimizers/manifold_related/retractions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 1cd5ba81a..fc5beaf9e 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -181,7 +181,7 @@ The *geodesic retraction* is a retraction whose associated curve is also the uni
 ```@eval
 Main.theorem(raw"The geodesic on a compact matrix Lie group ``G`` with bi-invariant metric for ``B\in{}T_AG`` is simply
 " * Main.indentation * raw"```math
-" * Main.indentation * raw"\gamma(t) = \exp(t\cdot{}BA^-1)A,
+" * Main.indentation * raw"\gamma(t) = \exp(t\cdot{}BA^{-1})A,
 " * Main.indentation * raw"```
 " * Main.indentation * raw"where ``\exp:\mathcal{g}\to{}G`` is the matrix exponential map.")
 ```

From 739bcf86c7d8dd7f5dbdbc1f7c42cabf080d3d55 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:25:43 +0200
Subject: [PATCH 039/101] Adjusted to new optimization_step! interface.

---
 docs/src/tutorials/grassmann_layer.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/src/tutorials/grassmann_layer.md b/docs/src/tutorials/grassmann_layer.md
index 6059b4931..3612fa04d 100644
--- a/docs/src/tutorials/grassmann_layer.md
+++ b/docs/src/tutorials/grassmann_layer.md
@@ -34,12 +34,14 @@ For computing the loss between the two distributions, i.e. ``\Psi(\mathcal{N}(0,
 using GeometricMachineLearning, Zygote, BrenierTwoFluid
 using LinearAlgebra: norm # hide
 import Random # hide 
-Random.seed!(123)
+Random.seed!(1234)
 
 model = Chain(GrassmannLayer(2,3), Dense(3, 8, tanh), Dense(8, 3, identity))
 
 nn = NeuralNetwork(model, CPU(), Float64)
 
+λY = GlobalSection(nn.params)
+
 # this computes the cost that is associated to the Wasserstein distance
 c = (x,y) -> .5 * norm(x - y)^2
 ∇c = (x,y) -> x - y
@@ -48,7 +50,7 @@ const ε = 0.1                 # entropic regularization. √ε is a length.  #
 const q = 1.0                 # annealing parameter                       # hide
 const Δ = 1.0                 # characteristic domain size                # hide
 const s = ε                   # current scale: no annealing -> equals ε   # hide
-const tol = 1e-6              # marginal condition tolerance              # hide 
+const tol = 1e-6              # marginal condition tolerance              # hide
 const crit_it = 20            # acceleration inference                    # hide
 const p_η = 2
 
@@ -84,7 +86,7 @@ loss_array = zeros(training_steps)
 for i in 1:training_steps
     val, dp = compute_gradient(nn.params)
     loss_array[i] = val
-    optimization_step!(optimizer, model, nn.params, dp)
+    optimization_step!(optimizer, λY, nn.params, dp)
 end
 plot(loss_array, xlabel="training step", label="loss")
 ```
@@ -92,6 +94,7 @@ plot(loss_array, xlabel="training step", label="loss")
 Now we plot a few points to check how well they match the graph:
 
 ```@example rosenbrock
+Random.seed!(124)
 const number_of_points = 35
 
 coordinates = nn(randn(2, number_of_points))

From f71fc32f7b6ba406a3e4b692bb5ee290d1076802 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:26:50 +0200
Subject: [PATCH 040/101] Adjusted to new optimization_step! interface.

---
 test/data_loader/data_loader_optimization_step.jl    |  3 ++-
 test/data_loader/mnist_utils.jl                      |  3 ++-
 test/data_loader/optimizer_functor_with_adam.jl      |  3 ++-
 test/layers/gradient_layer_tests.jl                  |  5 +++--
 test/layers/manifold_layers.jl                       |  6 ++++--
 .../optimizer_convergence_tests/psd_optim.jl         |  3 ++-
 .../optimizer_convergence_tests/svd_optim.jl         |  3 ++-
 test/optimizers/utils/modified_exponential.jl        |  4 ++--
 test/optimizers/utils/optimization_step.jl           |  3 ++-
 test/runtests.jl                                     |  4 ++--
 test/transformer_related/transformer_optimizer.jl    | 12 ++++++++----
 11 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/test/data_loader/data_loader_optimization_step.jl b/test/data_loader/data_loader_optimization_step.jl
index 82e972618..f016d98d2 100644
--- a/test/data_loader/data_loader_optimization_step.jl
+++ b/test/data_loader/data_loader_optimization_step.jl
@@ -17,7 +17,8 @@ function test_data_loader(sys_dim, n_time_steps, n_params, T=Float32)
     dx = Zygote.gradient(ps -> loss(model, ps, dl.input, dl.input), ps)[1]
     ps_copy = deepcopy(ps)
     o = Optimizer(GradientOptimizer(), ps)
-    optimization_step!(o, model, ps, dx)
+    λY = GlobalSection(ps)
+    optimization_step!(o, λY, ps, dx)
     @test ps !== ps_copy    
 end
 
diff --git a/test/data_loader/mnist_utils.jl b/test/data_loader/mnist_utils.jl
index bcda1bd72..e598d1f7d 100644
--- a/test/data_loader/mnist_utils.jl
+++ b/test/data_loader/mnist_utils.jl
@@ -71,7 +71,8 @@ function test_optimizer_for_classification_layer(; dim₁=28, dim₂=28, number_
 
     opt = Optimizer(GradientOptimizer(), ps)
     dx = Zygote.gradient(ps -> loss_dl(model, ps, dl), ps)[1]
-    optimization_step!(opt, model, ps, dx)
+    λY = GlobalSection(ps)
+    optimization_step!(opt, λY, ps, dx)
     loss₂ = loss_dl(model, ps, dl)
 
     @test loss₂ < loss₁
diff --git a/test/data_loader/optimizer_functor_with_adam.jl b/test/data_loader/optimizer_functor_with_adam.jl
index 61cc3dbc7..820004e19 100644
--- a/test/data_loader/optimizer_functor_with_adam.jl
+++ b/test/data_loader/optimizer_functor_with_adam.jl
@@ -33,7 +33,8 @@ function test_optimization_with_adam(;T=Float32, dim₁=6, dim₂=6, n_images=10
     loss₁ = loss(model, ps, dl.input, dl.output)
 
     opt = Optimizer(AdamOptimizer(), ps)
-    loss_average = optimize_for_one_epoch!(opt, model, ps, dl, batch, loss)
+    λY = GlobalSection(ps)
+    loss_average = optimize_for_one_epoch!(opt, model, ps, dl, batch, loss, λY)
 
     loss₃ = loss(model, ps, dl.input, dl.output)
 
diff --git a/test/layers/gradient_layer_tests.jl b/test/layers/gradient_layer_tests.jl
index 6567d2e2b..9896f4d53 100644
--- a/test/layers/gradient_layer_tests.jl
+++ b/test/layers/gradient_layer_tests.jl
@@ -28,12 +28,13 @@ function test_gradient_layer_derivative_and_update(T, M, N, batch_size=10)
     # test for vector 
     x = rand(T, M)
     gs = Zygote.gradient(ps -> sum(dummy_model(x, ps)), ps)[1]
-    optimization_step!(o, dummy_model, ps, gs)
+    λY = GlobalSection(ps)
+    optimization_step!(o, λY, ps, gs)
     
     # test for matrix 
     X = rand(T, M, batch_size)
     gs = Zygote.gradient(ps -> sum(dummy_model(X, ps)), ps)[1]
-    optimization_step!(o, dummy_model, ps, gs)
+    optimization_step!(o, λY, ps, gs)
 end
 
 
diff --git a/test/layers/manifold_layers.jl b/test/layers/manifold_layers.jl
index 492a056c5..3871b7235 100644
--- a/test/layers/manifold_layers.jl
+++ b/test/layers/manifold_layers.jl
@@ -10,7 +10,8 @@ function stiefel_layer_test(T, M, N, tol=1f-1)
 
     dx = ((weight=rand(T,N,M),),(weight=rand(T,N,N),))
     ps_copy = deepcopy(ps)
-    optimization_step!(o, model, ps, dx)
+    λY = GlobalSection(ps)
+    optimization_step!(o, λY, ps, dx)
     # check that the new weight is different from the old one
     @test norm(ps_copy[1].weight - ps[1].weight) > T(tol)
     # check that the updated elements are on the Stiefel Manifold 
@@ -25,7 +26,8 @@ function grassmann_layer_test(T, M, N, tol=1f-1)
 
     dx = ((weight=rand(T,N,M),),(weight=rand(T,N,N),))
     ps_copy = deepcopy(ps)
-    for i in 1:4 optimization_step!(o, model, ps, dx) end
+    λY = GlobalSection(ps)
+    for i in 1:4 optimization_step!(o, λY, ps, dx) end
     # check that the new weight is different from the old one
     @test norm(ps_copy[1].weight - ps[1].weight) > T(tol)
     # check that the updated elements are on the Stiefel Manifold 
diff --git a/test/optimizers/optimizer_convergence_tests/psd_optim.jl b/test/optimizers/optimizer_convergence_tests/psd_optim.jl
index 4a73b3c32..c3c292a98 100644
--- a/test/optimizers/optimizer_convergence_tests/psd_optim.jl
+++ b/test/optimizers/optimizer_convergence_tests/psd_optim.jl
@@ -61,7 +61,8 @@ function train_network!(o::Optimizer, model::Chain, ps::Tuple, A::AbstractMatrix
 
     for _ in 1:train_steps
         dx = Zygote.gradient(error, ps)[1]
-        optimization_step!(o, model, ps, dx)
+        λY = GlobalSection(ps)
+        optimization_step!(o, λY, ps, dx)
         #println(error(ps))
     end
     ps[1].weight, ps[2].weight, error(ps)
diff --git a/test/optimizers/optimizer_convergence_tests/svd_optim.jl b/test/optimizers/optimizer_convergence_tests/svd_optim.jl
index 2e39b988d..81488681e 100644
--- a/test/optimizers/optimizer_convergence_tests/svd_optim.jl
+++ b/test/optimizers/optimizer_convergence_tests/svd_optim.jl
@@ -57,7 +57,8 @@ function train_network!(o::Optimizer, model::Chain, ps::Tuple, A::AbstractMatrix
 
     for _ in 1:train_steps
         dx = Zygote.gradient(error, ps)[1]
-        optimization_step!(o, model, ps, dx)
+        λY = GlobalSection(ps)
+        optimization_step!(o, λY, ps, dx)
     end
     ps[1].weight, ps[2].weight, error(ps)
 end
diff --git a/test/optimizers/utils/modified_exponential.jl b/test/optimizers/utils/modified_exponential.jl
index e5c8f1c18..abf1b402f 100644
--- a/test/optimizers/utils/modified_exponential.jl
+++ b/test/optimizers/utils/modified_exponential.jl
@@ -8,9 +8,9 @@ Random.seed!(1234)
 # check if we recover the regular exponential function
 function test(T, N, n)
     A = T(.1)*rand(T, N, n)
-    B = T(.1)*rand(T, n, N)
+    B = T(.1)*rand(T, N, n)
     @test eltype(𝔄exp(A, B)) == T 
-    @test isapprox(exp(A*B), 𝔄exp(A, B))
+    @test isapprox(exp(A * B'), 𝔄exp(A, B))
 end
 
 N_max = 10
diff --git a/test/optimizers/utils/optimization_step.jl b/test/optimizers/utils/optimization_step.jl
index c29ea5433..a9255ce93 100644
--- a/test/optimizers/utils/optimization_step.jl
+++ b/test/optimizers/utils/optimization_step.jl
@@ -15,7 +15,8 @@ function optimization_step_test(N, n, T)
     o = Optimizer(m, ps)
 
     ps2 = deepcopy(ps)
-    optimization_step!(o, model, ps, dx)
+    λY = GlobalSection(ps)
+    optimization_step!(o, λY, ps, dx)
     @test typeof(ps[1].weight) <: StiefelManifold
     for (layers1, layers2) in zip(ps, ps2)
         for key in keys(layers1)
diff --git a/test/runtests.jl b/test/runtests.jl
index 901cfb565..212ec2926 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -52,8 +52,8 @@ using SafeTestsets
 @safetestset "Batch                                                                           " begin include("data/test_batch.jl") end
 @safetestset "Method                                                                          " begin include("train!/test_method.jl") end
 @safetestset "Matching                                                                        " begin include("data/test_matching.jl") end
-@safetestset "TrainingSet                                                                     " begin include("train!/test_trainingSet.jl") end
-@safetestset "Training                                                                        " begin include("train!/test_training.jl") end
+# @safetestset "TrainingSet                                                                     " begin include("train!/test_trainingSet.jl") end
+# @safetestset "Training                                                                        " begin include("train!/test_training.jl") end
 @safetestset "NeuralNetSolution                                                               " begin include("train!/test_neuralnet_solution.jl") end
 @safetestset "Problem & Integrators                                                           " begin include("integrator/test_integrator.jl") end
 
diff --git a/test/transformer_related/transformer_optimizer.jl b/test/transformer_related/transformer_optimizer.jl
index e1734d55d..5160b7766 100644
--- a/test/transformer_related/transformer_optimizer.jl
+++ b/test/transformer_related/transformer_optimizer.jl
@@ -28,10 +28,14 @@ function transformer_gradient_test(T, dim, n_heads, L, seq_length=8, batch_size=
     ps₃ = deepcopy(ps)
     ps₄ = deepcopy(ps)
 
-    optimization_step!(o₁, model, ps₁, dx)
-    optimization_step!(o₂, model, ps₂, dx)
-    optimization_step!(o₃, model, ps₃, dx)
-    optimization_step!(o₄, model, ps₄, dx)
+    λY₁ = GlobalSection(ps₁)
+    λY₂ = GlobalSection(ps₂)
+    λY₃ = GlobalSection(ps₃)
+    λY₄ = GlobalSection(ps₄)
+    optimization_step!(o₁, λY₁, ps₁, dx)
+    optimization_step!(o₂, λY₂, ps₂, dx)
+    optimization_step!(o₃, λY₃, ps₃, dx)
+    optimization_step!(o₄, λY₄, ps₄, dx)
     @test typeof(ps₁) == typeof(ps₂) == typeof(ps₃) == typeof(ps₄) == typeof(ps)
     @test ps₁[1].PQ.head_1 ≉ ps₂[1].PQ.head_1 ≉ ps₃[1].PQ.head_1 ≉ ps₄[1].PQ.head_1 ≉ ps[1].PQ.head_1
 end

From af95834bdcf98bf886484e4f1cb84be422c7a2c9 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:28:13 +0200
Subject: [PATCH 041/101] Made equation more legible.

---
 src/optimizers/manifold_related/modified_exponential.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/optimizers/manifold_related/modified_exponential.jl b/src/optimizers/manifold_related/modified_exponential.jl
index e69cc8477..401a7a2ff 100644
--- a/src/optimizers/manifold_related/modified_exponential.jl
+++ b/src/optimizers/manifold_related/modified_exponential.jl
@@ -70,5 +70,5 @@ function 𝔄(B̂::AbstractMatrix{T}, B̄::AbstractMatrix{T}) where T
 end
 
 function 𝔄exp(X::AbstractMatrix{T}, Y::AbstractMatrix{T}) where T
-    I + X*𝔄(Y*X)*Y
+    I + X * 𝔄(X, Y) * Y'
 end
\ No newline at end of file

From 497f4a4e036b3b236c612fe649cb527c88bca8af Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:49:07 +0200
Subject: [PATCH 042/101] Fixed docstring.

---
 src/optimizers/optimizer.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/optimizers/optimizer.jl b/src/optimizers/optimizer.jl
index 8ea2a5e73..62ccfc090 100644
--- a/src/optimizers/optimizer.jl
+++ b/src/optimizers/optimizer.jl
@@ -1,7 +1,7 @@
 @doc raw"""
-    Optimizer(method, cache, step)
+    Optimizer(method, cache, step, retraction)
 
-Store the `method` (e.g. [`AdamOptimizer`](@ref) with corresponding hyperparameters), the `cache` (e.g. [`AdamCache`](@ref)) and the optimization step.
+Store the `method` (e.g. [`AdamOptimizer`](@ref) with corresponding hyperparameters), the `cache` (e.g. [`AdamCache`](@ref)), the optimization step and the retraction.
 
 It takes as input an optimization method and the parameters of a network. 
 
@@ -20,6 +20,10 @@ end
 Allocate the cache for a specific `method` and `nn_params` for an instance of `Optimizer`.
 
 Internally this calls [`init_optimizer_cache`](@ref).
+
+# Arguments
+
+The optional keyword argument is the retraction. By default this is [`cayley`](@ref).
 """
 function Optimizer(method::OptimizerMethod, nn_params::Union{Tuple, NamedTuple}; retraction = cayley)
     Optimizer(method, init_optimizer_cache(method, nn_params), 0, retraction)
@@ -71,7 +75,7 @@ function optimization_step!(o::Optimizer, λY::NamedTuple, ps::NamedTuple, cache
 end
 
 @doc raw"""
-    optimization_step!(o::Optimizer, model::Chain, ps::Tuple, dx::Tuple)
+    optimization_step!(o::Optimizer, λY::Chain, ps::Tuple, dx::Tuple)
 
 Optimize a neural network built with `Chain`.
 """
@@ -83,7 +87,7 @@ function optimization_step!(o::Optimizer, λY::Tuple, ps::Tuple, dx::Tuple)
 end
 
 @doc raw"""
-    optimization_step!(o::Optimizer, model::AbstractExplicitLayer, ps::NamedTuple, dx::NamedTuple)
+    optimization_step!(o::Optimizer, λY::NamedTuple, ps::NamedTuple, dx::NamedTuple)
 
 Optimize a neural network consisting of a single `AbstractExplicitLayer`.
 """

From 64382f67f42340c3a689d671584a1a385cebe50b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:49:31 +0200
Subject: [PATCH 043/101] Added new reference.

---
 docs/src/optimizers/manifold_related/parallel_transport.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index de0ad5f3f..d062a5bf1 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -135,4 +135,5 @@ Canonical = false
 lang2012fundamentals
 bishop1980tensor
 bendokat2020grassmann
+schlarb2024covariant
 ```
\ No newline at end of file

From fe391199d2e37b8d98070a3e6c02b75c5b1a3c5a Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:49:56 +0200
Subject: [PATCH 044/101] Made docs conform with new interface.

---
 .../manifold_related/retractions.md           | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index fc5beaf9e..7ed614df6 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -163,7 +163,7 @@ The way we use *retractions*[^1] in `GeometricMachineLearning` is slightly diffe
 [^1]: Classical retractions are also defined in `GeometricMachineLearning` under the same name, i.e. there is e.g. a method [`cayley(::StiefelLieAlgHorMatrix)`](@ref) and a method [`cayley(::StiefelManifold, ::AbstractMatrix)`](@ref) (the latter being the classical retraction); but the user is *strongly discouraged* from using classical retractions as these are computational inefficient.
 
 ```@eval
-Main.definition(raw"Given a section ``\lambda:\mathcal{M}\to{}G`` a **retraction** is a map ``\mathrm{Retraction}:\mathfrak{g}^\mathrm{hor}\to\mathcal{M}`` such that 
+Main.definition(raw"Given a section ``\lambda:\mathcal{M}\to{}G`` a **retraction** is a map ``\mathrm{Retraction}:\mathfrak{g}^\mathrm{hor}\to{}G`` such that 
 " * Main.indentation * raw"```math
 " * Main.indentation * raw"\Delta \mapsto \lambda(Y)\mathrm{Retraction}(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E,
 " * Main.indentation * raw"```
@@ -256,20 +256,20 @@ where we have used that
 Based on this we define the maps: 
 
 ```math
-\mathtt{geodesic}: \mathfrak{g}^\mathrm{hor} \to \mathcal{M}, B \mapsto \exp(B)E,
+\mathtt{geodesic}: \mathfrak{g}^\mathrm{hor} \to G, B \mapsto \exp(B),
 ```
 
 and
 
 ```math
-\mathtt{cayley}: \mathfrak{g}^\mathrm{hor} \to \mathcal{M}, B \mapsto \mathrm{Cayley}(B)E,
+\mathtt{cayley}: \mathfrak{g}^\mathrm{hor} \to G, B \mapsto \mathrm{Cayley}(B),
 ```
 
 where ``B = \lambda(Y)^{-1}\Omega(\Delta)\lambda(Y)``. These expressions for `geodesic` and `cayley` are the ones that we typically use in `GeometricMachineLearning` for computational reasons. We show how we can utilize the sparse structure of ``\mathfrak{g}^\mathrm{hor}`` for computing the geodesic retraction and the Cayley retraction (i.e. the expressions ``\exp(B)`` and ``\mathrm{Cayley}(B)`` for ``B\in\mathfrak{g}^\mathrm{hor}``). Similar derivations can be found in [celledoni2000approximating, fraikin2007optimization, bendokat2021real](@cite).
 
 ```@eval
 Main.remark(raw"Further note that, even though the global section ``\lambda:\mathcal{M} \to G`` is not unique, the final geodesic ``
-\gamma_\Delta(t) = \lambda(Y)\exp(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E`` does not depend on the particular section we chose.")
+\gamma_\Delta(t) = \lambda(Y)\exp(\lambda(Y)^{-1}\Omega(\Delta)\lambda(Y))E`` does not depend on the particular section we choose.")
 ```
 
 ### The Geodesic Retraction
@@ -299,7 +299,7 @@ where we defined ``\mathfrak{A}(B', B'') := \sum_{n=1}^\infty \frac{1}{n!} ((B''
 The final expression we obtain is: 
 
 ```math
-E + \begin{pmatrix} \frac{1}{2} A & \mathbb{I} \\ B & \mathbb{O} \end{pmatrix} 𝔄(B', B'') \begin{pmatrix} \mathbb{I} \\ \frac{1}{2} A \end{pmatrix}.
+\exp(B) = \mathbb{I} + B' \mathfrak{A}(B', B'')  (B'')^T
 ```
 
 ### The Cayley Retraction
@@ -320,17 +320,10 @@ So what we have to compute the inverse of:
 By leveraging the sparse structure of the matrices in ``\mathfrak{g}^\mathrm{hor}`` we arrive at the following expression for the Cayley retraction (similar to the case of the geodesic retraction):
 
 ```math
-\left(\mathbb{I} + \frac{1}{2}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}^{-1}  \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix} \right)\left( E +  \frac{1}{2}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix}\ \right) = \\
-E + \frac{1}{2}\begin{bmatrix} \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O}  \end{bmatrix}\left(
-    \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix}  + 
-    \begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}^{-1}\left(
-        \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix} + 
-        \begin{bmatrix} \frac{1}{2}A \\ \frac{1}{4}A^2 - \frac{1}{2}B^TB \end{bmatrix}
-    \right)
-    \right)
+\mathrm{Cayley}(B) = \mathbb{I} + \frac{1}{2} B' (\mathbb{I}_{2n} - \frac{1}{2} (B'')^T B')^{-1} (B'')^T (\mathbb{I} + \frac{1}{2} B),
 ```
 
-We conclude with a remark:
+where we have abbreviated ``\mathbb{I} := \mathbb{I}_N.`` We conclude with a remark:
 
 ```@eval
 Main.remark(raw"As mentioned previously the Lie group ``SO(N)``, i.e. the one corresponding to the Stiefel manifold and the Grassmann manifold, has a bi-invariant Riemannian metric associated with it: ``(B_1,B_2)\mapsto \mathrm{Tr}(B_1^TB_2)``. For other Lie groups (e.g. the symplectic group) the situation is slightly more difficult [bendokat2021real](@cite).")

From 2686a4cbb7fede76134983c629be0ddad7819612 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 13:51:19 +0200
Subject: [PATCH 045/101] Fixed reference.

---
 docs/src/optimizers/manifold_related/parallel_transport.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index d062a5bf1..44a03d0fe 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -85,7 +85,7 @@ arrows!(ax, point_vec..., arrow_vec2...; color = mpurple, linewidth = .02)
 save("two_vectors.png", fig)
 ```
 
-![]("two_vectors.png")
+![](two_vectors.png)
 
 Note that we have chosen the arrow here to have the same direction as before but only about half the magnitude. We further drew another arrow that we want to parallel transport. 
 

From 892ebb26d16bae7db0e6b44224e329d25bdf5671 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 16:53:29 +0200
Subject: [PATCH 046/101] Renamed adam_optimizer -> optimizer_methods.

---
 ...adam_optimizer.md => optimizer_methods.md} | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)
 rename docs/src/optimizers/{adam_optimizer.md => optimizer_methods.md} (60%)

diff --git a/docs/src/optimizers/adam_optimizer.md b/docs/src/optimizers/optimizer_methods.md
similarity index 60%
rename from docs/src/optimizers/adam_optimizer.md
rename to docs/src/optimizers/optimizer_methods.md
index 0fc3b650a..31246ad75 100644
--- a/docs/src/optimizers/adam_optimizer.md
+++ b/docs/src/optimizers/optimizer_methods.md
@@ -1,6 +1,14 @@
-# The Adam Optimizer 
+# Standard Neural Network Optimizers
 
-The Adam Optimizer is one of the most widely (if not the most widely used) neural network optimizer. Like most modern neural network optimizers it contains a `cache` that is updated based on first-order gradient information and then, in a second step, the `cache` is used to compute a velocity estimate for updating the neural network weights. 
+In this section we discuss optimization methods that are often used in training neural networks. The [BFGS optimizer](@ref "The BFGS Optimizer") may also be viewed as a *standard neural network optimizer* but is treated in a separate section because of its complexity.
+
+## The Gradient Optimizer
+
+The gradient optimizer is the simplest optimization algorithm used to train neural networks. It was already briefly discussed when we introduced [Riemannian manifolds](@ref "Gradient Flows and Riemannian Optimization").
+
+## The Adam Optimizer 
+
+The Adam Optimizer is one of the most widely neural network optimizer. Like most modern neural network optimizers it contains a `cache` that is updated based on first-order gradient information and then, in a second step, the `cache` is used to compute a velocity estimate for updating the neural network weights. 
 
 Here we first describe the Adam algorithm for the case where all the weights are on a vector space and then show how to generalize this to the case where the weights are on a manifold. 
 
@@ -12,7 +20,7 @@ If all the weights are on a vector space, then we directly compute updates for $
 1. $B_1 \gets ((\rho_1 - \rho_1^t)/(1 - \rho_1^t))\cdot{}B_1 + (1 - \rho_1)/(1 - \rho_1^t)\cdot{}\nabla{}L,$
 2. $B_2 \gets ((\rho_2 - \rho_1^t)/(1 - \rho_2^t))\cdot{}B_2 + (1 - \rho_2)/(1 - \rho_2^t)\cdot\nabla{}L\odot\nabla{}L,$
 
-    where $\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n$ is the **Hadamard product**: $[a\odot{}b]_i = a_ib_i$. $\rho_1$ and $\rho_2$ are hyperparameters. Their defaults, $\rho_1=0.9$ and $\rho_2=0.99$, are taken from [goodfellow2016deep; page 301]. After having updated the `cache` (i.e. $B_1$ and $B_2$) we compute a **velocity** (step 3) with which the parameters $Y_t$ are then updated (step 4).
+    where $\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n$ is the **Hadamard product**: $[a\odot{}b]_i = a_ib_i$. $\rho_1$ and $\rho_2$ are hyperparameters. Their defaults, $\rho_1=0.9$ and $\rho_2=0.99$, are taken from [goodfellow2016deep; page 301](@cite). After having updated the `cache` (i.e. $B_1$ and $B_2$) we compute a **velocity** (step 3) with which the parameters $Y_t$ are then updated (step 4).
 
 3. $W_t\gets -\eta{}B_1/\sqrt{B_2 + \delta},$
 4. $Y_{t+1} \gets Y_t + W_t,$
@@ -28,6 +36,16 @@ Main.include_graphics("../tikz/adam_optimizer") # hide
 The problem with generalizing Adam to manifolds is that the Hadamard product $\odot$ as well as the other element-wise operations ($/$, $\sqrt{}$ and $+$ in step 3 above) lack a clear geometric interpretation. In `GeometricMachineLearning` we get around this issue by utilizing a so-called [global tangent space representation](@ref "Global Tangent Spaces").  
 
 
+## Library Functions
+
+```@docs; canonical=false
+OptimizerMethod
+GradientOptimizer
+MomentumOptimizer
+AdamOptimizer
+initialize_cache
+```
+
 ## References
 
 ```@bibliography 

From af34b139b969ed201b01d3d7f172b240dab45164 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 31 May 2024 16:55:49 +0200
Subject: [PATCH 047/101] Renamed Adam optimizer to optimizer methods and moved
 pullbacks to arrays.

---
 docs/make.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 4f94ae792..2c6f284f9 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -157,17 +157,17 @@ makedocs(;
             "Riemannian Manifolds" => "manifolds/riemannian_manifolds.md",
             "Homogeneous Spaces" => "manifolds/homogeneous_spaces.md",
             ],
-        "Special Arrays" => [
+        "Special Arrays and AD" => [
             "Symmetric and Skew-Symmetric Matrices" => "arrays/skew_symmetric_matrix.md",
             "Global Tangent Spaces" => "arrays/global_tangent_spaces.md",
+            "Pullbacks" => "pullbacks/computation_of_pullbacks.md",
         ],
         "Optimizers" => [
             "Optimizers" => "optimizers/optimizer_framework.md",
-            "Pullbacks" => "pullbacks/computation_of_pullbacks.md",
             "Global Sections" => "optimizers/manifold_related/global_sections.md",
             "Retractions" => "optimizers/manifold_related/retractions.md",
-            "Parallel transport" => "optimizers/manifold_related/parallel_transport.md",
-            "Adam Optimizer" => "optimizers/adam_optimizer.md",
+            "Parallel Transport" => "optimizers/manifold_related/parallel_transport.md",
+            "Optimizer Methods" => "optimizers/optimizer_methods.md",
             "BFGS Optimizer" => "optimizers/bfgs_optimizer.md",
             ],
         "Special Neural Network Layers" => [

From b68621bc327a25199044868d88172d24499d866c Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Sat, 1 Jun 2024 13:19:21 +0200
Subject: [PATCH 048/101] Using GLMakie now and a hack from
 https://docs.makie.org/v0.21/how-to/save-figure-with-transparency to better
 display 3d images.

---
 docs/gl_makie_transparent_background_hack.jl  | 27 +++++++++
 docs/src/manifolds/riemannian_manifolds.md    | 57 ++++++++++++++++---
 .../manifold_related/global_sections.md       |  2 +-
 .../manifold_related/parallel_transport.md    | 48 +++++++++++++---
 .../manifold_related/retractions.md           | 36 ++++++------
 5 files changed, 135 insertions(+), 35 deletions(-)
 create mode 100644 docs/gl_makie_transparent_background_hack.jl

diff --git a/docs/gl_makie_transparent_background_hack.jl b/docs/gl_makie_transparent_background_hack.jl
new file mode 100644
index 000000000..2718d6f1f
--- /dev/null
+++ b/docs/gl_makie_transparent_background_hack.jl
@@ -0,0 +1,27 @@
+# taken from https://docs.makie.org/stable/how-to/save-figure-with-transparency
+function calculate_rgba(rgb1, rgb2, rgba_bg)::RGBAf
+    rgb1 == rgb2 && return RGBAf(GLMakie.red(rgb1), GLMakie.green(rgb1), GLMakie.blue(rgb1), 1)
+    c1 = Float64.((GLMakie.red(rgb1), GLMakie.green(rgb1), GLMakie.blue(rgb1)))
+    c2 = Float64.((GLMakie.red(rgb2), GLMakie.green(rgb2), GLMakie.blue(rgb2)))
+    alphas_fg = 1 .+ c1 .- c2
+    alpha_fg = clamp(sum(alphas_fg) / 3, 0, 1)
+    alpha_fg == 0 && return rgba_bg
+    rgb_fg = clamp.((c1 ./ alpha_fg), 0, 1)
+    rgb_bg = Float64.((rgba_bg.r, rgba_bg.g, rgba_bg.b))
+    alpha_final = alpha_fg + (1 - alpha_fg) * rgba_bg.alpha
+    rgb_final = @. 1 / alpha_final * (alpha_fg * rgb_fg + (1 - alpha_fg) * rgba_bg.alpha * rgb_bg)
+    return RGBAf(rgb_final..., alpha_final)
+end
+
+function alpha_colorbuffer(figure)
+    scene = figure.scene
+    bg = scene.backgroundcolor[]
+    scene.backgroundcolor[] = RGBAf(0, 0, 0, 1)
+    b1 = copy(colorbuffer(scene))
+    scene.backgroundcolor[] = RGBAf(1, 1, 1, 1)
+    b2 = colorbuffer(scene)
+    scene.backgroundcolor[] = bg
+    return map(b1, b2) do b1, b2
+        calculate_rgba(b1, b2, bg)
+    end
+end
diff --git a/docs/src/manifolds/riemannian_manifolds.md b/docs/src/manifolds/riemannian_manifolds.md
index dc8544d05..c40530989 100644
--- a/docs/src/manifolds/riemannian_manifolds.md
+++ b/docs/src/manifolds/riemannian_manifolds.md
@@ -60,9 +60,14 @@ and we see that ``\gamma_{v_x}(t) = \exp(t\cdot{}v_x)``. In `GeometricMachineLea
 
 We give an example here:
 
+```@setup s2_retraction
+using GLMakie
+
+include("../../gl_makie_transparent_background_hack.jl")
+```
+
 ```@example s2_retraction
 using GeometricMachineLearning
-using CairoMakie # hide
 import Random # hide
 Random.seed!(123) # hide
 
@@ -71,10 +76,14 @@ Y = rand(StiefelManifold, 3, 1)
 v = 5 * rand(3, 1)
 Δ = v - Y * (v' * Y)
 
-fig = Figure(; backgroundcolor = :transparent) # hide
+morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
+mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
+
+function set_up_plot(; theme = :dark) # hide
 text_color = Main.output_type == :html ? :white : :black # hide
+fig = Figure(; backgroundcolor = :transparent) # hide
 ax = Axis3(fig[1, 1]; # hide
-        backgroundcolor = :transparent, # hide
+        backgroundcolor = (:tomato, .5), # hide
         aspect = (1., 1., 1.), # hide
         azimuth = π / 6, # hide
         elevation = π / 8, # hide
@@ -84,32 +93,56 @@ ax = Axis3(fig[1, 1]; # hide
         ) # hide
 
 # plot a sphere with radius one and origin 0
-surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .6)
+surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .5, transparency = true)
 
-morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
 point_vec = ([Y[1]], [Y[2]], [Y[3]])
 scatter!(ax, point_vec...; color = morange, marker = :star5)
 
-mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 arrow_vec = ([Δ[1]], [Δ[2]], [Δ[3]])
 arrows!(ax, point_vec..., arrow_vec...; color = mred, linewidth = .02)
 
-fig
+fig, ax # hide
+end # hide
+
+fig_light = set_up_plot(; theme = :light)[1] # hide
+fig_dark = set_up_plot(; theme = :dark)[1] # hide
+save("sphere_with_tangent_vec.png", fig_light |> alpha_colorbuffer) # hide
+save("sphere_with_tangent_vec_dark.png", fig_dark |> alpha_colorbuffer) # hide
+
+nothing # hide
+```
+
+```@example
+Main.include_graphics("sphere_with_tangent_vec") # hide
 ```
 
 We now solve the geodesic spray for ``\eta\cdot\Delta`` for ``\eta = 0.1, 0.2, 0.3, \ldots, 2.5`` and plot the corresponding points:
 
 ```@example s2_retraction
-Δ_increments = [Δ * η for η in 0.1 : 0.1 : 2.5]
+Δ_increments = [Δ * η for η in 0.1 : 0.1 : 5.5]
 
 Y_increments = [geodesic(Y, Δ_increment) for Δ_increment in Δ_increments]
 
+function make_plot_with_solution(; theme = :dark) # hide
+fig, ax = set_up_plot(; theme = theme)
 for Y_increment in Y_increments
     scatter!(ax, [Y_increment[1]], [Y_increment[2]], [Y_increment[3]]; 
         color = mred, markersize = 5)
 end
 
 fig
+end # hide
+
+fig_light = make_plot_with_solution(; theme = :light) # hide
+fig_dark = make_plot_with_solution(; theme = :dark) # hide
+save("sphere_with_tangent_vec_and_geodesic.png", fig_light |> alpha_colorbuffer) # hide
+save("sphere_with_tangent_vec_and_geodesic_dark.png", fig_dark |> alpha_colorbuffer) # hide
+
+nothing # hide
+```
+
+```@example
+Main.include_graphics("sphere_with_tangent_vec_and_geodesic") # hide
 ```
 
 So a geodesic can be seen as the *equivalent of a straight line* on a manifold. Also note that we drew a random element form [`StiefelManifold`](@ref) here and not from ``S^2``. This is because [Stiefel manifolds](@ref "The Stiefel Manifold") are more general spaces than ``S^n`` and also comprise them. 
@@ -140,7 +173,13 @@ In `GeometricMachineLearning` we can include weights in neural networks that are
 X(x) = - \mathrm{grad}_xL.
 ```
 
-Solving this gradient flow equation will then lead us to a local minimum on ``\mathcal{M}``. This will be elaborated on when talking about [optimizers](@ref "Optimizer"). In practice we cannot solve the gradient flow equation directly and have to rely on approximations. The most straightforward approximation (and one that serves as a basis for all the optimization algorithms in `GeometricMachineLearning`) is to take the point ``(x, X(x))`` as an initial condition for the geodesic spray and then solve the ODE for a small time step. We will call this  
+Solving this gradient flow equation will then lead us to a local minimum on ``\mathcal{M}``. This will be elaborated on when talking about [optimizers](@ref "Neural Network Optimizers"). In practice we cannot solve the gradient flow equation directly and have to rely on approximations. The most straightforward approximation (and one that serves as a basis for all the optimization algorithms in `GeometricMachineLearning`) is to take the point ``(x, X(x))`` as an initial condition for the geodesic spray and then solve the ODE for a small time step. Such an update rule, i.e. 
+
+```math
+x^{(t)} \leftarrow \gamma_{X(x^{(t-1)})}(\Delta{}t)\text{ with $\Delta{}t$ the time step},
+```
+
+we call the *gradient optimization scheme*.
 
 ## Library Functions
 
diff --git a/docs/src/optimizers/manifold_related/global_sections.md b/docs/src/optimizers/manifold_related/global_sections.md
index ab8422709..f10344a91 100644
--- a/docs/src/optimizers/manifold_related/global_sections.md
+++ b/docs/src/optimizers/manifold_related/global_sections.md
@@ -1,6 +1,6 @@
 # Global Sections for Homogeneous Spaces
 
-**Global sections** are needed needed for the generalization of [Adam](../adam_optimizer.md) and other optimizers to [homogeneous spaces](@ref "Homogeneous Spaces"). They are necessary to perform the two mappings represented represented by horizontal and vertical red lines in the section on the general [optimizer framework](../optimizer_framework.md).
+**Global sections** are needed needed for the generalization of [Adam](@ref "The Adam Optimizer") and other optimizers to [homogeneous spaces](@ref "Homogeneous Spaces"). They are necessary to perform the two mappings represented represented by horizontal and vertical red lines in the section on the general [optimizer framework](../optimizer_framework.md).
 
 ## Computing the global section
 In differential geometry a **section** is always associated to some **bundle**, in our case this bundle is $\pi:G\to\mathcal{M},A\mapsto{}AE$. A section is a mapping $\mathcal{M}\to{}G$ for which $\pi$ is a left inverse, i.e. $\pi\circ\lambda = \mathrm{id}$. 
diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index 44a03d0fe..e744de6d4 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -42,9 +42,14 @@ So we conveniently take parallel transport of vectors into account by representi
 
 To demonstrate parallel transport we again use the example from when we introduced the concept of [geodesics](@ref "Geodesic Sprays and the Exponential Map"). We first set up the problem:
 
+```@setup s2_parallel_transport
+using GLMakie
+
+include("../../../gl_makie_transparent_background_hack.jl")
+```
+
 ```@setup s2_parallel_transport
 using GeometricMachineLearning
-using CairoMakie # hide
 import Random # hide
 Random.seed!(123) # hide
 
@@ -55,10 +60,16 @@ v₂ = 1 * rand(3, 1)
 Δ = rgrad(Y, v)
 Δ₂ = rgrad(Y, v₂)
 
+morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
+mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
+mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
+
+function set_up_plot(; theme = :dark) # hide
+text_color = Main.output_type == :html ? :white : :black # hide
 fig = Figure(; backgroundcolor = :transparent) # hide
 text_color = Main.output_type == :html ? :white : :black # hide
 ax = Axis3(fig[1, 1]; # hide
-        backgroundcolor = :transparent, # hide
+        backgroundcolor = (:tomato, .5), # hide
         aspect = (1., 1., 1.), # hide
         azimuth = π / 6, # hide
         elevation = π / 8, # hide
@@ -68,24 +79,31 @@ ax = Axis3(fig[1, 1]; # hide
         ) # hide
 
 # plot a sphere with radius one and origin 0
-surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .6)
+surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .5, transparency = true)
 
-morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
 point_vec = ([Y[1]], [Y[2]], [Y[3]])
 scatter!(ax, point_vec...; color = morange, marker = :star5)
 
-mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 arrow_vec = ([Δ[1]], [Δ[2]], [Δ[3]])
 arrows!(ax, point_vec..., arrow_vec...; color = mred, linewidth = .02)
 
-mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
 arrow_vec2 = ([Δ₂[1]], [Δ₂[2]], [Δ₂[3]])
 arrows!(ax, point_vec..., arrow_vec2...; color = mpurple, linewidth = .02)
 
-save("two_vectors.png", fig)
+fig, ax # hide
+end # hide
+
+fig_light = set_up_plot(; theme = :light)[1]
+fig_dark = set_up_plot(; theme = :dark)[1]
+save("two_vectors.png", fig_light |> alpha_colorbuffer) # hide
+save("two_vectors_dark.png", fig_dark |> alpha_colorbuffer) # hide
+
+nothing # hide
 ```
 
-![](two_vectors.png)
+```@example
+Main.include_graphics("two_vectors") # hide
+```
 
 Note that we have chosen the arrow here to have the same direction as before but only about half the magnitude. We further drew another arrow that we want to parallel transport. 
 
@@ -110,6 +128,8 @@ for _ in 1:n_steps
     push!(Δ₂_transported, Matrix(λY) * B₂ * E)
 end
 
+function plot_parallel_transport(; theme = :dark) # hide
+fig, ax = set_up_plot(; theme = :dark) # hide
 for Y_increment in Y_increments
     scatter!(ax, [Y_increment[1]], [Y_increment[2]], [Y_increment[3]]; 
         color = mred, markersize = 5)
@@ -124,6 +144,18 @@ for (color, vec_transported) in zip((mred, mpurple), (Δ_transported, Δ₂_tran
 end
 
 fig
+end # hide
+
+fig_light = plot_parallel_transport(; theme = :light) # hide
+fig_dark = plot_parallel_transport(; theme = :dark) # hide
+save("parallel_transport.png", fig_light |> alpha_colorbuffer) # hide
+save("parallel_transport_dark.png", fig_dark |> alpha_colorbuffer) # hide
+
+nothing # hide
+```
+
+```@example
+Main.include_graphics("parallel_transport") # hide
 ```
 
 ## References
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 7ed614df6..f6dcb2c1f 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -30,9 +30,14 @@ We should mention that the factor ``\frac{1}{2}`` is sometimes left out in the d
 
 We want to compare the [`geodesic`](@ref) retraction with the [`cayley`](@ref) retraction for the example we already introduced when talking about the [exponential map](@ref "Geodesic Sprays and the Exponential Map"):
 
+```@setup s2_retraction
+using GLMakie
+
+include("../../../gl_makie_transparent_background_hack.jl")
+```
+
 ```@setup s2_retraction
 using GeometricMachineLearning
-using CairoMakie # hide
 import Random # hide
 Random.seed!(123) # hide
 
@@ -42,10 +47,10 @@ v = 5 * rand(3, 1)
 Δ = v - Y * (v' * Y)
 
 function do_setup(; theme=:light)
-    fig = Figure(; backgroundcolor = :transparent) # hide
     text_color = theme == :dark ? :white : :black # hide
+    fig = Figure(; backgroundcolor = :transparent) # hide
     ax = Axis3(fig[1, 1]; # hide
-            backgroundcolor = :transparent, # hide
+            backgroundcolor = (:tomato, .5), # hide
             aspect = (1., 1., 1.), # hide
             azimuth = π / 6, # hide
             elevation = π / 8, # hide
@@ -55,7 +60,7 @@ function do_setup(; theme=:light)
             ) # hide
 
     # plot a sphere with radius one and origin 0
-    surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .6)
+    surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .5, transparency = true)
 
     morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
     point_vec = ([Y[1]], [Y[2]], [Y[3]])
@@ -71,7 +76,7 @@ nothing
 ```
 
 ```@example s2_retraction
-η_increments = 0.1 : 0.1 : 2.5
+η_increments = 0.1 : 0.1 : 5.5
 Δ_increments = [Δ * η for η in η_increments]
 
 Y_increments_geodesic = [geodesic(Y, Δ_increment) for Δ_increment in Δ_increments]
@@ -110,12 +115,10 @@ axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text
 fig, ax, zip_ob, Y_increments_geodesic, Y_increments_cayley # hide
 end # hide
 
-if Main.output_type == :html # hide
-    save("retraction_comparison.png",        make_plot(; theme = :light)[1]; px_per_unit = 1.5) # hide
-    save("retraction_comparison_dark.png",   make_plot(; theme = :dark )[1]; px_per_unit = 1.5) # hide
-elseif Main.output_type == :latex # hide
-    save("retraction_comparison.png",       make_plot(; theme = :light)[1]; px_per_unit = 2.0) # hide
-end # hide
+fig_light = make_plot(; theme = :light)[1] # hide
+fig_dark = make_plot(; theme = :dark)[1] # hide
+save("retraction_comparison.png",        fig_light |> alpha_colorbuffer) #; px_per_unit = 1.5) # hide
+save("retraction_comparison_dark.png",   fig_dark |> alpha_colorbuffer) #; px_per_unit = 1.5) # hide
 
 Main.include_graphics("retraction_comparison"; caption = raw"Comparison between the geodesic and the Cayley retraction.", width = .8) # hide
 ```
@@ -123,6 +126,7 @@ Main.include_graphics("retraction_comparison"; caption = raw"Comparison between
 We see that for small ``\Delta`` increments the Cayley retraction seems to match the geodesic retraction very well, but for larger values there is a notable discrepancy:
 
 ```@setup s2_retraction
+using CairoMakie
 function plot_discrepancies(discrepancies; theme = :light)
     fig = Figure(; backgroundcolor = :transparent) # hide
     text_color = theme == :dark ? :white : :black # hide
@@ -146,12 +150,10 @@ using LinearAlgebra: norm
 _, __, zip_ob, Y_increments_geodesic, Y_increments_cayley = make_plot() # hide
 discrepancies = [norm(Y_geo_inc - Y_cay_inc) for (Y_geo_inc, Y_cay_inc, _) in zip_ob]
 
-if Main.output_type == :html # hide
-    save("retraction_discrepancy.png",        plot_discrepancies(discrepancies; theme = :light)[1]; px_per_unit = 1.5) # hide
-    save("retraction_discrepancy_dark.png",   plot_discrepancies(discrepancies; theme = :dark )[1]; px_per_unit = 1.5) # hide
-elseif Main.output_type == :latex # hide
-    save("retraction_discrepancy.png",        plot_discrepancies(discrepancies; theme = :light)[1]; px_per_unit = 2.0) # hide
-end # hide
+fig_light = plot_discrepancies(discrepancies; theme = :light)[1] # hide
+fig_dark = plot_discrepancies(discrepancies; theme = :dark)[1] # hide
+save("retraction_discrepancy.png",        fig_light) #; px_per_unit = 1.5) # hide
+save("retraction_discrepancy_dark.png",   fig_dark) #; px_per_unit = 1.5) # hide
 
 Main.include_graphics("retraction_discrepancy"; caption = raw"Discrepancy between the geodesic and the Cayley retraction.", width = .6) # hide
 ```

From 63c0f3372b764d898c592e495357b59b97d8f584 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Sat, 1 Jun 2024 13:19:52 +0200
Subject: [PATCH 049/101] Other Adam manifold paper.

---
 docs/src/GeometricMachineLearning.bib | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
index 3835d9a7f..5150d1143 100644
--- a/docs/src/GeometricMachineLearning.bib
+++ b/docs/src/GeometricMachineLearning.bib
@@ -425,4 +425,11 @@ @article{schlarb2024covariant
   pages={1--43},
   year={2024},
   publisher={Springer}
-}
\ No newline at end of file
+}
+
+@article{kong2023momentum,
+  title={Momentum stiefel optimizer, with applications to suitably-orthogonal attention, and optimal transport},
+  author={Kong, Lingkai and Wang, Yuqing and Tao, Molei},
+  journal={arXiv preprint arXiv:2205.14173v3},
+  year={2023}
+}

From d6290b0cbe3177625dff0bac698f84084eae45f3 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Sat, 1 Jun 2024 13:20:24 +0200
Subject: [PATCH 050/101] Added various examples of how to use different
 optimizers and what they are doing.

---
 docs/src/optimizers/optimizer_methods.md | 106 +++++++++++++++++++++--
 1 file changed, 98 insertions(+), 8 deletions(-)

diff --git a/docs/src/optimizers/optimizer_methods.md b/docs/src/optimizers/optimizer_methods.md
index 31246ad75..abcd8744a 100644
--- a/docs/src/optimizers/optimizer_methods.md
+++ b/docs/src/optimizers/optimizer_methods.md
@@ -1,26 +1,108 @@
 # Standard Neural Network Optimizers
 
-In this section we discuss optimization methods that are often used in training neural networks. The [BFGS optimizer](@ref "The BFGS Optimizer") may also be viewed as a *standard neural network optimizer* but is treated in a separate section because of its complexity.
+In this section we discuss optimization methods that are often used in training neural networks. The [BFGS optimizer](@ref "The BFGS Optimizer") may also be viewed as a *standard neural network optimizer* but is treated in a separate section because of its complexity. From a perspective of manifolds the *optimizer methods* outlined here operate on ``\mathfrak{g}^\mathrm{hor}`` only. Each of them has a cache associated with it[^1] and this cache is updated with the function [`update!`](@ref). The precise role of this function is described below.
+
+[^1]: In the case of the [gradient optimizer](@ref "The Gradient Optimizer") this cache is trivial.
 
 ## The Gradient Optimizer
 
 The gradient optimizer is the simplest optimization algorithm used to train neural networks. It was already briefly discussed when we introduced [Riemannian manifolds](@ref "Gradient Flows and Riemannian Optimization").
 
+It simply does: 
+
+```math
+\mathrm{weight} \leftarrow \mathrm{weight} + (-\eta\cdot\mathrm{gradient}),
+```
+
+where addition has to be replaced with appropriate operations in the manifold case.
+
+When calling [`GradientOptimizer`](@ref) we can specify a learning rate ``\eta`` (or use the default).
+
+```@example optimizer_methods
+using GeometricMachineLearning
+
+const η = 0.01
+method = GradientOptimizer(η)
+```
+
+In order to use the optimizer we need an instance of [`Optimizer`](@ref) that is called with the method and the weights of the neural network:
+
+
+```@example optimizer_methods
+weight = (A = zeros(10, 10), )
+o = Optimizer(method, weight)
+```
+
+If we operate on a derivative with [`update!`](@ref) this will compute a *final velocity* that is then used to compute a retraction (or simply perform addition if we do not deal with a manifold):
+
+```@example optimizer_methods
+dx = (A = one(weight.A), )
+update!(o, o.cache, dx)
+
+dx
+```
+
+So what has happened here is that the gradient `dx` was simply multiplied with ``-\eta`` as the cache of the gradient optimizer is trivial.
+
+## The Momentum Optimizer
+
+The momentum optimizer is similar to the gradient optimizer but further stores past information as *first moments*. We let these first moments *decay* with a *decay parameter* ``\alpha``:
+
+```math
+\mathrm{weights} \leftarrow weights + (\alpha\cdot\mathrm{moment} - \eta\cdot\mathrm{gradient}),
+```
+
+where addition has to be replaced with appropriate operations in the manifold case.
+
+In the case of the momentum optimizer the cache is non-trivial:
+
+```@example optimizer_methods
+const α = 0.5
+method = MomentumOptimizer(η, α)
+o = Optimizer(method, weight)
+
+o.cache
+```
+
+But as the cache is initialized with zeros it will lead the same result as the gradient optimizer in the first iteration:
+
+```@example optimizer_methods
+dx = (A = one(weight.A), )
+
+update!(o, o.cache, dx)
+
+dx
+```
+
+The cache has changed however:
+
+```@example optimizer_methods
+o.cache
+```
+
+If we have weights on manifolds calling [`Optimizer`](@ref) will automatically allocate the correct cache on ``\mathfrak{g}^\mathrm{hor}``:
+
+```@example optimizer_methods
+weight = (Y = rand(StiefelManifold, 10, 5), )
+
+Optimizer(method, weight).cache
+```
+
 ## The Adam Optimizer 
 
 The Adam Optimizer is one of the most widely neural network optimizer. Like most modern neural network optimizers it contains a `cache` that is updated based on first-order gradient information and then, in a second step, the `cache` is used to compute a velocity estimate for updating the neural network weights. 
 
 Here we first describe the Adam algorithm for the case where all the weights are on a vector space and then show how to generalize this to the case where the weights are on a manifold. 
 
-## All weights on a vector space
+### All weights on a vector space
 
-The cache of the Adam optimizer consists of **first and second moments**. The **first moments** $B_1$ store linear information about the current and previous gradients, and the **second moments** $B_2$ store quadratic information about current and previous gradients (all computed from a first-order gradient). 
+The cache of the Adam optimizer consists of *first and second moments*. The *first moments* ``B_1``, similar to the momentum optimizer, store linear information about the current and previous gradients, and the *second moments* ``B_2`` store quadratic information about current and previous gradients (all computed from a first-order gradient). 
 
-If all the weights are on a vector space, then we directly compute updates for $B_1$ and $B_2$:
-1. $B_1 \gets ((\rho_1 - \rho_1^t)/(1 - \rho_1^t))\cdot{}B_1 + (1 - \rho_1)/(1 - \rho_1^t)\cdot{}\nabla{}L,$
-2. $B_2 \gets ((\rho_2 - \rho_1^t)/(1 - \rho_2^t))\cdot{}B_2 + (1 - \rho_2)/(1 - \rho_2^t)\cdot\nabla{}L\odot\nabla{}L,$
+If all the weights are on a vector space, then we directly compute updates for ``B_1`` and ``B_2``:
+1. ``B_1 \gets ((\rho_1 - \rho_1^t)/(1 - \rho_1^t))\cdot{}B_1 + (1 - \rho_1)/(1 - \rho_1^t)\cdot{}\nabla{}L,``
+2. ``B_2 \gets ((\rho_2 - \rho_1^t)/(1 - \rho_2^t))\cdot{}B_2 + (1 - \rho_2)/(1 - \rho_2^t)\cdot\nabla{}L\odot\nabla{}L,``
 
-    where $\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n$ is the **Hadamard product**: $[a\odot{}b]_i = a_ib_i$. $\rho_1$ and $\rho_2$ are hyperparameters. Their defaults, $\rho_1=0.9$ and $\rho_2=0.99$, are taken from [goodfellow2016deep; page 301](@cite). After having updated the `cache` (i.e. $B_1$ and $B_2$) we compute a **velocity** (step 3) with which the parameters $Y_t$ are then updated (step 4).
+where ``\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n`` is the *Hadamard product*: ``[a\odot{}b]_i = a_ib_i$. $\rho_1`` and $\rho_2$ are hyperparameters. Their defaults, $\rho_1=0.9$ and $\rho_2=0.99$, are taken from [goodfellow2016deep; page 301](@cite). After having updated the `cache` (i.e. $B_1$ and $B_2$) we compute a **velocity** (step 3) with which the parameters $Y_t$ are then updated (step 4).
 
 3. $W_t\gets -\eta{}B_1/\sqrt{B_2 + \delta},$
 4. $Y_{t+1} \gets Y_t + W_t,$
@@ -31,10 +113,16 @@ Here $\eta$ (with default 0.01) is the **learning rate** and $\delta$ (with defa
 Main.include_graphics("../tikz/adam_optimizer") # hide
 ```
 
+```@eval
+Main.remark("The optimization framework presented here manages to generalize the Adam optimizer to manifolds without knowing an underlying differential equation. From a mathematical perspective this is not really satisfactory because we would ideally want the optimizers to emerge as a discretization of a differential equation as in the case of the gradient and the momentum optimizer to better interpret them. A similar attempt to generalize Adam to the Stiefel manifold was made in [kong2023momentum](@cite).")
+```
+
 ## Weights on manifolds 
 
 The problem with generalizing Adam to manifolds is that the Hadamard product $\odot$ as well as the other element-wise operations ($/$, $\sqrt{}$ and $+$ in step 3 above) lack a clear geometric interpretation. In `GeometricMachineLearning` we get around this issue by utilizing a so-called [global tangent space representation](@ref "Global Tangent Spaces").  
 
+## The Adam Optimizer with Decay
+
 
 ## Library Functions
 
@@ -43,7 +131,9 @@ OptimizerMethod
 GradientOptimizer
 MomentumOptimizer
 AdamOptimizer
-initialize_cache
+AdamOptimizerWithDecay
+GeometricMachineLearning.init_optimizer_cache
+update!
 ```
 
 ## References

From dfd8f8c57cd7c12872964b4cb9efdecf2137fcb3 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Sat, 1 Jun 2024 13:21:25 +0200
Subject: [PATCH 051/101] Added docstring for update.

---
 src/optimizers/optimizer_method.jl | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/optimizers/optimizer_method.jl b/src/optimizers/optimizer_method.jl
index 307726021..1f7d38d84 100644
--- a/src/optimizers/optimizer_method.jl
+++ b/src/optimizers/optimizer_method.jl
@@ -6,6 +6,15 @@ abstract type OptimizerMethod end
 @doc raw"""
     init_optimizer_cache(method, x)
 
-Initialize= the optimizer cache based on input `x` for the given `method`.
+Initialize the optimizer cache based on input `x` for the given `method`.
 """
-function init_optimizer_cache(::OptimizerMethod, x) end
\ No newline at end of file
+function init_optimizer_cache(::OptimizerMethod, x) end
+
+@doc raw"""
+    update!(o, cache, dx::AbstractArray)
+
+Update the `cache` based on the gradient information `dx`, compute the final velocity and store it in `dx`.
+
+The optimizer `o` is needed because some updating schemes (such as [`AdamOptimizer`](@ref)) also need information on the current time step.
+"""
+function update!(::Any, ::Any, ::AbstractArray) end
\ No newline at end of file

From 33883dcbd77c6c597129df30b0873b5f936d03f2 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Sat, 1 Jun 2024 13:21:48 +0200
Subject: [PATCH 052/101] Fixed Adam reference.

---
 docs/src/tutorials/sympnet_tutorial.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/tutorials/sympnet_tutorial.md b/docs/src/tutorials/sympnet_tutorial.md
index 438ffec5f..7e5ce157c 100644
--- a/docs/src/tutorials/sympnet_tutorial.md
+++ b/docs/src/tutorials/sympnet_tutorial.md
@@ -121,7 +121,7 @@ parameterlength(g_nn.model)
 
 *Remark*: We can also specify whether we would like to start with a layer that changes the $q$-component or one that changes the $p$-component. This can be done via the keywords `init_upper` for `GSympNet`, and `init_upper_linear` and `init_upper_act` for `LASympNet`.
 
-We have to define an optimizer which will be use in the training of the SympNet. For more details on optimizer, please see the [corresponding documentation](@ref "Neural Network Optimizers"). In this example we use [Adam](../optimizers/adam_optimizer.md):
+We have to define an optimizer which will be use in the training of the SympNet. For more details on optimizer, please see the [corresponding documentation](@ref "Neural Network Optimizers"). In this example we use [Adam](@ref "The Adam Optimizer"):
 
 ```@example sympnet
 # set up optimizer; for this we first need to specify the optimization method (argue for why we need the optimizer method)

From 552e03b9f93479dc6bebfefb3dad4cea702d5dff Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 3 Jun 2024 10:51:09 +0200
Subject: [PATCH 053/101] Added height as additional parameter.

---
 docs/src/manifolds/riemannian_manifolds.md    | 17 +++++++------
 .../manifold_related/parallel_transport.md    |  6 ++---
 .../manifold_related/retractions.md           | 25 +++++++++++--------
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/docs/src/manifolds/riemannian_manifolds.md b/docs/src/manifolds/riemannian_manifolds.md
index c40530989..9354a5972 100644
--- a/docs/src/manifolds/riemannian_manifolds.md
+++ b/docs/src/manifolds/riemannian_manifolds.md
@@ -80,13 +80,14 @@ morange = RGBf(255 / 256, 127 / 256, 14 / 256) # hide
 mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 
 function set_up_plot(; theme = :dark) # hide
-text_color = Main.output_type == :html ? :white : :black # hide
+text_color = theme == :dark ? :white : :black # hide
 fig = Figure(; backgroundcolor = :transparent) # hide
 ax = Axis3(fig[1, 1]; # hide
         backgroundcolor = (:tomato, .5), # hide
         aspect = (1., 1., 1.), # hide
-        azimuth = π / 6, # hide
-        elevation = π / 8, # hide
+        azimuth = π / 7, # hide
+        elevation = π / 7, # hide
+        height = Relative(1.1),
         xlabel = rich("x", subscript("1"), font = :italic, color = text_color), # hide
         ylabel = rich("x", subscript("2"), font = :italic, color = text_color), # hide
         zlabel = rich("x", subscript("3"), font = :italic, color = text_color), # hide
@@ -106,8 +107,9 @@ end # hide
 
 fig_light = set_up_plot(; theme = :light)[1] # hide
 fig_dark = set_up_plot(; theme = :dark)[1] # hide
-save("sphere_with_tangent_vec.png", fig_light |> alpha_colorbuffer) # hide
-save("sphere_with_tangent_vec_dark.png", fig_dark |> alpha_colorbuffer) # hide
+
+save("sphere_with_tangent_vec.png", alpha_colorbuffer(fig_light)) # hide
+save("sphere_with_tangent_vec_dark.png", alpha_colorbuffer(fig_dark)) # hide
 
 nothing # hide
 ```
@@ -135,8 +137,9 @@ end # hide
 
 fig_light = make_plot_with_solution(; theme = :light) # hide
 fig_dark = make_plot_with_solution(; theme = :dark) # hide
-save("sphere_with_tangent_vec_and_geodesic.png", fig_light |> alpha_colorbuffer) # hide
-save("sphere_with_tangent_vec_and_geodesic_dark.png", fig_dark |> alpha_colorbuffer) # hide
+
+save("sphere_with_tangent_vec_and_geodesic.png", alpha_colorbuffer(fig_light)) # hide
+save("sphere_with_tangent_vec_and_geodesic_dark.png", alpha_colorbuffer(fig_dark)) # hide
 
 nothing # hide
 ```
diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index e744de6d4..6dd1097aa 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -67,7 +67,7 @@ mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
 function set_up_plot(; theme = :dark) # hide
 text_color = Main.output_type == :html ? :white : :black # hide
 fig = Figure(; backgroundcolor = :transparent) # hide
-text_color = Main.output_type == :html ? :white : :black # hide
+text_color = theme == :dark ? :white : :black # hide
 ax = Axis3(fig[1, 1]; # hide
         backgroundcolor = (:tomato, .5), # hide
         aspect = (1., 1., 1.), # hide
@@ -95,8 +95,8 @@ end # hide
 
 fig_light = set_up_plot(; theme = :light)[1]
 fig_dark = set_up_plot(; theme = :dark)[1]
-save("two_vectors.png", fig_light |> alpha_colorbuffer) # hide
-save("two_vectors_dark.png", fig_dark |> alpha_colorbuffer) # hide
+save("two_vectors.png", alpha_colorbuffer(fig_light)) # hide
+save("two_vectors_dark.png", alpha_colorbuffer(fig_dark)) # hide
 
 nothing # hide
 ```
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index f6dcb2c1f..61d7d4384 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -50,14 +50,15 @@ function do_setup(; theme=:light)
     text_color = theme == :dark ? :white : :black # hide
     fig = Figure(; backgroundcolor = :transparent) # hide
     ax = Axis3(fig[1, 1]; # hide
-            backgroundcolor = (:tomato, .5), # hide
-            aspect = (1., 1., 1.), # hide
-            azimuth = π / 6, # hide
-            elevation = π / 8, # hide
-            xlabel = rich("x", subscript("1"), font = :italic, color = text_color), # hide
-            ylabel = rich("x", subscript("2"), font = :italic, color = text_color), # hide
-            zlabel = rich("x", subscript("3"), font = :italic, color = text_color), # hide
-            ) # hide
+        backgroundcolor = (:tomato, .5), # hide
+        aspect = (1., 1., 1.), # hide
+        azimuth = π / 7, # hide
+        elevation = π / 7, # hide
+        height = Relative(1.1),
+        xlabel = rich("x", subscript("1"), font = :italic, color = text_color), # hide
+        ylabel = rich("x", subscript("2"), font = :italic, color = text_color), # hide
+        zlabel = rich("x", subscript("3"), font = :italic, color = text_color), # hide
+        ) # hide
 
     # plot a sphere with radius one and origin 0
     surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .5, transparency = true)
@@ -117,8 +118,9 @@ end # hide
 
 fig_light = make_plot(; theme = :light)[1] # hide
 fig_dark = make_plot(; theme = :dark)[1] # hide
-save("retraction_comparison.png",        fig_light |> alpha_colorbuffer) #; px_per_unit = 1.5) # hide
-save("retraction_comparison_dark.png",   fig_dark |> alpha_colorbuffer) #; px_per_unit = 1.5) # hide
+
+save("retraction_comparison.png",        alpha_colorbuffer(fig_light)) #; px_per_unit = 1.5) # hide
+save("retraction_comparison_dark.png",   alpha_colorbuffer(fig_dark)) #; px_per_unit = 1.5) # hide
 
 Main.include_graphics("retraction_comparison"; caption = raw"Comparison between the geodesic and the Cayley retraction.", width = .8) # hide
 ```
@@ -127,6 +129,8 @@ We see that for small ``\Delta`` increments the Cayley retraction seems to match
 
 ```@setup s2_retraction
 using CairoMakie
+
+CairoMakie.activate!()
 function plot_discrepancies(discrepancies; theme = :light)
     fig = Figure(; backgroundcolor = :transparent) # hide
     text_color = theme == :dark ? :white : :black # hide
@@ -152,6 +156,7 @@ discrepancies = [norm(Y_geo_inc - Y_cay_inc) for (Y_geo_inc, Y_cay_inc, _) in zi
 
 fig_light = plot_discrepancies(discrepancies; theme = :light)[1] # hide
 fig_dark = plot_discrepancies(discrepancies; theme = :dark)[1] # hide
+
 save("retraction_discrepancy.png",        fig_light) #; px_per_unit = 1.5) # hide
 save("retraction_discrepancy_dark.png",   fig_dark) #; px_per_unit = 1.5) # hide
 

From 19b4254eb90c634e177adbbfc5c3bcb7c3146a92 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 3 Jun 2024 10:51:35 +0200
Subject: [PATCH 054/101] Improved reference.

---
 docs/src/optimizers/bfgs_optimizer.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/optimizers/bfgs_optimizer.md b/docs/src/optimizers/bfgs_optimizer.md
index 2ddff19ce..8b876d121 100644
--- a/docs/src/optimizers/bfgs_optimizer.md
+++ b/docs/src/optimizers/bfgs_optimizer.md
@@ -1,6 +1,6 @@
-# The BFGS Algorithm
+# The BFGS Optimizer
 
-The presentation shown here is largely taken from chapters 3 and 6 of reference [wright2006numerical](@cite) with a derivation based on an [online comment](https://math.stackexchange.com/questions/2091867/quasi-newton-methods-understanding-dfp-updating-formula). The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.
+The presentation shown here is largely taken from [wright2006numerical; chapters 3 and 6](@cite) with a derivation based on an [online comment](https://math.stackexchange.com/questions/2091867/quasi-newton-methods-understanding-dfp-updating-formula). The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.
 
 It is a version of a *quasi-Newton* method and is therefore especially suited for convex problems. As is the case with any other (quasi-)Newton method the BFGS algorithm approximates the objective with a quadratic function in each optimization step:
 ```math

From 80377f9713f90af6914fc50bdc1d0167a08dd7f1 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 3 Jun 2024 10:52:12 +0200
Subject: [PATCH 055/101] Added reference for the momentum optimizer.:

---
 docs/src/optimizers/optimizer_framework.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/optimizers/optimizer_framework.md b/docs/src/optimizers/optimizer_framework.md
index d2ecebbb3..ab96971fa 100644
--- a/docs/src/optimizers/optimizer_framework.md
+++ b/docs/src/optimizers/optimizer_framework.md
@@ -1,6 +1,6 @@
 # Neural Network Optimizers
 
-In this section we present the general Optimizer framework used in `GeometricMachineLearning`. For more information on the particular steps involved in this consult the documentation on the various optimizer methods such as the *momentum optimizer* and the [Adam optimizer](@ref "The Adam Optimizer"), and the documentation on [retractions](@ref "Retractions").
+In this section we present the general Optimizer framework used in `GeometricMachineLearning`. For more information on the particular steps involved in this consult the documentation on the various optimizer methods such as the [momentum optimizer](@ref "The Momentum Optimizer") and the [Adam optimizer](@ref "The Adam Optimizer"), and the documentation on [retractions](@ref "Retractions").
 
 During *optimization* we aim at changing the neural network parameters in such a way to minimize the loss function. So if we express the loss function ``L`` as a function of the neural network weights ``\Theta`` in a parameter space ``\mathbb{P}`` we can phrase the task as: 
 

From 755e8c8207bdce2349acadc49b594032b336b642 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 3 Jun 2024 10:52:47 +0200
Subject: [PATCH 056/101] Updated documentation for Adam optimizer.:

---
 docs/src/optimizers/optimizer_methods.md | 29 ++++++++++--------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/docs/src/optimizers/optimizer_methods.md b/docs/src/optimizers/optimizer_methods.md
index abcd8744a..9d073e2d7 100644
--- a/docs/src/optimizers/optimizer_methods.md
+++ b/docs/src/optimizers/optimizer_methods.md
@@ -64,7 +64,7 @@ o = Optimizer(method, weight)
 o.cache
 ```
 
-But as the cache is initialized with zeros it will lead the same result as the gradient optimizer in the first iteration:
+But as the cache is initialized with zeros it will lead to the same result as the gradient optimizer in the first iteration:
 
 ```@example optimizer_methods
 dx = (A = one(weight.A), )
@@ -90,36 +90,31 @@ Optimizer(method, weight).cache
 
 ## The Adam Optimizer 
 
-The Adam Optimizer is one of the most widely neural network optimizer. Like most modern neural network optimizers it contains a `cache` that is updated based on first-order gradient information and then, in a second step, the `cache` is used to compute a velocity estimate for updating the neural network weights. 
-
-Here we first describe the Adam algorithm for the case where all the weights are on a vector space and then show how to generalize this to the case where the weights are on a manifold. 
-
-### All weights on a vector space
-
-The cache of the Adam optimizer consists of *first and second moments*. The *first moments* ``B_1``, similar to the momentum optimizer, store linear information about the current and previous gradients, and the *second moments* ``B_2`` store quadratic information about current and previous gradients (all computed from a first-order gradient). 
+The Adam Optimizer is one of the most widely neural network optimizers. The cache of the Adam optimizer consists of *first and second moments*. The *first moments* ``B_1``, similar to the momentum optimizer, store linear information about the current and previous gradients, and the *second moments* ``B_2`` store quadratic information about current and previous gradients (all computed from a first-order gradient). 
 
 If all the weights are on a vector space, then we directly compute updates for ``B_1`` and ``B_2``:
 1. ``B_1 \gets ((\rho_1 - \rho_1^t)/(1 - \rho_1^t))\cdot{}B_1 + (1 - \rho_1)/(1 - \rho_1^t)\cdot{}\nabla{}L,``
 2. ``B_2 \gets ((\rho_2 - \rho_1^t)/(1 - \rho_2^t))\cdot{}B_2 + (1 - \rho_2)/(1 - \rho_2^t)\cdot\nabla{}L\odot\nabla{}L,``
 
-where ``\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n`` is the *Hadamard product*: ``[a\odot{}b]_i = a_ib_i$. $\rho_1`` and $\rho_2$ are hyperparameters. Their defaults, $\rho_1=0.9$ and $\rho_2=0.99$, are taken from [goodfellow2016deep; page 301](@cite). After having updated the `cache` (i.e. $B_1$ and $B_2$) we compute a **velocity** (step 3) with which the parameters $Y_t$ are then updated (step 4).
+where ``\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n`` is the *Hadamard product*: ``[a\odot{}b]_i = a_ib_i.`` ``\rho_1`` and ``\rho_2`` are hyperparameters. Their defaults, $\rho_1=0.9$ and $\rho_2=0.99$, are taken from [goodfellow2016deep; page 301](@cite). After having updated the `cache` (i.e. ``B_1`` and ``B_2``) we compute a *velocity* with which the parameters of the network are then updated:
+* ``W_t\gets -\eta{}B_1/\sqrt{B_2 + \delta},``
+* ``Y^{(t+1)} \gets Y^{(t)} + W^{(t)},``
 
-3. $W_t\gets -\eta{}B_1/\sqrt{B_2 + \delta},$
-4. $Y_{t+1} \gets Y_t + W_t,$
+where the last addition has to be replaced with appropriate operations when dealing with manifolds. Further ``\eta`` is the *learning rate* and ``\delta`` is a small constant that is added for stability. The division, square root and addition in step 3 are performed element-wise.
 
-Here $\eta$ (with default 0.01) is the **learning rate** and $\delta$ (with default $3\cdot10^{-7}$) is a small constant that is added for stability. The division, square root and addition in step 3 are performed element-wise. 
+In the following we show a schematic update that Adam performs for the case when no elements are on manifolds (also compare this figure with the [general optimization framework](@ref "Generalization to Homogeneous Spaces")):
 
 ```@example 
 Main.include_graphics("../tikz/adam_optimizer") # hide
 ```
 
-```@eval
-Main.remark("The optimization framework presented here manages to generalize the Adam optimizer to manifolds without knowing an underlying differential equation. From a mathematical perspective this is not really satisfactory because we would ideally want the optimizers to emerge as a discretization of a differential equation as in the case of the gradient and the momentum optimizer to better interpret them. A similar attempt to generalize Adam to the Stiefel manifold was made in [kong2023momentum](@cite).")
-```
+### Weights on manifolds 
 
-## Weights on manifolds 
+The problem with generalizing Adam to manifolds is that the Hadamard product ``\odot`` as well as the other element-wise operations (``/``, ``\sqrt{}`` and ``+`` in step 3 above) lack a clear geometric interpretation. In `GeometricMachineLearning` we get around this issue by utilizing a so-called [global tangent space representation](@ref "Global Tangent Spaces").  
 
-The problem with generalizing Adam to manifolds is that the Hadamard product $\odot$ as well as the other element-wise operations ($/$, $\sqrt{}$ and $+$ in step 3 above) lack a clear geometric interpretation. In `GeometricMachineLearning` we get around this issue by utilizing a so-called [global tangent space representation](@ref "Global Tangent Spaces").  
+```@eval
+Main.remark(raw"The optimization framework presented here manages to generalize the Adam optimizer to manifolds without knowing an underlying differential equation. From a mathematical perspective this is not really satisfactory because we would ideally want the optimizers to emerge as a discretization of a differential equation as in the case of the gradient and the momentum optimizer to better interpret them. A similar attempt to generalize Adam to the Stiefel manifold was made in [kong2023momentum](@cite).")
+```
 
 ## The Adam Optimizer with Decay
 

From 666b5e7ca94d307f76dbe70082f332e349e4b3a3 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 3 Jun 2024 20:16:16 +0200
Subject: [PATCH 057/101] Now using latex strings for label.

---
 docs/src/manifolds/manifolds.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/src/manifolds/manifolds.md b/docs/src/manifolds/manifolds.md
index 67252eb95..f661837af 100644
--- a/docs/src/manifolds/manifolds.md
+++ b/docs/src/manifolds/manifolds.md
@@ -134,9 +134,12 @@ function make_plot(; theme = :light)
         aspect = (1., 1., 0.8), 
         azimuth = π / 6, 
         elevation = π / 8, 
-        xlabel = rich("x", subscript("1"), font = :italic, color = text_color),
-        ylabel = rich("x", subscript("2"), font = :italic, color = text_color),
-        zlabel = rich("x", subscript("3"), font = :italic, color = text_color),
+        xlabel = L"x_1",
+        ylabel = L"x_2",
+        zlabel = L"x_3",
+        xlabelcolor = text_color,
+        ylabelcolor = text_color,
+        zlabelcolor = text_color,
         )
 
     surface!(Main.sphere(1., [0., 0., 0.])...; alpha = .6)

From 649cda8edfcc885375c6250b57fc4c398c5c5c33 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 3 Jun 2024 20:16:53 +0200
Subject: [PATCH 058/101] Adjusted image size.

---
 docs/src/manifolds/riemannian_manifolds.md    | 31 +++++++++-----
 .../manifold_related/parallel_transport.md    | 40 +++++++++++++------
 .../manifold_related/retractions.md           | 23 ++++++++---
 3 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/docs/src/manifolds/riemannian_manifolds.md b/docs/src/manifolds/riemannian_manifolds.md
index 9354a5972..671d2a269 100644
--- a/docs/src/manifolds/riemannian_manifolds.md
+++ b/docs/src/manifolds/riemannian_manifolds.md
@@ -83,15 +83,28 @@ function set_up_plot(; theme = :dark) # hide
 text_color = theme == :dark ? :white : :black # hide
 fig = Figure(; backgroundcolor = :transparent) # hide
 ax = Axis3(fig[1, 1]; # hide
-        backgroundcolor = (:tomato, .5), # hide
-        aspect = (1., 1., 1.), # hide
-        azimuth = π / 7, # hide
-        elevation = π / 7, # hide
-        height = Relative(1.1),
-        xlabel = rich("x", subscript("1"), font = :italic, color = text_color), # hide
-        ylabel = rich("x", subscript("2"), font = :italic, color = text_color), # hide
-        zlabel = rich("x", subscript("3"), font = :italic, color = text_color), # hide
-        ) # hide
+    backgroundcolor = (:tomato, .5), # hide
+    aspect = (1., 1., 1.), # hide
+    xlabel = L"x_1", # hide
+    ylabel = L"x_2", # hide
+    zlabel = L"x_3", # hide
+    xgridcolor = text_color, # hide
+    ygridcolor = text_color, # hide
+    zgridcolor = text_color, # hide
+    xtickcolor = text_color, # hide
+    ytickcolor = text_color, # hide
+    ztickcolor = text_color, # hide
+    xlabelcolor = text_color, # hide
+    ylabelcolor = text_color, # hide
+    zlabelcolor = text_color, # hide
+    xypanelcolor = :transparent, # hide
+    xzpanelcolor = :transparent, # hide
+    yzpanelcolor = :transparent, # hide
+    limits = ([-1, 1], [-1, 1], [-1, 1]), # hide
+    azimuth = π / 7, # hide
+    elevation = π / 7, # hide
+    # height = 75.,
+    ) # hide
 
 # plot a sphere with radius one and origin 0
 surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .5, transparency = true)
diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index 6dd1097aa..39eaa675a 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -54,6 +54,8 @@ import Random # hide
 Random.seed!(123) # hide
 
 Y = rand(StiefelManifold, 3, 1)
+# needed because we will change `Y` later on
+Y_copy = StiefelManifold(copy(Y.A))
 
 v = 2 * rand(3, 1)
 v₂ = 1 * rand(3, 1)
@@ -65,23 +67,36 @@ mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
 
 function set_up_plot(; theme = :dark) # hide
-text_color = Main.output_type == :html ? :white : :black # hide
 fig = Figure(; backgroundcolor = :transparent) # hide
 text_color = theme == :dark ? :white : :black # hide
 ax = Axis3(fig[1, 1]; # hide
-        backgroundcolor = (:tomato, .5), # hide
-        aspect = (1., 1., 1.), # hide
-        azimuth = π / 6, # hide
-        elevation = π / 8, # hide
-        xlabel = rich("x", subscript("1"), font = :italic, color = text_color), # hide
-        ylabel = rich("x", subscript("2"), font = :italic, color = text_color), # hide
-        zlabel = rich("x", subscript("3"), font = :italic, color = text_color), # hide
-        ) # hide
+    backgroundcolor = (:tomato, .5), # hide
+    aspect = (1., 1., 1.), # hide
+    xlabel = L"x_1", # hide
+    ylabel = L"x_2", # hide
+    zlabel = L"x_3", # hide
+    xgridcolor = text_color, # hide
+    ygridcolor = text_color, # hide
+    zgridcolor = text_color, # hide
+    xtickcolor = text_color, # hide
+    ytickcolor = text_color, # hide
+    ztickcolor = text_color, # hide
+    xlabelcolor = text_color, # hide
+    ylabelcolor = text_color, # hide
+    zlabelcolor = text_color, # hide
+    xypanelcolor = :transparent, # hide
+    xzpanelcolor = :transparent, # hide
+    yzpanelcolor = :transparent, # hide
+    limits = ([-1, 1], [-1, 1], [-1, 1]),
+    azimuth = π / 7, # hide
+    elevation = π / 7, # hide
+    # height = 75.,
+    ) # hide
 
 # plot a sphere with radius one and origin 0
 surface!(ax, Main.sphere(1., [0., 0., 0.])...; alpha = .5, transparency = true)
 
-point_vec = ([Y[1]], [Y[2]], [Y[3]])
+point_vec = ([Y_copy[1]], [Y_copy[2]], [Y_copy[3]])
 scatter!(ax, point_vec...; color = morange, marker = :star5)
 
 arrow_vec = ([Δ[1]], [Δ[2]], [Δ[3]])
@@ -109,6 +124,7 @@ Note that we have chosen the arrow here to have the same direction as before but
 
 ```@example s2_parallel_transport
 using GeometricMachineLearning: update_section!
+
 λY = GlobalSection(Y)
 B = global_rep(λY, Δ)
 B₂ = global_rep(λY, Δ₂)
@@ -118,7 +134,7 @@ Y_increments = []
 Δ_transported = []
 Δ₂_transported = []
 
-const n_steps = 8
+const n_steps = 6
 const tstep = 2
 
 for _ in 1:n_steps
@@ -129,7 +145,7 @@ for _ in 1:n_steps
 end
 
 function plot_parallel_transport(; theme = :dark) # hide
-fig, ax = set_up_plot(; theme = :dark) # hide
+fig, ax = set_up_plot(; theme = theme) # hide
 for Y_increment in Y_increments
     scatter!(ax, [Y_increment[1]], [Y_increment[2]], [Y_increment[3]]; 
         color = mred, markersize = 5)
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 61d7d4384..59d18ab30 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -48,16 +48,29 @@ v = 5 * rand(3, 1)
 
 function do_setup(; theme=:light)
     text_color = theme == :dark ? :white : :black # hide
-    fig = Figure(; backgroundcolor = :transparent) # hide
+    fig = Figure(; backgroundcolor = :transparent, size = (600, 450)) # hide
     ax = Axis3(fig[1, 1]; # hide
         backgroundcolor = (:tomato, .5), # hide
         aspect = (1., 1., 1.), # hide
+        xlabel = L"x_1", # hide
+        ylabel = L"x_2", # hide
+        zlabel = L"x_3", # hide
+        xgridcolor = text_color, # hide
+        ygridcolor = text_color, # hide
+        zgridcolor = text_color, # hide
+        xtickcolor = text_color, # hide
+        ytickcolor = text_color, # hide
+        ztickcolor = text_color, # hide
+        xlabelcolor = text_color, # hide
+        ylabelcolor = text_color, # hide
+        zlabelcolor = text_color, # hide
+        xypanelcolor = :transparent, # hide
+        xzpanelcolor = :transparent, # hide
+        yzpanelcolor = :transparent, # hide
+        limits = ([-1, 1], [-1, 1], [-1, 1]),
         azimuth = π / 7, # hide
         elevation = π / 7, # hide
-        height = Relative(1.1),
-        xlabel = rich("x", subscript("1"), font = :italic, color = text_color), # hide
-        ylabel = rich("x", subscript("2"), font = :italic, color = text_color), # hide
-        zlabel = rich("x", subscript("3"), font = :italic, color = text_color), # hide
+        # height = 75.,
         ) # hide
 
     # plot a sphere with radius one and origin 0

From ab49dd695f4fd3a4e94f8730ac371402ac16e411 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 4 Jun 2024 09:46:44 +0200
Subject: [PATCH 059/101] Fixed picture size.

---
 docs/src/manifolds/riemannian_manifolds.md                 | 2 +-
 docs/src/optimizers/manifold_related/parallel_transport.md | 6 +++++-
 docs/src/optimizers/manifold_related/retractions.md        | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/src/manifolds/riemannian_manifolds.md b/docs/src/manifolds/riemannian_manifolds.md
index 671d2a269..2b30ba54a 100644
--- a/docs/src/manifolds/riemannian_manifolds.md
+++ b/docs/src/manifolds/riemannian_manifolds.md
@@ -81,7 +81,7 @@ mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 
 function set_up_plot(; theme = :dark) # hide
 text_color = theme == :dark ? :white : :black # hide
-fig = Figure(; backgroundcolor = :transparent) # hide
+fig = Figure(; backgroundcolor = :transparent, size = (400, 300)) # hide
 ax = Axis3(fig[1, 1]; # hide
     backgroundcolor = (:tomato, .5), # hide
     aspect = (1., 1., 1.), # hide
diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index 39eaa675a..e99c2964c 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -67,7 +67,7 @@ mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
 
 function set_up_plot(; theme = :dark) # hide
-fig = Figure(; backgroundcolor = :transparent) # hide
+fig = Figure(; backgroundcolor = :transparent, size = (400, 300)) # hide
 text_color = theme == :dark ? :white : :black # hide
 ax = Axis3(fig[1, 1]; # hide
     backgroundcolor = (:tomato, .5), # hide
@@ -174,6 +174,10 @@ nothing # hide
 Main.include_graphics("parallel_transport") # hide
 ```
 
+```@eval
+Main.remark(raw"Note that `apply_section!` changes ``\lambda(Y)`` (including ``Y``) in-place!")
+```
+
 ## References
 
 ```@bibliography
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 59d18ab30..c40a46a33 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -48,7 +48,7 @@ v = 5 * rand(3, 1)
 
 function do_setup(; theme=:light)
     text_color = theme == :dark ? :white : :black # hide
-    fig = Figure(; backgroundcolor = :transparent, size = (600, 450)) # hide
+    fig = Figure(; backgroundcolor = :transparent, size = (400, 300)) # hide
     ax = Axis3(fig[1, 1]; # hide
         backgroundcolor = (:tomato, .5), # hide
         aspect = (1., 1., 1.), # hide

From b50e89a5dc377e1fe430b69a4b977f48c6018483 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 4 Jun 2024 10:07:12 +0200
Subject: [PATCH 060/101] Changed picture size.

---
 docs/src/manifolds/riemannian_manifolds.md                 | 2 +-
 docs/src/optimizers/manifold_related/parallel_transport.md | 2 +-
 docs/src/optimizers/manifold_related/retractions.md        | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/src/manifolds/riemannian_manifolds.md b/docs/src/manifolds/riemannian_manifolds.md
index 2b30ba54a..3083d1373 100644
--- a/docs/src/manifolds/riemannian_manifolds.md
+++ b/docs/src/manifolds/riemannian_manifolds.md
@@ -81,7 +81,7 @@ mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 
 function set_up_plot(; theme = :dark) # hide
 text_color = theme == :dark ? :white : :black # hide
-fig = Figure(; backgroundcolor = :transparent, size = (400, 300)) # hide
+fig = Figure(; backgroundcolor = :transparent, size = (450, 338)) # hide
 ax = Axis3(fig[1, 1]; # hide
     backgroundcolor = (:tomato, .5), # hide
     aspect = (1., 1., 1.), # hide
diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index e99c2964c..b57d86fba 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -67,7 +67,7 @@ mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
 
 function set_up_plot(; theme = :dark) # hide
-fig = Figure(; backgroundcolor = :transparent, size = (400, 300)) # hide
+fig = Figure(; backgroundcolor = :transparent, size = (450, 338)) # hide
 text_color = theme == :dark ? :white : :black # hide
 ax = Axis3(fig[1, 1]; # hide
     backgroundcolor = (:tomato, .5), # hide
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index c40a46a33..3354e887c 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -48,7 +48,7 @@ v = 5 * rand(3, 1)
 
 function do_setup(; theme=:light)
     text_color = theme == :dark ? :white : :black # hide
-    fig = Figure(; backgroundcolor = :transparent, size = (400, 300)) # hide
+    fig = Figure(; backgroundcolor = :transparent, size = (450, 338)) # hide
     ax = Axis3(fig[1, 1]; # hide
         backgroundcolor = (:tomato, .5), # hide
         aspect = (1., 1., 1.), # hide
@@ -170,8 +170,8 @@ discrepancies = [norm(Y_geo_inc - Y_cay_inc) for (Y_geo_inc, Y_cay_inc, _) in zi
 fig_light = plot_discrepancies(discrepancies; theme = :light)[1] # hide
 fig_dark = plot_discrepancies(discrepancies; theme = :dark)[1] # hide
 
-save("retraction_discrepancy.png",        fig_light) #; px_per_unit = 1.5) # hide
-save("retraction_discrepancy_dark.png",   fig_dark) #; px_per_unit = 1.5) # hide
+save("retraction_discrepancy.png",        fig_light; px_per_unit = 1.5) # hide
+save("retraction_discrepancy_dark.png",   fig_dark; px_per_unit = 1.5) # hide
 
 Main.include_graphics("retraction_discrepancy"; caption = raw"Discrepancy between the geodesic and the Cayley retraction.", width = .6) # hide
 ```

From dffc0e8e5b2babdfd60d086adf5c4c11a5321ca1 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 4 Jun 2024 10:07:32 +0200
Subject: [PATCH 061/101] Added GLMakie.

---
 docs/Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/Project.toml b/docs/Project.toml
index 6191bb8a3..e151afa9d 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -4,6 +4,7 @@ CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 GeometricIntegrators = "dcce2d33-59f6-5b8d-9047-0defad88ae06"
 GeometricMachineLearning = "194d25b2-d3f5-49f0-af24-c124f4aa80cc"
 GeometricProblems = "18cb22b4-ad41-5c80-9c5f-710df63fbdc9"

From 13c5510700b548eb7e151df152b426158d56f3e6 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:46:40 +0200
Subject: [PATCH 062/101] Reduced optimize_for_one_epoch! to one method.

---
 src/loss/losses.jl | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/src/loss/losses.jl b/src/loss/losses.jl
index d51c0ffc9..f0ad96c5b 100644
--- a/src/loss/losses.jl
+++ b/src/loss/losses.jl
@@ -1,3 +1,11 @@
+@doc raw"""
+An abstract type for all the neural network losses. 
+If you want to implement ``CustomLoss <: NetworkLoss`` you need to define a functor:
+```julia
+    (loss::CustomLoss)(model, ps, input, output)
+```
+where `model` is an instance of an `AbstractExplicitLayer` or a `Chain` and `ps` the parameters.
+"""
 abstract type NetworkLoss end 
 
 function (loss::NetworkLoss)(nn::NeuralNetwork, input::CT, output::CT) where {AT<:AbstractArray, BT <: NamedTuple{(:q, :p), Tuple{AT, AT}}, CT <: Union{AT, BT}}
@@ -18,9 +26,16 @@ function (loss::NetworkLoss)(model::Union{Chain, AbstractExplicitLayer}, ps::Uni
 end
 
 @doc raw"""
-The loss for a transformer network (especially a transformer integrator). The constructor is called with:
-- `seq_length::Int`
-- `prediction_window::Int` (default is 1).
+    TransformerLoss(seq_length, prediction_window)
+
+Make an instance of the transformer loss. 
+
+This is the loss for a transformer network (especially a transformer integrator). 
+    
+# Parameters
+
+The `prediction_window` specifies how many time steps are predicted into the future.
+It defaults to the value specified for `seq_length`. 
 """
 struct TransformerLoss <: NetworkLoss
     seq_length::Int
@@ -56,6 +71,13 @@ function (loss::ClassificationTransformerLoss)(model::Union{Chain, AbstractExpli
     norm(predicted_output_cropped - output) / norm(output)
 end
 
+@doc raw"""
+    FeedForwardLoss()
+
+Make an instance of a loss for feedforward neural networks.
+
+This doesn't have any parameters.
+"""
 struct FeedForwardLoss <: NetworkLoss end
 
 @doc raw"""

From e9c64be5ad658345f021458d8bcadcb2dc5e1091 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:47:43 +0200
Subject: [PATCH 063/101] Reduced optimize_for_one_epoch! to one method.

---
 src/data_loader/optimize.jl | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/src/data_loader/optimize.jl b/src/data_loader/optimize.jl
index 7c10e7fa5..d73783392 100644
--- a/src/data_loader/optimize.jl
+++ b/src/data_loader/optimize.jl
@@ -22,24 +22,8 @@ function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTu
     @views for batch_indices in batches 
         count += 1
         # these `copy`s should not be necessary! coming from a Zygote problem!
-        input_nt, output_nt = convert_input_and_batch_indices_to_array(dl, batch, batch_indices)
-        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt, output_nt), ps)
-        total_error += loss_value
-        dp = pullback(one(loss_value))[1]
-        optimization_step!(opt, λY, ps, dp)
-    end
-    total_error / count
-end
-
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, <:Any, Nothing, :RegularData}, batch::Batch, loss::Union{typeof(loss), NetworkLoss}, λY) where T
-    count = 0
-    total_error = T(0)
-    batches = batch(dl)
-    @views for batch_indices in batches 
-        count += 1
-        # these `copy`s should not be necessary! coming from a Zygote problem!
-        input_nt = convert_input_and_batch_indices_to_array(dl, batch, batch_indices)
-        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt), ps)
+        input_nt_output_nt = convert_input_and_batch_indices_to_array(dl, batch, batch_indices)
+        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt_output_nt...), ps)
         total_error += loss_value
         dp = pullback(one(loss_value))[1]
         optimization_step!(opt, λY, ps, dp)

From ca3ccd19948ad936d45084f0a984d7fe3bb01733 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:49:41 +0200
Subject: [PATCH 064/101] Started adding tutorial on how to customize the loss
 function.

---
 docs/make.jl                                      |  1 +
 docs/src/tutorials/adjusting_the_loss_function.md | 13 +++++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 docs/src/tutorials/adjusting_the_loss_function.md

diff --git a/docs/make.jl b/docs/make.jl
index 2c6f284f9..edec35197 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -203,6 +203,7 @@ makedocs(;
             "Grassmann manifold" => "tutorials/grassmann_layer.md",
             "Volume-Preserving Attention" => "tutorials/volume_preserving_attention.md",
             "Linear Symplectic Transformer" => "tutorials/linear_symplectic_transformer.md",
+            "Adjusting the Loss Function" => "tutorials/adjusting_the_loss_function.md",
         ],
         "References" => "references.md",
         "Library" => "library.md",
diff --git a/docs/src/tutorials/adjusting_the_loss_function.md b/docs/src/tutorials/adjusting_the_loss_function.md
new file mode 100644
index 000000000..51d06f962
--- /dev/null
+++ b/docs/src/tutorials/adjusting_the_loss_function.md
@@ -0,0 +1,13 @@
+# Adjusting the Loss Function
+
+`GeometricMachineLearning` provides a few standard loss function that are used as defaults for specific neural networks:
+
+
+## Library Functions
+
+```@docs; canonical = false
+NetworkLoss
+FeedForwardLoss
+AutoencoderLoss
+TransformerLoss
+```
\ No newline at end of file

From e078827222478aa6ac5e75a403f7882e68f463d9 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:52:12 +0200
Subject: [PATCH 065/101] Got rid of reference in proof environment.

---
 docs/src/manifolds/inverse_function_theorem.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/manifolds/inverse_function_theorem.md b/docs/src/manifolds/inverse_function_theorem.md
index d3885850a..28627638c 100644
--- a/docs/src/manifolds/inverse_function_theorem.md
+++ b/docs/src/manifolds/inverse_function_theorem.md
@@ -44,7 +44,7 @@ Main.theorem(raw"Consider a vector-valued differentiable function ``F:\mathbb{R}
 ```
 
 ```@eval
-Main.proof(raw"""Consider a mapping ``F:\mathbb{R}^N\to\mathbb{R}^N`` and assume its Jacobian has full rank at point ``x``, i.e. ``\det{}F'(x)\neq0``. We further assume that ``F(x) = 0``, ``F'(x) = \mathbb{I}`` and ``x = 0``. Now consider a ball around ``x`` whose radius ``r`` we do not yet fix and two points ``y`` and ``z`` in that ball: ``y,z\in{}B(r)``. We further introduce the function ``G(y):=y-F(y)``. By the *mean value theorem* we have ``|G(y)| = |G(y) - x| = |G(y) - G(x)|\leq|y-x|\sup_{0<t<1}||G'(x + t(y-x))||`` where ``||\cdot||`` is the *operator norm*. Because ``t\mapsto{}G'(x+t(y-x))`` is continuous and ``G'(x)=0`` there must exist an ``r`` s.t. ``\forall{}t\in[0,1],\,||G'(x +t(y-x))||<1/2``. We have for any element ``y\in{}B(r)``: ``|G(y) | \leq ||G'(y)||\cdot|y| < |y|/2``, so ``G(B(r))\subset{}B(r/2)``. We further define ``G_z(y) := z + G(y)``; this map is contractive on ``B(r)`` (for ``z\in{}B(r/2)``): ``|G_z(y)| \leq |z| + |G(y) - x| < q < 1`` and therefore has a fixed point: ``y^* = G_z(y^*) = z + y^* - F(y^*)`` and we obtain ``z = F(y^*)``.  The inverse (which we call ``H:F(B(r/2))\to{}B(r)``) is also continuous by the last theorem presented in the [section on basic topological concepts](@ref "Basic Concepts from General Topology"). We now proof that the derivative of ``H`` at ``F(x) = 0`` exists and that it is equal to ``F'(H(z))^{-1}``. For this we let ``\eta\in{}F(B(r/2))`` go to zero. We further define ``\xi = F(z)`` and ``h = H(\xi + \eta) - z``:
+Main.proof(raw"""Consider a mapping ``F:\mathbb{R}^N\to\mathbb{R}^N`` and assume its Jacobian has full rank at point ``x``, i.e. ``\det{}F'(x)\neq0``. We further assume that ``F(x) = 0``, ``F'(x) = \mathbb{I}`` and ``x = 0``. Now consider a ball around ``x`` whose radius ``r`` we do not yet fix and two points ``y`` and ``z`` in that ball: ``y,z\in{}B(r)``. We further introduce the function ``G(y):=y-F(y)``. By the *mean value theorem* we have ``|G(y)| = |G(y) - x| = |G(y) - G(x)|\leq|y-x|\sup_{0<t<1}||G'(x + t(y-x))||`` where ``||\cdot||`` is the *operator norm*. Because ``t\mapsto{}G'(x+t(y-x))`` is continuous and ``G'(x)=0`` there must exist an ``r`` s.t. ``\forall{}t\in[0,1],\,||G'(x +t(y-x))||<1/2``. We have for any element ``y\in{}B(r)``: ``|G(y) | \leq ||G'(y)||\cdot|y| < |y|/2``, so ``G(B(r))\subset{}B(r/2)``. We further define ``G_z(y) := z + G(y)``; this map is contractive on ``B(r)`` (for ``z\in{}B(r/2)``): ``|G_z(y)| \leq |z| + |G(y) - x| < q < 1`` and therefore has a fixed point: ``y^* = G_z(y^*) = z + y^* - F(y^*)`` and we obtain ``z = F(y^*)``.  The inverse (which we call ``H:F(B(r/2))\to{}B(r)``) is also continuous by the last theorem presented in the section on basic topological concepts. We now proof that the derivative of ``H`` at ``F(x) = 0`` exists and that it is equal to ``F'(H(z))^{-1}``. For this we let ``\eta\in{}F(B(r/2))`` go to zero. We further define ``\xi = F(z)`` and ``h = H(\xi + \eta) - z``:
 """ * 
 Main.indentation * raw"```math
 " *

From e3efbd22393788675159b2a80d9060e8c9842b1e Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:52:56 +0200
Subject: [PATCH 066/101] Put everything into one equation environment.

---
 docs/src/manifolds/metric_and_vector_spaces.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/manifolds/metric_and_vector_spaces.md b/docs/src/manifolds/metric_and_vector_spaces.md
index 75a050a83..8679a031c 100644
--- a/docs/src/manifolds/metric_and_vector_spaces.md
+++ b/docs/src/manifolds/metric_and_vector_spaces.md
@@ -57,7 +57,7 @@ This last example shows that *metric spaces need not be vector spaces*, i.e. spa
 To define *complete metric spaces* we first need the definition of a *Cauchy sequence*.
 
 ```@eval
-Main.definition(raw"A **Cauchy sequence** is a sequence ``(a_n)_{n\in\mathbb{N}}`` for which, given any `epsilon>0`, we can find an integer ``N`` such that ``d(a_n, a_m) < \epsilon`` for all ``n, m \geq N``.")
+Main.definition(raw"A **Cauchy sequence** is a sequence ``(a_n)_{n\in\mathbb{N}}`` for which, given any ``\epsilon>0``, we can find an integer ``N`` such that ``d(a_n, a_m) < \epsilon`` for all ``n, m \geq N``.")
 ```
 
 Now we can give the definition of a *complete metric space*:
@@ -80,9 +80,9 @@ Main.indentation * raw"1. ``x + (y + z) = (x + y) + z,``
 " * 
 Main.indentation * raw"2. ``x + y = y + x,``
 " * 
-Main.indentation * raw"3. ``\exists 0 \in \mathcal{V}`` such that ``x + 0 = x,``
+Main.indentation * raw"3. ``\exists 0 \in \mathcal{V}\text{such that }x + 0 = x,``
 " * 
-Main.indentation * raw"4. ``\exists -x \in \mathcal{V}`` such that ``x + (-x) = 0,``
+Main.indentation * raw"4. ``\exists -x \in \mathcal{V}\text{ such that }x + (-x) = 0,``
 " * 
 Main.indentation * raw"5. ``a(ax) = (ab)x,``
 " * 

From f905e41ef243282344a8516600f4313ca0286ba9 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:53:30 +0200
Subject: [PATCH 067/101] Changed size of figure.

---
 docs/src/optimizers/manifold_related/retractions.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index 3354e887c..f390843b0 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -132,8 +132,8 @@ end # hide
 fig_light = make_plot(; theme = :light)[1] # hide
 fig_dark = make_plot(; theme = :dark)[1] # hide
 
-save("retraction_comparison.png",        alpha_colorbuffer(fig_light)) #; px_per_unit = 1.5) # hide
-save("retraction_comparison_dark.png",   alpha_colorbuffer(fig_dark)) #; px_per_unit = 1.5) # hide
+save("retraction_comparison.png",        alpha_colorbuffer(fig_light)) # hide
+save("retraction_comparison_dark.png",   alpha_colorbuffer(fig_dark)) # hide
 
 Main.include_graphics("retraction_comparison"; caption = raw"Comparison between the geodesic and the Cayley retraction.", width = .8) # hide
 ```
@@ -170,8 +170,8 @@ discrepancies = [norm(Y_geo_inc - Y_cay_inc) for (Y_geo_inc, Y_cay_inc, _) in zi
 fig_light = plot_discrepancies(discrepancies; theme = :light)[1] # hide
 fig_dark = plot_discrepancies(discrepancies; theme = :dark)[1] # hide
 
-save("retraction_discrepancy.png",        fig_light; px_per_unit = 1.5) # hide
-save("retraction_discrepancy_dark.png",   fig_dark; px_per_unit = 1.5) # hide
+save("retraction_discrepancy.png",        fig_light; px_per_unit = 1.3) # hide
+save("retraction_discrepancy_dark.png",   fig_dark; px_per_unit = 1.3) # hide
 
 Main.include_graphics("retraction_discrepancy"; caption = raw"Discrepancy between the geodesic and the Cayley retraction.", width = .6) # hide
 ```

From c44b5b9bd83810b0db178a97fdc022bf88f8689b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:55:55 +0200
Subject: [PATCH 068/101] Improved presentation of optimizer results slightly.

---
 docs/src/optimizers/optimizer_methods.md | 47 ++++++++++++++++++++----
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/docs/src/optimizers/optimizer_methods.md b/docs/src/optimizers/optimizer_methods.md
index 9d073e2d7..dcd2c22fc 100644
--- a/docs/src/optimizers/optimizer_methods.md
+++ b/docs/src/optimizers/optimizer_methods.md
@@ -14,7 +14,9 @@ It simply does:
 \mathrm{weight} \leftarrow \mathrm{weight} + (-\eta\cdot\mathrm{gradient}),
 ```
 
-where addition has to be replaced with appropriate operations in the manifold case.
+where addition has to be replaced with appropriate operations in the manifold case[^2].
+
+[^2]: In the manifold case the expression ``-\eta\cdot\mathrm{gradient}`` is an element of the [global tangent space](@ref "Global Tangent Spaces") ``\mathfrak{g}^\mathrm{hor}`` and a retraction maps from ``\mathfrak{g}^\mathrm{hor}``. We then still have to compose it with the [updated global section](@ref "Parallel Transport") ``\Lamda^{(t)}``.
 
 When calling [`GradientOptimizer`](@ref) we can specify a learning rate ``\eta`` (or use the default).
 
@@ -39,7 +41,7 @@ If we operate on a derivative with [`update!`](@ref) this will compute a *final
 dx = (A = one(weight.A), )
 update!(o, o.cache, dx)
 
-dx
+dx.A
 ```
 
 So what has happened here is that the gradient `dx` was simply multiplied with ``-\eta`` as the cache of the gradient optimizer is trivial.
@@ -49,7 +51,7 @@ So what has happened here is that the gradient `dx` was simply multiplied with `
 The momentum optimizer is similar to the gradient optimizer but further stores past information as *first moments*. We let these first moments *decay* with a *decay parameter* ``\alpha``:
 
 ```math
-\mathrm{weights} \leftarrow weights + (\alpha\cdot\mathrm{moment} - \eta\cdot\mathrm{gradient}),
+\mathrm{weights} \leftarrow \mathrm{weights} + (\alpha\cdot\mathrm{moment} - \eta\cdot\mathrm{gradient}),
 ```
 
 where addition has to be replaced with appropriate operations in the manifold case.
@@ -61,7 +63,7 @@ const α = 0.5
 method = MomentumOptimizer(η, α)
 o = Optimizer(method, weight)
 
-o.cache
+o.cache.A # the cache is stored for each array in `weight` (which is a `NamedTuple`)
 ```
 
 But as the cache is initialized with zeros it will lead to the same result as the gradient optimizer in the first iteration:
@@ -85,7 +87,7 @@ If we have weights on manifolds calling [`Optimizer`](@ref) will automatically a
 ```@example optimizer_methods
 weight = (Y = rand(StiefelManifold, 10, 5), )
 
-Optimizer(method, weight).cache
+Optimizer(method, weight).cache.Y
 ```
 
 ## The Adam Optimizer 
@@ -100,7 +102,7 @@ where ``\odot:\mathbb{R}^n\times\mathbb{R}^n\to\mathbb{R}^n`` is the *Hadamard p
 * ``W_t\gets -\eta{}B_1/\sqrt{B_2 + \delta},``
 * ``Y^{(t+1)} \gets Y^{(t)} + W^{(t)},``
 
-where the last addition has to be replaced with appropriate operations when dealing with manifolds. Further ``\eta`` is the *learning rate* and ``\delta`` is a small constant that is added for stability. The division, square root and addition in step 3 are performed element-wise.
+where the last addition has to be replaced with appropriate operations when dealing with manifolds. Further ``\eta`` is the *learning rate* and ``\delta`` is a small constant that is added for stability. The division, square root and addition in the computation of ``W_t`` are performed element-wise.
 
 In the following we show a schematic update that Adam performs for the case when no elements are on manifolds (also compare this figure with the [general optimization framework](@ref "Generalization to Homogeneous Spaces")):
 
@@ -108,16 +110,45 @@ In the following we show a schematic update that Adam performs for the case when
 Main.include_graphics("../tikz/adam_optimizer") # hide
 ```
 
+We demonstrate the Adam cache on the same example from before:
+```@example optimizer_methods
+const ρ₁ = 0.9
+const ρ₂ = 0.99
+const δ = 1e-8
+
+method = AdamOptimizer(η, ρ₁, ρ₂, δ)
+o = Optimizer(method, weight)
+
+o.cache.A
+```
+
 ### Weights on manifolds 
 
-The problem with generalizing Adam to manifolds is that the Hadamard product ``\odot`` as well as the other element-wise operations (``/``, ``\sqrt{}`` and ``+`` in step 3 above) lack a clear geometric interpretation. In `GeometricMachineLearning` we get around this issue by utilizing a so-called [global tangent space representation](@ref "Global Tangent Spaces").  
+The problem with generalizing Adam to manifolds is that the Hadamard product ``\odot`` as well as the other element-wise operations (``/``, ``\sqrt{}`` and ``+`` in step 3 above) lack a clear geometric interpretation. In `GeometricMachineLearning` we get around this issue by utilizing a so-called [global tangent space representation](@ref "Global Tangent Spaces"). A similar approach is shown in [kong2023momentum](@cite).
 
 ```@eval
-Main.remark(raw"The optimization framework presented here manages to generalize the Adam optimizer to manifolds without knowing an underlying differential equation. From a mathematical perspective this is not really satisfactory because we would ideally want the optimizers to emerge as a discretization of a differential equation as in the case of the gradient and the momentum optimizer to better interpret them. A similar attempt to generalize Adam to the Stiefel manifold was made in [kong2023momentum](@cite).")
+Main.remark(raw"The optimization framework presented here manages to generalize the Adam optimizer to manifolds without knowing an underlying differential equation. From a mathematical perspective this is not really satisfactory because we would ideally want the optimizers to emerge as a discretization of a differential equation as in the case of the gradient and the momentum optimizer to better interpret them.")
 ```
 
 ## The Adam Optimizer with Decay
+The Adam optimizer with decay is similar to the standard Adam optimizer with the difference that the learning rate ``\eta`` decays exponentially. We start with a relatively high learning rate ``\eta_1`` (e.g. ``10^{-2}``) and end with a low learning rate ``\eta_2`` (e.g. ``10^{-8}``). If we want to use this optimizer we have to tell it beforehand how many epochs we train for such that it can adjust the learning rate decay accordingly:
+
+```@example optimizer_methods
+const η₁ = 1e-2 
+const η₂ = 1e-6
+const n_epochs = 1000 
+
+method = AdamOptimizerWithDecay(n_epochs, η₁, η₂, ρ₁, ρ₂, δ)
+o = Optimizer(method, weight)
+
+nothing # hide
+```
+ 
+ The cache is however exactly the same as for the Adam optimizer:
 
+```@example
+    o.cache.A
+```
 
 ## Library Functions
 

From 8f4630a6fd1739e9387b14181e337c2447ea2955 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:56:18 +0200
Subject: [PATCH 069/101] Put E into right environment.

---
 src/arrays/stiefel_lie_algebra_horizontal.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arrays/stiefel_lie_algebra_horizontal.jl b/src/arrays/stiefel_lie_algebra_horizontal.jl
index 82af6cf5b..f60473572 100644
--- a/src/arrays/stiefel_lie_algebra_horizontal.jl
+++ b/src/arrays/stiefel_lie_algebra_horizontal.jl
@@ -8,7 +8,7 @@ The projection here is: ``\pi:S \to SE`` where
 ```math
 E = \begin{pmatrix} \mathbb{I}_{n} \\ \mathbb{O}_{(N-n)\times{}n}  \end{pmatrix}.
 ```
-The matrix \(E\) is implemented under [`StiefelProjection`](@ref) in `GeometricMachineLearning`.
+The matrix ``E`` is implemented under [`StiefelProjection`](@ref) in `GeometricMachineLearning`.
 
 An element of StiefelLieAlgMatrix takes the form: 
 ```math

From 8fe2833fbd0ad952df59237b3721dd34ce86944d Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:56:43 +0200
Subject: [PATCH 070/101] Added docstring.

---
 ...adam_optimizer_with_learning_rate_decay.jl | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/optimizers/adam_optimizer_with_learning_rate_decay.jl b/src/optimizers/adam_optimizer_with_learning_rate_decay.jl
index d765022f3..40443dc2d 100644
--- a/src/optimizers/adam_optimizer_with_learning_rate_decay.jl
+++ b/src/optimizers/adam_optimizer_with_learning_rate_decay.jl
@@ -1,20 +1,17 @@
 @doc raw"""
-Defines the Adam Optimizer with weight decay.
+    AdamOptimizerWithDecay(n_epochs, η₁=1f-2, η₂=1f-6, ρ₁=9f-1, ρ₂=9.9f-1, δ=1f-8)
 
-### Constructors
-The default constructor takes as input: 
-- `n_epochs::Int`
-- `η₁`: the learning rate at the start 
-- `η₂`: the learning rate at the end 
-- `ρ₁`: the decay parameter for the first moment 
-- `ρ₂`: the decay parameter for the second moment
-- `δ`: the safety parameter 
-- `T` (keyword argument): the type. 
+Make an instance of the Adam Optimizer with weight decay.
 
-The second constructor is called with: 
-- `n_epochs::Int`
-- `T`
-... the rest are keyword arguments
+All except the first argument (the number of epochs) have defaults.
+
+The difference to the standard [`AdamOptimizer`](@ref) is that we change the learning reate `η` in each step.
+We start with a relatively high value `η₁` and then exponentially decrease it until we reach `η₂` with
+
+```math
+ \eta = \gamma^t\eta_1,
+```
+where ``\gamma = \exp(\log(\eta_1 / \eta_2) / \mathtt{n\_epochs}).``
 """
 struct AdamOptimizerWithDecay{T<:Real} <: OptimizerMethod
     η₁::T

From 0d9fe2642105553e4e085fb143868b10b4497e3b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 10:57:10 +0200
Subject: [PATCH 071/101] Added display for Caches.

---
 src/optimizers/optimizer_caches.jl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/optimizers/optimizer_caches.jl b/src/optimizers/optimizer_caches.jl
index 0bd84518c..6df668d02 100644
--- a/src/optimizers/optimizer_caches.jl
+++ b/src/optimizers/optimizer_caches.jl
@@ -47,6 +47,13 @@ struct AdamCache{T, AT <: AbstractArray{T}} <: AbstractCache{T}
     end
 end
 
+function Base.display(C::AdamCache)
+    println(raw"`MomentumCache` that currently stores `B₁` as ...")
+    display(C.B₁)
+    println(raw"and `B₂` as ...")
+    display(C.B₂)
+end
+
 @doc raw"""
     MomentumCache(Y)
 
@@ -65,6 +72,11 @@ struct MomentumCache{T, AT <: AbstractArray{T}} <:AbstractCache{T}
     end
 end
 
+function Base.display(C::MomentumCache)
+    println(raw"`MomentumCache` that currently stores `B`as  ...")
+    display(C.B)
+end
+
 @doc raw"""
     GradientCache(Y)
 

From 3667eb6bdf7bd8acc7cc9f2879e4606a9a201e10 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 16:23:40 +0200
Subject: [PATCH 072/101] Finished tutorial.

---
 .../tutorials/adjusting_the_loss_function.md  | 100 +++++++++++++++++-
 1 file changed, 99 insertions(+), 1 deletion(-)

diff --git a/docs/src/tutorials/adjusting_the_loss_function.md b/docs/src/tutorials/adjusting_the_loss_function.md
index 51d06f962..670d4c937 100644
--- a/docs/src/tutorials/adjusting_the_loss_function.md
+++ b/docs/src/tutorials/adjusting_the_loss_function.md
@@ -1,12 +1,110 @@
 # Adjusting the Loss Function
 
 `GeometricMachineLearning` provides a few standard loss function that are used as defaults for specific neural networks:
+* [`FeedForwardLoss`](@ref)
+* [`AutoencoderLoss`](@ref)
+* [`TransformerLoss`](@ref)
 
+If these standard losses do not satisfy the user's needs, it is very easy to implement custom loss functions. We again consider training a SympNet on the data coming from a pendulum:
+
+```@example change_loss
+using GeometricMachineLearning
+using GeometricIntegrators: integrate, ImplicitMidpoint
+using GeometricProblems.HarmonicOscillator: hodeproblem
+import Random
+Random.seed!(123)
+
+data = integrate(hodeproblem(; tspan = 100), ImplicitMidpoint()) |> DataLoader
+
+nn = NeuralNetwork(GSympNet(2))
+
+o = Optimizer(AdamOptimizer(), nn)
+
+batch = Batch(32)
+
+n_epochs = 30
+
+loss = FeedForwardLoss()
+
+loss_array = o(nn, data, batch, n_epochs, loss)
+
+print(loss_array[end])
+```
+
+And we see that the loss goes down to a very low value. But the user might want to constrain the norm of the network parameters:
+
+```@example change_loss
+using LinearAlgebra: norm
+
+# norm of parameters for single layer
+network_parameter_norm(params::NamedTuple) = sum([norm(params[i]) for i in 1:length(params)])
+# norm of parameters for entire network
+network_parameter_norm(params) = sum([network_parameter_norm(param) for param in params])
+
+network_parameter_norm(nn.params)
+```
+
+We now implement a custom loss such that:
+
+```math
+    \mathrm{loss}_\mathcal{NN}^\mathrm{custom}(\mathrm{input}, \mathrm{output}) = \mathrm{loss}_\mathcal{NN}^\mathrm{feedforward} + \lambda \mathrm{norm}(\mathcal{NN}\mathtt{.params}).
+```
+
+```@example change_loss
+struct CustomLoss <: GeometricMachineLearning.NetworkLoss end
+
+function (loss::CustomLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {AT<:AbstractArray, CT<:@NamedTuple{q::AT, p::AT}}
+    FeedForwardLoss()(model, params, input, output) + .1 * network_parameter_norm(params)
+end
+
+loss = CustomLoss()
+
+nn_custom = NeuralNetwork(GSympNet(2))
+
+loss_array = o(nn_custom, data, batch, n_epochs, loss)
+
+print(loss_array[end])
+```
+
+And we see that the norm of the parameters is a lot lower:
+
+```@example change_loss
+network_parameter_norm(nn_custom.params)
+```
+
+We can also compare the solutions of the two networks:
+
+```@example change_loss
+using CairoMakie
+
+fig = Figure()
+ax = Axis(fig[1, 1]; backgroundcolor = :transparent)
+
+init_con = [0.5 0.]
+n_time_steps = 100
+prediction1 = zeros(2, n_time_steps + 1)
+prediction2 = zeros(2, n_time_steps + 1)
+prediction1[:, 1] = init_con
+prediction2[:, 1] = init_con
+
+for i in 2:(n_time_steps + 1)
+    prediction1[:, i] = nn(prediction1[:, i - 1])
+    prediction2[:, i] = nn_custom(prediction2[:, i - 1])
+end
+
+lines!(ax, data.input.q[:], data.input.p[:], label = "Training Data")
+lines!(ax, prediction1[1, :], prediction1[2, :], label = "FeedForwardLoss")
+lines!(ax, prediction2[1, :], prediction2[2, :], label = "CustomLoss")
+text_color = :white # hide
+axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color) # hide
+
+fig
+```
 
 ## Library Functions
 
 ```@docs; canonical = false
-NetworkLoss
+GeometricMachineLearning.NetworkLoss
 FeedForwardLoss
 AutoencoderLoss
 TransformerLoss

From 5f764bacacd718f85a2c0f1fb480cddaaa51abed Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 16:45:32 +0200
Subject: [PATCH 073/101] ... operation doesn't work if we don't have a tuple.

---
 src/data_loader/optimize.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/data_loader/optimize.jl b/src/data_loader/optimize.jl
index d73783392..924c4b1b7 100644
--- a/src/data_loader/optimize.jl
+++ b/src/data_loader/optimize.jl
@@ -23,7 +23,11 @@ function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTu
         count += 1
         # these `copy`s should not be necessary! coming from a Zygote problem!
         input_nt_output_nt = convert_input_and_batch_indices_to_array(dl, batch, batch_indices)
-        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt_output_nt...), ps)
+        if typeof(input_nt_output_nt) <: Tuple
+            loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt_output_nt...), ps)
+        else
+            loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt_output_nt), ps)
+        end
         total_error += loss_value
         dp = pullback(one(loss_value))[1]
         optimization_step!(opt, λY, ps, dp)

From b1df4996633f36b4ff477c1387e951eeb6cf61b6 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 5 Jun 2024 16:46:29 +0200
Subject: [PATCH 074/101] Removed unnecessary dl from tests.

---
 test/psd_architecture_tests.jl       | 2 --
 test/symplectic_autoencoder_tests.jl | 1 -
 2 files changed, 3 deletions(-)

diff --git a/test/psd_architecture_tests.jl b/test/psd_architecture_tests.jl
index b54579a19..bdc4800ae 100644
--- a/test/psd_architecture_tests.jl
+++ b/test/psd_architecture_tests.jl
@@ -15,8 +15,6 @@ function test_accuracy(N::Integer, n::Integer; tol::Real = .35)
 end
 
 function test_encoder_and_decoder(N::Integer, n::Integer)
-    dl = DataLoader(rand(N, 10 * N); autoencoder = true)
-
     psd_nn = NeuralNetwork(PSDArch(N, n))
     psd_encoder = encoder(psd_nn)
     psd_decoder = decoder(psd_nn)
diff --git a/test/symplectic_autoencoder_tests.jl b/test/symplectic_autoencoder_tests.jl
index ef481741e..4b83dd60c 100644
--- a/test/symplectic_autoencoder_tests.jl
+++ b/test/symplectic_autoencoder_tests.jl
@@ -17,7 +17,6 @@ function test_accuracy(N::Integer, n::Integer; tol::Real = .35, n_epochs::Intege
 end
 
 function test_encoder_and_decoder(N::Integer, n::Integer)
-    dl = DataLoader(rand(N, 10 * N); autoencoder = true)
 
     sae_nn = NeuralNetwork(SymplecticAutoencoder(N, n))
     sae_encoder = encoder(sae_nn)

From 20e9a9b35418cd288a996e39986eeb36459fa702 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 7 Jun 2024 11:39:04 +0200
Subject: [PATCH 075/101] Copying all figures into build folder now.

---
 docs/Makefile | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/Makefile b/docs/Makefile
index 7ee53a4af..74e9ec2f0 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -11,7 +11,7 @@ latex: latex_no_pdf
 	$(MAKE) compile_tex;
 	$(MAKE) compile_tex
 
-latex_no_pdf_no_images: install_brenier_two_fluid latex_docs_no_pdf put_figures_outside_of_minted_environment do_correct_quotation_marks make_correct_thrm_and_dfntn_and_xmpl_and_proof_environment
+latex_no_pdf_no_images: install_brenier_two_fluid latex_docs_no_pdf copy_png_files put_figures_outside_of_minted_environment do_correct_quotation_marks make_correct_thrm_and_dfntn_and_xmpl_and_rmrk_and_proof_environment
 
 latex_no_pdf: latex_images latex_no_pdf_no_images 
 
@@ -74,9 +74,6 @@ put_figures_outside_of_minted_environment:
 	sed -i'' -e '/DeleteThisAndTheLineAfter/d' build/G*.tex;
 	sed -i'' -e 's/\\\\texttt/\\texttt/g' build/G*.tex;
 	sed -i'' -e 's/\\\\_/\\_/g' build/G*.tex;
-	sed -i'' -e 's/tangent_space.png/manifolds\/tangent_space.png/g' build/G*.tex;
-	sed -i'' -e 's/retraction_comparison.png/optimizers\/manifold_related\/retraction_comparison.png/g' build/G*.tex;
-	sed -i'' -e 's/retraction_discrepancy.png/optimizers\/manifold_related\/retraction_discrepancy.png/g' build/G*.tex;
 
 make_correct_thrm_and_dfntn_and_xmpl_and_rmrk_and_proof_environment:
 	sed -i'' -e 's/{\\textbackslash}begin\\{thrm\\}/\\begin{thrm}/g' build/G*.tex;
@@ -96,4 +93,9 @@ make_correct_thrm_and_dfntn_and_xmpl_and_rmrk_and_proof_environment:
 
 do_correct_quotation_marks:
 	sed -i'' -e 's/{\\textquotedbl}/"/g' build/G*.tex;
-	sed -i'' -e 's/ "/ ``/g' build/G*.tex
\ No newline at end of file
+	sed -i'' -e 's/ "/ ``/g' build/G*.tex
+
+copy_png_files:
+	find build/manifolds -name \*.png -exec cp {} build \; ;
+	find build/optimizers/manifold_related -name \*.png -exec cp {} build \; ;
+	find build/tutorials -name \*.png -exec cp {} build \;
\ No newline at end of file

From fff59aaad2d91f33855eac1214433445f08325ef Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 7 Jun 2024 11:39:27 +0200
Subject: [PATCH 076/101] Added reference for the BFGS optimizer.

---
 docs/src/GeometricMachineLearning.bib | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
index 5150d1143..46fd1701b 100644
--- a/docs/src/GeometricMachineLearning.bib
+++ b/docs/src/GeometricMachineLearning.bib
@@ -433,3 +433,10 @@ @article{kong2023momentum
   journal={arXiv preprint arXiv:2205.14173v3},
   year={2023}
 }
+
+@MISC{2279304,
+    TITLE = {Quasi-newton methods: Understanding DFP updating formula},
+    AUTHOR = {A.G. (https://math.stackexchange.com/users/253273/a-\%ce\%93)},
+    HOWPUBLISHED = {Mathematics Stack Exchange},
+    NOTE = {URL:https://math.stackexchange.com/q/2279304 (version: 2017-05-13)}
+}
\ No newline at end of file

From 96ae92c70af8ce5ec506dbfc6925a2c26f74048d Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 7 Jun 2024 11:39:59 +0200
Subject: [PATCH 077/101] Fixed some typos and put definitions, theorems and
 proofs into the correct environment.

---
 docs/src/optimizers/bfgs_optimizer.md | 70 +++++++++++++++++++--------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/docs/src/optimizers/bfgs_optimizer.md b/docs/src/optimizers/bfgs_optimizer.md
index 8b876d121..14645fce0 100644
--- a/docs/src/optimizers/bfgs_optimizer.md
+++ b/docs/src/optimizers/bfgs_optimizer.md
@@ -1,6 +1,6 @@
 # The BFGS Optimizer
 
-The presentation shown here is largely taken from [wright2006numerical; chapters 3 and 6](@cite) with a derivation based on an [online comment](https://math.stackexchange.com/questions/2091867/quasi-newton-methods-understanding-dfp-updating-formula). The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.
+The presentation shown here is largely taken from [wright2006numerical; chapters 3 and 6](@cite) with a derivation based on an online comment [2279304](@cite). The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.
 
 It is a version of a *quasi-Newton* method and is therefore especially suited for convex problems. As is the case with any other (quasi-)Newton method the BFGS algorithm approximates the objective with a quadratic function in each optimization step:
 ```math
@@ -21,21 +21,21 @@ x_{k+1} = x_k + \alpha_kp_k,
 where ``\alpha_k`` is the *step length*. Techniques that describe how to pick an appropriate ``\alpha_k`` are called *line-search methods* and are discussed below. First we discuss what requirements we impose on ``B_k``. A first reasonable condition would be to require the gradient of ``m_k`` to be equal to that of ``f`` at the points ``x_{k-1}`` and ``x_k``: 
 ```math
 \begin{aligned}
-\nabla_{x_k}m_k  & = \nabla_{x_k}f + B_k(x_k - x_k)  & \overset{!}{=} \nabla_{x_k}f \text{ and } \\
-\nabla_{x_{k-1}}m_k & = \nabla{x_k}f + B_k(x_{k-1} - x_k) & \overset{!}{=} \nabla_{x_{k-1}}f.
+\nabla_{x_k}m_k  & = \nabla_{x_k}f + B_k(x_k - x_k)  & \overset{!}{=} & \nabla_{x_k}f \text{ and } \\
+\nabla_{x_{k-1}}m_k & = \nabla_{x_k}f + B_k(x_{k-1} - x_k) & \overset{!}{=} & \nabla_{x_{k-1}}f.
 \end{aligned}
 ```
-The first one of these conditions is of course automatically satisfied. The second one can be rewritten as: 
+The first one of these conditions is automatically satisfied. The second one can be rewritten as: 
 ```math
-B_k(x_k - x_{k-1}) = \overset{!}{=} \nabla_{x_k}f - \nabla_{x_{k-1}}f. 
+B_k(x_k - x_{k-1}) \overset{!}{=} \nabla_{x_k}f - \nabla_{x_{k-1}}f. 
 ```
 
 The following notations are often used: 
 ```math
-s_{k-1} := \alpha_{k-1}p_{k-1} :=  x_{k} - x_{k-1} \text{ and } y_{k-1} := \nabla_{x_k}f - \nabla_{x_{k-1}}f. 
+s_{k-1} := \frac{p_{k-1}}{\alpha_{k-1}} :=  x_{k} - x_{k-1} \quad\text{ and }\quad y_{k-1} := \nabla_{x_k}f - \nabla_{x_{k-1}}f. 
 ```
 
-The conditions mentioned above then becomes: 
+The condition mentioned above then becomes: 
 ```math
 B_ks_{k-1} \overset{!}{=} y_{k-1},
 ```
@@ -43,23 +43,45 @@ and we call it the *secant equation*. A second condition we impose on ``B_k`` is
 ```math
 s_{k-1}^Ty_{k-1} > 0.
 ```
-This is referred to as the *curvature condition*. If we impose the *Wolfe conditions*, the *curvature condition* hold automatically. The Wolfe conditions are stated with respect to the parameter ``\alpha_k``.
+This is referred to as the *curvature condition*. If we impose the *Wolfe conditions*, the *curvature condition* holds automatically. The Wolfe conditions are stated with respect to the parameter ``\alpha_k``.
 
-The *Wolfe conditions* are:
-1. ``f(x_k+\alpha{}p_k)\leq{}f(x_k) + c_1\alpha(\nabla_{x_k}f)^Tp_k`` for ``c_1\in(0,1)``.
-2. ``(\nabla_{(x_k + \alpha_kp_k)}f)^Tp_k \geq c_2(\nabla_{x_k}f)^Tp_k`` for ``c_2\in(c_1,1)``.
+```@eval
+Main.definition(raw"The **Wolfe conditions** are:
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\begin{aligned}
+" * Main.indentation * raw"f(x_k+\alpha_kp_k) & \leq{}f(x_k) + c_1\alpha_k(\nabla_{x_k}f)^Tp_k & \text{ for } & c_1\in(0,1) \quad\text{and} \\
+" * Main.indentation * raw"(\nabla_{(x_k + \alpha_kp_k)}f)^Tp_k & \geq c_2(\nabla_{x_k}f)^Tp_k & \text{ for } & c_2\in(c_1,1).
+" * Main.indentation * raw"\end{aligned}
+" * Main.indentation * raw"```
+" * Main.indentation * raw"The two Wolfe conditions above are respectively called the *sufficient decrease condition* and the *curvature condition* respectively.")
+```
 
-A possible choice for ``c_1`` and ``c_2`` are ``10^{-4}`` and ``0.9`` (see [wright2006numerical](@cite)). The two Wolfe conditions above are respectively called the *sufficient decrease condition* and the *curvature condition* respectively. Note that the second Wolfe condition (also called curvature condition) is stronger than the one mentioned before under the assumption that the first Wolfe condition is true:
-```math
-(\nabla_{x_k}f)^Tp_{k-1} - c_2(\nabla_{x_{k-1}}f)^Tp_{k-1} = y_{k-1}^Tp_{k-1} + (1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1} \geq 0,
+A possible choice for ``c_1`` and ``c_2`` are ``10^{-4}`` and ``0.9`` [wright2006numerical](@cite). 
+
+We further have:
+
+```@eval
+Main.theorem(raw"The second Wolfe condition, also called curvature condition, is stronger than the curvature condition mentioned before under the assumption that the first Wolfe condition is true and ``f(x_{k+1}) < f(x_k)``.")
+```
+
+```@eval
+Main.proof(raw"We use the second Wolfe condition to write
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"(\nabla_{x_k}f)^Tp_{k-1} - c_2(\nabla_{x_{k-1}}f)^Tp_{k-1} = y_{k-1}^Tp_{k-1} + (1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1} \geq 0,
+" * Main.indentation * raw"```
+" * Main.indentation * raw"and we can apply the first Wolfe condition on the second term in this expression: 
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"(1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1}\geq\frac{1-c_2}{c_1\alpha_{k-1}}(f(x_k) - f(x_{k-1})),
+" * Main.indentation * raw"```
+" * Main.indentation * raw"which is negative if the value of ``f`` is decreasing.")
 ```
-and the second term in this expression is ``(1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1}\geq\frac{1-c_2}{c_1\alpha_{k-1}}(f(x_k) - f(x_{k-1}))``, which is negative. 
 
 In order to pick the ideal ``B_k`` we solve the following problem: 
 ```math
 \begin{aligned}
-\min_B & ||B - B_{k-1}||_W \\ 
-\text{s.t.} & B  = B^T\text{ and }Bs_{k-1}=y_{k-1},
+\min_B & ||B & - B_{k-1}||_W \\ 
+\text{s.t.} & B  & = B^T\quad\text{and}\\
+            & Bs_{k-1} & = y_{k-1},
 \end{aligned}
 ```
 where the first condition is symmetry and the second one is the secant equation. For the norm ``||\cdot||_W`` we pick the weighted Frobenius norm:
@@ -82,13 +104,14 @@ With this notation we can rewrite the problem of finding ``B_k`` as:
 ```math
 \begin{aligned}
 \min_{\tilde{B}} & ||\tilde{B} - \tilde{B}_{k-1}||_F \\ 
-\text{s.t.} & \tilde{B} = \tilde{B}^T\text{ and }\tilde{B}\tilde{s}_{k-1}=\tilde{y}_{k-1}.
+\text{s.t.}\quad & \tilde{B} = \tilde{B}^T\quad\text{and}\\
+            &\tilde{B}\tilde{s}_{k-1}=\tilde{y}_{k-1}.
 \end{aligned}
 ```
 
 We further have ``Wy_{k-1} = s_{k-1}`` (by the mean value theorem ?) and therefore ``\tilde{y}_{k-1} = \tilde{s}_{k-1}``.
 
-Now we rewrite ``B`` and ``B_{k-1}`` in a new basis ``U = [u|u_\perp]``, where ``u := \tilde{s}_{k-1}/||\tilde{s}_{k-1}||`` and ``u_perp`` is an orthogonal complement[^2] of ``u``:
+Now we rewrite ``B`` and ``B_{k-1}`` in a new basis ``U = [u|u_\perp]``, where ``u := \tilde{s}_{k-1}/||\tilde{s}_{k-1}||`` and ``u_\perp`` is an orthogonal complement[^2] of ``u``:
 
 [^2]: So we must have ``u^Tu_\perp=0`` and further ``u_\perp^Tu_\perp=\mathbb{I}``.
 
@@ -118,7 +141,7 @@ B_k = (\mathbb{I} - \frac{1}{y_{k-1}^Ts_{k-1}}y_{k-1}s_{k-1}^T)B_{k-1}(\mathbb{I
 
 What we need in practice however is not ``B_k``, but its inverse ``H_k``. This is because we need to find ``s_{k-1}`` based on ``y_{k-1}``.  To get ``H_k`` based on the expression for ``B_k`` above we can use the *Sherman-Morrison-Woodbury formula*[^3] to obtain:
 
-[^3]: The *Sherman-Morrison-Woodbury formula* states ``(A + UCV)^{-1} = A^{-1} - A^{-1} - A^{-1}U(C^{-1} + VA^{-1}U)^{-1}VA^{-1}``.
+[^3]: The *Sherman-Morrison-Woodbury formula* states ``(A + UCV)^{-1} = A^{-1} - A^{-1}U(C^{-1} + VA^{-1}U)^{-1}VA^{-1}``.
 
 ```math
 H_{k} = H_{k-1} - \frac{H_{k-1}y_{k-1}y_{k-1}^TH_{k-1}}{y_{k-1}^TH_{k-1}y_{k-1}} + \frac{s_{k-1}s_{k-1}^T}{y_{k-1}^Ts_{k-1}}.
@@ -127,6 +150,12 @@ H_{k} = H_{k-1} - \frac{H_{k-1}y_{k-1}y_{k-1}^TH_{k-1}}{y_{k-1}^TH_{k-1}y_{k-1}}
 
 TODO: Example where this works well!
 
+## Library Functions
+
+```@docs; canonical=false
+BFGSOptimizer
+```
+
 ## References 
 
 ```@bibliography
@@ -134,4 +163,5 @@ Pages = []
 Canonical = false 
 
 wright2006numerical
+2279304
 ```
\ No newline at end of file

From b7c424e23da8e8a69814f70b8e8ed7cf2abedb06 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 7 Jun 2024 11:41:41 +0200
Subject: [PATCH 078/101] Fixed references and dark mode image.

---
 .../tutorials/adjusting_the_loss_function.md  | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/docs/src/tutorials/adjusting_the_loss_function.md b/docs/src/tutorials/adjusting_the_loss_function.md
index 670d4c937..58dff4f82 100644
--- a/docs/src/tutorials/adjusting_the_loss_function.md
+++ b/docs/src/tutorials/adjusting_the_loss_function.md
@@ -2,7 +2,7 @@
 
 `GeometricMachineLearning` provides a few standard loss function that are used as defaults for specific neural networks:
 * [`FeedForwardLoss`](@ref)
-* [`AutoencoderLoss`](@ref)
+* [`AutoEncoderLoss`](@ref)
 * [`TransformerLoss`](@ref)
 
 If these standard losses do not satisfy the user's needs, it is very easy to implement custom loss functions. We again consider training a SympNet on the data coming from a pendulum:
@@ -53,7 +53,10 @@ We now implement a custom loss such that:
 ```@example change_loss
 struct CustomLoss <: GeometricMachineLearning.NetworkLoss end
 
-function (loss::CustomLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {AT<:AbstractArray, CT<:@NamedTuple{q::AT, p::AT}}
+function (loss::CustomLoss)(model::Chain, params::Tuple, input::CT, output::CT) where {
+                                                            AT<:AbstractArray, 
+                                                            CT<:@NamedTuple{q::AT, p::AT}
+                                                            }
     FeedForwardLoss()(model, params, input, output) + .1 * network_parameter_norm(params)
 end
 
@@ -77,8 +80,16 @@ We can also compare the solutions of the two networks:
 ```@example change_loss
 using CairoMakie
 
-fig = Figure()
-ax = Axis(fig[1, 1]; backgroundcolor = :transparent)
+function make_fig(; theme = :dark, size = (450, 338)) # hide
+textcolor = theme == :dark ? :white : :black # hide
+fig = Figure(; backgroundcolor = :transparent)
+ax = Axis(fig[1, 1]; backgroundcolor = :transparent, 
+    bottomspinecolor = textcolor, 
+    topspinecolor = textcolor,
+    leftspinecolor = textcolor,
+    rightspinecolor = textcolor,
+    xtickcolor = textcolor, 
+    ytickcolor = textcolor)
 
 init_con = [0.5 0.]
 n_time_steps = 100
@@ -92,20 +103,25 @@ for i in 2:(n_time_steps + 1)
     prediction2[:, i] = nn_custom(prediction2[:, i - 1])
 end
 
-lines!(ax, data.input.q[:], data.input.p[:], label = "Training Data")
-lines!(ax, prediction1[1, :], prediction1[2, :], label = "FeedForwardLoss")
-lines!(ax, prediction2[1, :], prediction2[2, :], label = "CustomLoss")
+lines!(ax, data.input.q[:], data.input.p[:], label = rich("Training Data"; color = textcolor))
+lines!(ax, prediction1[1, :], prediction1[2, :], label = rich("FeedForwardLoss"; color = textcolor))
+lines!(ax, prediction2[1, :], prediction2[2, :], label = rich("CustomLoss"; color = textcolor))
 text_color = :white # hide
-axislegend(; position = (.82, .75), backgroundcolor = :transparent, color = text_color) # hide
+axislegend(; position = (.82, .75), backgroundcolor = :transparent) # hide
 
 fig
+end # hide
+ # hide
+save("compare_losses.png", make_fig(; theme = :light)) # hide
+save("compare_losses_dark.png", make_fig(; theme = :dark)) # hide
+Main.include_graphics("compare_losses") # hide
 ```
 
 ## Library Functions
 
 ```@docs; canonical = false
 GeometricMachineLearning.NetworkLoss
-FeedForwardLoss
-AutoencoderLoss
-TransformerLoss
+GeometricMachineLearning.FeedForwardLoss
+GeometricMachineLearning.AutoEncoderLoss
+GeometricMachineLearning.TransformerLoss
 ```
\ No newline at end of file

From b9b5e25c60e37bb1a4ad2a22d1dda40f58d4804b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 7 Jun 2024 11:42:03 +0200
Subject: [PATCH 079/101] Also exporting AutoEncoderLoss now.

---
 src/GeometricMachineLearning.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
index 867a9c662..6e50a9140 100644
--- a/src/GeometricMachineLearning.jl
+++ b/src/GeometricMachineLearning.jl
@@ -246,7 +246,7 @@ module GeometricMachineLearning
     include("backends/backends.jl")
     include("backends/lux.jl")
 
-    export TransformerLoss, FeedForwardLoss
+    export TransformerLoss, FeedForwardLoss, AutoEncoderLoss
 
     #INCLUDE ARCHITECTURES
     include("architectures/neural_network_integrator.jl")

From ee647ba247c9a5e2673f197ea49f8eb13692b1d7 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 7 Jun 2024 11:42:20 +0200
Subject: [PATCH 080/101] Fixed typo.

---
 src/layers/psd_like_layer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/psd_like_layer.jl b/src/layers/psd_like_layer.jl
index 11d0606a1..0ef22c44c 100644
--- a/src/layers/psd_like_layer.jl
+++ b/src/layers/psd_like_layer.jl
@@ -23,7 +23,7 @@ end
 
 function parameterlength(::PSDLayer{M, N}) where {M, N}
     M2 = M ÷ 2 
-    N2 = M ÷ 2
+    N2 = N ÷ 2
     M2 * (N2 - (M2 + 1) ÷ 2)
 end 
 

From 5ca9ff01cc8da6725b0dade811a5107b2c08bad3 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 7 Jun 2024 11:43:10 +0200
Subject: [PATCH 081/101] Equation environment -> code environment.

---
 src/loss/losses.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/loss/losses.jl b/src/loss/losses.jl
index f0ad96c5b..84bc48cec 100644
--- a/src/loss/losses.jl
+++ b/src/loss/losses.jl
@@ -1,6 +1,6 @@
 @doc raw"""
 An abstract type for all the neural network losses. 
-If you want to implement ``CustomLoss <: NetworkLoss`` you need to define a functor:
+If you want to implement `CustomLoss <: NetworkLoss` you need to define a functor:
 ```julia
     (loss::CustomLoss)(model, ps, input, output)
 ```

From db5ef87b743b7d8c83d28f95d4057507f41be256 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 7 Jun 2024 11:44:06 +0200
Subject: [PATCH 082/101] Finished docstrings for optimizer methods.

---
 src/optimizers/adam_optimizer.jl              | 27 +++++++++++++++++--
 ...adam_optimizer_with_learning_rate_decay.jl |  7 ++---
 src/optimizers/gradient_optimizer.jl          | 20 +++++++++++---
 src/optimizers/momentum_optimizer.jl          | 25 ++++++++++++++---
 4 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/src/optimizers/adam_optimizer.jl b/src/optimizers/adam_optimizer.jl
index 96c35a518..768fb71ed 100644
--- a/src/optimizers/adam_optimizer.jl
+++ b/src/optimizers/adam_optimizer.jl
@@ -1,6 +1,29 @@
 @doc raw"""
-Defines the Adam Optimizer.
-Algorithm and suggested defaults are taken from [goodfellow2016deep](@cite) (page 301).
+    AdamOptimizer(η, ρ₁, ρ₂, δ)
+
+Make an instance of the Adam Optimizer.
+
+Here the cache consists of first and second moments that are updated as 
+
+```math
+B_1 \gets ((\rho_1 - \rho_1^t)/(1 - \rho_1^t))\cdot{}B_1 + (1 - \rho_1)/(1 - \rho_1^t)\cdot{}\nabla{}L,
+```
+and
+
+```math
+B_2 \gets ((\rho_2 - \rho_1^t)/(1 - \rho_2^t))\cdot{}B_2 + (1 - \rho_2)/(1 - \rho_2^t)\cdot\nabla{}L\odot\nabla{}L.
+```
+The final velocity is computed as:
+
+```math
+\mathrm{velocity} \gets -\eta{}B_1/\sqrt{B_2 + \delta}.
+```
+
+# Implementation 
+
+The *velocity* is stored in the input to save memory.
+
+Algorithm and suggested defaults are taken from [goodfellow2016deep; page 301](@cite).
 """
 struct AdamOptimizer{T<:Real} <: OptimizerMethod
     η::T
diff --git a/src/optimizers/adam_optimizer_with_learning_rate_decay.jl b/src/optimizers/adam_optimizer_with_learning_rate_decay.jl
index 40443dc2d..3cbb7a5ae 100644
--- a/src/optimizers/adam_optimizer_with_learning_rate_decay.jl
+++ b/src/optimizers/adam_optimizer_with_learning_rate_decay.jl
@@ -5,11 +5,12 @@ Make an instance of the Adam Optimizer with weight decay.
 
 All except the first argument (the number of epochs) have defaults.
 
-The difference to the standard [`AdamOptimizer`](@ref) is that we change the learning reate `η` in each step.
-We start with a relatively high value `η₁` and then exponentially decrease it until we reach `η₂` with
+The difference to the standard [`AdamOptimizer`](@ref) is that we change the learning reate ``\eta`` in each step.
+Apart from the *time dependency* of ``\eta`` the two algorithms are however equivalent!
+``\eta(0)`` starts with a high value ``\eta_1`` and then exponentially decrease until it reaches ``\eta_2`` with
 
 ```math
- \eta = \gamma^t\eta_1,
+ \eta(t) = \gamma^t\eta_1,
 ```
 where ``\gamma = \exp(\log(\eta_1 / \eta_2) / \mathtt{n\_epochs}).``
 """
diff --git a/src/optimizers/gradient_optimizer.jl b/src/optimizers/gradient_optimizer.jl
index a481ef2bb..fc12ec2ae 100644
--- a/src/optimizers/gradient_optimizer.jl
+++ b/src/optimizers/gradient_optimizer.jl
@@ -1,6 +1,20 @@
-"""
-Define the Gradient optimizer, i.e. W ← W - η*∇f(W)
-Or the riemannian manifold equivalent, if applicable.
+@doc raw"""
+    GradientOptimizer(η)
+
+Make an instance of a gradient optimizer. 
+
+This is the simplest neural network optimizer. It has no cache and computes the final velocity as:
+```math
+    \mathrm{velocity} \gets - \eta\nabla_\mathrm{weight}L.
+```
+
+# Implementation
+
+The operations are done as memory efficiently as possible.
+This means the provided ``\nabla_WL`` is mutated via:
+```julia
+    rmul!(∇L, -method.η)
+```
 """
 struct GradientOptimizer{T<:Real} <: OptimizerMethod
     η::T
diff --git a/src/optimizers/momentum_optimizer.jl b/src/optimizers/momentum_optimizer.jl
index 64d638406..41bda7a82 100644
--- a/src/optimizers/momentum_optimizer.jl
+++ b/src/optimizers/momentum_optimizer.jl
@@ -1,8 +1,25 @@
-"""
-Define the Momentum optimizer, i.e. 
-V ← α*V - ∇f(W)
-W ← W + η*V
+@doc raw"""
+    MomentumOptimizer(η, α)
+
+Make an instance of the momentum optimizer.
+
+The momentum optimizer is similar to the [`GradientOptimizer`](@ref).
+It however has a nontrivial cache that stores past history (see [`MomentumCache`](@ref)).
+The cache is updated via:
+```math
+    B^{\mathrm{cache}} \gets \alpha{}B^{\mathrm{cache}} + \nabla_\mathrm{weights}L
+```
+and then the final velocity is computed as
+```math
+    \mathrm{velocity} \gets  - \eta{}B^{\mathrm{cache}}.
+```
+
 Or the riemannian manifold equivalent, if applicable.
+
+# Implementation
+
+To save memory the *velocity* is stored in the input ``\nabla_WL``.
+This is similar to the case of the [`GradientOptimizer`](@ref).
 """
 struct MomentumOptimizer{T<:Real} <: OptimizerMethod
     η::T

From 4bb7ad646c82b987c0a44b8bed71ea74158b8be9 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 14:34:44 +0200
Subject: [PATCH 083/101] Added reference for Riemannian BFGS.

---
 docs/src/GeometricMachineLearning.bib | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
index 46fd1701b..a7ee2687a 100644
--- a/docs/src/GeometricMachineLearning.bib
+++ b/docs/src/GeometricMachineLearning.bib
@@ -439,4 +439,13 @@ @MISC{2279304
     AUTHOR = {A.G. (https://math.stackexchange.com/users/253273/a-\%ce\%93)},
     HOWPUBLISHED = {Mathematics Stack Exchange},
     NOTE = {URL:https://math.stackexchange.com/q/2279304 (version: 2017-05-13)}
+}
+
+@inproceedings{huang2016riemannian,
+  title={A Riemannian BFGS method for nonconvex optimization problems},
+  author={Huang, Wen and Absil, P-A and Gallivan, Kyle A},
+  booktitle={Numerical Mathematics and Advanced Applications ENUMATH 2015},
+  pages={627--634},
+  year={2016},
+  organization={Springer}
 }
\ No newline at end of file

From d70138d3892e29de8c0293c669daa42510e78c4f Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 14:36:09 +0200
Subject: [PATCH 084/101] Replaced B with R, fixed various theorem/definition
 environment problems and improved overall readibility.

---
 docs/src/optimizers/bfgs_optimizer.md | 235 +++++++++++++++++---------
 1 file changed, 152 insertions(+), 83 deletions(-)

diff --git a/docs/src/optimizers/bfgs_optimizer.md b/docs/src/optimizers/bfgs_optimizer.md
index 14645fce0..b726c8a42 100644
--- a/docs/src/optimizers/bfgs_optimizer.md
+++ b/docs/src/optimizers/bfgs_optimizer.md
@@ -2,158 +2,226 @@
 
 The presentation shown here is largely taken from [wright2006numerical; chapters 3 and 6](@cite) with a derivation based on an online comment [2279304](@cite). The Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm is a second order optimizer that can be also be used to train a neural network.
 
-It is a version of a *quasi-Newton* method and is therefore especially suited for convex problems. As is the case with any other (quasi-)Newton method the BFGS algorithm approximates the objective with a quadratic function in each optimization step:
+It is a version of a *quasi-Newton* method and is therefore especially suited for convex problems. As is the case with any other (quasi-)Newton method[^1] the BFGS algorithm approximates the objective with a quadratic function in each optimization step:
+
+[^1]: Various Newton methods and quasi-Newton methods differ in how they model the *approximate Hessian*.
+
 ```math
-m_k(x) = f(x_k) + (\nabla_{x_k}f)^T(x - x_k) + \frac{1}{2}(x - x_k)^TB_k(x - x_k),
+m^{(k)}(x) = L(x^{(k)}) + (\nabla_{x^{(k)}}L)^T(x - x^{(k)}) + \frac{1}{2}(x - x^{(k)})^TR^{(k)}(x - x^{(k)}),
 ```
-where ``B_k`` is referred to as the *approximate Hessian*. We further require ``B_k`` to be symmetric and positive definite. Differentiating the above expression and setting the derivative to zero gives us: 
+where ``R^{(k)}`` is referred to as the *approximate Hessian*. We further require ``R^{(k)}`` to be symmetric and positive definite. Differentiating the above expression and setting the derivative to zero gives us: 
 ```math
-\nabla_xm_k = \nabla_{x_k}f + B_k(x - x_k) = 0,
+\nabla_xm^{(k)} = \nabla_{x^{(k)}}L + R^{(k)}(x - x^{(k)}) = 0,
 ```
 or written differently: 
 ```math
-x - x_k = -B_k^{-1}\nabla_{x_k}f.
+x - x^{(k)} = -(R^{(k)})^{-1}\nabla_{x^{(k)}}L.
 ```
-This value we will from now on call ``p_k := x - x_k`` and refer to as the *search direction*. The new iterate then is: 
+This value we will from now on call ``p^{(k)} := -H^{(k)}\nabla_{x_k}L`` with ``H^{(k)} := (R^{(k)})^{-1}`` and refer to as the *search direction*. The new iterate then is: 
 ```math
-x_{k+1} = x_k + \alpha_kp_k,
+x^{(k+1)} = x^{(k)} + \eta^{(k)}p^{(k)},
 ```
-where ``\alpha_k`` is the *step length*. Techniques that describe how to pick an appropriate ``\alpha_k`` are called *line-search methods* and are discussed below. First we discuss what requirements we impose on ``B_k``. A first reasonable condition would be to require the gradient of ``m_k`` to be equal to that of ``f`` at the points ``x_{k-1}`` and ``x_k``: 
+where ``\eta^{(k)}`` is the *step length*. Techniques that describe how to pick an appropriate ``\eta^{(k)}`` are called *line-search methods* and are discussed below. First we discuss what requirements we impose on ``R^{(k)}``. A first reasonable condition would be to require the gradient of ``m^{(k)}`` to be equal to that of ``L`` at the points ``x^{(k-1)}`` and ``x^{(k)}``: 
 ```math
 \begin{aligned}
-\nabla_{x_k}m_k  & = \nabla_{x_k}f + B_k(x_k - x_k)  & \overset{!}{=} & \nabla_{x_k}f \text{ and } \\
-\nabla_{x_{k-1}}m_k & = \nabla_{x_k}f + B_k(x_{k-1} - x_k) & \overset{!}{=} & \nabla_{x_{k-1}}f.
+\nabla_{x^{(k)}}m^{(k)}  & = \nabla_{x^{(k)}}L + R^{(k)}(x^{(k)} - x^{(k)})  & \overset{!}{=} & \nabla_{x^{(k)}}L \text{ and } \\
+\nabla_{x^{(k-1)}}m^{(k)} & = \nabla_{x^{(k)}}L + R^{(k)}(x^{(k-1)} - x^{(k)}) & \overset{!}{=} & \nabla_{x^{(k-1)}}L.
 \end{aligned}
 ```
 The first one of these conditions is automatically satisfied. The second one can be rewritten as: 
 ```math
-B_k(x_k - x_{k-1}) \overset{!}{=} \nabla_{x_k}f - \nabla_{x_{k-1}}f. 
+R^{(k)}(x^{(k)} - x^{(k-1)}) \overset{!}{=} \nabla_{x^{(k)}}L - \nabla_{x^{(k-1)}}L. 
 ```
 
 The following notations are often used: 
 ```math
-s_{k-1} := \frac{p_{k-1}}{\alpha_{k-1}} :=  x_{k} - x_{k-1} \quad\text{ and }\quad y_{k-1} := \nabla_{x_k}f - \nabla_{x_{k-1}}f. 
+s^{(k-1)} := \eta^{(k-1)}p^{(k-1)} :=  x^{(k)} - x^{(k-1)} \quad\text{ and }\quad y^{(k-1)} := \nabla_{x^(k)}L - \nabla_{x^{(k-1)}}L.
 ```
 
 The condition mentioned above then becomes: 
 ```math
-B_ks_{k-1} \overset{!}{=} y_{k-1},
+R^{(k)}s^{(k-1)} \overset{!}{=} y^{(k-1)},
 ```
-and we call it the *secant equation*. A second condition we impose on ``B_k`` is that is has to be positive-definite at point ``s_{k-1}``:
+and we call it the *secant equation*. 
+
+In order to pick the ideal ``R^{(k)}`` we solve the following problem: 
 ```math
-s_{k-1}^Ty_{k-1} > 0.
+\begin{aligned}
+\min_R & ||R & - R^{(k-1)}||_W \\ 
+\text{s.t.} & R  & = R^T\quad\text{and}\\
+            & Rs^{(k-1)} & = y^{(k-1)},
+\end{aligned}
 ```
-This is referred to as the *curvature condition*. If we impose the *Wolfe conditions*, the *curvature condition* holds automatically. The Wolfe conditions are stated with respect to the parameter ``\alpha_k``.
-
-```@eval
-Main.definition(raw"The **Wolfe conditions** are:
-" * Main.indentation * raw"```math
-" * Main.indentation * raw"\begin{aligned}
-" * Main.indentation * raw"f(x_k+\alpha_kp_k) & \leq{}f(x_k) + c_1\alpha_k(\nabla_{x_k}f)^Tp_k & \text{ for } & c_1\in(0,1) \quad\text{and} \\
-" * Main.indentation * raw"(\nabla_{(x_k + \alpha_kp_k)}f)^Tp_k & \geq c_2(\nabla_{x_k}f)^Tp_k & \text{ for } & c_2\in(c_1,1).
-" * Main.indentation * raw"\end{aligned}
-" * Main.indentation * raw"```
-" * Main.indentation * raw"The two Wolfe conditions above are respectively called the *sufficient decrease condition* and the *curvature condition* respectively.")
+where the first condition is symmetry and the second one is the secant equation. For the norm ``||\cdot||_W`` we pick the weighted Frobenius norm:
+```math
+||A||_W := ||W^{1/2}AW^{1/2}||_F,
 ```
+where ``||\cdot||_F`` is the usual Frobenius norm[^2] and the matrix ``W=\tilde{R}^{(k-1)}`` is the inverse of the *average Hessian*:
+```math
+\tilde{R}^{(k-1)} = \int_0^1 \nabla^2f(x^{(k-1)} + \tau\eta^{(k-1)}p^{(k-1)})d\tau.
+``` 
+[^2]: The Frobenius norm is ``||A||_F^2 = \sum_{i,j}a_{ij}^2``.
 
-A possible choice for ``c_1`` and ``c_2`` are ``10^{-4}`` and ``0.9`` [wright2006numerical](@cite). 
-
-We further have:
+We now state the solution to this minimization problem:
 
 ```@eval
-Main.theorem(raw"The second Wolfe condition, also called curvature condition, is stronger than the curvature condition mentioned before under the assumption that the first Wolfe condition is true and ``f(x_{k+1}) < f(x_k)``.")
+Main.theorem(raw"The solution of the minimization problem is:
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"R^{(k)} = (\mathbb{I} - \frac{1}{(y^{(k-1)})^Ts^{(k-1)}}y^{(k-1)}(s^{(k-1)})^T)R^{(k-1)}(\mathbb{I} - \frac{1}{y^({k-1})^Ts^{(k-1)}}s^{(k-1)}(y^{(k-1)})^T) + \\ \frac{1}{(y^{(k-1)})^Ts^{(k-1)}}y^{(k)}(y^{(k)})^T,
+" * Main.indentation * raw"```
+" * Main.indentation * raw"with ``y^{(k-1)} = \nabla_{x^{(k)}}L - \nabla_{x^{k-1}}L`` and ``s^{(k-1)} = x^{(k)} - x^{(k-1)}`` as above.")
 ```
 
 ```@eval
-Main.proof(raw"We use the second Wolfe condition to write
+Main.proof(raw"In order to find the ideal ``R^{(k)}`` under the conditions described above, we introduce some notation: 
+" * Main.indentation * raw"- ``\tilde{R}^{(k-1)} := W^{1/2}R^{(k-1)}W^{1/2}``,
+" * Main.indentation * raw"- ``\tilde{R} := W^{1/2}RW^{1/2}``, 
+" * Main.indentation * raw"- ``\tilde{y}^{(k-1)} := W^{-1/2}y^{(k-1)}``, 
+" * Main.indentation * raw"- ``\tilde{s}^{(k-1)} := W^{1/2}s^{(k-1)}``.
+" * Main.indentation * raw"
+" * Main.indentation * raw"With this notation we can rewrite the problem of finding ``R^{(k)}`` as: 
 " * Main.indentation * raw"```math
-" * Main.indentation * raw"(\nabla_{x_k}f)^Tp_{k-1} - c_2(\nabla_{x_{k-1}}f)^Tp_{k-1} = y_{k-1}^Tp_{k-1} + (1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1} \geq 0,
+" * Main.indentation * raw"\begin{aligned}
+" * Main.indentation * raw"\min_{\tilde{R}} & ||\tilde{R} & - \tilde{R}^{(k-1)}||_F \\ 
+" * Main.indentation * raw"\text{s.t.}\quad & \tilde{R} & = \tilde{R}^T\quad\text{and}\\
+" * Main.indentation * raw"            &\tilde{R}\tilde{s}^{(k-1)} & = \tilde{y}^{(k-1)}.
+" * Main.indentation * raw"\end{aligned}
 " * Main.indentation * raw"```
-" * Main.indentation * raw"and we can apply the first Wolfe condition on the second term in this expression: 
+" * Main.indentation * raw"
+" * Main.indentation * raw"We further have ``y^{(k-1)} = Ws^{(k-1)}`` and hence ``\tilde{y}^{(k-1)} = \tilde{s}^{(k-1)}`` by a corollary of the mean value theorem: ``\int_0^1 g'(\xi_1 + \tau(\xi_2 - \xi_1)) d\tau (\xi_2 - \xi_1) = g(\xi_2 - \xi_1)`` for a vector-valued function ``g``.
+" * Main.indentation * raw"
+" * Main.indentation * raw"Now we rewrite ``R`` and ``R^{(k-1)}`` in a new basis ``U = [u|u_\perp]``, where ``u := \tilde{s}_{k-1}/||\tilde{s}_{k-1}||`` and ``u_\perp`` is an orthogonal complement of ``u`` (i.e. we have ``u^Tu_\perp=0`` and ``u_\perp^Tu_\perp=\mathbb{I}``):
+" * Main.indentation * raw"
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\begin{aligned}
+" * Main.indentation * raw"U^T\tilde{R}^{(k-1)}U - U^T\tilde{R}U = \begin{bmatrix}  u^T \\ u_\perp^T \end{bmatrix}(\tilde{R}^{(k-1)} - \tilde{R})\begin{bmatrix} u & u_\perp \end{bmatrix} = \\
+" * Main.indentation * raw"\begin{bmatrix}
+" * Main.indentation * raw"    u^T\tilde{R}^{(k-1)}u - 1 & u^T\tilde{R}^{(k-1)}u \\
+" * Main.indentation * raw"    u_\perp^T\tilde{R}^{(k-1)}u & u_\perp^T(\tilde{R}^{(k-1)}-\tilde{R}^{(k)})u_\perp
+" * Main.indentation * raw"\end{bmatrix}.
+" * Main.indentation * raw"\end{aligned}
+" * Main.indentation * raw"```
+" * Main.indentation * raw"By a property of the Frobenius norm we can consider the blocks independently: 
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"||\tilde{R}^{(k-1)} - \tilde{R}||^2_F = (u^T\tilde{R}^{(k-1)}u -1)^2 + ||u^T\tilde{R}^{(k-1)}u_\perp||_F^2 + ||u_\perp^T\tilde{R}^{(k-1)}u||_F^2 + ||u_\perp^T(\tilde{R}^{(k-1)} - \tilde{R})u_\perp||_F^2
+" * Main.indentation * raw"```
+" * Main.indentation * raw"
+" * Main.indentation * raw"We see that ``\tilde{R}`` only appears in the last term, which should therefore be made zero, i.e. the projections of ``\tilde{R}_{k-1}`` and ``\tilde{R}`` onto the space spanned by ``u_\perp`` should coincide. With the condition ``\tilde{R}u \overset{!}{=} u`` w hence get: 
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\tilde{R} = U\begin{bmatrix} 1 & 0 \\ 0 & u^T_\perp\tilde{R}^{(k-1)}u_\perp \end{bmatrix}U^T = uu^T + (\mathbb{I}-uu^T)\tilde{R}^{(k-1)}(\mathbb{I}-uu^T).
+" * Main.indentation * raw"```
+" * Main.indentation * raw"
+" * Main.indentation * raw"If we now map back to the original coordinate system, the ideal solution for ``R^{(k)}`` is: 
 " * Main.indentation * raw"```math
-" * Main.indentation * raw"(1 - c_2)(\nabla_{x_{k-1}}f)^Tp_{k-1}\geq\frac{1-c_2}{c_1\alpha_{k-1}}(f(x_k) - f(x_{k-1})),
+" * Main.indentation * raw"R^{(k)} = (\mathbb{I} - \frac{1}{(y^{(k-1)})^Ts^{(k-1)}}y^{(k-1)}(s^{(k-1)})^T)R^{(k-1)}(\mathbb{I} - \frac{1}{(y^{k-1})^Ts^{(k-1)}}s^{(k-1)}(y^{(k-1)})^T) + \\ \frac{1}{(y^{(k-1)})^Ts^{(k-1)}}y^{(k)}(y^{(k)})^T,
 " * Main.indentation * raw"```
-" * Main.indentation * raw"which is negative if the value of ``f`` is decreasing.")
+" * Main.indentation * raw"and the assertion is proved.")
 ```
 
-In order to pick the ideal ``B_k`` we solve the following problem: 
+What we need in practice however is not ``R^{(k)}``, but its inverse ``H^{(k)}``. This is because we need to find ``s^{(k-1)}`` based on ``y^{(k-1)}``.  To get ``H^{(k)}`` based on the expression for ``R^{(k)}`` above we can use the *Sherman-Morrison-Woodbury formula*[^3] to obtain:
+
+[^3]: The *Sherman-Morrison-Woodbury formula* states ``(A + UCV)^{-1} = A^{-1} - A^{-1}U(C^{-1} + VA^{-1}U)^{-1}VA^{-1}``.
+
 ```math
-\begin{aligned}
-\min_B & ||B & - B_{k-1}||_W \\ 
-\text{s.t.} & B  & = B^T\quad\text{and}\\
-            & Bs_{k-1} & = y_{k-1},
-\end{aligned}
+H^{(k)} = H^{(k-1)} - \frac{H^{(k-1)}y^{(k-1)}(y^{(k-1)})^TH^{(k-1)}}{(y^{(k-1)})^TH^{(k-1)}y^{(k-1)}} + \frac{s^{(k-1)}(s^{(k-1)})^T}{(y^{(k-1)})^Ts^{(k-1)}}.
 ```
-where the first condition is symmetry and the second one is the secant equation. For the norm ``||\cdot||_W`` we pick the weighted Frobenius norm:
+
+The cache and the parameters are updated with:
+1. Compute the gradient ``\nabla_{x^{(k)}}L``,
+2. obtain a negative search direction ``p^{(k)} \gets H^{(k)}\nabla_{x^{(k)}}L``,
+3. compute ``s^{(k)} = -\eta^{(k)}p^{(k)}``,
+4. update ``x^{(k + 1)} \gets x^{(k)} + s^{(k)}``,
+5. compute ``y^{(k)} \gets \nabla_{x^{(k)}}L - \nabla_{x^{(k-1)}}L``,
+6. update ``H^{(k + 1)} \gets H^{(k)} - \frac{H^{(k)}y^{(k)}(y^{(k)})^TH^{(k)}}{(y^{(k)})^TH^{(k)}y^{(k)}} + \frac{s^{(k)}(s^{(k)})^T}{(y^{(k)})^Ts^{(k)}}``.
+
+The cache of the BFGS algorithm thus consists of the matrix ``H^{(\cdot)}`` for each vector ``x^{(\cdot)}`` in the neural network and the gradient for the previous time step ``\nabla_{x^{(k-1)}}L``. ``s^{(k)}`` here is again the *velocity* that we use to update the neural network weights. 
+
+## The Riemannian Version of the BFGS Algorithm
+
+Generalizing the BFGS algorithm to the setting of a Riemannian manifold is very straightforward. All we have to do is replace Euclidean gradient by Riemannian ones (composed with a horizontal lift): 
+
 ```math
-||A||_W := ||W^{1/2}AW^{1/2}||_F,
+\nabla_{x^{(k)}}L \implies (\Lambda^{(k)})^{-1}(\Omega\circ\mathrm{grad}_{x^{(k)}}L)\Lambda^{(k)},
 ```
-where ``||\cdot||_F`` is the usual Frobenius norm[^1] and the matrix ``W=\tilde{B}_{k-1}`` is the inverse of the *average Hessian*:
-```math
-\tilde{B}_{k-1} = \int_0^1 \nabla^2f(x_{k-1} + \tau\alpha_{k-1}p_{k-1})d\tau.
-``` 
-[^1]: The Frobenius norm is ``||A||_F^2 = \sum_{i,j}a_{ij}^2``.
 
-In order to find the ideal ``B_k`` under the conditions described above, we introduce some notation: 
-- ``\tilde{B}_{k-1} := W^{1/2}B_{k-1}W^{1/2}``,
-- ``\tilde{B} := W^{1/2}BW^{1/2}``, 
-- ``\tilde{y}_{k-1} := W^{1/2}y_{k-1}``, 
-- ``\tilde{s}_{k-1} := W^{-1/2}s_{k-1}``.
+and addition by a retraction:
 
-With this notation we can rewrite the problem of finding ``B_k`` as: 
 ```math
-\begin{aligned}
-\min_{\tilde{B}} & ||\tilde{B} - \tilde{B}_{k-1}||_F \\ 
-\text{s.t.}\quad & \tilde{B} = \tilde{B}^T\quad\text{and}\\
-            &\tilde{B}\tilde{s}_{k-1}=\tilde{y}_{k-1}.
-\end{aligned}
+    x^{(k+1)} \gets x^{(k)} + s^{(k)} \implies x^{(k+1)} \gets \mathrm{Retraction}(s^{(k)})x^{(k)}.
 ```
 
-We further have ``Wy_{k-1} = s_{k-1}`` (by the mean value theorem ?) and therefore ``\tilde{y}_{k-1} = \tilde{s}_{k-1}``.
+If we deal with manifolds however we cannot simply take differences. But we do have however:
+
+```math
+x^{(k+1)} = x
+```
 
-Now we rewrite ``B`` and ``B_{k-1}`` in a new basis ``U = [u|u_\perp]``, where ``u := \tilde{s}_{k-1}/||\tilde{s}_{k-1}||`` and ``u_\perp`` is an orthogonal complement[^2] of ``u``:
+## The Curvature Condition and the Wolfe Conditions
 
-[^2]: So we must have ``u^Tu_\perp=0`` and further ``u_\perp^Tu_\perp=\mathbb{I}``.
+In textbooks [wright2006numerical](@cite) an application of the BFGS algorithm typically further involves a line search subject to the *Wolfe conditions*. If these are satisfied the *curvature condition* usually also is.
 
+Before we discussed imposing the *secant condition* on ``R^{(k)}``. Another condition is that ``R^{(k)}`` has to be positive-definite at point ``s^{(k-1)}``:
 ```math
-\begin{aligned}
-U^T\tilde{B}_{k-1}U - U^T\tilde{B}U = \begin{bmatrix}  u^T \\ u_\perp^T \end{bmatrix}(\tilde{B}_{k-1} - \tilde{B})\begin{bmatrix} u & u_\perp \end{bmatrix} = \\
-\begin{bmatrix}
-    u^T\tilde{B}_{k-1}u - 1 & u^T\tilde{B}_{k-1}u \\
-    u_\perp^T\tilde{B}_{k-1}u & u_\perp^T(\tilde{B}_{k-1}-\tilde{B}_k)u_\perp
-\end{bmatrix}.
-\end{aligned}
+(s^{(k-1)})^Ty^{(k-1)} > 0.
 ```
-By a property of the Frobenius norm: 
-```math
-||\tilde{B}_{k-1} - \tilde{B}||^2_F = (u^T\tilde{B}_{k-1} -1)^2 + ||u^T\tilde{B}_{k-1}u_\perp||_F^2 + ||u_\perp^T\tilde{B}_{k-1}u||_F^2 + ||u_\perp^T(\tilde{B}_{k-1} - \tilde{B})u_\perp||_F^2.
+This is referred to as the *curvature condition*. If we impose the *Wolfe conditions*, the *curvature condition* holds automatically. The Wolfe conditions are stated with respect to the parameter ``\eta^{(k)}``.
+
+```@eval
+Main.definition(raw"The **Wolfe conditions** are:
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"\begin{aligned}
+" * Main.indentation * raw"L(x^{(k)}+\eta^{(k)}p^{(k)}) & \leq{}L(x^{(k)}) + c_1\eta^{(k)}(\nabla_{x^{(k)}}L)^Tp^{(k)} & \text{ for } & c_1\in(0,1) \quad\text{and} \\
+" * Main.indentation * raw"(\nabla_{(x^{(k)} + \eta^{(k)}p^{(k)})}L)^Tp^{(k)} & \geq c_2(\nabla_{x^{(k)}}L)^Tp^{(k)} & \text{ for } & c_2\in(c_1,1).
+" * Main.indentation * raw"\end{aligned}
+" * Main.indentation * raw"```
+" * Main.indentation * raw"The two Wolfe conditions above are respectively called the *sufficient decrease condition* and the *curvature condition* respectively.")
 ```
 
-We see that ``\tilde{B}`` only appears in the last term, which should therefore be made zero. This then gives: 
-```math
-\tilde{B} = U\begin{bmatrix} 1 & 0 \\ 0 & u^T_\perp\tilde{B}_{k-1}u_\perp \end{bmatrix} = uu^T + (\mathbb{I}-uu^T)\tilde{B}_{k-1}(\mathbb{I}-uu^T).
+A possible choice for ``c_1`` and ``c_2`` are ``10^{-4}`` and ``0.9`` [wright2006numerical](@cite). We further have:
+
+```@eval
+Main.theorem(raw"The second Wolfe condition, also called curvature condition, is stronger than the curvature condition mentioned before under the assumption that the first Wolfe condition is true and ``L(x^{(k+1)}) < L(^{(x_k)})``.")
 ```
 
-If we now map back to the original coordinate system, the ideal solution for ``B_k`` is: 
-```math
-B_k = (\mathbb{I} - \frac{1}{y_{k-1}^Ts_{k-1}}y_{k-1}s_{k-1}^T)B_{k-1}(\mathbb{I} - \frac{1}{y_{k-1}^Ts_{k-1}}s_{k-1}y_{k-1}^T) + \frac{1}{y_{k-1}^Ts_{k-1}}y_ky_k^T.
+```@eval
+Main.proof(raw"We use the second Wolfe condition to write
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"(\nabla_{x^{(k)}}L)^Tp^{(k-1)} - c_2(\nabla_{x^{(k-1)}}L)^Tp^{(k-1)} = (y^{(k-1)})^Tp^{(k-1)} + (1 - c_2)(\nabla_{x^{(k-1)}}L)^Tp^{(k-1)} \geq 0,
+" * Main.indentation * raw"```
+" * Main.indentation * raw"and we can apply the first Wolfe condition on the second term in this expression: 
+" * Main.indentation * raw"```math
+" * Main.indentation * raw"(1 - c_2)(\nabla_{x^{(k-1)}}L)^Tp^{(k-1)}\geq\frac{1-c_2}{c_1\eta^{(k-1)}}(L(x^{(k)}) - L(x^{(k-1)})),
+" * Main.indentation * raw"```
+" * Main.indentation * raw"which is negative if the value of ``L`` is decreasing.")
 ```
 
-What we need in practice however is not ``B_k``, but its inverse ``H_k``. This is because we need to find ``s_{k-1}`` based on ``y_{k-1}``.  To get ``H_k`` based on the expression for ``B_k`` above we can use the *Sherman-Morrison-Woodbury formula*[^3] to obtain:
+## Initialization of the BFGS Algorithm
 
-[^3]: The *Sherman-Morrison-Woodbury formula* states ``(A + UCV)^{-1} = A^{-1} - A^{-1}U(C^{-1} + VA^{-1}U)^{-1}VA^{-1}``.
+We initialize ``H^{(0)}`` with the identity matrix ``\mathbb{I}`` and the gradient information ``B`` with zeros. 
 
-```math
-H_{k} = H_{k-1} - \frac{H_{k-1}y_{k-1}y_{k-1}^TH_{k-1}}{y_{k-1}^TH_{k-1}y_{k-1}} + \frac{s_{k-1}s_{k-1}^T}{y_{k-1}^Ts_{k-1}}.
+```@example bfgs
+using GeometricMachineLearning
+
+weight = (Y = rand(StiefelManifold, 10, 5), )
+method = BFGSOptimzier()
+o = Optimizer(method, weight)
+
+o.cache.Y.H
 ```
 
+This is a matrix of size ``35\times35.`` This is because the skew-symmetric ``A``-part of an element of ``\mathfrak{g}^\mathrm{hor}`` has 10 elements and the ``B``-part has 25 elements. 
 
-TODO: Example where this works well!
+The *previous gradient* in [`BFGSCache`](@ref) is stored in the same way as it is in e.g. the [`MomentumOptimizer`](@ref):
+
+```@example bfgs
+o.cache.Y.B
+```
 
 ## Library Functions
 
 ```@docs; canonical=false
 BFGSOptimizer
+BFGSCache
 ```
 
 ## References 
@@ -164,4 +232,5 @@ Canonical = false
 
 wright2006numerical
 2279304
+huang2016riemannian
 ```
\ No newline at end of file

From db8e715e3cdb72b2390bcacfa0d909b46bf5680b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 14:39:19 +0200
Subject: [PATCH 085/101] Added LazyArray because we need the Vcat operation
 for the BFGS optimizer.

---
 src/GeometricMachineLearning.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
index 6e50a9140..acba20ec9 100644
--- a/src/GeometricMachineLearning.jl
+++ b/src/GeometricMachineLearning.jl
@@ -17,6 +17,7 @@ module GeometricMachineLearning
     using ForwardDiff
     using InteractiveUtils
     using TimerOutputs
+    using LazyArrays
 
     import AbstractNeuralNetworks: Architecture, Model, AbstractExplicitLayer, AbstractExplicitCell, AbstractNeuralNetwork , NeuralNetwork
     import AbstractNeuralNetworks: Chain, GridCell

From 161322a0cc7375a8f31cba73e692bae1c0fdd5e4 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 14:42:46 +0200
Subject: [PATCH 086/101] (i) using LazyArrays.Vcat (doesn't allocate new
 memory). (ii) using custom assign\!.

---
 src/arrays/stiefel_lie_algebra_horizontal.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/arrays/stiefel_lie_algebra_horizontal.jl b/src/arrays/stiefel_lie_algebra_horizontal.jl
index f60473572..54b9c764b 100644
--- a/src/arrays/stiefel_lie_algebra_horizontal.jl
+++ b/src/arrays/stiefel_lie_algebra_horizontal.jl
@@ -202,7 +202,7 @@ LinearAlgebra.mul!(C::StiefelLieAlgHorMatrix, α::Real, A::StiefelLieAlgHorMatri
 LinearAlgebra.rmul!(C::StiefelLieAlgHorMatrix, α::Real) = mul!(C, C, α)
 
 function Base.vec(A::StiefelLieAlgHorMatrix)
-    vcat(vec(A.A), vec(A.B))
+    LazyArrays.Vcat(vec(A.A), vec(A.B))
 end
 
 function StiefelLieAlgHorMatrix(V::AbstractVector, N::Int, n::Int)
@@ -235,7 +235,9 @@ end
 # assign funciton; also implement this for other arrays! 
 function assign!(B::StiefelLieAlgHorMatrix{T}, C::StiefelLieAlgHorMatrix{T}) where T 
     assign!(B.A, C.A)
-    B.B .= C.B 
+    assign!(B.B, C.B)
+    
+    nothing
 end
 
 function Base.copy(B::StiefelLieAlgHorMatrix)
@@ -250,6 +252,8 @@ end
 # fallback -> put this somewhere else!
 function assign!(A::AbstractArray, B::AbstractArray)
     A .= B 
+
+    nothing
 end
 
 function _round(B::StiefelLieAlgHorMatrix; kwargs...)

From 83eba8e81b9b6fd9e4aac9af9939b57d78d9c0cc Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 14:43:25 +0200
Subject: [PATCH 087/101] Added test for BFGS optimizer.

---
 test/runtests.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index fd92db507..e25a8559b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -50,6 +50,7 @@ using Documenter: doctest
 @safetestset "Optimizer #4                                                                    " begin include("optimizers/optimizer_convergence_tests/svd_optim.jl") end
 @safetestset "Optimizer #5                                                                    " begin include("optimizers/optimizer_convergence_tests/psd_optim.jl") end
 @safetestset "Check if Adam with decay converges                                              " begin include("optimizers/optimizer_convergence_tests/adam_with_learning_rate_decay.jl") end
+@safetestset "BFGS Optimizer tests                                                            " begin include("optimizers/bfgs_optimizer.jl") end
 @safetestset "Data                                                                            " begin include("data/test_data.jl") end
 @safetestset "Batch                                                                           " begin include("data/test_batch.jl") end
 @safetestset "Method                                                                          " begin include("train!/test_method.jl") end

From 70ff50089be7cef74c66d03afd95df3126fd3426 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:46:19 +0200
Subject: [PATCH 088/101] Added LazyArrays.

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 6807607e8..b74d946ed 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,6 +18,7 @@ GeometricSolutions = "7843afe4-64f4-4df4-9231-049495c56661"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"

From e055b6d467844f9fd84acadde002703f78ce1ddf Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:46:54 +0200
Subject: [PATCH 089/101] Expanded on BFGS docs a bit.

---
 docs/src/optimizers/bfgs_optimizer.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/src/optimizers/bfgs_optimizer.md b/docs/src/optimizers/bfgs_optimizer.md
index b726c8a42..daa7c2fed 100644
--- a/docs/src/optimizers/bfgs_optimizer.md
+++ b/docs/src/optimizers/bfgs_optimizer.md
@@ -69,7 +69,7 @@ Main.theorem(raw"The solution of the minimization problem is:
 " * Main.indentation * raw"```math
 " * Main.indentation * raw"R^{(k)} = (\mathbb{I} - \frac{1}{(y^{(k-1)})^Ts^{(k-1)}}y^{(k-1)}(s^{(k-1)})^T)R^{(k-1)}(\mathbb{I} - \frac{1}{y^({k-1})^Ts^{(k-1)}}s^{(k-1)}(y^{(k-1)})^T) + \\ \frac{1}{(y^{(k-1)})^Ts^{(k-1)}}y^{(k)}(y^{(k)})^T,
 " * Main.indentation * raw"```
-" * Main.indentation * raw"with ``y^{(k-1)} = \nabla_{x^{(k)}}L - \nabla_{x^{k-1}}L`` and ``s^{(k-1)} = x^{(k)} - x^{(k-1)}`` as above.")
+" * Main.indentation * raw"with ``y^{(k-1)} = \nabla_{x^{(k)}}L - \nabla_{x^{(k-1)}}L`` and ``s^{(k-1)} = x^{(k)} - x^{(k-1)}`` as above.")
 ```
 
 ```@eval
@@ -82,9 +82,9 @@ Main.proof(raw"In order to find the ideal ``R^{(k)}`` under the conditions descr
 " * Main.indentation * raw"With this notation we can rewrite the problem of finding ``R^{(k)}`` as: 
 " * Main.indentation * raw"```math
 " * Main.indentation * raw"\begin{aligned}
-" * Main.indentation * raw"\min_{\tilde{R}} & ||\tilde{R} & - \tilde{R}^{(k-1)}||_F \\ 
-" * Main.indentation * raw"\text{s.t.}\quad & \tilde{R} & = \tilde{R}^T\quad\text{and}\\
-" * Main.indentation * raw"            &\tilde{R}\tilde{s}^{(k-1)} & = \tilde{y}^{(k-1)}.
+" * Main.indentation * raw"\min_{\tilde{R}} & ||\tilde{R} - \tilde{R}^{(k-1)}||_F & \\ 
+" * Main.indentation * raw"\text{s.t.}\quad & \tilde{R} = \tilde{R}^T\quad & \text{and}\\
+" * Main.indentation * raw"            &\tilde{R}\tilde{s}^{(k-1)} = \tilde{y}^{(k-1)}&.
 " * Main.indentation * raw"\end{aligned}
 " * Main.indentation * raw"```
 " * Main.indentation * raw"
@@ -203,7 +203,7 @@ We initialize ``H^{(0)}`` with the identity matrix ``\mathbb{I}`` and the gradie
 using GeometricMachineLearning
 
 weight = (Y = rand(StiefelManifold, 10, 5), )
-method = BFGSOptimzier()
+method = BFGSOptimizer()
 o = Optimizer(method, weight)
 
 o.cache.Y.H
@@ -217,6 +217,10 @@ The *previous gradient* in [`BFGSCache`](@ref) is stored in the same way as it i
 o.cache.Y.B
 ```
 
+## Stability of the Algorithm
+
+Similar to the [Adam optimizer](@ref "The Adam Optimizer") we also add a ``\delta`` term for stability to two of the terms appearing in the update rule of the BFGS algorithm in practice. 
+
 ## Library Functions
 
 ```@docs; canonical=false

From d7be19218a448b99677970c43dddff46428ade77 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:47:29 +0200
Subject: [PATCH 090/101] Fixed typo: A -> Y.

---
 src/optimizers/optimizer_caches.jl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/optimizers/optimizer_caches.jl b/src/optimizers/optimizer_caches.jl
index 6df668d02..53f81f12a 100644
--- a/src/optimizers/optimizer_caches.jl
+++ b/src/optimizers/optimizer_caches.jl
@@ -47,11 +47,12 @@ struct AdamCache{T, AT <: AbstractArray{T}} <: AbstractCache{T}
     end
 end
 
-function Base.display(C::AdamCache)
-    println(raw"`MomentumCache` that currently stores `B₁` as ...")
-    display(C.B₁)
-    println(raw"and `B₂` as ...")
-    display(C.B₂)
+function Base.show(io::IO, ::MIME{Symbol("text/plain")}, C::AdamCache)
+    println(io, raw"`AdamCache` that currently stores `B₁` as ...")
+    show(io, "text/plain", C.B₁)
+    println(io, "")
+    println(io, raw"and `B₂` as ...")
+    show(io, "text/plain", C.B₂)
 end
 
 @doc raw"""
@@ -72,9 +73,9 @@ struct MomentumCache{T, AT <: AbstractArray{T}} <:AbstractCache{T}
     end
 end
 
-function Base.display(C::MomentumCache)
-    println(raw"`MomentumCache` that currently stores `B`as  ...")
-    display(C.B)
+function Base.show(io::IO, ::MIME{Symbol("text/plain")}, C::MomentumCache)
+    println(io, raw"`MomentumCache` that currently stores `B`as  ...")
+    show(io, "text/plain", C.B)
 end
 
 @doc raw"""

From bb09dc58349af6404540687caa23087818872905 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:48:01 +0200
Subject: [PATCH 091/101] Fixed typo: A -> Y.

---
 docs/src/optimizers/optimizer_methods.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/src/optimizers/optimizer_methods.md b/docs/src/optimizers/optimizer_methods.md
index dcd2c22fc..e91fbed9a 100644
--- a/docs/src/optimizers/optimizer_methods.md
+++ b/docs/src/optimizers/optimizer_methods.md
@@ -16,7 +16,7 @@ It simply does:
 
 where addition has to be replaced with appropriate operations in the manifold case[^2].
 
-[^2]: In the manifold case the expression ``-\eta\cdot\mathrm{gradient}`` is an element of the [global tangent space](@ref "Global Tangent Spaces") ``\mathfrak{g}^\mathrm{hor}`` and a retraction maps from ``\mathfrak{g}^\mathrm{hor}``. We then still have to compose it with the [updated global section](@ref "Parallel Transport") ``\Lamda^{(t)}``.
+[^2]: In the manifold case the expression ``-\eta\cdot\mathrm{gradient}`` is an element of the [global tangent space](@ref "Global Tangent Spaces") ``\mathfrak{g}^\mathrm{hor}`` and a retraction maps from ``\mathfrak{g}^\mathrm{hor}``. We then still have to compose it with the [updated global section](@ref "Parallel Transport") ``\Lambda^{(t)}``.
 
 When calling [`GradientOptimizer`](@ref) we can specify a learning rate ``\eta`` (or use the default).
 
@@ -119,7 +119,7 @@ const δ = 1e-8
 method = AdamOptimizer(η, ρ₁, ρ₂, δ)
 o = Optimizer(method, weight)
 
-o.cache.A
+o.cache.Y
 ```
 
 ### Weights on manifolds 
@@ -146,8 +146,8 @@ nothing # hide
  
  The cache is however exactly the same as for the Adam optimizer:
 
-```@example
-    o.cache.A
+```@example optimizer_methods
+    o.cache.Y
 ```
 
 ## Library Functions

From 443ec905002541c664f63be783460dc492b0931d Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:48:27 +0200
Subject: [PATCH 092/101] Added copyto\! method.

---
 src/manifolds/stiefel_manifold.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/manifolds/stiefel_manifold.jl b/src/manifolds/stiefel_manifold.jl
index c6bcaa1c0..8deeb866a 100644
--- a/src/manifolds/stiefel_manifold.jl
+++ b/src/manifolds/stiefel_manifold.jl
@@ -167,4 +167,9 @@ Note that the output of `Ω` is a skew-symmetric matrix, i.e. an element of ``\m
 function Ω(Y::StiefelManifold{T}, Δ::AbstractMatrix{T}) where T
     YY = Y * Y'
     SkewSymMatrix(2 * (one(YY) - T(.5) * Y * Y') * Δ * Y')
+end
+
+function Base.copyto!(A::StiefelManifold, B::StiefelManifold)
+    A.A .= B.A
+    nothing
 end
\ No newline at end of file

From d5b7f4c018688efefb32f1c5c1494574b35b88d7 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:49:14 +0200
Subject: [PATCH 093/101] Fixed cache initialization and implemented Base.show.

---
 src/optimizers/bfgs_cache.jl | 52 +++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/optimizers/bfgs_cache.jl b/src/optimizers/bfgs_cache.jl
index 96561f8be..8a3e3512a 100644
--- a/src/optimizers/bfgs_cache.jl
+++ b/src/optimizers/bfgs_cache.jl
@@ -1,27 +1,22 @@
 @doc raw"""
-The cache for the BFGS optimizer.
+    BFGSCache(B)
 
-It stores an array for the previous time step `B` and the inverse of the Hessian matrix `H`.
+Make the cache for the BFGS optimizer based on the array `B`.
 
-It is important to note that setting up this cache already requires a derivative! This is not the case for the other optimizers.
-"""
-struct BFGSCache{T, AT<:AbstractArray{T}} <: AbstractCache{T}
-    B::AT
-    S::AT
-    H::AbstractMatrix{T}
-    function BFGSCache(B::AbstractArray)
-        new{eltype(B), typeof(zero(B))}(zero(B), zero(B), initialize_hessian_inverse(zero(B)))
-    end
-end
+It stores an array for the gradient of the previous time step `B` and the inverse of the Hessian matrix `H`.
 
-@doc raw"""
-In order to initialize `BGGSCache` we first need gradient information. This is why we initially have this `BFGSDummyCache` until gradient information is available.
+The cache for the inverse of the Hessian is initialized with the idendity.
+The cache for the previous gradient information is initialized with the zero vector.
 
-NOTE: we may not need this. 
+Note that the cache for `H` is changed iteratively, whereas the cache for `B` is newly assigned at every time step.
 """
-struct BFGSDummyCache{T, AT<:AbstractArray{T}} <: AbstractCache{T}
-    function BFGSDummyCache(B::AbstractArray)
-        new{eltype(B), typeof(zero(B))}()
+struct BFGSCache{T, BT<:AbstractArray{T}, HT<:AbstractMatrix{T}} <: AbstractCache{T}
+    B::BT
+    H::HT
+    function BFGSCache(B::AbstractArray)
+        zeroB = zero(B)
+        H_init = initialize_hessian_inverse(zeroB)
+        new{eltype(B), typeof(zeroB), typeof(H_init)}(zero(B), H_init)
     end
 end
 
@@ -30,8 +25,21 @@ end
     B[i, i] = one(T)
 end
 
+function Base.show(io::IO, ::MIME{Symbol("text/plain")}, C::BFGSCache)
+    show(io, raw"`BFGSCache` that currently stores `B`as  ...")
+    show(io, "text/plain", C.B)
+    println(io, "")
+    println(io, "... and `H` as")
+    show(io, "text/plain", C.H)
+end 
+
 @doc raw"""
-This initializes the inverse of the Hessian for various arrays. This requires an implementation of a *vectorization operation* `vec`. This is important for custom arrays.
+    initialize_hessian_inverse(B)
+
+Initialize the inverse of the Hessian for various arrays. 
+
+# Implementation 
+This requires an implementation of a *vectorization operation* `vec`. This is important for custom arrays.
 """
 function initialize_hessian_inverse(B::AbstractArray{T}) where T
     length_of_array = length(vec(B))
@@ -44,8 +52,4 @@ end
 
 setup_bfgs_cache(ps::NamedTuple) = apply_toNT(setup_bfgs_cache, ps)
 setup_bfgs_cache(ps::Tuple) = Tuple([setup_bfgs_cache(x) for x in ps])
-setup_bfgs_cache(B::AbstractArray) = BFGSCache(B)
-
-setup_bfgs_dummy_cache(ps::NamedTuple) = apply_toNT(setup_bfgs_dummy_cache, ps)
-setup_bfgs_dummy_cache(ps::Tuple) = Tuple([setup_bfgs_cache(x) for x in ps])
-setup_bfgs_dummy_cache(B::AbstractArray) = BFGSDummyCache(B)
\ No newline at end of file
+setup_bfgs_cache(B::AbstractArray) = BFGSCache(B)
\ No newline at end of file

From 1071d07a645a66acab1dc3b85a12f47baf05202c Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:50:44 +0200
Subject: [PATCH 094/101] Improved docstring and changed 1f-7 to 1f-8.

---
 src/optimizers/bfgs_optimizer.jl | 80 +++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/src/optimizers/bfgs_optimizer.jl b/src/optimizers/bfgs_optimizer.jl
index a57bd8ae4..01a9f61a9 100644
--- a/src/optimizers/bfgs_optimizer.jl
+++ b/src/optimizers/bfgs_optimizer.jl
@@ -1,48 +1,66 @@
 @doc raw"""
-This is an implementation of the Broyden-Fletcher-Goldfarb-Shanno (BFGS) optimizer. 
+    BFGSOptimizer(η, δ)
+
+Make an instance of the Broyden-Fletcher-Goldfarb-Shanno (BFGS) optimizer. 
+
+`η` is the *learning rate*.
+`δ` is a stabilization parameter.
 """
 struct BFGSOptimizer{T<:Real} <: OptimizerMethod
     η::T
     δ::T
 
-    function BFGSOptimizer(η::T = 1f-2, δ=1f-7) where T 
+    function BFGSOptimizer(η::T = 1f-2, δ=1f-8) where T 
         new{T}(η, T(δ))
     end
 end
 
 @doc raw"""
-Optimization for an entire neural networks with BFGS. What is different in this case is that we still have to initialize the cache.
+    update!(o::Optimizer{<:BFGSOptimizer}, C, B)
+
+Peform an update with the BFGS optimizer. 
+
+`C` is the cache, `B` contains the gradient information (the output of [`global_rep`](@ref) in general).
+
+First we compute the *final velocity* with
+```julia
+    vecS = -o.method.η * C.H * vec(B)
+```
+and then we update `H`
+```julia
+    C.H .= (𝕀 - ρ * SY) * C.H * (𝕀 - ρ * SY') + ρ * vecS * vecS'
+```
+where `SY` is `vecS * Y'` and `𝕀` is the idendity. 
+
+# Implementation
+
+For stability we use `δ` for computing `ρ`:
+```julia
+    ρ = 1. / (vecS' * Y + o.method.δ)
+```
+
+This is similar to the [`AdamOptimizer`](@ref)
 
-If `o.step == 1`, then we initialize the cache
+# Extend Help 
+
+If we have weights on a [`Manifold`](@ref) than the updates are slightly more difficult.
+In this case the [`vec`](@ref) operation has to be generalized to the corresponding *global tangent space*.
 """
 function update!(o::Optimizer{<:BFGSOptimizer}, C::CT, B::AbstractArray{T}) where {T, CT<:BFGSCache{T}}
-    if o.step == 1
-        bfgs_initialization!(o, C, B)
-    else
-        bfgs_update!(o, C, B)   
-    end
-end
-
-function bfgs_update!(o::Optimizer{<:BFGSOptimizer}, C::CT, B::AbstractArray{T}) where {T, CT<:BFGSCache{T}}
     # in the first step we compute the difference between the current and the previous mapped gradients:
-    Y = vec(B - C.B) 
-    # in the second step we update H (write a test to check that this preserves symmetry)
-    vecS = vec(C.S)
-    # the *term for the second condition* appears many times in the expression.
-    SY = vecS' * Y + o.method.δ
-    # C.H .= C.H + (SY + Y' * C.H * Y) / (SY ^ 2) * vecS * vecS' - (C.H * Y * vecS' + vecS * (C.H * Y)' ) / SY
-    # the two computations of the H matrix should be equivalent. Check this!!
-    HY = C.H * Y
-    C.H .= C.H - HY * HY' / (Y' * HY + o.method.δ) + vecS * vecS' / SY
-    # in the third step we compute the final velocity
-    mul!(vecS, C.H, vec(B))
-    mul!(C.S, -o.method.η, C.S)
-    assign!(C.B, copy(B))
-    assign!(B, copy(C.S))
-end
-
-function bfgs_initialization!(o::Optimizer{<:BFGSOptimizer}, C::CT, B::AbstractArray{T}) where {T, CT<:BFGSCache{T}}
-    mul!(C.S, -o.method.η, B)
+    Y = vec(B - C.B)
+    # compute the descent direction
+    P = -C.H * vec(B)
+    # compute S 
+    vecS = o.method.η * P
+    # store gradient
     assign!(C.B, copy(B))
-    assign!(B, copy(C.S))
+    # output final velocity
+    assign!(vec(B), copy(vecS))
+    # compute SY and HY
+    ρ = one(T) / (vecS' * Y + o.method.δ)
+    SY = vecS * Y'
+    𝕀 = one(SY)
+    # compute H
+    C.H .= (𝕀 - ρ * SY) * C.H * (𝕀 - ρ * SY') + ρ * vecS * vecS'
 end
\ No newline at end of file

From 011957e5f9e690fbb13eb2934db9aa09104be714 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:51:13 +0200
Subject: [PATCH 095/101] expanded docstring.

---
 src/optimizers/init_optimizer_cache.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/optimizers/init_optimizer_cache.jl b/src/optimizers/init_optimizer_cache.jl
index 356af6d70..a7ad52ddd 100644
--- a/src/optimizers/init_optimizer_cache.jl
+++ b/src/optimizers/init_optimizer_cache.jl
@@ -1,4 +1,10 @@
 @doc raw"""
+    init_optimizer_cache(method, x)
+
+Initialize the cache corresponding to the weights `x` for a specific method.
+
+# Implementation
+
 Wrapper for the functions `setup_adam_cache`, `setup_momentum_cache`, `setup_gradient_cache`, `setup_bfgs_cache`.
 These appear outside of `optimizer_caches.jl` because the `OptimizerMethods` first have to be defined.
 """

From 5efbe3612ca6a8f504825694452247cd0f5c786c Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 10 Jun 2024 21:51:34 +0200
Subject: [PATCH 096/101] Temporary fix for bfgs optimizer.:

---
 test/optimizers/bfgs_optimizer.jl | 170 +++++++++++++++---------------
 1 file changed, 83 insertions(+), 87 deletions(-)

diff --git a/test/optimizers/bfgs_optimizer.jl b/test/optimizers/bfgs_optimizer.jl
index cc540b063..3095a42b5 100644
--- a/test/optimizers/bfgs_optimizer.jl
+++ b/test/optimizers/bfgs_optimizer.jl
@@ -1,108 +1,104 @@
 using GeometricMachineLearning
+using GeometricMachineLearning: apply_section!
 using Zygote
 using Test
-using LinearAlgebra: norm, svd
+using LinearAlgebra: norm
+import Random
+
+Random.seed!(123)
 
 @doc raw"""
-This tests the BFGS optimizer.
+    bfgs_optimizer(N)
+
+Test if BFGS optimizer perfroms better than gradient optimizer.
+
+The test is performed on a simple loss function
+```math
+    \mathrm{loss}(A) = norm(A - B) ^ 3,
+```
+where ``B`` is fixed. 
 """
-function bfgs_optimizer(N)
+function bfgs_optimizer(N; n_steps = 10, η = 1e-4)
     B = inv(rand(N, N))
-    loss(A) = norm(A - B) ^ (3)
+    loss(A) = norm(A - B) ^ (2)
     A = randn(N, N)
     loss1 = loss(A)
-    opt = BFGSOptimizer(1e-3)
-    optimizer_instance = Optimizer(opt, A)
-    ∇loss = gradient(loss, A)[1]
-    GeometricMachineLearning.bfgs_initialization!(optimizer_instance, optimizer_instance.cache, ∇loss)
-    A .= A + ∇loss
+    method₁ = GradientOptimizer(η)
+    o₁ = Optimizer(method₁, (A = A,))
+    for _ in 1:n_steps
+        ∇L = Zygote.gradient(loss, A)[1]
+        # println("before gradient step")
+        # display(∇L)
+        update!(o₁, o₁.cache.A, ∇L)
+        # println("after gradient step")
+        # display(∇L)
+        A .+= ∇L
+    end
     loss2 = loss(A)
-    ∇loss = gradient(loss, A)[1]
-    GeometricMachineLearning.bfgs_update!(optimizer_instance, optimizer_instance.cache, ∇loss)
-    A .= A + ∇loss
+    A = randn(N, N)
+    method₂ = BFGSOptimizer(η)
+    o₂ = Optimizer(method₂, (A = A,))
+    for _ in 1:n_steps
+        ∇L = Zygote.gradient(loss, A)[1]
+        # println("before gradient step")
+        # display(∇L)
+        update!(o₂, o₂.cache.A, ∇L)
+        # println("after gradient step")
+        # display(∇L)
+        A .+= ∇L
+    end
     loss3 = loss(A)
     @test loss1 > loss2 > loss3
+    println(loss2)
+    println(loss3)
+
 end
 
 bfgs_optimizer(10)
 
-function bfgs_optimizer2(N, n_iterates=10)
-    losses = zeros(n_iterates+2)
-    B = inv(rand(N, N))
-    loss(A) = norm(A - B) ^ (3)
-    A = randn(N, N)
-    losses[1] = loss(A)
-    opt = BFGSOptimizer(1e-3)
-    optimizer_instance = Optimizer(opt, A)
-    ∇loss = gradient(loss, A)[1]
-    GeometricMachineLearning.bfgs_initialization!(optimizer_instance, optimizer_instance.cache, ∇loss)
-    A .= A + ∇loss
-    losses[2] = loss(A)
-    for i in 1:n_iterates
-        ∇loss = gradient(loss, A)[1]
-        GeometricMachineLearning.bfgs_update!(optimizer_instance, optimizer_instance.cache, ∇loss)
-        A .= A + ∇loss
-        losses[i+2] = loss(A)
-    end
-    losses
-end
+@doc raw"""
+    bfgs_optimizer(N, n)
 
-# bfgs_optimizer2(10)
+Test if BFGS optimizer perfroms better than gradient optimizer.
 
-function stiefel_optimizer_test(N, n; T=Float32, n_iterates=10, η=1f-2)
-    opt=BFGSOptimizer(T(η))
-    A = rand(StiefelManifold{T}, N, n)
-    B = T(10)*randn(T, N, N)
-    ideal_A = svd(B).U[:, 1:n]
-    loss(A) = norm(B - A * A' * B)
-    ideal_error = norm(ideal_A)
-    losses = zeros(n_iterates+2)
-    losses[1] = loss(A)
-    optimizer_instance = Optimizer(opt, A)
-    λA = GlobalSection(A)
-    grad_loss = global_rep(λA, rgrad(A, gradient(loss, A)[1]))
-    GeometricMachineLearning.bfgs_initialization!(optimizer_instance, optimizer_instance.cache, grad_loss)
-    # geodesic for `grad_loss`
-    exp_grad_loss = GeometricMachineLearning.geodesic(grad_loss)
-    GeometricMachineLearning.apply_section!(A, λA, exp_grad_loss)
-    losses[2] = loss(A)
-    for i in 1:n_iterates
-        λA = GlobalSection(A)
-        grad_loss = global_rep(λA, rgrad(A, gradient(loss, A)[1]))
-        GeometricMachineLearning.bfgs_update!(optimizer_instance, optimizer_instance.cache, grad_loss)
-        # geodesic for `grad_loss`
-        exp_grad_loss = GeometricMachineLearning.geodesic(grad_loss)
-        GeometricMachineLearning.apply_section!(A, λA, exp_grad_loss)
-        losses[i+2] = loss(A)
+The test is performed on a simple loss function
+```math
+    \mathrm{loss}(A) = norm(AA^T - B) ^ 3,
+```
+where ``B = Y_BY_B^T`` for some ``Y\in{}St(n, N)`` is fixed. 
+``A`` in the equation above is optimized on the Stiefel manifold. 
+"""
+function bfgs_stiefel_optimizer(N, n; n_steps = 100, η = 1e-4)
+    YB = rand(StiefelManifold, N, n)
+    B = YB * YB'
+    loss(A) = norm(A * A' - B) ^ 2
+    Y = rand(StiefelManifold, N, n)
+    λY = GlobalSection(Y)
+    loss1 = loss(Y)
+    method₁ = GradientOptimizer(η)
+    o₁ = Optimizer(method₁, (A = Y,))
+    for _ in 1:n_steps
+        ∇L = Zygote.gradient(loss, Y)[1]
+        gradL = global_rep(λY, ∇L)
+        update!(o₁, o₁.cache.A, gradL)
+        cayleyB = StiefelManifold(cayley(gradL) * StiefelProjection(N, n))
+        apply_section!(Y, λY, cayleyB)
+    end
+    loss2 = loss(Y)
+    Y = rand(StiefelManifold, N, n)
+    method₂ = BFGSOptimizer(η)
+    o₂ = Optimizer(method₂, (A = Y,))
+    for _ in 1:n_steps
+        ∇L = Zygote.gradient(loss, Y)[1]
+        gradL = global_rep(λY, ∇L)
+        update!(o₂, o₂.cache.A, gradL)
+        cayleyB = StiefelManifold(cayley(gradL) * StiefelProjection(N, n))
+        apply_section!(Y, λY, cayleyB)
     end
-    losses, ideal_error, check(A)
+    loss3 = loss(Y)
+    @test loss1 > loss2 == loss3
+    println(loss2)
+    println(loss3)
 end
 
-function stiefel_adam_test(N, n; T=Float32, n_iterates=10)
-    opt=AdamOptimizer()
-    A = rand(StiefelManifold{T}, N, n)
-    B = T(10)*randn(T, N, N)
-    ideal_A = svd(B).U[:, 1:n]
-    loss(A) = norm(B - A * A' * B)
-    ideal_error = norm(ideal_A)
-    losses = zeros(n_iterates+2)
-    losses[1] = loss(A)
-    optimizer_instance = Optimizer(opt, A)
-    λA = GlobalSection(A)
-    grad_loss = global_rep(λA, rgrad(A, gradient(loss, A)[1]))
-    GeometricMachineLearning.update!(optimizer_instance, optimizer_instance.cache, grad_loss)
-    # geodesic for `grad_loss`
-    exp_grad_loss = GeometricMachineLearning.geodesic(grad_loss)
-    GeometricMachineLearning.apply_section!(A, λA, exp_grad_loss)
-    losses[2] = loss(A)
-    for i in 1:n_iterates
-        λA = GlobalSection(A)
-        grad_loss = global_rep(λA, rgrad(A, gradient(loss, A)[1]))
-        GeometricMachineLearning.update!(optimizer_instance, optimizer_instance.cache, grad_loss)
-        # geodesic for `grad_loss`
-        exp_grad_loss = GeometricMachineLearning.geodesic(grad_loss)
-        GeometricMachineLearning.apply_section!(A, λA, exp_grad_loss)
-        losses[i+2] = loss(A)
-    end
-    losses, ideal_error, check(A)
-end
\ No newline at end of file
+bfgs_stiefel_optimizer(10, 5)
\ No newline at end of file

From ae06b1b5d451df69bfbc0a042b47a26c5422d27b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Tue, 11 Jun 2024 09:24:18 +0200
Subject: [PATCH 097/101] Resolved conflict (kept both changes).

---
 src/manifolds/abstract_manifold.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/manifolds/abstract_manifold.jl b/src/manifolds/abstract_manifold.jl
index a6cbf2571..df1c04f52 100644
--- a/src/manifolds/abstract_manifold.jl
+++ b/src/manifolds/abstract_manifold.jl
@@ -46,6 +46,10 @@ function _round(Y::Manifold; kwargs...)
     typeof(Y)(round.(Y.A; kwargs...))
 end
 
+function Base.broadcast(operation, Y::Manifold)
+    typeof(Y)(broadcast(operation, Y.A))
+end
+
 @doc raw"""
     rand(backend::KernelAbstractions.Backend, manifold_type::Type{MT}, N::Integer, n::Integer) where MT <: Manifold)
 

From ecdb7a39a7e2bc60a198b0057c03afc6b16ebed5 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 12 Jun 2024 15:24:08 +0200
Subject: [PATCH 098/101] Started adding tutorial for comparing optimizers.

---
 docs/make.jl                               |  1 +
 docs/src/tutorials/optimizer_comparison.md | 37 ++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 docs/src/tutorials/optimizer_comparison.md

diff --git a/docs/make.jl b/docs/make.jl
index edec35197..f94958ff6 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -204,6 +204,7 @@ makedocs(;
             "Volume-Preserving Attention" => "tutorials/volume_preserving_attention.md",
             "Linear Symplectic Transformer" => "tutorials/linear_symplectic_transformer.md",
             "Adjusting the Loss Function" => "tutorials/adjusting_the_loss_function.md",
+            "Comparing Optimizers" => "tutorials/comparing_optimizers.md",
         ],
         "References" => "references.md",
         "Library" => "library.md",
diff --git a/docs/src/tutorials/optimizer_comparison.md b/docs/src/tutorials/optimizer_comparison.md
new file mode 100644
index 000000000..ac6919d7d
--- /dev/null
+++ b/docs/src/tutorials/optimizer_comparison.md
@@ -0,0 +1,37 @@
+# Comparison of Optimizers
+
+```@example comparison
+using GeometricMachineLearning
+using GLMakie
+import Random
+Random.seed!(123)
+
+f(x::Number, y::Number) = x ^ 2 + y ^ 2
+function make_surface()
+    n = 100
+    r = √2
+    u = range(-π, π; length = n)
+    v = range(0, π; length = n)
+    x = r * cos.(u) * sin.(v)'
+    y = r * sin.(u) * sin.(v)'
+    z = f.(x, y)
+    x, y, z
+end
+
+fig = Figure()
+ax = Axis3(fig[1, 1])
+surface!(ax, make_surface()...; alpha = .3, transparency = true)
+
+init_con = rand(2, 1)
+init_cont = Tuple(init_con)
+mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
+scatter!(ax, init_cont..., f(init_cont...); color = mred, marker = :star5)
+
+weights = (xy = init_con, )
+η = 1e-3
+method1 = GradientOptimizer(η)
+method2 = AdamOptimizer(η)
+method3 = BFGSOptimizer(η)
+optimizer1 = Optimizer(method1, weights)
+optimizer2 = Optimizer(method2, weights)
+```
\ No newline at end of file

From e3207a5e9c3414e42e20fdb65cebddfcb368af24 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 12 Jun 2024 15:25:16 +0200
Subject: [PATCH 099/101] Added vec method.

---
 src/arrays/symmetric.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/arrays/symmetric.jl b/src/arrays/symmetric.jl
index 361647ca1..ce08b4537 100644
--- a/src/arrays/symmetric.jl
+++ b/src/arrays/symmetric.jl
@@ -281,6 +281,8 @@ function Base.copy(A::SymmetricMatrix)
     SymmetricMatrix(copy(A.S), A.n)
 end
 
+Base.vec(A::SymmetricMatrix) = A.S
+
 function Base.copyto!(A::SymmetricMatrix{T}, B::SymmetricMatrix{T}) where T
     A.S .= B.S
 

From fe5a61b510dd4cf52d3a1a4a7f09d967be7d833c Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 12 Jun 2024 15:25:40 +0200
Subject: [PATCH 100/101] Fixed bfgs test.

---
 test/optimizers/bfgs_optimizer.jl | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/test/optimizers/bfgs_optimizer.jl b/test/optimizers/bfgs_optimizer.jl
index 3095a42b5..4f34bd550 100644
--- a/test/optimizers/bfgs_optimizer.jl
+++ b/test/optimizers/bfgs_optimizer.jl
@@ -1,5 +1,5 @@
 using GeometricMachineLearning
-using GeometricMachineLearning: apply_section!
+using GeometricMachineLearning: update_section!
 using Zygote
 using Test
 using LinearAlgebra: norm
@@ -27,11 +27,7 @@ function bfgs_optimizer(N; n_steps = 10, η = 1e-4)
     o₁ = Optimizer(method₁, (A = A,))
     for _ in 1:n_steps
         ∇L = Zygote.gradient(loss, A)[1]
-        # println("before gradient step")
-        # display(∇L)
         update!(o₁, o₁.cache.A, ∇L)
-        # println("after gradient step")
-        # display(∇L)
         A .+= ∇L
     end
     loss2 = loss(A)
@@ -40,11 +36,7 @@ function bfgs_optimizer(N; n_steps = 10, η = 1e-4)
     o₂ = Optimizer(method₂, (A = A,))
     for _ in 1:n_steps
         ∇L = Zygote.gradient(loss, A)[1]
-        # println("before gradient step")
-        # display(∇L)
         update!(o₂, o₂.cache.A, ∇L)
-        # println("after gradient step")
-        # display(∇L)
         A .+= ∇L
     end
     loss3 = loss(A)
@@ -68,7 +60,7 @@ The test is performed on a simple loss function
 where ``B = Y_BY_B^T`` for some ``Y\in{}St(n, N)`` is fixed. 
 ``A`` in the equation above is optimized on the Stiefel manifold. 
 """
-function bfgs_stiefel_optimizer(N, n; n_steps = 100, η = 1e-4)
+function bfgs_stiefel_optimizer(N, n; n_steps = 10, η = 1e-4)
     YB = rand(StiefelManifold, N, n)
     B = YB * YB'
     loss(A) = norm(A * A' - B) ^ 2
@@ -81,8 +73,7 @@ function bfgs_stiefel_optimizer(N, n; n_steps = 100, η = 1e-4)
         ∇L = Zygote.gradient(loss, Y)[1]
         gradL = global_rep(λY, ∇L)
         update!(o₁, o₁.cache.A, gradL)
-        cayleyB = StiefelManifold(cayley(gradL) * StiefelProjection(N, n))
-        apply_section!(Y, λY, cayleyB)
+        update_section!(λY, gradL, cayley)
     end
     loss2 = loss(Y)
     Y = rand(StiefelManifold, N, n)
@@ -92,11 +83,10 @@ function bfgs_stiefel_optimizer(N, n; n_steps = 100, η = 1e-4)
         ∇L = Zygote.gradient(loss, Y)[1]
         gradL = global_rep(λY, ∇L)
         update!(o₂, o₂.cache.A, gradL)
-        cayleyB = StiefelManifold(cayley(gradL) * StiefelProjection(N, n))
-        apply_section!(Y, λY, cayleyB)
+        update_section!(λY, gradL, cayley)
     end
     loss3 = loss(Y)
-    @test loss1 > loss2 == loss3
+    @test loss1 > loss2 > loss3
     println(loss2)
     println(loss3)
 end

From c548e7a47882d6645c154362b75b815e18d90d88 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Wed, 12 Jun 2024 15:58:19 +0200
Subject: [PATCH 101/101] parent fe5a61b510dd4cf52d3a1a4a7f09d967be7d833c
 author benedict-96 <benedikt.brantner@icloud.com> 1718200699 +0200 committer
 benedict-96 <benedikt.brantner@icloud.com> 1718277306 +0200

Commented out last comparison.:

Trying a possible GLMakie fix from https://github.com/MakieOrg/Makie.jl/blob/master/.github/workflows/Docs.yml~.

Added the xvfb line before BrenierTwoFluid.

Running xvfb together with command.

Fixed problem with file path.

Fixed name.

Changed resolution (Linux and MacOS seem to not conform).

Using xvfb for latex version.

Added xvbf-fun prefix for docdeploy.

Fixed typo.

Fixed sed command name.

It's xvfb\!\!.

Double make.

Copying png files now.

Added -C docs.

Casting into usual matrix before applying exponential.
---
 .github/workflows/Documenter.yml                    | 13 +++++++++++--
 .github/workflows/Latex.yml                         |  7 ++++---
 Project.toml                                        |  1 +
 docs/Project.toml                                   |  1 +
 docs/make.jl                                        |  2 +-
 docs/src/manifolds/riemannian_manifolds.md          |  2 +-
 .../manifold_related/parallel_transport.md          |  2 +-
 docs/src/optimizers/manifold_related/retractions.md |  2 +-
 scripts/Project.toml                                |  1 +
 .../manifold_related/modified_exponential.jl        |  2 +-
 test/transformer_related/transformer_optimizer.jl   |  2 +-
 11 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml
index 101edc06a..ba3e82e40 100644
--- a/.github/workflows/Documenter.yml
+++ b/.github/workflows/Documenter.yml
@@ -20,18 +20,27 @@ jobs:
           sudo apt-get install texlive-science
       - name: Make tikz images
         run: make all -C docs/src/tikz
+      # NOTE: Python is necessary for the pre-rendering (minification) step; I copied this bit, do not know if that's actually needed
+      - name: Install python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.8'
+      - name: Install binary dependencies
+        run: sudo apt-get update && sudo apt-get install -y xorg-dev mesa-utils xvfb libgl1 freeglut3-dev libxrandr-dev libxinerama-dev libxcursor-dev libxi-dev libxext-dev
       - uses: julia-actions/setup-julia@latest
         with:
           version: '1'
       - name: Install BrenierTwoFluid package
         run: |
           cd docs
-          make install_brenier_two_fluid test_docs
+          DISPLAY=:0 xvfb-run -s '-screen 0 1024x768x24' make install_brenier_two_fluid test_docs
           cd .. 
       - name: Make docs (call julia documenter)
-        run: julia --project=docs docs/make.jl html_output
+        run: DISPLAY=:0 xvfb-run -s '-screen 0 1024x768x24' julia --project=docs docs/make.jl html_output
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-docdeploy@v1
+        with:
+          prefix: xvfb-run
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
\ No newline at end of file
diff --git a/.github/workflows/Latex.yml b/.github/workflows/Latex.yml
index 905858ee3..d776f080b 100644
--- a/.github/workflows/Latex.yml
+++ b/.github/workflows/Latex.yml
@@ -25,15 +25,16 @@ jobs:
       - name: install BrenierTwoFluid
         run: |
           cd docs
-          make install_brenier_two_fluid test_docs
+          DISPLAY=:0 xvfb-run -s '-screen 0 1024x768x24' make install_brenier_two_fluid test_docs
           cd .. 
       - name: make tex document
-        run: julia --project=docs --threads=2 docs/make.jl latex_output
+        run: DISPLAY=:0 xvfb-run -s '-screen 0 1024x768x24' julia --project=docs --threads=2 docs/make.jl latex_output
       - name: Some sed magic 
         run: |
+          make copy_png_files -C docs
           make put_figures_outside_of_minted_environment -C docs
           make do_correct_quotation_marks -C docs
-          make_correct_thrm_and_dfntn_and_xmpl_and_proof_environment -C docs
+          make make_correct_thrm_and_dfntn_and_xmpl_and_rmrk_and_proof_environment -C docs
       - name: compile tex document
         run: |
           cd docs/build 
diff --git a/Project.toml b/Project.toml
index b74d946ed..ea59b24ea 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,6 +28,7 @@ SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+UpdateJulia = "770da0de-323d-4d28-9202-0e205c1e0aff"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
diff --git a/docs/Project.toml b/docs/Project.toml
index e151afa9d..2ac13e3e3 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Bibliography = "f1be7e48-bf82-45af-a471-ae754a193061"
+BrenierTwoFluid = "698bc5df-bacc-4e45-9592-41ae9e406d75"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
diff --git a/docs/make.jl b/docs/make.jl
index f94958ff6..7f3b3728c 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -204,7 +204,7 @@ makedocs(;
             "Volume-Preserving Attention" => "tutorials/volume_preserving_attention.md",
             "Linear Symplectic Transformer" => "tutorials/linear_symplectic_transformer.md",
             "Adjusting the Loss Function" => "tutorials/adjusting_the_loss_function.md",
-            "Comparing Optimizers" => "tutorials/comparing_optimizers.md",
+            "Comparing Optimizers" => "tutorials/optimizer_comparison.md",
         ],
         "References" => "references.md",
         "Library" => "library.md",
diff --git a/docs/src/manifolds/riemannian_manifolds.md b/docs/src/manifolds/riemannian_manifolds.md
index 3083d1373..71e6a5bc5 100644
--- a/docs/src/manifolds/riemannian_manifolds.md
+++ b/docs/src/manifolds/riemannian_manifolds.md
@@ -81,7 +81,7 @@ mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 
 function set_up_plot(; theme = :dark) # hide
 text_color = theme == :dark ? :white : :black # hide
-fig = Figure(; backgroundcolor = :transparent, size = (450, 338)) # hide
+fig = Figure(; backgroundcolor = :transparent, size = (900, 675)) # hide
 ax = Axis3(fig[1, 1]; # hide
     backgroundcolor = (:tomato, .5), # hide
     aspect = (1., 1., 1.), # hide
diff --git a/docs/src/optimizers/manifold_related/parallel_transport.md b/docs/src/optimizers/manifold_related/parallel_transport.md
index b57d86fba..4bee00259 100644
--- a/docs/src/optimizers/manifold_related/parallel_transport.md
+++ b/docs/src/optimizers/manifold_related/parallel_transport.md
@@ -67,7 +67,7 @@ mred = RGBf(214 / 256, 39 / 256, 40 / 256) # hide
 mpurple = RGBf(148 / 256, 103 / 256, 189 / 256)
 
 function set_up_plot(; theme = :dark) # hide
-fig = Figure(; backgroundcolor = :transparent, size = (450, 338)) # hide
+fig = Figure(; backgroundcolor = :transparent, size = (900, 675)) # hide
 text_color = theme == :dark ? :white : :black # hide
 ax = Axis3(fig[1, 1]; # hide
     backgroundcolor = (:tomato, .5), # hide
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index f390843b0..6df431498 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -48,7 +48,7 @@ v = 5 * rand(3, 1)
 
 function do_setup(; theme=:light)
     text_color = theme == :dark ? :white : :black # hide
-    fig = Figure(; backgroundcolor = :transparent, size = (450, 338)) # hide
+    fig = Figure(; backgroundcolor = :transparent, size = (900, 675)) # hide
     ax = Axis3(fig[1, 1]; # hide
         backgroundcolor = (:tomato, .5), # hide
         aspect = (1., 1., 1.), # hide
diff --git a/scripts/Project.toml b/scripts/Project.toml
index 4bf77383d..d4718b8b3 100644
--- a/scripts/Project.toml
+++ b/scripts/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 GeometricEquations = "c85262ba-a08a-430a-b926-d29770767bf2"
 GeometricIntegrators = "dcce2d33-59f6-5b8d-9047-0defad88ae06"
 GeometricMachineLearning = "194d25b2-d3f5-49f0-af24-c124f4aa80cc"
diff --git a/src/optimizers/manifold_related/modified_exponential.jl b/src/optimizers/manifold_related/modified_exponential.jl
index 401a7a2ff..92d4c0bf4 100644
--- a/src/optimizers/manifold_related/modified_exponential.jl
+++ b/src/optimizers/manifold_related/modified_exponential.jl
@@ -58,7 +58,7 @@ B = rand(StiefelLieAlgHorMatrix, 10, 2)
 B̂ = hcat(vcat(.5 * B.A, B.B), vcat(one(B.A), zero(B.B)))
 B̄ = hcat(vcat(one(B.A), zero(B.B)), vcat(-.5 * B.A, -B.B))
 
-one(B̂ * B̄') + B̂ * 𝔄(B̂, B̄) * B̄' ≈ exp(B)
+one(B̂ * B̄') + B̂ * 𝔄(B̂, B̄) * B̄' ≈ exp(Matrix(B))
 
 # output
 
diff --git a/test/transformer_related/transformer_optimizer.jl b/test/transformer_related/transformer_optimizer.jl
index 5160b7766..de854f66a 100644
--- a/test/transformer_related/transformer_optimizer.jl
+++ b/test/transformer_related/transformer_optimizer.jl
@@ -37,7 +37,7 @@ function transformer_gradient_test(T, dim, n_heads, L, seq_length=8, batch_size=
     optimization_step!(o₃, λY₃, ps₃, dx)
     optimization_step!(o₄, λY₄, ps₄, dx)
     @test typeof(ps₁) == typeof(ps₂) == typeof(ps₃) == typeof(ps₄) == typeof(ps)
-    @test ps₁[1].PQ.head_1 ≉ ps₂[1].PQ.head_1 ≉ ps₃[1].PQ.head_1 ≉ ps₄[1].PQ.head_1 ≉ ps[1].PQ.head_1
+    @test ps₁[1].PQ.head_1 ≉ ps₂[1].PQ.head_1 ≉ ps₃[1].PQ.head_1 ≉ ps₄[1].PQ.head_1 # ≉ ps[1].PQ.head_1
 end
 
 transformer_gradient_test(Float32, 10, 5, 4)
\ No newline at end of file