diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 000000000..1096178d9
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,68 @@
+steps:
+  - label: ":julia: Julia {{matrix.julia}} + CUDA GPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "{{matrix.julia}}"
+      - JuliaCI/julia-test#v1:
+          test_args: "--quickfail"
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+          dirs:
+            - src
+            - ext
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    env:
+      GROUP: "CUDA"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 240
+    matrix:
+      setup:
+        julia:
+          - "1"
+          - "1.6"
+          - "nightly"
+      adjustments:
+        - with:
+            julia: "1.6"
+          soft_fail: true
+        - with:
+            julia: "nightly"
+          soft_fail: true
+  - label: "Documentation"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+          dirs:
+            - src
+            - ext
+    command: |
+      julia --project --code-coverage=user --color=yes --threads=3 -e '
+        println("--- :julia: Instantiating project")
+        using Pkg
+        Pkg.instantiate()
+        Pkg.activate("docs")
+        Pkg.develop(PackageSpec(path=pwd()))
+        Pkg.instantiate()
+        println("+++ :julia: Building documentation")
+
+        Pkg.activate("docs")
+        include("docs/tutorials.jl")
+        Pkg.activate("docs")
+        include("docs/make.jl")'
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    env:
+      DATADEPS_ALWAYS_ACCEPT: true
+      JULIA_DEBUG: "Documenter"
+      GKSwstype: "100" # https://discourse.julialang.org/t/generation-of-documentation-fails-qt-qpa-xcb-could-not-connect-to-display/60988
+    if: build.message !~ /\[skip docs\]/ && !build.pull_request.draft
+    timeout_in_minutes: 240
+
+env:
+  SECRET_CODECOV_TOKEN: "jQ0BMTQgyZx7QGyU0Q2Ec7qB9mtE2q/tDu0FsfxvEG7/zOAGvXkyXrzIFFOQxvDoFcP+K2+hYZKMxicYdNqzr5wcxu505aNGN2GM3wyegAr+hO6q12bCFYx6qXzU9FLCCdeqINqn9gUSSOlGtWNFrbAlrTyz/D4Yo66TqBDzvaLL63FMnhCLaXW/zJt3hNuEAJaPY2O6Ze1rX2WZ3Y+i+s3uQ8aLImtoCJhPe8CRx+OhuYiTzGhynFfGntZ0738/1RN4gNM0S/hTC4gLE7XMVBanJpGh32rFaiDwW4zAyXKBrDkL3QA3MS1RvLTJxGJ085S16hCk0C4ddAhZCvIM9Q==;U2FsdGVkX1+bXdFeKMs5G79catOCyby2n07A2fg0FjVAvrjQLZ0yfvDS4paJiFikLkodho0khz2YALKb2Y0K6w=="
+  SECRET_DOCUMENTER_KEY: "iRC4P/r5o9pARB670eK9jPlKQKgkTMDAyvp2GbLG8WwLuT8T1VcWx/o4+ofGlzbTh5Z+LuFgPXfgqkjGuoWLcocHNm78xQMNMywB4rcLB2shqp8xG2vhglgnTBBS4EiyPAtVqGyi5AKmfF95PfkJvnI0Lqg5P/RWQvNGywLAR0Ikgr/lqocm2CvkFGbpMzpGxGvj76JYOusVeKvGAp698TXqPabSZR2oZQLfYnEZnaO8ivkqvMGQSXfgzoIMjCOrN1rSa84SWeI9BDeBslzDHwaYGlvjpfCyviiLtKj4t5Acl1gVE0qxxZxWuALIU6z+C1W8TbW7ZDCBUFs6UTIT+Q==;U2FsdGVkX1+/HSgg1skLszz835vSO6mEtXMhG62ohQQUc5opdo7kEIAG2wCoJPQrqGyaF9kKDVvrN5G2MdjUyaLBYlv90RzXhjTiMNFdgI3M4K500xKq3itY/aEL7hUSMRKxTos8u4xhdbRboY4rPcqgtCJ2LHEjNxmml/NfEo/8lk291rGoEYQLTvKP9cuo4enmEVVRhqmabBzt1MDz0m4c8RufJWW2Ni4osaKRkYPjl/ijJ38wvRUZIiyCX7uofh+3iCKWn0111q5xFhn256Pm79Cx2ZP+yTp9sMsVNMJZ3UJ5r18F3H+zFHWWQSoiWpHn2WNB/2VUEyt0Lp1LnogKru96P2oYkXi6kqrA+qlLISUUU7R7ggJU0IRS6MjSGDyVzlaZG8m+RmY0bmQKrDwSeq1JMGkBpjwPY1o4yOnFRB7Rj1bzToLtd2IFSa8x0a2dUSyL5pBlyWklzZCxPp05R53RNSOi2KfhNfdZU2H7xEj5+z2aV5OidzowXIyYH8FlusMdk3NAOsvTbmBGiwvN4Zub9Exli06ZwARu/oJHLRh+hgOErIJ7DoX6nPrAtofSy6Etydpt+c4HkVZtGPWFSTMNWIGNx2NB1IfveOTU60H5emQ7zow5grXz4VTczqvCIh2hoQdSR4Oplr6+tDDLhtcGGHchHt473o2ygQ1m1tg7oSvMN7jmkUV1N6GniQofmlbr8d5LK4i/QtfC5GHCKIg3ohRlDvuvvKzvVWofgHX3NhXFTKK/CWAIp76iOaCWJcI562SpKyn+pFqYKpatJ42WfF3VbNpJYVMYMai5BwAE2RyZ6FhHbsaHq/NXO/dRJwHeDm4Pc/LFlGFdzpdbuf+w2DoePc56PlNmKsLNlZVlwbWcExKttI8nz3Th3aHNNtbIbD9awf1RdDspudQrTPWkyEopDVm7TkOj/J891U5p24PF5dasIJR19Tqpic3LVJuBXYRbL/Z79VRjeE3wBGLTDdhzJMA8TrS+yMSCF80bIw/F44o4WbA3Ya425mph9MIt/a137osRKATYqbustmVW/LfIyVhuHCOCRQsqTyFU+ff6Tp0EE2i1du90wosr+UutXiubYphCmuKkZONPbiXjpW1CAi40iAwxfgOVqAl13y4FlUp4EiGS7hPBUbvvEXMqT3ssfL+mlideH/v08PQCRcyG03zcCjCTmjXCggqHd+eEXhnsNZ4PFKCKiN+znR5SW+/p+kJTaBrX2e/kMU6kzjwb4NyNmZie0hHSneVtwJ1FuXJk/Zph4quv5KugCCx21xb5pePqxfKRW5jtW6r2Rc7OSNN4BHjwAcj8fOVV+12Ak7//o8mRh0aveYfoEvjCdaI8OPfjduDGfmzPUvXiqV9kGpovdlDUATyoVa3l1CowJ5r8KDOD6Ps89OG7TV2c7Wzxq2FQVjMFXxv/4wMZR1F/0zyH+ofPLVZjK3039z35GD4uoOW9Uc7WSr4FbxxuCDwOXWgstuk3rk6ASZFSe7RIwE/Y16d/aqzI+LG8pHqaEdhg6o6Y6JxBYNQo/JoglUOHwD+N5g5n9vfBNzf0xTlE/r0yjO3LCHyWzCnWr3QdKgzm6EDyL8GO+yQIbtXtw6lRQB/UEZ+ayt175r08Yhey95IsPwLVDFRRlG6pYwmzTlQOEwvqDI8SDMWboU+jp6a5jrbaAmqiIkaoiIzrV1QDp1x+Sqj0veqN+RtcpXLawJevz8dm76H+Mmp1br61nwvGcBaOKukICVj3iLeeu5tV5NoEJznWPwveHrcarZtKvOOeJbydmNAz286i0F1ocX337dt17jIkRv9sHbfqAVapob+eT7F3N/UY99GWGDVbXzaruQwsuPPR6MbLolG6buHQaKX3OZ/zJqGWfEAHw5yJKoKNe8aSgY2DsoITqPlbNRQQmOIMuF8ffD8L1stD/P5Ohth5Nql2W+l6y87/nqxkJ9y4FFS4QzrMrl9ztugfsRoYyeSWRydLUHlTCv155VsGAxjCMBQg1rP99Smfd02EbCFlWlypIw/zem0LZ1zVuz/Wjb03n+dzi2GIKRlTrt6YMrGGAcKI+3Pf1D0rsDhXNkdFUjOeofUkDbBr/splYCKLucDHFVdN88XyaQoj2fBymNJ4BqvK64TVOLwPGAQvh/rHZ5PkJR3lMI4fg+Kxdl9/5xDjkD9aV+yRvfqVGodNW/qofq34nrdb3co1tZ4BxtSANKdJg3Fv6U0I4DOMVsJTeOn/918M31rif0rKAwnHAkeyQVbZyEsFoqxvE8gUFs1zTRwZJWlmY0xnuVcM8pOh6hULeYGiF57ZlbvymygYqObe58YgrChRnF4NhKIIYzuz7mOSKRXqF3Cr0LNYHcktUH9wrqISxiHbaUQceYZ1D0q8UfiayeK9yppMkltcDUL9M93xjTGJK8pVzARXn6ETuEsNTtLvbU/KMDY7bnVc7n08suLCk1YeJB/sn0wuTbPt+27NeYIG1YXBEE0dsgJW4z64489h71v4xws856gFOHZx0L/nkW7l328HA3jltbgJFl52mQHAJwUZrt5sJef/k7gsTdX1zQtjKN8lFjo4qpvJUpenmO9nT+Wty5cjohlETBos8CdSqj4SjEu7/UhDt52evt33EayoWJ8TjKd4VRFYCXnM6eGnSMDqUU5f7DxVjrwHnT26jtq9ijKTiAxls7fYjN8TGT/S3CHZZAK1u5gSbWfkFOcE+mioboNwDvuvysjL6de+bsc7r35w4hLFnPmKemcde4pNQfEnuelBFJqwYZbcAkhN8AmtqIWPXBw9n3eUx/TJgMFEIoB/frNDRbB0WJKdBkjdE1NVvAUl3jDnZbWjG6rqE+6UvyGqKBpd0FRYAfg3ss3hVB70uluULKUBVazlNIQlqX+qYEMBXaDIkxcftre8KYebQyJnxiOB5V+eELvm6L28bK4Xh2tpXzJL7aDlQnL8dRNvQdZgDL62EXYhrc3mz0I/p7br3KMcnei/LaPRAgcsW7WKLwzE5id6JnpOJj4VXdkX7IUB4xQjDRsGKxhjbklMVFA8g/801khNlwzU/IoXsHBgTs7yZoFX/oo4Jyp514hwqPlvJEgci0OHiSA6Mx3le2nUh0SQH+AzFJ2vi7Bn1a4psiuqd+vJJ1iuNw5CBCZlV+GO8sG93BBGnLzZDoRvkIMbzwESFP3JYZ/lKs29CB2Adobl9YbwP3he0I9cD0A/RPC70gzTdVEfL6T4iPUhBr1Bn3YlUPeC2QvCTbpKkxDsfzchuq/y0xlmL4E7Rdb+4TSMlViXfnc6aoD9vvPMWLJFF2qrxRLKhUTse5V6RoE+EVmHSiX0Vd7sd/bYp7asOC0b1xL+zjfJ5DSrtMA/P8L1p+CoLNXgVfgzCB3sCa+GLSLS2INsL1Qtnfkl8IGaMDeV+VAyHjY0HCj0l1X99f/RzD6TYrZAkLS8h1EM/JjomglhVG9/HTKS20BBJeos5ifrVd38rhONJy0HCP28pn4rCIyIE4bNG+1tEsHAg4FDYgh/OYuBsaGYgha9TGV5lGIxmVCECq3IPpkPN1CsLqv3KuDvNeH6XOOAzVtFj4VoIV6QgRLP8+94ZiiEDaPQxQ7BZoqrqFYrxWHDtEuon46VtQ3Nfq/1Rq/HvszJv6JE77w7qvKlxG9sXgxzCDRqNrG83cwY2hpDBr8U0hPMrEx977Weja1aG/rG6uirNBcY5qAAOLDo+9RvV1xqvWFF8SkT97tzNUHbzw8tuUlCT9m4rshCG+jBw59rpUZwW+eR1ih9qU7Nyr3oNgi/zmkORF1duym8VSfW5dxtRBIqxxM0oSWoHti+HSd0VLdHw8jRpbQddMBr1sjD1jIgp3w2dU4oEthzStKCPY2/lAWBm+1Es1okGhEM3I939DRcYOjfJnTCtJLJ9DTKycVDMerXvHnCgImZ0Oh4mtLF+63hn+9wUc56owFeNqs+NJHqmBBFX2uNr/Rj9mzYkRRPsYYSyCB7jIS+Z8Zall6W3dwLcsE3uw/oPKx5bJDAhnp7kZgzLC0zlS2D0ZcNZuW2uUtwhZJM6OOyV+FUFgizmpIQAQ8Nm6n/1yk0asB4jZFf221a9ZmzvUfWKmmIR7OxX3qBH9x2uMMhemv9LZdEHMcjTeIXRYciMLWUNeWagYhDgV1cRBGCDTh2EhHvYX7ZXfpsHjLOR+sAEr7uR3siitf/mRkiLfT2YBgTACKKoj05UuC8aknEV4T5bWiye+gKGioml5G/fWYHyHow37g6D84n0cBTWmI0oPlg+rqpeRLOeYaTeCXOtM/7M1FHuGvzmBnag2vhKY2tpjVrg2nI3p4SRlzTyoQkyMfRXN87v5nAheVcLgrYtkv9aX7R6VMZ1UIsxn62ZHFa2IR6skB/xw7RRuJY5r5FIWs1LqIQDaon5L4C4v9rnBxMYoUM"
\ No newline at end of file
diff --git a/docs/make.jl b/docs/make.jl
index 5d262ff3d..0b9b97dcb 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -30,12 +30,26 @@ makedocs(;
             "Horizontal Lift" => "optimizers/manifold_related/horizontal_lift.md",
             "Global Sections" => "optimizers/manifold_related/global_sections.md",
             "Retractions" => "optimizers/manifold_related/retractions.md",
+            "Geodesic Retraction" => "optimizers/manifold_related/geodesic.md",
+            "Cayley Retraction" => "optimizers/manifold_related/cayley.md",
             "Adam Optimizer" => "optimizers/adam_optimizer.md",
             ],
         "Special Neural Network Layers" => [
             "Attention" => "layers/attention_layer.md",
             "Multihead Attention" => "layers/multihead_attention_layer.md",
         ],
+        "Data Loader" =>[
+            "Routines" => "data_loader/data_loader.md",
+            "Snapshot matrix" => "data_loader/snapshot_matrix.md",
+        ],
+        "Reduced Order Modelling" =>[
+            "POD and Autoencoders" => "reduced_order_modeling/autoencoder.md",
+            "PSD and Symplectic Autoencoders" => "reduced_order_modeling/symplectic_autoencoder.md",
+            "Kolmogorov n-width" => "reduced_order_modeling/kolmogorov_n_width.md",
+        ],
+        "Tutorials" =>[
+            "MNIST" => "tutorials/mnist_tutorial.md",
+        ],
         "Library" => "library.md",
     ],
 )
diff --git a/docs/src/data_loader/data_loader.md b/docs/src/data_loader/data_loader.md
new file mode 100644
index 000000000..1e60c1804
--- /dev/null
+++ b/docs/src/data_loader/data_loader.md
@@ -0,0 +1,8 @@
+# Data Loader 
+
+`GeometricMachineLearning` provides flexible routines to load and manage data for training neural networks. 
+`DataLoader` has several constructors: 
+
+1. If provided with a tensor, then it assumes the first axis is the system dimension, the second axis is the dimension of the parameter space, and the third axis gives the time evolution of the system. 
+
+2. If provided with a tensor and a vector, it assumes the data are related to a classification task. 
\ No newline at end of file
diff --git a/docs/src/data_loader/snapshot_matrix.md b/docs/src/data_loader/snapshot_matrix.md
new file mode 100644
index 000000000..d2373aaf4
--- /dev/null
+++ b/docs/src/data_loader/snapshot_matrix.md
@@ -0,0 +1,15 @@
+# Snapshot matrix
+
+The snapshot matrix stores solutions of the high-dimensional ODE (obtained from discretizing a PDE). This is then used to construct [reduced bases](../reduced_order_modeling/autoencoder.md) in a data-driven way. So (for a single parameter[^1]) the snapshot matrix takes the following form: 
+
+[^1]: If we deal with a parametrized PDE then there are **two stages** at which the snapshot matrix has to be processed: the offline stage and the online stage. 
+
+```math
+M = \left[\begin{array}{c:c:c:c}
+\hat{u}_1(t_0) &  \hat{u}_1(t_1) & \quad\ldots\quad & \hat{u}_1(t_f) \\
+\hat{u}_2(t_0) &  \hat{u}_2(t_1) & \ldots & \hat{u}_2(t_f) \\
+\hat{u}_3(t_0) &  \hat{u}_3(t_1) & \ldots & \hat{u}_3(t_f) \\
+\ldots &  \ldots & \ldots & \ldots \\
+\hat{u}_{2N}(t_0) &  \hat{u}_{2N}(t_1) & \ldots & \hat{u}_{2N}(t_f) \\
+\end{array}\right].
+```
\ No newline at end of file
diff --git a/docs/src/images/adam_optimizer.png b/docs/src/images/adam_optimizer.png
index 8b5d1a8c5..62569f311 100644
Binary files a/docs/src/images/adam_optimizer.png and b/docs/src/images/adam_optimizer.png differ
diff --git a/docs/src/images/adam_optimizer.tex b/docs/src/images/adam_optimizer.tex
index 4789dfd19..88ecc758c 100644
--- a/docs/src/images/adam_optimizer.tex
+++ b/docs/src/images/adam_optimizer.tex
@@ -24,7 +24,7 @@
     \coordinate[right of=R3, xshift=.5cm] (addition);
     \node[fit=(R2)(cache)(R3)(R4)(addition),draw, ultra thick, rounded corners, label=below:$\mathtt{optimization\_step(o, ::}\mathbb{R}^N\mathtt{,::}\mathbb{R}^N\mathtt{)}$] (optimization_step) {};
 
-    \draw[->] (R2) -- (R3) node[pos=.5, sloped, below] {Adam};
+    \draw[->] (R2) -- (cache) node[pos=.5, sloped, below] {Adam};
     \draw[->] (cache) -- (R3) node[pos=.5, sloped, above] {Adam};
     \draw[->] (R3) -- (R4) node[pos=.5, right] {Addition};
     \draw[->, mgreen] (R1) -- (R2) node[pos=.5, left] {\color{mgreen}AD};
diff --git a/docs/src/images/general_optimization.pdf b/docs/src/images/general_optimization.pdf
index d8da647b7..2e7ebbe67 100644
Binary files a/docs/src/images/general_optimization.pdf and b/docs/src/images/general_optimization.pdf differ
diff --git a/docs/src/images/general_optimization.png b/docs/src/images/general_optimization.png
index ead084d68..c7b692435 100644
Binary files a/docs/src/images/general_optimization.png and b/docs/src/images/general_optimization.png differ
diff --git a/docs/src/images/general_optimization.tex b/docs/src/images/general_optimization.tex
index bc7a4e2b3..4b68d8f04 100644
--- a/docs/src/images/general_optimization.tex
+++ b/docs/src/images/general_optimization.tex
@@ -18,7 +18,7 @@
 
     \draw[->] (TYM) -- (ghorY) node[pos=.5, left] {$\Omega$};
     \draw[->, mred] (ghorY) -- (ghor);
-    \draw[->] (ghor) -- (ghor2) node[pos=.5, sloped, below] {Adam};
+    \draw[->] (ghor) -- (cache) node[pos=.5, sloped, below] {Adam};
     \draw[->] (cache) -- (ghor2) node[pos=.5, sloped, above] {Adam};
     \draw[->] (ghor2) -- (M) node[pos=.5, right] {Retraction};
 \end{tikzpicture}
diff --git a/docs/src/images/general_optimization_with_boundary.png b/docs/src/images/general_optimization_with_boundary.png
index 4c1012cb1..a4b32b6d7 100644
Binary files a/docs/src/images/general_optimization_with_boundary.png and b/docs/src/images/general_optimization_with_boundary.png differ
diff --git a/docs/src/images/general_optimization_with_boundary.tex b/docs/src/images/general_optimization_with_boundary.tex
index 9ce0295e9..a576864d9 100644
--- a/docs/src/images/general_optimization_with_boundary.tex
+++ b/docs/src/images/general_optimization_with_boundary.tex
@@ -31,7 +31,7 @@
 
     \draw[->] (TYM) -- (ghorY) node[pos=.5, left] {$\Omega$};
     \draw[->, mred] (ghorY) -- (ghor);
-    \draw[->] (ghor) -- (ghor2) node[pos=.5, sloped, below] {Adam};
+    \draw[->] (ghor) -- (cache) node[pos=.5, sloped, below] {Adam};
     \draw[->] (cache) -- (ghor2) node[pos=.5, sloped, above] {Adam};
     \draw[->] (ghor2) -- (M) node[pos=.5, right] {Retraction};
     \draw[->, mgreen] (M2) -- (Euc) node[pos=.5, left] {\color{mgreen}AD};
diff --git a/docs/src/images/mha.png b/docs/src/images/mha.png
new file mode 100644
index 000000000..40628aa5f
Binary files /dev/null and b/docs/src/images/mha.png differ
diff --git a/docs/src/images/solution_manifold_2.png b/docs/src/images/solution_manifold_2.png
new file mode 100644
index 000000000..3ed6cf860
Binary files /dev/null and b/docs/src/images/solution_manifold_2.png differ
diff --git a/docs/src/images/symplectic_autoencoder.png b/docs/src/images/symplectic_autoencoder.png
new file mode 100644
index 000000000..d8dee8a98
Binary files /dev/null and b/docs/src/images/symplectic_autoencoder.png differ
diff --git a/docs/src/layers/multihead_attention_layer.md b/docs/src/layers/multihead_attention_layer.md
index ac7444126..5c2f8f524 100644
--- a/docs/src/layers/multihead_attention_layer.md
+++ b/docs/src/layers/multihead_attention_layer.md
@@ -1,7 +1,27 @@
 # Multihead Attention Layer
 
-In order to arrive from the [attention layer](attention_layer.md) at the **multihead attention layer** we only have to do a simple modification: 
+In order to arrive from the [attention layer](attention_layer.md) at the **multihead attention layer** we have to do a few modifications: 
 
+Note that these neural networks were originally developed for natural language processing (NLP) tasks and the terminology used here bears some resemblance to that field. 
+The input to a multihead attention layer typicaly comprises three components:
+
+1. Values $V\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are **value vectors**, 
+2. Queries $Q\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are **query vectors**, 
+3. Keys $K\in\mathbb{R}^{n\times{}T}$: a matrix whose columns are **key vectors**.
+
+Regular attention performs the following operation: 
+
+```math
+\mathrm{Attention}(Q,K,V) = V\mathrm{softmax}(\frac{K^TQ}{\sqrt{n}}),
+```
+
+where $n$ is the dimension of the vectors in $V$, $Q$ and $K$. The softmax activation function here acts column-wise, so it can be seen as a transformation $\mathrm{softmax}:\mathbb{R}^{T}\to\mathbb{R}^T$ with $[\mathrm{softmax}(v)]_i = e^{v_i}/\left(\sum_{j=1}e^{v_j}\right)$. The $K^TQ$ term is a similarity matrix between the queries and the vectors. 
+
+The transformer contains a **self-attention mechanism**, i.e. takes an input $X$ and then transforms it linearly to $V$, $Q$ and $K$, i.e. $V = P^VX$, $Q = P^QX$ and $K = P^KX$. What distinguishes the multihead attention layer from the singlehead attention layer, is that there is not just one $P^V$, $P^Q$ and $P^K$, but there are several: one for each **head** of the multihead attention layer. After computing the individual values, queries and vectors, and after applying the softmax, the outputs are then concatenated together in order to obtain again an array that is of the same size as the input array:
+
+![](../images/mha.png)
+
+Here the various $P$ matrices can be interpreted as being projections onto lower-dimensional subspaces, hence the designation by the letter $P$. Because of this interpretation as projection matrices onto smaller spaces that should **capture features in the input data** it makes sense to constrain these elements to be part of the Stiefel manifold.   
 
 ## References 
 - Vaswani, Ashish, et al. "Attention is all you need." Advances in neural information processing systems 30 (2017).
\ No newline at end of file
diff --git a/docs/src/optimizers/manifold_related/cayley.md b/docs/src/optimizers/manifold_related/cayley.md
new file mode 100644
index 000000000..15d814fc7
--- /dev/null
+++ b/docs/src/optimizers/manifold_related/cayley.md
@@ -0,0 +1,52 @@
+# The Cayley Retraction 
+
+The Cayley transformation is one of the most popular retractions. For several matrix Lie groups it is a mapping from the Lie algebra $\mathfrak{g}$ onto the Lie group $G$. 
+They Cayley retraction reads: 
+
+```math
+    \mathrm{Cayley}(C) = \left(\mathbb{I} -\frac{1}{2}C\right)^{-1}\left(\mathbb{I} +\frac{1}{2}C\right).
+```
+This is easily checked to be a retraction, i.e. $\mathrm{Cayley}(\mathbb{O}) = \mathbb{I}$ and $\frac{\partial}{\partial{}t}\mathrm{Cayley}(tC) = C$.
+
+What we need in practice is not the computation of the Cayley transform of an arbitrary matrix, but the Cayley transform of an element of $\mathfrak{g}^\mathrm{hor}$, the [global tangent space representation](../../arrays/stiefel_lie_alg_horizontal.md). 
+
+The elements of $\mathfrak{g}^\mathrm{hor}$ can be written as: 
+
+```math
+C = \begin{bmatrix}
+    A & -B^T \\ 
+    B & \mathbb{O}
+\end{bmatrix} = \begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix},
+```
+
+where the second expression exploits the sparse structure of the array, i.e. it is a multiplication of a $N\times2n$ with a $2n\times{}N$ matrix. We can hence use the **Sherman-Morrison-Woodbury formula** to obtain:
+
+```math
+(\mathbb{I} - \frac{1}{2}UV)^{-1} = \mathbb{I} + \frac{1}{2}U(\mathbb{I} - \frac{1}{2}VU)^{-1}V
+```
+
+So what we have to invert is the term 
+
+```math
+\mathbb{I} - \frac{1}{2}\begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} = 
+\begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}.
+```
+
+The whole cayley transform is then: 
+
+```math
+\left(\mathbb{I} + \frac{1}{2}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}^{-1}  \begin{bmatrix}  \mathbb{I} & \mathbb{O} \\ \frac{1}{2}A & -B^T  \end{bmatrix} \right)\left( E +  \frac{1}{2}\begin{bmatrix}  \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O} \end{bmatrix} \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix}\ \right) = \\
+
+E + \frac{1}{2}\begin{bmatrix} \frac{1}{2}A & \mathbb{I} \\ B & \mathbb{O}  \end{bmatrix}\left(
+    \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix}  + 
+    \begin{bmatrix}  \mathbb{I} - \frac{1}{4}A & - \frac{1}{2}\mathbb{I} \\ \frac{1}{2}B^TB - \frac{1}{8}A^2 & \mathbb{I} - \frac{1}{4}A  \end{bmatrix}^{-1}\left(
+
+        \begin{bmatrix}  \mathbb{I} \\ \frac{1}{2}A   \end{bmatrix} + 
+        \begin{bmatrix} \frac{1}{2}A \\ \frac{1}{4}A^2 - \frac{1}{2}B^TB \end{bmatrix}
+
+    \right)
+    \right)
+```
+
+
+Note that for computational reason we compute $\mathrm{Cayley}(C)E$ instead of just the Cayley transform (see the section on [retractions](retractions.md)).
\ No newline at end of file
diff --git a/docs/src/optimizers/manifold_related/geodesic.md b/docs/src/optimizers/manifold_related/geodesic.md
new file mode 100644
index 000000000..9ff9d645c
--- /dev/null
+++ b/docs/src/optimizers/manifold_related/geodesic.md
@@ -0,0 +1,3 @@
+# Geodesic Retraction
+
+General **retractions** are approximations of the exponential map. In `GeometricMachineLearning` we can, instead of using an approximation, solve the geodesic equation exactly (up to numerical error) by specifying `Geodesic()` as the argument of layers that have manifold weights. 
\ No newline at end of file
diff --git a/docs/src/optimizers/manifold_related/retractions.md b/docs/src/optimizers/manifold_related/retractions.md
index d3ac6a886..59651c86e 100644
--- a/docs/src/optimizers/manifold_related/retractions.md
+++ b/docs/src/optimizers/manifold_related/retractions.md
@@ -1,5 +1,16 @@
 # Retractions
 
+## Classical Definition
+Classically, retractions are defined as maps smooth maps 
+
+```math
+R: T\mathcal{M}\to\mathcal{M}:(x,v)\mapsto{}R_x(v)
+```
+
+such that each curve $c(t) := R_x(tv)$ satisfies $c(0) = x$ and $c'(0) = v$.
+
+## In `GeometricMachineLearning`
+
 Retractions are a map from the **horizontal component** of the Lie algebra $\mathfrak{g}^\mathrm{hor}$ to the respective manifold.
 
 For optimization in neural networks (almost always first order) we solve a gradient flow equation 
diff --git a/docs/src/reduced_order_modeling/autoencoder.md b/docs/src/reduced_order_modeling/autoencoder.md
new file mode 100644
index 000000000..044cede0b
--- /dev/null
+++ b/docs/src/reduced_order_modeling/autoencoder.md
@@ -0,0 +1,50 @@
+# Reduced Order modeling and Autoencoders 
+
+Reduced order modeling is a data-driven technique that exploits the structure of parametric PDEs to make solving those PDEs easier.
+
+Consider a parametric PDE written in the form: $F(z(\mu);\mu)=0$ where $z(\mu)$ evolves on a infinite-dimensional Hilbert space $V$. 
+
+In modeling any PDE we have to choose a discretization (particle discretization, finite element method, ...) of $V$, which will be denoted by $V_h$. 
+
+## Solution manifold 
+
+To any parametric PDE we associate a **solution manifold**: 
+
+```math 
+\mathcal{M} = \{z(\mu):F(z(\mu);\mu)=0, \mu\in\mathbb{P}\}.
+```
+
+![](../images/solution_manifold_2.png)
+
+In the image above a 2-dimensional solution manifold is visualized as a sub-manifold in 3-dimensional space. In general the embedding space is an infinite-dimensional function space.
+
+As an example of this consider the 1-dimensional wave equation: 
+
+```math
+\partial_{tt}^2q(t,\xi;\mu) = \mu^2\partial_{\xi\xi}^2q(t,\xi;\mu)\text{ on }I\times\Omega,
+```
+where $I = (0,1)$ and $\Omega=(-1/2,1/2)$. As initial condition for the first derivative we have $\partial_tq(0,\xi;\mu) = -\mu\partial_\xi{}q_0(\xi;\mu)$ and furthermore $q(t,\xi;\mu)=0$ on the boundary (i.e. $\xi\in\{-1/2,1/2\}$).
+
+The solution manifold is a 1-dimensional submanifold: 
+
+```math
+\mathcal{M} = \{(t, \xi)\mapsto{}q(t,\xi;\mu)=q_0(\xi-\mu{}t;\mu):\mu\in\mathbb{P}\subset\mathbb{R}\}.
+```
+
+If we provide an initial condition $u_0$, a parameter instance $\mu$ and a time $t$, then $\xi\mapsto{}q(t,\xi;\mu)$ will be the momentary solution. If we consider the time evolution of $q(t,\xi;\mu)$, then it evolves on a two-dimensional submanifold $\bar{\mathcal{M}} := \{\xi\mapsto{}q(t,\xi;\mu):t\in{}I,\mu\in\mathbb{P}\}$.
+
+## General workflow
+
+In reduced order modeling we aim to construct a mapping to a space that is close to this solution manifold. This is done through the following steps: 
+
+1. Discretize the PDE.
+
+2. Solve the discretized PDE for a certain set of parameter instances $\mu\in\mathbb{P}$.
+
+3. Build a reduced basis with the data obtained from having solved the discretized PDE. This step consists of finding two mappings: the **reduction** $\mathcal{P}$ and the **reconstruction** $\mathcal{R}$.
+
+The third step can be done with various machine learning (ML) techniques. Traditionally the most popular of these has been **Proper orthogonal decomposition** (POD), but in recent years **autoencoders** have also become a popular alternative (see (Fresca et al, 2021)). 
+
+## References 
+
+- Fresca, Stefania, Luca Dede, and Andrea Manzoni. "A comprehensive deep learning-based approach to reduced order modeling of nonlinear time-dependent parametrized PDEs." Journal of Scientific Computing 87 (2021): 1-36.
\ No newline at end of file
diff --git a/docs/src/reduced_order_modeling/kolmogorov_n_width.md b/docs/src/reduced_order_modeling/kolmogorov_n_width.md
new file mode 100644
index 000000000..65595990b
--- /dev/null
+++ b/docs/src/reduced_order_modeling/kolmogorov_n_width.md
@@ -0,0 +1,17 @@
+# Kolmogorov $n$-width
+
+The Kolmogorov $n$-width measures how well some set $\mathcal{M}$ (typically the solution manifold) can be approximated with a linear subspace:
+
+```math
+d_n(\mathcal{M}) := \mathrm{inf}_{V_n\sub{}V;\mathrm{dim}V_n=n}\mathrm{sup}(u\in\mathcal{M})\mathrm{inf}_{v_n\in{}V_n}|| u - v_n ||_V,
+```
+
+with $\mathcal{M}\sub{}V$ and $V$ is a (typically infinite-dimensional) Banach space. For advection-dominated problems (among others) the **decay of the Kolmogorov $n$-width is very slow**, i.e. one has to pick $n$ very high in order to obtain useful approximations (see (Greif and Urban, 2019) and (Blickhan, 2023)).
+
+In order to overcome this, techniques based on neural networks (see e.g. (Lee and Carlberg, 2020)) and optimal transport (see e.g. (Blickhan, 2023)) have been used. 
+
+
+## References 
+- Greif, Constantin, and Karsten Urban. "Decay of the Kolmogorov N-width for wave problems." Applied Mathematics Letters 96 (2019): 216-222.
+- Blickhan, Tobias. "A registration method for reduced basis problems using linear optimal transport." arXiv preprint arXiv:2304.14884 (2023).
+- Lee, Kookjin, and Kevin T. Carlberg. "Model reduction of dynamical systems on nonlinear manifolds using deep convolutional autoencoders." Journal of Computational Physics 404 (2020): 108973.
diff --git a/docs/src/reduced_order_modeling/symplectic_autoencoder.md b/docs/src/reduced_order_modeling/symplectic_autoencoder.md
new file mode 100644
index 000000000..3d5d40a97
--- /dev/null
+++ b/docs/src/reduced_order_modeling/symplectic_autoencoder.md
@@ -0,0 +1,87 @@
+# Symplectic Autoencoder 
+
+Symplectic Autoencoders are a type of neural network suitable for treating Hamiltonian parametrized PDEs with slowly decaying Kolmogorov $n$-width. It is based on **proper symplectic decomposition** (PSD) and symplectic neural networks (SympNets).
+
+## Hamiltonian Model Order Reduction 
+
+Hamiltonian PDEs are partial differential equations that, like its ODE counterpart, have a Hamiltonian associated with it. An example of this is the **linear wave equation** (see (Buckfink et al, 2023)) with Hamiltonian 
+
+```math
+\mathcal{H}(q, p; \mu) := \frac{1}{2}\int_\Omega\mu^2(\partial_\xi{}q(t,\xi;\mu))^2 + p(t,\xi;\mu)^2d\xi.
+```
+
+The PDE for to this Hamiltonian can be obtained similarly as in the ODE case:
+
+```math
+\partial_t{}q(t,\xi;\mu) = \frac{\delta{}\mathcal{H}}{\delta{}p} = p(t,\xi;\mu), \quad \partial_t{}p(t,\xi;\mu) = -\frac{\delta{}\mathcal{H}}{\delta{}q} = \mu^2\partial_{\xi{}\xi}q(t,\xi;\mu)
+```
+
+## Symplectic Solution Manifold 
+
+As with regular parametric PDEs, we also associate a solution manifold with Hamiltonian PDEs. This is a finite-dimensional manifold, on which the dynamics can be described through a Hamiltonian ODE. 
+I NEED A PROOF OR SOME EXPLANATION FOR THIS!
+
+
+## Workflow for Symplectic ROM
+
+As with any other [reduced order modeling technique](autoencoder.md) we first discretize the PDE. This should be done with a structure-preserving scheme, thus yielding a (high-dimensional) Hamiltonian ODE as a result. Discretizing the wave equation above with finite differences yields a Hamiltonian system: 
+
+```math
+\mathcal{H}_\mathrm{discr}(z(t;\mu);\mu) := \frac{1}{2}x(t;\mu)^T\begin{bmatrix}  -\mu^2D_{\xi{}\xi} & \mathbb{O} \\ \mathbb{O} & \mathbb{I}  \end{bmatrix} x(t;\mu).
+```
+
+In Hamiltonian reduced order modelling we try to find a symplectic submanifold of the solution space[^1] that captures the dynamics of the full system as well as possible.
+
+[^1]: The submanifold is: $\tilde{\mathcal{M}} = \{\Psi^\mathrm{dec}(z_r)\in\mathbb{R}^{2N}:u_r\in\mathrm{R}^{2n}\}$ where $z_r$ is the reduced state of the system. 
+
+Similar to the regular PDE case we again build an encoder $\Psi^\mathrm{enc}$ and a decoder $\Psi^\mathrm{dec}$; but now both these mappings are required to be symplectic!
+
+Concretely this means: 
+1. The encoder is a mapping from a high-dimensional symplectic space to a low-dimensional symplectic space, i.e. $\Psi^\mathrm{enc}:\mathbb{R}^{2N}\to\mathbb{R}^{2n}$ such that $\nabla\Psi^\mathrm{enc}\mathbb{J}_{2N}(\nabla\Psi^\mathrm{enc})^T = \mathbb{J}_{2n}$.
+2. The decoder is a mapping from a low-dimensional symplectic space to a high-dimensional symplectic space, i.e. $\Psi^\mathrm{dec}:\mathbb{R}^{2n}\to\mathbb{R}^{2N}$ such that $(\nabla\Psi^\mathrm{dec})^T\mathbb{J}_{2N}\nabla\Psi^\mathrm{dec} = \mathbb{J}_{2n}$.
+
+If these two maps are constrained to linear maps, then one can easily find good solutions with **proper symplectic decomposition** (PSD).
+
+## Proper Symplectic Decomposition
+
+For PSD the two mappings $\Psi^\mathrm{enc}$ and $\Psi^\mathrm{dec}$ are constrained to be linear, orthonormal (i.e. $\Psi^T\Psi = \mathbb{I}$) and symplectic. The easiest way to enforce this is through the so-called **cotangent lift**: 
+
+```math
+\Psi_\mathrm{CL} = 
+\begin{bmatrix} \Phi & \mathbb{O} \\ \mathbb{O} & \Phi \end{bmatrix},
+```
+and $\Phi\in{}St(n,N)\sub\mathbb{R}^{N\times{}n}$, i.e. is an element of the [Stiefel manifold](../manifolds/stiefel_manifold.md). If the [snapshot matrix](../data_loader/snapshot_matrix.md) is of the form: 
+
+```math
+M = \left[\begin{array}{c:c:c:c}
+\hat{q}_1(t_0) &  \hat{q}_1(t_1) & \quad\ldots\quad & \hat{q}_1(t_f) \\
+\hat{q}_2(t_0) &  \hat{q}_2(t_1) & \ldots & \hat{q}_2(t_f) \\
+\ldots & \ldots & \ldots & \ldots \\
+\hat{q}_N(t_0) &  \hat{q}_N(t_1) & \ldots & \hat{q}_N(t_f) \\
+\hat{p}_1(t_0) & \hat{p}_1(t_1) & \ldots & \hat{p}_1(t_f) \\
+\hat{p}_2(t_0) &  \hat{p}_2(t_1) & \ldots & \hat{p}_2(t_f) \\
+\ldots &  \ldots & \ldots & \ldots \\
+\hat{p}_{N}(t_0) &  \hat{p}_{N}(t_1) & \ldots & \hat{p}_{N}(t_f) \\
+\end{array}\right],
+```
+
+then $\Phi$ can be computed in a very straight-forward manner: 
+1. Rearrange the rows of the matrix $M$ such that we end up with a $N\times2(f+1)$ matrix: $\hat{M} := [M_q, M_p]$.
+2. Perform SVD: $\hat{M} = U\Sigma{}V^T$; set $\Phi\gets{}U\mathtt{[:,1:n]}$.
+
+For details on the cotangent lift (and other methods for linear symplectic model reduction) consult (Peng and Mohseni, 2016).
+
+## Symplectic Autoencoders
+
+PSD suffers from the similar shortcomings as regular POD: it is a linear map and the approximation space $\tilde{\mathcal{M}}= \{\Psi^\mathrm{dec}(z_r)\in\mathbb{R}^{2N}:u_r\in\mathrm{R}^{2n}\}$ is strictly linear. For problems with slowly-decaying [Kolmogorov $n$-width](kolmogorov_n_width.md) this leads to very poor approximations.  
+
+In order to overcome this difficulty we use neural networks, more specifically [SympNets](../architectures/sympnet.md), together with cotangent lift-like matrices. The resulting architecture, symplectic autoencoders, are demonstrated in the following image: 
+
+![](../images/symplectic_autoencoder.png)
+
+So we alternate between SympNet and PSD layers. Because all the PSD layers are based on matrices $\Phi\in{}St(n,N)$ we have to [optimize on the Stiefel manifold](../Optimizer.md).
+
+
+## References 
+- Buchfink, Patrick, Silke Glas, and Bernard Haasdonk. "Symplectic model reduction of Hamiltonian systems on nonlinear manifolds and approximation with weakly symplectic autoencoder." SIAM Journal on Scientific Computing 45.2 (2023): A289-A311.
+- Peng, Liqian, and Kamran Mohseni. "Symplectic model reduction of Hamiltonian systems." SIAM Journal on Scientific Computing 38.1 (2016): A1-A27.
\ No newline at end of file
diff --git a/docs/src/tutorials/mnist_tutorial.md b/docs/src/tutorials/mnist_tutorial.md
new file mode 100644
index 000000000..0527f7cce
--- /dev/null
+++ b/docs/src/tutorials/mnist_tutorial.md
@@ -0,0 +1,59 @@
+# MNIST tutorial 
+
+This is a short tutorial that shows how we can use `GeometricMachineLearning` to build a vision transformer and apply it for MNIST, while also putting some of the weights on a manifold. 
+
+First, we need to import the relevant packages: 
+
+```julia
+using GeometricMachineLearning, CUDA
+import Zygote, MLDatasets
+```
+
+In this example `Zygote` as an AD routine and we get the dataset from `MLDatasets`. First we need to load the data set, and put it on GPU (if you have one):
+
+```julia
+train_x, train_y = MLDatasets.MNIST(split=:train)[:]
+test_x, test_y = MLDatasets.MNIST(split=:test)[:]
+train_x = train_x |> cu 
+test_x = test_x |> cu 
+train_y = train_y |> cu 
+test_y = test_y |> cu
+```
+
+`GeometricMachineLearning` has built-in data loaders that make it particularly easy to handle data: 
+
+```julia
+patch_length = 7
+dl = DataLoader(train_x, train_y, batch_size=512, patch_length=patch_length)
+dl_test = DataLoader(train_x, train_y, batch_size=length(y), patch_length=patch_length)
+```
+
+The second line in the above code snippet indicates that we use the entire data set as one "batch" when processing the test set. For training, the batch size was here set to 512. 
+
+```julia
+ps = initialparameters(backend, eltype(dl.data), Ψᵉ) 
+
+optimizer_instance = Optimizer(o, ps)
+
+println("initial test accuracy: ", accuracy(Ψᵉ, ps, dl_test), "\n")
+
+progress_object = Progress(n_training_steps; enabled=true)
+
+loss_array = zeros(eltype(train_x), n_training_steps)
+for i in 1:n_training_steps
+    redraw_batch!(dl)
+    # get rid of try catch statement. This softmax issue should be solvable!
+    loss_val, pb = try Zygote.pullback(ps -> loss(Ψᵉ, ps, dl), ps)
+    catch
+        loss_array[i] = loss_array[i-1] 
+        continue 
+    end
+    dp = pb(one(loss_val))[1]
+
+    optimization_step!(optimizer_instance, Ψᵉ, ps, dp)
+    ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss, loss_val)])   
+    loss_array[i] = loss_val
+end
+
+println("final test accuracy: ", accuracy(Ψᵉ, ps, dl_test), "\n")
+```
\ No newline at end of file
diff --git a/scripts/symplectic_autoencoders/generate_data.jl b/scripts/symplectic_autoencoders/generate_data.jl
new file mode 100644
index 000000000..d8718a45a
--- /dev/null
+++ b/scripts/symplectic_autoencoders/generate_data.jl
@@ -0,0 +1,77 @@
+"""
+TODO: Implement p component!
+"""
+
+using Plots, ForwardDiff
+
+function h(x::T) where T
+    if T(0) ≤ x ≤ T(1)
+        T(1) - T(1.5)*x^2 + T(.75)*x^3 
+    elseif T(1) < x ≤ 2
+        T(.25)*(T(2) - x)^3
+    else
+        T(0)
+    end 
+end
+
+function s(ξ, μ::T) where T
+    T(4) / μ * abs(ξ + T(.5)*(T(1) - μ))
+end
+
+u₀(ξ, μ) = h(s(ξ, μ))
+u(t, ξ, μ) = u₀(ξ .- μ * t, μ)
+p(t, ξ, μ) = ForwardDiff.derivative(t -> u(t,ξ,μ), t)
+function p(t::T, ξ::AbstractVector{T}, μ::T) where T
+    p_closure(ξ) = p(t, ξ, μ)
+    p_closure.(ξ)
+end
+
+function s(ξ::AbstractVector{T}, μ::T) where T 
+    s_closure(ξ_scal) = s(ξ_scal, μ)
+    s_closure.(ξ)
+end
+
+function u₀(ξ::AbstractVector{T}, μ::T) where T 
+    h.(s(ξ, μ))
+end
+
+function get_domain(T=Float32, spacing=T(.01), time_step=T(0.01))
+    Ω = T(-.5):spacing:T(.5)
+    I = T(0):time_step:T(1)
+    Ω, I 
+end
+
+function plot_time_evolution(T=Float32; spacing=T(.01), time_step=T(0.25), μ=T(.3))
+    Ω, I = get_domain(T, spacing, time_step)
+    curves = zeros(T, length(Ω), length(I))
+    curves_p = zeros(T, length(Ω), length(I))
+
+    for it in zip(axes(I, 1), I) 
+        i = it[1]; t = it[2]
+        curves[1:length(Ω), i] = u(t, Ω, μ)
+        curves_p[1:length(Ω), i] = p(t, Ω, μ)
+    end
+    curves, curves_p, plot(Ω, curves, layout=(length(I), 1)), plot(Ω, curves_p, layout=(length(I), 1))
+end
+
+#=
+μ = Float32(5/12)
+data, data_p, plot_q, plot_p = plot_time_evolution(;μ=μ)
+png(plot_q, "q_data_for_μ="*string(μ))
+png(plot_p, "p_data_for_μ="*string(μ))
+=#
+
+function generate_data(T=Float32; spacing=T(.01), time_step=T(0.01), μ_collection=T(5/12):T(.1):T(5/6))
+    Ω, I = get_domain(T, spacing, time_step)
+    curves = zeros(T, 2*length(Ω), length(μ_collection), length(I))
+
+    for it in zip(axes(I, 1), I)
+        i = it[1]; t = it[2]
+        for it2 in zip(axes(μ_collection, 1), μ_collection)
+            j = it2[1]; μ = it2[2]
+            curves[1:length(Ω), j, i] = u(t, Ω, μ)
+            curves[(length(Ω)+1):2*length(Ω), j, i] = p(t, Ω, μ)
+        end
+    end
+    curves
+end
\ No newline at end of file
diff --git a/scripts/symplectic_autoencoders/training.jl b/scripts/symplectic_autoencoders/training.jl
new file mode 100644
index 000000000..c4aeb3d8b
--- /dev/null
+++ b/scripts/symplectic_autoencoders/training.jl
@@ -0,0 +1,50 @@
+using GeometricMachineLearning 
+using LinearAlgebra: svd, norm
+using ProgressMeter
+using Zygote
+include("generate_data.jl")
+
+T = Float32
+spacing=T(.01)
+time_step=T(0.01)
+μ_collection=T(5/12):T(.1):T(5/6)
+n = 5
+n_epochs = 2000
+backend = CPU()
+
+data = generate_data(T;spacing=spacing,time_step=time_step,μ_collection=μ_collection)
+data = reshape(data, size(data,1), size(data,2)*size(data,3))
+N = size(data,1)÷2
+dl = DataLoader(data)
+Φ = svd(hcat(data[1:N,:], data[(N+1):2*N,:])).U[:,1:n]
+PSD = hcat(vcat(Φ, zero(Φ)), vcat(zero(Φ), Φ))
+PSD_error = norm(data - PSD*PSD'*data)/norm(data)
+
+activation = tanh
+model = Chain(  GradientQ(2*N, 2*N, activation), 
+                GradientP(2*N, 2*N, activation),
+                PSDLayer(2*N, 2*n),
+                GradientQ(2*n, 2*n, activation),
+                GradientP(2*n, 2*n, activation),
+                GradientQ(2*n, 2*n, activation),
+                GradientP(2*n, 2*n, activation),
+                PSDLayer(2*n, 2*N),
+                GradientQ(2*N, 2*N, activation),
+                GradientP(2*N, 2*N, activation)
+)
+
+ps = initialparameters(backend, Float32, model)
+loss(model, ps, dl)
+
+optimizer_instance = Optimizer(AdamOptimizer(), ps)
+n_training_iterations = Int(ceil(n_epochs*dl.n_params/dl.batch_size))
+progress_object = Progress(n_training_iterations; enabled=true)
+
+for _ in 1:n_training_iterations
+    redraw_batch!(dl)
+    loss_val, pb = Zygote.pullback(ps -> loss(model, ps, dl), ps)
+    dp = pb(one(loss_val))[1]
+
+    optimization_step!(optimizer_instance, model, ps, dp)
+    ProgressMeter.next!(progress_object; showvalues=[(:TrainingLoss, loss_val)])
+end
\ No newline at end of file
diff --git a/scripts/transformer_new.jl b/scripts/transformer_new.jl
index 6570a77f5..d0cf0406f 100644
--- a/scripts/transformer_new.jl
+++ b/scripts/transformer_new.jl
@@ -15,11 +15,11 @@ image_dim = 28
 patch_length = 7
 transformer_dim = 49
 n_heads = 7
-n_layers = 16
+n_layers = 10
 number_of_patch = (image_dim÷patch_length)^2
-batch_size = 512
+batch_size = 2048
 activation = softmax
-n_epochs = 20
+n_epochs = 1000
 add_connection = false
 backend = CUDABackend()
 
@@ -54,25 +54,36 @@ function transformer_training(Ψᵉ::Chain; backend=CPU(), n_training_steps=1000
 
     progress_object = Progress(n_training_steps; enabled=true)
 
+    loss_array = zeros(eltype(train_x), n_training_steps)
     for i in 1:n_training_steps
         redraw_batch!(dl)
+        # ask Michael to take a look at this. Probably not good for performance.
         loss_val, pb = Zygote.pullback(ps -> loss(Ψᵉ, ps, dl), ps)
         dp = pb(one(loss_val))[1]
 
         optimization_step!(optimizer_instance, Ψᵉ, ps, dp)
         ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss, loss_val)])   
+        loss_array[i] = loss_val
     end
 
     println("final test loss: ", loss(Ψᵉ, ps, dl_test), "\n")
 
-    ps
+    loss_array, ps
 end
 
 # calculate number of epochs
 n_training_steps = Int(ceil(length(train_y)*n_epochs/batch_size))
 
-ps₁ = transformer_training(model1, backend=backend, n_training_steps=n_training_steps)
-ps₂ = transformer_training(model2, backend=backend, n_training_steps=n_training_steps)
+loss_array2, ps2 = transformer_training(model2, backend=backend, n_training_steps=n_training_steps)
+loss_array1, ps1 = transformer_training(model1, backend=backend, n_training_steps=n_training_steps)
+loss_array3, ps3 = transformer_training(model2, backend=backend, n_training_steps=n_training_steps, o=GradientOptimizer(0.001))
+loss_array4, ps4 = transformer_training(model2, backend=backend, n_training_steps=n_training_steps, o=MomentumOptimizer(0.001, 0.5))
 
-#loss_array₃ = transformer_training(Ψᵉ₂, batch_size, training_steps, err_freq, StandardOptimizer(0.001))
-#loss_array₄ = transformer_training(Ψᵉ₂, batch_size, training_steps, err_freq, MomentumOptimizer(0.001, 0.5))
+p1 = plot(loss_array1, color=1, label="Regular weights", ylimits=(0.,1.4))
+plot!(p1, loss_array2, color=2, label="Weights on Stiefel Manifold")
+png(p1, "Stiefel_Regular")
+
+p2 = plot(loss_array2, color=2, label="Adam", ylimits=(0.,1.4))
+plot!(p2, loss_array3, color=1, label="Gradient")
+plot!(p2, loss_array4, color=3, label="Momentum")
+png(p2, "Adam_Gradient_Momentum")
\ No newline at end of file
diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
index 05db28693..938b5ff1c 100644
--- a/src/GeometricMachineLearning.jl
+++ b/src/GeometricMachineLearning.jl
@@ -35,6 +35,7 @@ module GeometricMachineLearning
     export dim
     import GeometricIntegrators.Integrators: method
     import NNlib: σ, sigmoid, softmax
+    #import LogExpFunctions: softmax
 
     export CPU, GPU
     export Chain, NeuralNetwork
@@ -67,9 +68,10 @@ module GeometricMachineLearning
     # export tensor_mat_mul
 
     include("data_loader/tensor_assign.jl")
+    include("data_loader/matrix_assign.jl")
     include("data_loader/data_loader.jl")
     include("data_loader/mnist_utils.jl")
-    export DataLoader, redraw_batch!, loss, onehotbatch
+    export DataLoader, redraw_batch!, loss, onehotbatch, accuracy
 
     # this defines empty retraction type structs (doesn't rely on anything)
     include("optimizers/manifold_related/retraction_types.jl")
@@ -354,9 +356,5 @@ module GeometricMachineLearning
     export integrate, integrate_step!
 
     include("integrator/sympnet_integrator.jl")
-
-
-
-
-    
-end
+ 
+end
\ No newline at end of file
diff --git a/src/data/data_training.jl b/src/data/data_training.jl
index 6646d626b..215f0805b 100644
--- a/src/data/data_training.jl
+++ b/src/data/data_training.jl
@@ -1,5 +1,15 @@
 abstract type AbstractTrainingData end
 
+
+"""
+TrainingData stores: \n 
+\t - problem \n 
+\t - shape \n 
+\t - get \n 
+\t - symbols \n 
+\t - dim \n 
+\t - noisemaker \n 
+"""
 struct TrainingData{TK <: DataSymbol, TS <: AbstractDataShape, TP <: AbstractProblem, TG <: Dict{Symbol, <:Base.Callable}, TN <: Base.Callable} <: AbstractTrainingData 
     problem::TP
     shape::TS
diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
index fbdb21d8f..d8f4bfe3c 100644
--- a/src/data_loader/data_loader.jl
+++ b/src/data_loader/data_loader.jl
@@ -1,39 +1,39 @@
 """
-Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient.
-
-Implemented: 
-If the data loader is called with a single tensor, a batch_size and an output_size, then the batch is drawn randomly in the relevant range and the output is assigned accordingly.
-
-The fields of the struct are the following: 
-    - data: The input data with axes (i) system dimension, (ii) number of parameters and (iii) number of time steps.
-    - batch: A tensor in which the current batch is stored. 
-    - target_tensor: A tensor in which the current target is stored. 
-    - output: The tensor from which the output is drawn - this may be of type Nothing if the constructor is only called with one tensor.
-    - sys_dim: The ``dimension'' of the system, i.e. what is taken as input by a regular neural network. 
-    - seq_length: The length batches should have. 
-    - batch_size: 
-    - output_size: The size of the second axis of the output tensor (prediction_window, output_size=1 in most cases)
-    - n_params: The number of parameters that are present in the data set (length of second axis). 
-    - n_time_steps: Number of time steps (length of third axis).
-
-For drawing the batch, the sampling is done over n_params and n_time_steps (here seq_length and output_size are also taken into account).
-
-TODO: Implement DataLoader that works well with GeometricEnsembles etc. 
+Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient. \n
+\n
+Implemented: \n
+If the data loader is called with a single tensor, a batch_size and an output_size, then the batch is drawn randomly in the relevant range and the output is assigned accordingly.\n
+\n
+The fields of the struct are the following: \n
+\t   - data: The input data with axes (i) system dimension, (ii) number of parameters and (iii) number of time steps.\n
+\t   - batch: A tensor in which the current batch is stored.\n 
+\t   - target_tensor: A tensor in which the current target is stored.\n
+\t   - output: The tensor from which the output is drawn - this may be of type Nothing if the constructor is only called with one tensor.\n
+\t   - sys_dim: The ``dimension'' of the system, i.e. what is taken as input by a regular neural network.\n 
+\t   - seq_length: The length batches should have.\n
+\t   - batch_size:\n
+\t   - output_size: The size of the second axis of the output tensor (prediction_window, output_size=1 in most cases)\n
+\t   - n_params: The number of parameters that are present in the data set (length of second axis)\n
+\t   - n_time_steps: Number of time steps (length of third axis)\n
+\n
+For drawing the batch, the sampling is done over n_params and n_time_steps (here seq_length and output_size are also taken into account).\n
+\n
+TODO: Implement DataLoader that works well with GeometricEnsembles etc.\n
 """
-struct DataLoader{T, AT<:AbstractArray{T}, TT<:Union{AbstractArray, Nothing}, OT<:AbstractArray}
+struct DataLoader{T, AT<:AbstractArray{T}, TT<:Union{AbstractArray, Nothing}, OT<:Union{AbstractArray, Nothing}}
     data::AT
     batch::AT
     target_tensor::TT
     output::OT
     sys_dim::Integer
-    seq_length::Integer 
+    seq_length::Union{Integer, Nothing} 
     batch_size::Integer
-    output_size::Integer 
+    output_size::Union{Integer, Nothing} 
     n_params::Integer 
-    n_time_steps::Integer 
+    n_time_steps::Union{Integer, Nothing}
 end
 
-function DataLoader(data::AbstractArray{T, 3}, seq_length=10, batch_size=32, output_size=1) where T
+function DataLoader(data::AbstractArray{T, 3}; seq_length=10, batch_size=32, output_size=1) where T
     @info "You have provided a tensor with three axes as input. They will be interpreted as \n (i) system dimension, (ii) number of parameters and (iii) number of time steps."
     sys_dim,n_params,n_time_steps = size(data)
     backend = KernelAbstractions.get_backend(data)
@@ -43,6 +43,15 @@ function DataLoader(data::AbstractArray{T, 3}, seq_length=10, batch_size=32, out
     DataLoader{T, typeof(data), Nothing, typeof(output)}(data, batch, nothing, output, sys_dim, seq_length, batch_size, output_size, n_params, n_time_steps)
 end
 
+function DataLoader(data::AbstractMatrix{T}; batch_size=32) where T 
+    @info "You have provided a matrix as input. The axes will be interpreted as (i) system dimension and (ii) number of parameters."
+    sys_dim, n_params = size(data)
+    backend = KernelAbstractions.get_backend(data)
+    batch = KernelAbstractions.allocate(backend, T, sys_dim, batch_size)
+    draw_batch!(batch, data)
+    DataLoader{T, typeof(data), Nothing, Nothing}(data, batch, nothing, nothing, sys_dim, nothing, batch_size, nothing, n_params, nothing)
+end
+
 # T and T1 are not the same because T1 is of Integer type
 function DataLoader(data::AbstractArray{T, 3}, target::AbstractVector{T1}; batch_size=32, patch_length=7) where {T, T1} 
     @info "You provided a tensor and a vector as input. This will be treated as a classification problem (MNIST). Tensor axes: (i) & (ii) image axes and (iii) batch dimesnion."
@@ -68,8 +77,27 @@ function redraw_batch!(dl::DataLoader{T, AT, Nothing}) where {T, AT<:AbstractArr
     draw_batch!(dl.batch, dl.output, dl.data)
 end
 
+function redraw_batch!(dl::DataLoader{T, AT, Nothing}) where {T, AT<:AbstractMatrix{T}}
+    draw_batch!(dl.batch, dl.data)
+end
+
 function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T}) where T
     batch_output = model(dl.batch, ps)
     output_estimate = assign_output_estimate(batch_output, dl.output_size)
     norm(dl.output - output_estimate)/T(sqrt(dl.batch_size))/T(sqrt(dl.output_size))
-end
\ No newline at end of file
+end
+
+function loss(model::Chain, ps::Tuple, dl::DataLoader{T}) where T 
+    batch_output = model(dl.batch, ps)
+    norm(batch_output - dl.batch)/norm(dl.batch)
+end
+
+function accuracy(model::Chain, ps::Tuple, dl::DataLoader{T, AT, BT}) where {T, T2<:Integer, AT<:AbstractArray{T}, BT<:AbstractArray{T2}}
+    output_tensor = model(dl.batch, ps)
+    output_estimate = assign_output_estimate(output_tensor, dl.output_size)
+    backend = KernelAbstractions.get_backend(output_estimate)
+    tensor_of_maximum_elements = KernelAbstractions.zeros(backend, T2, size(output_estimate)...)
+    ind = argmax(output_estimate, dims=1)
+    tensor_of_maximum_elements[ind] .= T2(1)
+    (size(dl.output, 3)-sum(abs.(dl.output - tensor_of_maximum_elements))/T2(2))/size(dl.output, 3)
+end
diff --git a/src/data_loader/matrix_assign.jl b/src/data_loader/matrix_assign.jl
new file mode 100644
index 000000000..f6ef93543
--- /dev/null
+++ b/src/data_loader/matrix_assign.jl
@@ -0,0 +1,9 @@
+"""
+This assigns the batch if the data are in form of a matrix.
+"""
+function draw_batch!(batch::AbstractMatrix{T}, data::AbstractMatrix{T}) where T
+    sys_dim, batch_size = size(batch)
+    n_params = size(data, 2)
+    param_indices = Int.(ceil.(rand(T, batch_size)*n_params))
+    batch .= data[:, param_indices]
+end
\ No newline at end of file
diff --git a/src/kernels/inverses/inverse_kernel.jl b/src/kernels/inverses/inverse_kernel.jl
index de65b0770..e39af406e 100644
--- a/src/kernels/inverses/inverse_kernel.jl
+++ b/src/kernels/inverses/inverse_kernel.jl
@@ -60,11 +60,17 @@ end
 function tensor_inverse(A::AbstractArray{T, 3}) where T 
     A_inv = zero(A)
     total_length = size(A, 3)
-    for k in 1:total_length 
-        B = assign_matrix(A, k)
-        B_inv = B\one(B)
-        A_inv += assign_tensor(B_inv, total_length, k)
+    #for k in 1:total_length 
+    #    B = assign_matrix(A, k)
+    #    B_inv = B\one(B)
+    #    A_inv += assign_tensor(B_inv, total_length, k)
+    #end
+
+    for k in axes(A_inv, 3)
+        B = @view A[:,:,k]
+        A_inv[:,:,k] .= B \ one(B) 
     end
+
     A_inv
 end 
 
diff --git a/src/layers/attention_layer.jl b/src/layers/attention_layer.jl
index 7978813d6..8d5be53f5 100644
--- a/src/layers/attention_layer.jl
+++ b/src/layers/attention_layer.jl
@@ -63,14 +63,14 @@ function (d::Attention{M, M, Stiefel, Retraction, true})(x::AbstractMatrix{T}, p
     dim, input_length = size(x)
     @assert dim == M
 
-    x + x*d.activation((ps.PQ'*x)'*(ps.PK'*x))
+    x + x*d.activation((ps.PQ'*x)'*(ps.PK'*x)/T(sqrt(M)))
 end
 
 function (d::Attention{M, M, Stiefel, Retraction, false})(x::AbstractMatrix{T}, ps::NamedTuple) where {M, Stiefel, Retraction, T}
     dim, input_length = size(x)
     @assert dim == M
 
-    x*d.activation((ps.PQ'*x)'*(ps.PK'*x))
+    x*d.activation((ps.PQ'*x)'*(ps.PK'*x)/T(sqrt(M)))
 end
 
 function (d::Attention{M, M, Stiefel, Retraction, true})(x::AbstractArray{T, 3}, ps::NamedTuple) where {M, Stiefel, Retraction, T} 
@@ -80,7 +80,7 @@ function (d::Attention{M, M, Stiefel, Retraction, true})(x::AbstractArray{T, 3},
     Q_tensor = mat_tensor_mul(ps.PQ', x)
     K_tensor = mat_tensor_mul(ps.PK', x)
     QK_tensor = tensor_transpose_tensor_mul(Q_tensor, K_tensor)
-    x + tensor_tensor_mul(x, d.activation(QK_tensor))
+    x + tensor_tensor_mul(x, d.activation(QK_tensor/T(sqrt(M))))
 end
 
 function (d::Attention{M, M, Stiefel, Retraction, false})(x::AbstractArray{T, 3}, ps::NamedTuple) where {M, Stiefel, Retraction, T} 
@@ -90,7 +90,7 @@ function (d::Attention{M, M, Stiefel, Retraction, false})(x::AbstractArray{T, 3}
     Q_tensor = mat_tensor_mul(ps.PQ', x)
     K_tensor = mat_tensor_mul(ps.PK', x)
     QK_tensor = tensor_transpose_tensor_mul(Q_tensor, K_tensor)
-    tensor_tensor_mul(x, d.activation(QK_tensor))
+    tensor_tensor_mul(x, d.activation(QK_tensor/T(sqrt(M))))
 end
 
 @kernel function upper_triangular_asymmetrize_kernel!(output::AbstractArray{T, 3}, input::AbstractArray{T, 3}) where T 
diff --git a/src/layers/multi_head_attention.jl b/src/layers/multi_head_attention.jl
index ad4612f14..0ae8757f6 100644
--- a/src/layers/multi_head_attention.jl
+++ b/src/layers/multi_head_attention.jl
@@ -82,7 +82,7 @@ function (d::MultiHeadAttention{M, M, Stiefel, Retraction, true})(x::AbstractMat
     output = typeof(x)(zeros(T, 0, input_length))
     for i in 1:d.n_heads
         key = Symbol("head_"*string(i))
-        output = vcat(output, ps.PV[key]'*x*Lux.softmax((ps.PQ[key]'*x)'*(ps.PK[key]'*x)))
+        output = vcat(output, ps.PV[key]'*x*softmax((ps.PQ[key]'*x)'*(ps.PK[key]'*x)/T(sqrt(dim))))
     end
     x + output
 end
@@ -94,7 +94,7 @@ function (d::MultiHeadAttention{M, M, Stiefel, Retraction, false})(x::AbstractMa
     output = similar(x, 0, input_length)
     for i in 1:d.n_heads
         key = Symbol("head_"*string(i))
-        output = vcat(output, ps.PV[key]'*x*Lux.softmax((ps.PQ[key]'*x)'*(ps.PK[key]'*x)))
+        output = vcat(output, ps.PV[key]'*x*softmax((ps.PQ[key]'*x)'*(ps.PK[key]'*x)/T(sqrt(dim))))
     end
     output
 end
@@ -123,7 +123,7 @@ function (d::MultiHeadAttention{M, M, Stiefel, Retraction, true})(x::AbstractArr
         V_tensor = mat_tensor_mul(ps.PV[key]', x)
         QK_tensor = tensor_transpose_tensor_mul(Q_tensor, K_tensor)
 
-        single_head_output = tensor_tensor_mul(V_tensor, Lux.softmax(QK_tensor))
+        single_head_output = tensor_tensor_mul(V_tensor, softmax(QK_tensor/T(sqrt(dim))))
         output = vcat(output, single_head_output) 
         # KernelAbstractions.synchronize(backend)
     end
@@ -151,7 +151,7 @@ function (d::MultiHeadAttention{M, M, Stiefel, Retraction, false})(x::AbstractAr
         V_tensor = mat_tensor_mul(ps.PV[key]', x)
         QK_tensor = tensor_transpose_tensor_mul(Q_tensor, K_tensor)
 
-        single_head_output = tensor_tensor_mul(V_tensor, Lux.softmax(QK_tensor))
+        single_head_output = tensor_tensor_mul(V_tensor, softmax(QK_tensor/T(sqrt(dim))))
         output = vcat(output, single_head_output) 
         # KernelAbstractions.synchronize(backend)
     end
diff --git a/src/layers/psd_like_layer.jl b/src/layers/psd_like_layer.jl
index 245ad5dd7..b3f10efd4 100644
--- a/src/layers/psd_like_layer.jl
+++ b/src/layers/psd_like_layer.jl
@@ -5,14 +5,13 @@ One layer has the following shape:
     |Φ 0|
 A = |0 Φ|, where Φ is an element of the regular Stiefel manifold. 
 """
-
-struct PSDLayer{M, N, retraction} <: LayerWithManifold{M, N, retraction} end
+struct PSDLayer{M, N, Retraction} <: LayerWithManifold{M, N, Retraction} end
 
 default_retr = Geodesic()
-function PSDLayer(M::Integer, N::Integer; Retraction=default_retr)
+function PSDLayer(M::Integer, N::Integer; retraction=default_retr)
     @assert iseven(M)
     @assert iseven(N)
-    PSDLayer{M, N, typeof(Retraction)}()
+    PSDLayer{M, N, typeof(retraction)}()
 end
 
 function parameterlength(::PSDLayer{M, N}) where {M, N}
diff --git a/src/layers/stiefel_layer.jl b/src/layers/stiefel_layer.jl
index 23027aa8e..58c6839bd 100644
--- a/src/layers/stiefel_layer.jl
+++ b/src/layers/stiefel_layer.jl
@@ -1,11 +1,11 @@
 """
 Defines a layer that performs simple multiplication with an element of the Stiefel manifold.
 """
-struct StiefelLayer{M, N, retraction} <: ManifoldLayer{M, N, retraction} end
+struct StiefelLayer{M, N, Retraction} <: ManifoldLayer{M, N, Retraction} end
 
 default_retr = Geodesic()
-function StiefelLayer(n::Integer, N::Integer, Retraction::AbstractRetraction=default_retr)
-    StiefelLayer{n, N, typeof(Retraction)}()
+function StiefelLayer(n::Integer, N::Integer; retraction::AbstractRetraction=default_retr)
+    StiefelLayer{n, N, typeof(retraction)}()
 end
 
 function AbstractNeuralNetworks.initialparameters(backend::KernelAbstractions.Backend, ::Type{T}, d::StiefelLayer{M,N}; rng::AbstractRNG=Random.default_rng()) where {M,N,T}
diff --git a/src/optimizers/manifold_related/retractions.jl b/src/optimizers/manifold_related/retractions.jl
index 9966f12ea..11b552406 100644
--- a/src/optimizers/manifold_related/retractions.jl
+++ b/src/optimizers/manifold_related/retractions.jl
@@ -71,12 +71,16 @@ function cayley(B::StiefelLieAlgHorMatrix{T}) where T
     N, n = B.N, B.n
     E = typeof(B.B)(StiefelProjection(N, n, T))
     unit = typeof(B.B)(I(n))
-    unit2 = I(2*n)
-    exponent = unit2 - T(.5)*hcat(vcat(T(.5)*B.A*one(B.A), T(.25)*B.A*B.A - B.B'*B.B), vcat(unit, T(.5)*B.A*one(B.A)))
+    A_mat = B.A*one(B.A)
+    A_mat2 = B.A*B.A 
+    BB = B.B'*B.B
+
+    exponent = hcat(vcat(unit - T(.25)*A_mat, T(.5)*BB - T(.125)*A_mat2), vcat(-T(.5)*unit, unit - T(.25)*A_mat))
     StiefelManifold(
-        (One(N, T) + T(.5)*B)*
+        E + 
+        T(.5)*hcat(vcat(T(.5)*A_mat, B.B), vcat(unit, zero(B.B)))*
         (
-            E + hcat(vcat(T(.25)*B.A*one(B.A), T(.5)*B.B), vcat(T(0.5)*unit, zero(B.B)))*(exponent \ vcat(unit, T(0.5)*B.A*one(B.A)))
+            vcat(unit, T(0.5)*A_mat) + exponent \ (vcat(unit, T(0.5)*A_mat) + vcat(T(0.5)*A_mat, T(0.25)*A_mat2 - T(0.5)*BB))
             )
     )
 end
\ No newline at end of file
diff --git a/test/optimizers/optimizer_convergence_tests/psd_optim.jl b/test/optimizers/optimizer_convergence_tests/psd_optim.jl
index 499ce1119..e85c4a94c 100644
--- a/test/optimizers/optimizer_convergence_tests/psd_optim.jl
+++ b/test/optimizers/optimizer_convergence_tests/psd_optim.jl
@@ -27,7 +27,7 @@ A = [   0.06476993260924702 0.8369280855305259 0.6245358125914054 0.140729967064
         0.7317681833003449 0.9051355184962627 0.3376918522349117 0.436545092402125 0.3462196925686055   ]
 
 
-function svd_test(A, n, train_steps=1000, tol=1e-1)
+function svd_test(A, n, train_steps=1000, tol=1e-1; retraction=Cayley())
     N2 = size(A,1)
     @assert iseven(N2)
     N = N2÷2
@@ -36,7 +36,7 @@ function svd_test(A, n, train_steps=1000, tol=1e-1)
     U_result = hcat(vcat(U_result_sing, zero(U_result_sing)), vcat(zero(U_result_sing), U_result_sing))
 
     err_best = norm(A - U_result*U_result'*A)
-    model = Chain(PSDLayer(2*N, 2*n), PSDLayer(2*n, 2*N))
+    model = Chain(PSDLayer(2*N, 2*n, retraction=retraction), PSDLayer(2*n, 2*N, retraction=retraction))
     ps = initialparameters(CPU(), Float64, model)
 
     o₁ = Optimizer(GradientOptimizer(0.01), ps)
@@ -68,4 +68,6 @@ function train_network!(o::Optimizer, model::Chain, ps::Tuple, A::AbstractMatrix
     ps[1].weight, ps[2].weight, error(ps)
 end
 
-svd_test(A, 4)
\ No newline at end of file
+for retraction in (Geodesic(), Cayley())
+    svd_test(A, 4, retraction=retraction)
+end
\ No newline at end of file
diff --git a/test/optimizers/optimizer_convergence_tests/svd_optim.jl b/test/optimizers/optimizer_convergence_tests/svd_optim.jl
index 92e2c23c3..165817461 100644
--- a/test/optimizers/optimizer_convergence_tests/svd_optim.jl
+++ b/test/optimizers/optimizer_convergence_tests/svd_optim.jl
@@ -23,13 +23,13 @@ A = [   0.06476993260924702 0.8369280855305259 0.6245358125914054 0.140729967064
         0.7317681833003449 0.9051355184962627 0.3376918522349117 0.436545092402125 0.3462196925686055   ]
 
 
-function svd_test(A, n, train_steps=1000, tol=1e-1)
+function svd_test(A, n, train_steps=1000, tol=1e-1; retraction=Cayley())
     N = size(A,1)
     U, Σ, Vt = svd(A)
     U_result = U[:, 1:n]
 
     err_best = norm(A - U_result*U_result'*A)
-    model = Chain(StiefelLayer(N, n), StiefelLayer(n, N))
+    model = Chain(StiefelLayer(N, n, retraction=retraction), StiefelLayer(n, N, retraction=retraction))
     ps = initialparameters(CPU(), Float64, model)
 
     o₁ = Optimizer(GradientOptimizer(0.01), ps)
@@ -60,4 +60,6 @@ function train_network!(o::Optimizer, model::Chain, ps::Tuple, A::AbstractMatrix
     ps[1].weight, ps[2].weight, error(ps)
 end
 
-svd_test(A, 3)
\ No newline at end of file
+for retraction in (Geodesic(), Cayley())
+    svd_test(A, 3, retraction=retraction)
+end
\ No newline at end of file