From 44d70f457662f2e7c24b3e5d953787b080108795 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 07:59:15 +0200
Subject: [PATCH 01/16] Added transformer integrator.

---
 src/GeometricMachineLearning.jl             |  1 +
 src/architectures/transformer_integrator.jl | 69 +++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 src/architectures/transformer_integrator.jl

diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
index 24fbebadb..ea9808d6c 100644
--- a/src/GeometricMachineLearning.jl
+++ b/src/GeometricMachineLearning.jl
@@ -246,6 +246,7 @@ module GeometricMachineLearning
 
     #INCLUDE ARCHITECTURES
     include("architectures/neural_network_integrator.jl")
+    include("architectures/transformer_integrator.jl")
     include("architectures/sympnet.jl")
     include("architectures/autoencoder.jl")
     include("architectures/fixed_width_network.jl")
diff --git a/src/architectures/transformer_integrator.jl b/src/architectures/transformer_integrator.jl
new file mode 100644
index 000000000..4aaae2719
--- /dev/null
+++ b/src/architectures/transformer_integrator.jl
@@ -0,0 +1,69 @@
+@doc raw"""
+Encompasses various transformer architectures, such as the structure-preserving transformer and the linear symplectic transformer. 
+"""
+abstract type TransformerIntegrator <: Architecture end
+
+struct DummyTransformer <: TransformerIntegrator 
+    seq_length::Int
+end
+
+@doc raw"""
+This function computes a trajectory for a Transformer that has already been trained for valuation purposes.
+
+It takes as input: 
+- `nn`: a `NeuralNetwork` (that has been trained).
+- `ics`: initial conditions (a matrix in ``\mathbb{R}^{2n\times\mathtt{seq\_length}}`` or `NamedTuple` of two matrices in ``\mathbb{R}^{n\times\mathtt{seq\_length}}``)
+- `n_points::Int=100` (keyword argument): The number of steps for which we run the prediction. 
+"""
+function Base.iterate(nn::NeuralNetwork{<:TransformerIntegrator}, ics::NamedTuple{(:q, :p), Tuple{AT, AT}}; n_points::Int = 100, prediction_window::Union{Nothing, Int} = 1) where {T, AT<:AbstractMatrix{T}}
+
+    seq_length = nn.architecture.seq_length
+
+    n_dim = size(ics.q, 1)
+    backend = KernelAbstractions.get_backend(ics.q)
+
+    n_iterations = Int(ceil((n_points - seq_length) / prediction_window))
+    # Array to store the predictions
+    q_valuation = KernelAbstractions.allocate(backend, T, n_dim, seq_length + n_iterations * prediction_window)
+    p_valuation = KernelAbstractions.allocate(backend, T, n_dim, seq_length + n_iterations * prediction_window)
+    
+    # Initialisation
+    q_valuation[:,1:seq_length] = ics.q
+    p_valuation[:,1:seq_length] = ics.p
+    
+    # iteration in phase space
+    @views for i in 1:n_iterations
+        start_index = (i - 1) * prediction_window + 1
+        @views qp_temp = (q = q_valuation[:, start_index:(start_index + seq_length - 1)], p = p_valuation[:, start_index:(start_index + seq_length - 1)]) 
+        qp_prediction = nn(qp_temp)
+        q_valuation[seq_length + (i - 1) * prediction_window, seq_length + i * prediction_window] = qp_prediction.q[:, (seq_length - prediction_window + 1):end]
+        p_valuation[seq_length + (i - 1) * prediction_window, seq_length + i * prediction_window] = qp_prediction.p[:, (seq_length - prediction_window + 1):end]
+    end
+
+    (q=q_valuation[:, 1:n_points], p=p_valuation[:, 1:n_points])
+end
+
+function Base.iterate(nn::NeuralNetwork{<:TransformerIntegrator}, ics::AT; n_points::Int = 100, prediction_window::Union{Nothing, Int} = 1) where {T, AT<:AbstractMatrix{T}}
+
+    seq_length = typeof(nn.architecture) <: RegularTransformerIntegrator ? prediction_window : nn.architecture.seq_length
+
+    n_dim = size(ics, 1)
+    backend = KernelAbstractions.get_backend(ics)
+
+    n_iterations = Int(ceil((n_points - seq_length) / prediction_window))
+    # Array to store the predictions
+    valuation = KernelAbstractions.allocate(backend, T, n_dim, seq_length + n_iterations * prediction_window)
+    
+    # Initialisation
+    valuation[:,1:seq_length] = ics
+    
+    # iteration in phase space
+    @views for i in 1:n_iterations
+        start_index = (i - 1) * prediction_window + 1
+        temp = valuation[:, start_index:(start_index + seq_length - 1)]
+        prediction = nn(copy(temp))
+        valuation[:, (seq_length + (i - 1) * prediction_window + 1):(seq_length + i * prediction_window)] = prediction[:, (seq_length - prediction_window + 1):end]
+    end
+
+    valuation[:, 1:n_points]
+end
\ No newline at end of file

From 7bf59f6f8b6edd0ae0317d995ec7412fb45f1b7b Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 08:00:50 +0200
Subject: [PATCH 02/16] Added new DataLoader tests.

---
 test/batch/batch_functor.jl                   | 57 +++++++++++++++++++
 test/data_loader/batch_data_loader_qp_test.jl | 15 +++--
 .../data_loader_optimization_step.jl          |  3 +-
 .../data_loader/draw_batch_for_tensor_test.jl | 10 ++--
 test/data_loader/mnist_utils.jl               |  8 ++-
 .../optimizer_functor_with_adam.jl            | 14 +++--
 test/runtests.jl                              |  7 ++-
 7 files changed, 90 insertions(+), 24 deletions(-)
 create mode 100644 test/batch/batch_functor.jl

diff --git a/test/batch/batch_functor.jl b/test/batch/batch_functor.jl
new file mode 100644
index 000000000..d2f0683da
--- /dev/null
+++ b/test/batch/batch_functor.jl
@@ -0,0 +1,57 @@
+using Test
+using GeometricMachineLearning
+import Random 
+
+Random.seed!(123)
+
+const data_set_raw = 1:10
+const data_set = reshape(data_set_raw, 1, 10)
+
+const dl = DataLoader(data_set)
+
+function test_batching_over_one_axis(batch_size::Int=3)
+    batch = Batch(batch_size)
+    tuple_indices = batch(dl)
+    @test length(tuple_indices) == Int(ceil(10 / batch_size))
+    tuple_indices_sorted = sort(vcat(tuple_indices...))
+    indices_sorted = [tuple_index[2] for tuple_index in tuple_indices_sorted]
+    @test indices_sorted == data_set_raw
+end
+
+test_batching_over_one_axis()
+test_batching_over_one_axis(5)
+
+const data_set_raw₂ = hcat(1:10, 21:30)
+const data_set₂ = reshape(data_set_raw₂, 1, 10, 2)
+
+const dl₂ = DataLoader(data_set₂)
+
+function convert_vector_of_tuples_to_array(indices::Vector{<:Tuple})
+    elem_in_tuple = length(indices[1])
+    index_array = zeros(Int, length(indices), elem_in_tuple)
+    for (i, elem) in zip(axes(indices, 1), indices)
+        index_array[i, :] .= elem
+    end
+    index_array
+end
+
+sort_according_to_first_column(A::Matrix) = A[sortperm(A[:, 1]), :]
+
+function test_batching_over_two_axis_with_seq_length(batch_size::Int=3, seq_length::Int=2, prediction_window::Int=1)
+    batch = Batch(batch_size, seq_length, prediction_window)
+    tuple_indices₂ = batch(dl₂)
+    @test length(tuple_indices₂) == Int(ceil(2 * (10 - (seq_length - 1) - batch.prediction_window) / batch_size)) # have to multiply with two because we deal with tuples
+    num_elems = 10 - (batch.seq_length -1) - batch.prediction_window
+    indices₁ = hcat(1:num_elems, 1 * ones(Int, num_elems))
+    indices₂ = hcat(1:num_elems, 2 * ones(Int, num_elems))
+    true_indices₁ = sort_according_to_first_column(vcat(indices₁, indices₂))
+    true_indices₂ = sort_according_to_first_column(vcat(indices₂, indices₁))
+    # convert indices to a format such that we can compare them to the true indices
+    indices_for_comparison = convert_vector_of_tuples_to_array(vcat(tuple_indices₂...))
+    # sort the indices
+    indices_for_comparison = sort_according_to_first_column(indices_for_comparison)
+    @test indices_for_comparison == true_indices₁ || indices_for_comparison == true_indices₂
+end
+
+test_batching_over_two_axis_with_seq_length()
+test_batching_over_two_axis_with_seq_length(3, 2, 2)
\ No newline at end of file
diff --git a/test/data_loader/batch_data_loader_qp_test.jl b/test/data_loader/batch_data_loader_qp_test.jl
index 5c4ba35b7..f968932cf 100644
--- a/test/data_loader/batch_data_loader_qp_test.jl
+++ b/test/data_loader/batch_data_loader_qp_test.jl
@@ -5,17 +5,19 @@ import Random
 Random.seed!(1234)
 
 function dummy_qp_data_matrix(dim=2, number_data_points=200, T=Float32)
-    (q = rand(T, dim, number_data_points), p = (rand(T, dim, number_data_points)))
+    @assert iseven(dim)
+    (q = rand(T, dim ÷ 2, number_data_points), p = (rand(T, dim ÷ 2, number_data_points)))
 end
 
 function dummy_qp_data_tensor(dim=2, number_of_time_steps=100, number_of_parameters=20, T=Float32)
-    (q = rand(T, dim, number_of_time_steps, number_of_parameters), p = (rand(T, dim, number_of_time_steps, number_of_parameters)))
+    @assert iseven(dim)
+    (q = rand(T, dim ÷ 2, number_of_time_steps, number_of_parameters), p = (rand(T, dim ÷ 2, number_of_time_steps, number_of_parameters)))
 end
 
 function test_data_loader(dim=2, number_of_time_steps=100, number_of_parameters=20, batch_size=10, T=Float32)
 
     dl1 = DataLoader(dummy_qp_data_matrix(dim, number_of_time_steps, T))
-    dl2 = DataLoader(dummy_qp_data_tensor(dim, number_of_time_steps, number_of_parameters))
+    dl2 = DataLoader(dummy_qp_data_tensor(dim, number_of_time_steps, number_of_parameters, T))
 
     arch1 = GSympNet(dl1)
     arch2 = GSympNet(dl2)
@@ -23,15 +25,12 @@ function test_data_loader(dim=2, number_of_time_steps=100, number_of_parameters=
     nn1 = NeuralNetwork(arch1, CPU(), T)
     nn2 = NeuralNetwork(arch2, CPU(), T)
 
-    loss1 = GeometricMachineLearning.loss(nn1, dl1)
-    loss2 = GeometricMachineLearning.loss(nn2, dl2)
-
     batch = Batch(batch_size)
     o₁ = Optimizer(GradientOptimizer(), nn1)
-    # o₂ = Optimizer(GradientOptimizer(), nn2)
+    o₂ = Optimizer(GradientOptimizer(), nn2)
 
     o₁(nn1, dl1, batch)
-    # o₂(nn2, dl2, batch)
+    o₂(nn2, dl2, batch)
 end
 
 test_data_loader()
\ No newline at end of file
diff --git a/test/data_loader/data_loader_optimization_step.jl b/test/data_loader/data_loader_optimization_step.jl
index 7a245b0e2..82e972618 100644
--- a/test/data_loader/data_loader_optimization_step.jl
+++ b/test/data_loader/data_loader_optimization_step.jl
@@ -13,7 +13,8 @@ function test_data_loader(sys_dim, n_time_steps, n_params, T=Float32)
     # first argument is sys_dim, second is number of heads, third is number of units
     model = Transformer(dl.input_dim, 2, 1)
     ps = initialparameters(model, CPU(), T)
-    dx = Zygote.gradient(ps -> GeometricMachineLearning.loss(model, ps, dl), ps)[1]
+    loss = GeometricMachineLearning.TransformerLoss(n_time_steps)
+    dx = Zygote.gradient(ps -> loss(model, ps, dl.input, dl.input), ps)[1]
     ps_copy = deepcopy(ps)
     o = Optimizer(GradientOptimizer(), ps)
     optimization_step!(o, model, ps, dx)
diff --git a/test/data_loader/draw_batch_for_tensor_test.jl b/test/data_loader/draw_batch_for_tensor_test.jl
index d384a36f5..c777f205a 100644
--- a/test/data_loader/draw_batch_for_tensor_test.jl
+++ b/test/data_loader/draw_batch_for_tensor_test.jl
@@ -4,18 +4,18 @@ import Random
 
 Random.seed!(1234)
 
-
-function test_data_loader_for_qp_tensor(system_dim2::Int, input_time_steps::Int, n_params::Int, batch_size::Int, seq_length::Int)
+function test_data_loader_for_qp_tensor(system_dim2::Int, input_time_steps::Int, n_params::Int, batch_size::Int, seq_length::Int, prediction_window::Union{Int, Nothing}=nothing)
+    _prediction_window = isnothing(prediction_window) ? seq_length : prediction_window
     dummy_data = (q = rand(system_dim2, input_time_steps, n_params), p = rand(system_dim2, input_time_steps, n_params))
 
     dl = DataLoader(dummy_data)
-    batch = Batch(batch_size, seq_length)
+    batch = isnothing(prediction_window) ? Batch(batch_size, seq_length) : Batch(batch_size, seq_length, prediction_window)
 
     batch_indices_all = batch(dl)
     for batch_indices in batch_indices_all
         batched_array = convert_input_and_batch_indices_to_array(dl, batch, batch_indices)
-        @test size(batched_array[1].q) == (system_dim2, seq_length, batch_size)
-        @test size(batched_array[2].q) == (system_dim2, 1,          batch_size)
+        @test size(batched_array[1].q) == (system_dim2, seq_length,         batch_size)
+        @test size(batched_array[2].q) == (system_dim2, _prediction_window, batch_size)
     end
 end
 
diff --git a/test/data_loader/mnist_utils.jl b/test/data_loader/mnist_utils.jl
index e65fe1ddd..bcda1bd72 100644
--- a/test/data_loader/mnist_utils.jl
+++ b/test/data_loader/mnist_utils.jl
@@ -65,12 +65,14 @@ function test_optimizer_for_classification_layer(; dim₁=28, dim₂=28, number_
     model = Classification(patch_length * patch_length, 10, activation_function)
 
     ps = initialparameters(model, CPU(), T)   
-    loss₁ = GeometricMachineLearning.loss(model, ps, dl)
+    loss = GeometricMachineLearning.ClassificationTransformerLoss()
+    loss_dl(model::GeometricMachineLearning.AbstractExplicitLayer, ps::Union{Tuple, NamedTuple}, dl::DataLoader) = loss(model, ps, dl.input, dl.output)
+    loss₁ = loss_dl(model, ps, dl)
 
     opt = Optimizer(GradientOptimizer(), ps)
-    dx = Zygote.gradient(ps -> GeometricMachineLearning.loss(model, ps, dl), ps)[1]
+    dx = Zygote.gradient(ps -> loss_dl(model, ps, dl), ps)[1]
     optimization_step!(opt, model, ps, dx)
-    loss₂ = GeometricMachineLearning.loss(model, ps, dl)
+    loss₂ = loss_dl(model, ps, dl)
 
     @test loss₂ < loss₁
 end
diff --git a/test/data_loader/optimizer_functor_with_adam.jl b/test/data_loader/optimizer_functor_with_adam.jl
index 55cc6e236..61cc3dbc7 100644
--- a/test/data_loader/optimizer_functor_with_adam.jl
+++ b/test/data_loader/optimizer_functor_with_adam.jl
@@ -12,12 +12,12 @@ end
 """
 This creates a dummy MNIST data set; i.e. its output are two tensors that look similarly to the ones in the MNIST data set.
 """
-function create_dummy_mnist(;T=Float32, dim₁=6, dim₂=6, n_images=10)
+function create_dummy_mnist(; T=Float32, dim₁=6, dim₂=6, n_images=10)
     rand(T, dim₁, dim₂, n_images), Int.(floor.(10*rand(T, n_images)))
 end
 
 
-function test_optimizer_functor_with_adam(;T=Float32, dim₁=6, dim₂=6, n_images=10, patch_length=3)
+function test_optimization_with_adam(;T=Float32, dim₁=6, dim₂=6, n_images=10, patch_length=3)
     dl = DataLoader(create_dummy_mnist(T=T, dim₁=dim₁, dim₂=dim₂, n_images=n_images)...; patch_length=patch_length)
     
     # batch size is equal to two
@@ -28,15 +28,17 @@ function test_optimizer_functor_with_adam(;T=Float32, dim₁=6, dim₂=6, n_imag
 
     ps = initialparameters(model, CPU(), Float32)
 
-    loss₁ = GeometricMachineLearning.loss(model, ps, dl)
+    loss = GeometricMachineLearning.ClassificationTransformerLoss()
+
+    loss₁ = loss(model, ps, dl.input, dl.output)
 
     opt = Optimizer(AdamOptimizer(), ps)
-    loss_average = optimize_for_one_epoch!(opt, model, ps, dl, batch)
+    loss_average = optimize_for_one_epoch!(opt, model, ps, dl, batch, loss)
 
-    loss₃ = GeometricMachineLearning.loss(model, ps, dl)
+    loss₃ = loss(model, ps, dl.input, dl.output)
 
     #check if the loss decreases during optimization
     @test loss₁ > loss_average > loss₃
 end
 
-test_optimizer_functor_with_adam()
\ No newline at end of file
+test_optimization_with_adam()
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 5242b3c81..46206770b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,11 +8,14 @@ using SafeTestsets
 @safetestset "Scalar multiplication tests for custom arrays                                   " begin include("arrays/scalar_multiplication_for_custom_arrays.jl") end
 @safetestset "Matrix multiplication tests for custom arrays                                   " begin include("arrays/matrix_multiplication_for_custom_arrays.jl") end
 @safetestset "Test constructors for custom arrays                                             " begin include("arrays/constructor_tests_for_custom_arrays.jl") end
+@safetestset "Test triangular matrices                                                        " begin include("arrays/triangular.jl") end
 @safetestset "Manifolds (Grassmann):                                                          " begin include("manifolds/grassmann_manifold.jl") end
 @safetestset "Gradient Layer                                                                  " begin include("layers/gradient_layer_tests.jl") end
 @safetestset "Test symplecticity of upscaling layer                                           " begin include("layers/sympnet_layers_test.jl") end 
 @safetestset "Hamiltonian Neural Network                                                      " begin include("hamiltonian_neural_network_tests.jl") end
 @safetestset "Manifold Neural Network Layers                                                  " begin include("layers/manifold_layers.jl") end
+
+@safetestset "Custom inverse for 2x2, 3x3, 4x4, 5x5 matrices                                  " begin include("kernels/tensor_inverse.jl") end
 @safetestset "Custom AD rules for kernels                                                     " begin include("custom_ad_rules/kernel_pullbacks.jl") end
 @safetestset "ResNet                                                                          " begin include("layers/resnet_tests.jl") end
 
@@ -54,4 +57,6 @@ using SafeTestsets
 
 @safetestset "Test triangular arrays                                                          " begin include("arrays/triangular.jl") end
 
-@safetestset "Test volume-preserving feedforward neural network                               " begin include("layers/volume_preserving_feedforward.jl") end
\ No newline at end of file
+@safetestset "Test volume-preserving feedforward neural network                               " begin include("layers/volume_preserving_feedforward.jl") end
+
+@safetestset "Batch functor(s)                                                                " begin include("batch/batch_functor.jl") end
\ No newline at end of file

From 980d16aa0b195c1903d35a70e88fd3763dc45647 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 08:01:20 +0200
Subject: [PATCH 03/16] Implemented new DataLoader interface.

---
 docs/src/data_loader/data_loader.md |   4 +-
 src/data_loader/batch.jl            | 204 ++++++++++++----------------
 src/data_loader/data_loader.jl      | 104 +++++++++++---
 3 files changed, 178 insertions(+), 134 deletions(-)

diff --git a/docs/src/data_loader/data_loader.md b/docs/src/data_loader/data_loader.md
index 5a2590b6f..01d5d5788 100644
--- a/docs/src/data_loader/data_loader.md
+++ b/docs/src/data_loader/data_loader.md
@@ -69,7 +69,7 @@ Markdown.parse(description(Val(:batch_functor_matrix)))
 using GeometricMachineLearning # hide
 
 matrix_data = rand(Float32, 2, 10)
-dl = DataLoader(matrix_data)
+dl = DataLoader(matrix_data; autoencoder=true)
 
 batch = Batch(3)
 batch(dl)
@@ -112,7 +112,7 @@ We can also sample tensor data.
 ```@example
 using GeometricMachineLearning # hide 
 
-qp_data = (q = rand(Float32, 2, 8, 3), p = rand(Float32, 2, 8, 3))
+qp_data = (q = rand(Float32, 2, 20, 3), p = rand(Float32, 2, 20, 3))
 dl = DataLoader(qp_data)
 
 # also specify sequence length here
diff --git a/src/data_loader/batch.jl b/src/data_loader/batch.jl
index f93c84f59..c0a0cba9f 100644
--- a/src/data_loader/batch.jl
+++ b/src/data_loader/batch.jl
@@ -1,7 +1,16 @@
 description(::Val{:Batch}) = raw"""
-`Batch` is a struct with an associated functor that acts on an instance of `DataLoader`. 
+`Batch` is a struct whose functor acts on an instance of `DataLoader` to produce a sequence of training samples for one epoch. 
 
-The constructor of `Batch` takes `batch_size` (an integer) as input argument. Optionally we can provide `seq_length` if we deal with time series data and want to draw batches of a certain *length* (i.e. a range contained in the second dimension of the input array).
+## The Constructor
+
+The constructor for `Batch` is called with: 
+- `batch_size::Int`
+- `seq_length::Int` (optional)
+- `prediction_window::Int` (optional)
+
+## The functor 
+
+An instance of `Batch` can be called on an instance of `DataLoader` to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. 
 """
 
 description(::Val{:batch_functor_matrix}) = raw"""
@@ -23,43 +32,48 @@ $(description(Val(:batch_functor_matrix)))
 
 $(description(Val(:batch_functor_tensor)))
 """
-struct Batch{seq_type <: Union{Nothing, Integer}}
+struct Batch{BatchType}
     batch_size::Int
-    seq_length::seq_type
+    seq_length::Int
+    prediction_window::Int
+end
 
-    function Batch(batch_size, seq_length = nothing)
-        new{typeof(seq_length)}(batch_size, seq_length)
-    end
+# add an additional constructor for `TransformerLoss` taking batch as input.
+TransformerLoss(batch::Batch{:Transformer}) = TransformerLoss(batch.seq_length, batch.prediction_window)
+
+Base.iterate(nn::NeuralNetwork, ics, batch::Batch{:Transformer}; n_points = 100) = iterate(nn, ics; n_points = n_points, prediction_window = batch.prediction_window)
+
+function Batch(batch_size::Int)
+    Batch{:FeedForward}(batch_size, 1, 1)
 end
 
-hasseqlength(::Batch{<:Integer}) = true
-hasseqlength(::Batch{<:Nothing}) = false
+function Batch(batch_size::Int, seq_length::Int)
+    Batch{:Transformer}(batch_size, seq_length, seq_length)
+end
 
-"""
-This function is called when either dealing with a matrix or a tensor where we always consider the entire time series. 
-"""
-function batch_over_one_axis(batch::Batch, number_columns::Int)
-    indices = shuffle(1:number_columns)
-    n_batches = Int(ceil(number_columns / batch.batch_size))
-    batches = ()
-    for batch_number in 1:(n_batches - 1)
-        batches = (batches..., indices[(batch_number - 1) * batch.batch_size + 1 : batch_number * batch.batch_size])
-    end
-    (batches..., indices[(n_batches - 1) * batch.batch_size + 1:number_columns])
+function Batch(batch_size::Int, seq_length::Int, prediction_window::Int)
+    Batch{:Transformer}(batch_size, seq_length, prediction_window)
 end
 
-function (batch::Batch{<:Nothing})(dl::DataLoader{T, BT, OT, RegularData}) where {T, AT<:AbstractArray{T}, BT<:Union{AT, NamedTuple{(:q, :p), Tuple{AT, AT}}}, OT}
-    batch_over_one_axis(batch, dl.n_params)
+Batch(::Int, ::Nothing, ::Int) = error("Cannot provide prediction window alone. Need sequence length!")
+
+@doc raw"""
+Gives the number of batches. Inputs are of type `DataLoader` and `Batch`.
+
+Here the big distinction is between data that are *time-series like* and data that are *autoencoder like*.
+"""
+function number_of_batches(dl::DataLoader{T, AT, OT, :TimeSeries}, batch::Batch) where {T, BT<:AbstractArray{T, 3}, AT<:Union{BT, NamedTuple{(:q, :p), Tuple{BT, BT}}}, OT}
+    @assert dl.input_time_steps > (batch.seq_length + batch.prediction_window) "The number of time steps has to be greater than sequence length + prediction window."
+    Int(ceil((dl.input_time_steps - (batch.seq_length - 1) - batch.prediction_window) * dl.n_params / batch.batch_size))
 end
 
-function (batch::Batch{<:Nothing})(dl::DataLoader{T, BT, OT, TimeSteps}) where {T, AT<:AbstractArray{T}, BT<:Union{AT, NamedTuple{(:q, :p), Tuple{AT, AT}}}, OT}
-    batch_over_one_axis(batch, dl.input_time_steps)
+function number_of_batches(dl::DataLoader{T, AT, OT, :RegularData}, batch::Batch) where {T, BT<:AbstractArray{T, 3}, AT<:Union{BT, NamedTuple{(:q, :p), Tuple{BT, BT}}}, OT}
+    Int(ceil(dl.input_time_steps * dl.n_params / batch.batch_size))
 end
 
-# Batching for tensor with three axes (unsupervised learning). 
-function (batch::Batch{<:Integer})(dl::DataLoader{T, BT, Nothing}) where {T, AT<:AbstractArray{T, 3}, BT<:Union{AT, NamedTuple{(:q, :p), Tuple{AT, AT}}}}
-    time_indices = shuffle(1:(dl.input_time_steps - batch.seq_length))
-    parameter_indices = shuffle(1:dl.n_params)
+function batch_over_two_axes(batch::Batch, number_columns::Int, third_dim::Int, dl::DataLoader)
+    time_indices = shuffle(1:number_columns)
+    parameter_indices = shuffle(1:third_dim)
     complete_indices = Iterators.product(time_indices, parameter_indices) |> collect |> vec
     batches = ()
     n_batches = number_of_batches(dl, batch)
@@ -67,21 +81,14 @@ function (batch::Batch{<:Integer})(dl::DataLoader{T, BT, Nothing}) where {T, AT<
         batches = (batches..., complete_indices[(batch_number - 1) * batch.batch_size + 1 : batch_number * batch.batch_size])
     end
     (batches..., complete_indices[(n_batches - 1) * batch.batch_size + 1:end])
-end 
-
-@doc raw"""
-Gives the number of bathces. Inputs are of type `DataLoader` and `Batch`.
-"""
-function number_of_batches(dl::DataLoader{T, AT}, batch::Batch) where {T, BT<:AbstractMatrix{T}, AT<:Union{BT, NamedTuple{(:q, :p), Tuple{BT, BT}}}}
-    Int(ceil(dl.input_time_steps / batch.batch_size))
 end
 
-function number_of_batches(dl::DataLoader{T, AT}, batch::Batch{Nothing}) where {T, BT<:AbstractArray{T, 3}, AT<:Union{BT, NamedTuple{(:q, :p), Tuple{BT, BT}}}}
-    Int(ceil(dl.n_params / batch.batch_size))
+function (batch::Batch)(dl::DataLoader{T, BT, OT, :RegularData}) where {T, AT<:AbstractArray{T, 3}, BT<:Union{AT, NamedTuple{(:q, :p), Tuple{AT, AT}}}, OT}
+    batch_over_two_axes(batch, dl.input_time_steps, dl.n_params, dl)
 end
 
-function number_of_batches(dl::DataLoader{T, AT}, batch::Batch{<:Integer}) where {T, BT<:AbstractArray{T, 3}, AT<:Union{BT, NamedTuple{(:q, :p), Tuple{BT, BT}}}}
-    Int(ceil((dl.input_time_steps - batch.seq_length) * dl.n_params / batch.batch_size))
+function (batch::Batch)(dl::DataLoader{T, BT, OT, :TimeSeries}) where {T, AT<:AbstractArray{T, 3}, BT<:Union{AT, NamedTuple{(:q, :p), Tuple{AT, AT}}}, OT}
+    batch_over_two_axes(batch, dl.input_time_steps - (batch.seq_length - 1) - batch.prediction_window, dl.n_params, dl)
 end
 
 @kernel function assign_input_from_vector_of_tuples_kernel!(q_input::AT, p_input::AT, input::NamedTuple{(:q, :p), Tuple{AT, AT}}, indices::AbstractArray{Int, 2}) where {T, AT<:AbstractArray{T, 3}}
@@ -98,27 +105,23 @@ end
 end
 
 @kernel function assign_output_from_vector_of_tuples_kernel!(q_output::AT, p_output::AT, input::NamedTuple{(:q, :p), Tuple{AT, AT}}, indices::AbstractArray{Int, 2}, seq_length::Int) where {T, AT<:AbstractArray{T, 3}}
-    i, k = @index(Global, NTuple)
+    i, j, k = @index(Global, NTuple)
 
-    q_output[i, 1, k] = input.q[i, indices[1, k] + seq_length, indices[2, k]]
-    p_output[i, 1, k] = input.p[i, indices[1, k] + seq_length, indices[2, k]]
+    q_output[i, j, k] = input.q[i, indices[1, k] + seq_length + j - 1, indices[2, k]]
+    p_output[i, j, k] = input.p[i, indices[1, k] + seq_length + j - 1, indices[2, k]]
 end
 
 @kernel function assign_output_from_vector_of_tuples_kernel!(output::AT, data::AT, indices::AbstractArray{Int, 2}, seq_length::Int) where {T, AT<:AbstractArray{T, 3}}
-    i, k = @index(Global, NTuple)
+    i, j, k = @index(Global, NTuple)
 
-    output[i, 1, k] = data[i, indices[1, k] + seq_length, indices[2, k]]
+    output[i, j, k] = data[i, indices[1, k] + seq_length + j - 1, indices[2, k]]
 end
 
-"""
-Takes the output of the batch functor and uses it to create the corresponding array (NamedTuples). 
-"""
-function convert_input_and_batch_indices_to_array(dl::DataLoader{T, BT}, batch::Batch, batch_indices_tuple::Vector{Tuple{Int, Int}}) where {T, AT<:AbstractArray{T, 3}, BT<:NamedTuple{(:q, :p), Tuple{AT, AT}}}
-    backend = KernelAbstractions.get_backend(dl.input.q)
-    q_input = KernelAbstractions.allocate(backend, T, dl.input_dim ÷ 2, batch.seq_length, length(batch_indices_tuple))
-    p_input = similar(q_input)
+# this is neeced if we want to use the vector of tuples in a kernel
+function convert_vector_of_tuples_to_matrix(backend::Backend, batch_indices_tuple::Vector{Tuple{Int, Int}})
+    _batch_size = length(batch_indices_tuple)
 
-    batch_indices = KernelAbstractions.allocate(backend, Int, 2, length(batch_indices_tuple))
+    batch_indices = KernelAbstractions.allocate(backend, Int, 2, _batch_size)
     batch_indices_temp = zeros(Int, size(batch_indices)...)
     for t in axes(batch_indices_tuple, 1)
         batch_indices_temp[1, t] = batch_indices_tuple[t][1]
@@ -126,14 +129,31 @@ function convert_input_and_batch_indices_to_array(dl::DataLoader{T, BT}, batch::
     end
     batch_indices = typeof(batch_indices)(batch_indices_temp)
 
+    batch_indices
+end
+
+"""
+Takes the output of the batch functor and uses it to create the corresponding array (NamedTuples). 
+"""
+function convert_input_and_batch_indices_to_array(dl::DataLoader{T, BT}, batch::Batch, batch_indices_tuple::Vector{Tuple{Int, Int}}) where {T, AT<:AbstractArray{T, 3}, BT<:NamedTuple{(:q, :p), Tuple{AT, AT}}}
+    backend = KernelAbstractions.get_backend(dl.input.q)
+    
+    # the batch size is smaller for the last batch
+    _batch_size = length(batch_indices_tuple)
+
+    batch_indices = convert_vector_of_tuples_to_matrix(backend, batch_indices_tuple)
+
+    q_input = KernelAbstractions.allocate(backend, T, dl.input_dim ÷ 2, batch.seq_length, _batch_size)
+    p_input = similar(q_input)
+
     assign_input_from_vector_of_tuples! = assign_input_from_vector_of_tuples_kernel!(backend)
-    assign_input_from_vector_of_tuples!(q_input, p_input, dl.input, batch_indices, ndrange=(dl.input_dim ÷ 2, batch.seq_length, length(batch_indices_tuple)))
+    assign_input_from_vector_of_tuples!(q_input, p_input, dl.input, batch_indices, ndrange=(dl.input_dim ÷ 2, batch.seq_length, _batch_size))
 
-    q_output = KernelAbstractions.allocate(backend, T, dl.input_dim ÷ 2, 1, length(batch_indices_tuple))
+    q_output = KernelAbstractions.allocate(backend, T, dl.input_dim ÷ 2, batch.prediction_window, _batch_size)
     p_output = similar(q_output)
 
     assign_output_from_vector_of_tuples! = assign_output_from_vector_of_tuples_kernel!(backend)
-    assign_output_from_vector_of_tuples!(q_output, p_output, dl.input, batch_indices, batch.seq_length, ndrange=(dl.input_dim ÷ 2, length(batch_indices_tuple)))
+    assign_output_from_vector_of_tuples!(q_output, p_output, dl.input, batch_indices, batch.seq_length, ndrange=(dl.input_dim ÷ 2, batch.prediction_window, _batch_size))
 
     (q = q_input, p = p_input), (q = q_output, p = p_output)
 end
@@ -143,75 +163,31 @@ Takes the output of the batch functor and uses it to create the corresponding ar
 """
 function convert_input_and_batch_indices_to_array(dl::DataLoader{T, BT}, batch::Batch, batch_indices_tuple::Vector{Tuple{Int, Int}}) where {T, BT<:AbstractArray{T, 3}}
     backend = KernelAbstractions.get_backend(dl.input)
-    input = KernelAbstractions.allocate(backend, T, dl.input_dim, batch.seq_length, length(batch_indices_tuple))
 
-    batch_indices = KernelAbstractions.allocate(backend, Int, 2, length(batch_indices_tuple))
-    batch_indices_temp = zeros(Int, size(batch_indices)...)
-    for t in axes(batch_indices_tuple, 1)
-        batch_indices_temp[1, t] = batch_indices_tuple[t][1]
-        batch_indices_temp[2, t] = batch_indices_tuple[t][2]
-    end
-    batch_indices = typeof(batch_indices)(batch_indices_temp)
+    # the batch size is smaller for the last batch 
+    _batch_size = length(batch_indices_tuple)
+
+    batch_indices = convert_vector_of_tuples_to_matrix(backend, batch_indices_tuple)
+
+    input = KernelAbstractions.allocate(backend, T, dl.input_dim, batch.seq_length, _batch_size)
 
     assign_input_from_vector_of_tuples! = assign_input_from_vector_of_tuples_kernel!(backend)
-    assign_input_from_vector_of_tuples!(input, dl.input, batch_indices, ndrange=(dl.input_dim, batch.seq_length, length(batch_indices_tuple)))
+    assign_input_from_vector_of_tuples!(input, dl.input, batch_indices, ndrange=(dl.input_dim, batch.seq_length, _batch_size))
 
-    output = KernelAbstractions.allocate(backend, T, dl.input_dim, 1, length(batch_indices_tuple))
+    output = KernelAbstractions.allocate(backend, T, dl.input_dim, batch.prediction_window, _batch_size)
 
     assign_output_from_vector_of_tuples! = assign_output_from_vector_of_tuples_kernel!(backend)
-    assign_output_from_vector_of_tuples!(output, dl.input, batch_indices, batch.seq_length, ndrange=(dl.input_dim, length(batch_indices_tuple)))
+    assign_output_from_vector_of_tuples!(output, dl.input, batch_indices, batch.seq_length, ndrange=(dl.input_dim, batch.prediction_window, _batch_size))
 
     input, output
 end
 
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, AT, BT}, batch::Batch) where {T, T1, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T1, 3}}
-    optimize_for_one_epoch!(opt, model, ps, dl, batch, loss)
-end
-
-function optimize_for_one_epoch!(opt::Optimizer, nn::NeuralNetwork, dl::DataLoader, batch::Batch)
-    optimize_for_one_epoch!(opt, nn.model, nn.params, dl, batch)
-end
 
-"""
-This routine is called if a `DataLoader` storing *symplectic data* (i.e. a `NamedTuple`) is supplied.
-"""
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, AT}, batch::Batch, loss) where {T, AT<:NamedTuple}
-    count = 0 
-    total_error = T(0)
-    batches = batch(dl)
-    @views for batch_indices in batches 
-        count += 1
-        input_batch = (q=copy(dl.input.q[:,batch_indices]), p=copy(dl.input.p[:,batch_indices]))
-        # add +1 here !!!!
-        output_batch = (q=copy(dl.input.q[:,batch_indices.+1]), p=copy(dl.input.p[:,batch_indices.+1]))
-        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_batch, output_batch), ps)
-        total_error += loss_value 
-        dp = pullback(one(loss_value))[1]
-        optimization_step!(opt, model, ps, dp)
-    end
-    total_error / count
-end
-
-@doc raw"""
-A functor for `Optimizer`. It is called with:
-    - `nn::NeuralNetwork`
-    - `dl::DataLoader`
-    - `batch::Batch`
-    - `n_epochs::Int`
-    - `loss`
-
-The last argument is a function through which `Zygote` differentiates. This argument is optional; if it is not supplied `GeometricMachineLearning` defaults to an appropriate loss for the `DataLoader`.
-"""
-function (o::Optimizer)(nn::NeuralNetwork, dl::DataLoader, batch::Batch, n_epochs::Int, loss)
-    progress_object = ProgressMeter.Progress(n_epochs; enabled=true)
-    loss_array = zeros(n_epochs)
-    for i in 1:n_epochs
-        loss_array[i] = optimize_for_one_epoch!(o, nn.model, nn.params, dl, batch, loss)
-        ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss, loss_array[i])]) 
-    end
-    loss_array
-end
+# for the case when the DataLoader also contains an output
+function convert_input_and_batch_indices_to_array(dl::DataLoader{T, BT, OT}, ::Batch, batch_indices_tuple::Vector{Tuple{Int, Int}}) where {T, T1, BT<:AbstractArray{T, 3}, OT<:AbstractArray{T1, 3}}
+    _batch_indices = [batch_index[1] for batch_index in batch_indices_tuple]
+    input_batch = copy(dl.input[:, :, _batch_indices])
+    output_batch = copy(dl.output[:, :, _batch_indices])
 
-function (o::Optimizer)(nn::NeuralNetwork, dl::DataLoader, batch::Batch, n_epochs::Int=1)
-    o(nn, dl, batch, n_epochs, loss)
+    input_batch, output_batch
 end
\ No newline at end of file
diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
index 14f9fe4b5..be2f2cd6b 100644
--- a/src/data_loader/data_loader.jl
+++ b/src/data_loader/data_loader.jl
@@ -1,8 +1,34 @@
 description(::Val{:DataLoader}) = raw"""
 Data Loader is a struct that creates an instance based on a tensor (or different input format) and is designed to make training convenient. 
 
-The fields of the struct are the following: 
-- `data`: The input data with axes (i) system dimension, (ii) number of parameters and (iii) number of time steps.
+## Constructor 
+
+The data loader can be called with various inputs:
+
+### A single matrix 
+
+If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps. 
+
+### A single tensor 
+
+If the data loader is called with a single tensor (and no other arguments are given), then this is interpreted as an *integration problem* with the second axis indicating the time step and the third one indicating the parameters.
+
+### A tensor and a vector 
+
+This is a special case (MNIST classification problem).
+
+### A `NamedTuple` with fields `q` and `p`.
+
+The `NamedTuple` contains (i) two matrices or (ii) two tensors. 
+
+### An `EnsembleSolution`
+
+The `EnsembleSolution` typically comes from `GeometricProblems`.
+"""
+
+#=
+The fields of the `DataLoader` struct are the following: 
+- `input`: The input data with axes (i) system dimension, (ii) number of time steps and (iii) number of parameters.
 - `output`: The tensor that contains the output (supervised learning) - this may be of type Nothing if the constructor is only called with one tensor (unsupervised learning).
 - `input_dim`: The *dimension* of the system, i.e. what is taken as input by a regular neural network.
 - `input_time_steps`: The length of the entire time series of the data
@@ -11,28 +37,25 @@ The fields of the struct are the following:
 - `output_time_steps`: The size of the second axis of the output tensor (also called `prediction_window`, `output_time_steps=1` in most cases)
 
 If for the output we have a tensor whose second axis has length 1, we still store it as a tensor and not a matrix for consistency. 
-"""
+=#
 
 """
 $(description(Val(:DataLoader)))
 """
-struct DataLoader{T, AT<:Union{NamedTuple, AbstractArray{T}}, OT<:Union{AbstractArray, Nothing}, TimeSteps}
+struct DataLoader{T, AT<:Union{NamedTuple, AbstractArray{T}}, OT<:Union{AbstractArray, Nothing}, DataType}
     input::AT
     output::OT
     input_dim::Int
-    input_time_steps::Union{Int, Nothing}
-    n_params::Union{Int, Nothing} 
+    input_time_steps::Int
+    n_params::Int
     output_dim::Union{Int, Nothing}
     output_time_steps::Union{Int, Nothing}
 end
 
-struct TimeSteps end 
-struct RegularData end
-
 function DataLoader(data::AbstractArray{T, 3}) where T
     @info "You have provided a tensor with three axes as input. They will be interpreted as \n (i) system dimension, (ii) number of time steps and (iii) number of params."
     input_dim, input_time_steps, n_params = size(data)
-    DataLoader{T, typeof(data), Nothing, TimeSteps}(data, nothing, input_dim, input_time_steps, n_params, nothing, nothing)
+    DataLoader{T, typeof(data), Nothing, :TimeSeries}(data, nothing, input_dim, input_time_steps, n_params, nothing, nothing)
 end
 
 description(::Val{:data_loader_constructor_matrix}) = raw"""
@@ -49,13 +72,17 @@ function DataLoader(data::AbstractMatrix{T}; autoencoder=true) where T
     
     if autoencoder ==false
         input_dim, time_steps = size(data)
-        return DataLoader{T, typeof(data), Nothing, TimeSteps}(data, nothing, input_dim, time_steps-1, nothing, nothing, nothing)
+        reshaped_data = reshape(data, input_dim, time_steps, 1)
+        return DataLoader{T, typeof(reshaped_data), Nothing, :TimeSeries}(reshaped_data, nothing, input_dim, time_steps, 1, nothing, nothing)
     elseif autoencoder ==true 
         input_dim, n_params = size(data)
-        return DataLoader{T, typeof(data), Nothing, RegularData}(data, nothing, input_dim, nothing, n_params, nothing, nothing)
+        reshaped_data = reshape(data, input_dim, 1, n_params)
+        return DataLoader{T, typeof(reshaped_data), Nothing, :RegularData}(reshaped_data, nothing, input_dim, 1, n_params, nothing, nothing)
     end
 end
 
+DataLoader(data::AbstractVector) = DataLoader(reshape(data, 1, length(data)))
+
 # T and T1 are not the same because T1 is of Integer type
 function DataLoader(data::AbstractArray{T, 3}, target::AbstractVector{T1}; patch_length=7) where {T, T1} 
     @info "You provided a tensor and a vector as input. This will be treated as a classification problem (MNIST). Tensor axes: (i) & (ii) image axes and (iii) parameter dimension."
@@ -64,7 +91,7 @@ function DataLoader(data::AbstractArray{T, 3}, target::AbstractVector{T1}; patch
     number_of_patches = (im_dim₁ ÷ patch_length) * (im_dim₂ ÷ patch_length) 
     target = onehotbatch(target)
     data_preprocessed = split_and_flatten(data, patch_length=patch_length, number_of_patches=number_of_patches)
-    DataLoader{T, typeof(data_preprocessed), typeof(target), RegularData}(
+    DataLoader{T, typeof(data_preprocessed), typeof(target), :RegularData}(
         data_preprocessed, target, patch_length^2, number_of_patches, n_params, 10, 1
         )
 end
@@ -83,25 +110,66 @@ function DataLoader(data::NamedTuple{(:q, :p), Tuple{AT, AT}}; autoencoder=false
     
     if autoencoder == false
         dim2, time_steps = size(data.q)
-        return DataLoader{T, typeof(data), Nothing, TimeSteps}(data, nothing, dim2 * 2, time_steps - 1, nothing, nothing, nothing)
+        reshaped_data = (q = reshape(data.q, dim2, time_steps, 1), p = reshape(data.p, dim2, time_steps, 1))
+        return DataLoader{T, typeof(reshaped_data), Nothing, :TimeSeries}(reshaped_data, nothing, dim2 * 2, time_steps, 1, nothing, nothing)
     elseif autoencoder == true
         dim2, n_params = size(data.q)
-        return DataLoader{T, typeof(data), Nothing, RegularData}(data, nothing, dim2 * 2, nothing, n_params, nothing, nothing)
+        reshaped_data = (q = reshape(data.q, dim2, 1, n_params), p = reshape(data.p, dim2, 1, n_params))
+        return DataLoader{T, typeof(reshaped_data), Nothing, :RegularData}(reshaped_data, nothing, dim2 * 2, 1, n_params, nothing, nothing)
     end
 end
 
-function DataLoader(data::NamedTuple{(:q, :p), Tuple{AT, AT}}; output_time_steps=1) where {T, AT<:AbstractArray{T, 3}}
+function DataLoader(data::NamedTuple{(:q, :p), Tuple{AT, AT}}) where {T, AT<:AbstractArray{T, 3}}
     @info "You have provided a NamedTuple with keys q and p; the data are tensors. This is interpreted as *symplectic data*."
     
     dim2, time_steps, n_params = size(data.q)
-    DataLoader{T, typeof(data), Nothing, TimeSteps}(data, nothing, dim2 * 2, time_steps - 1, n_params, nothing, output_time_steps)
+    DataLoader{T, typeof(data), Nothing, :TimeSeries}(data, nothing, dim2 * 2, time_steps, n_params, nothing, nothing)
+end
+
+DataLoader(data::NamedTuple{(:q, :p), Tuple{VT, VT}}) where {VT <: AbstractVector} = DataLoader((q = reshape(data.q, 1, length(data.q)), p = reshape(data.p, 1, length(data.p))))
+
+"""
+Constructor for `EnsembleSolution` form package `GeometricSolutions`.
+"""
+function DataLoader(ensemble_solution::EnsembleSolution{T, T1, Vector{ST}}) where {T, T1, DT, ST <: GeometricSolution{T, T1, NamedTuple{(:q, :p), Tuple{DT, DT}}}}
+
+    sys_dim, input_time_steps, n_params = length(ensemble_solution.s[1].q[0]), length(ensemble_solution.t), length(ensemble_solution.s)
+
+    data = (q = zeros(sys_dim, input_time_steps, n_params), p = zeros(sys_dim, input_time_steps, n_params))
+
+    for (solution, i) in zip(ensemble_solution.s, axes(ensemble_solution.s, 1))
+        for dim in 1:sys_dim 
+            data.q[dim, :, i] = solution.q[:, dim]
+            data.p[dim, :, i] = solution.p[:, dim]
+        end 
+    end
+
+    DataLoader(data)
+end
+
+"""
+Constructor for `EnsembleSolution` from package `GeometricSolutions`.
+"""
+function DataLoader(ensemble_solution::EnsembleSolution{T, T1, Vector{ST}}) where {T, T1, DT, ST <: GeometricSolution{T, T1, NamedTuple{(:q, ), Tuple{DT}}}}
+
+    sys_dim, input_time_steps, n_params = length(ensemble_solution.s[1].q[0]), length(ensemble_solution.t), length(ensemble_solution.s)
+
+    data = zeros(sys_dim, input_time_steps, n_params)
+
+    for (solution, i) in zip(ensemble_solution.s, axes(ensemble_solution.s, 1))
+        for dim in 1:sys_dim 
+            data[dim, :, i] = solution.q[:, dim]
+        end 
+    end
+
+    DataLoader(data)
 end
 
 @doc raw"""
 Computes the accuracy (as opposed to the loss) of a neural network classifier. 
 
 It takes as input:
-- `model::Chain`:
+- `model::Chain`
 - `ps`: parameters of the network
 - `dl::DataLoader`
 """

From 666cdfe26a68567381c84eeaada700884fe3a141 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 08:01:48 +0200
Subject: [PATCH 04/16] Classified SympNet as NeuralNetworkIntegrator.

---
 src/architectures/sympnet.jl | 33 +--------------------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

diff --git a/src/architectures/sympnet.jl b/src/architectures/sympnet.jl
index dba449c4e..e7362959a 100644
--- a/src/architectures/sympnet.jl
+++ b/src/architectures/sympnet.jl
@@ -4,7 +4,7 @@ SympNet type encompasses GSympNets and LASympnets.
 TODO: 
 -[ ] add bias to `LASympNet`!
 """
-abstract type SympNet{AT} <: Architecture end
+abstract type SympNet{AT} <: NeuralNetworkIntegrator end
 
 @doc raw"""
 `LASympNet` is called with **a single input argument**, the **system dimension**, or with an instance of `DataLoader`. Optional input arguments are: 
@@ -144,35 +144,4 @@ function Chain(arch::LASympNet{AT, true, true}) where {AT}
         layers = isodd(j) ? (layers..., LinearLayerQ(arch.dim)) : (layers..., LinearLayerP(arch.dim))
     end
     Chain(layers...)
-end
-
-@doc raw"""
-This function computes a trajectory for a SympNet that has already been trained for valuation purposes.
-
-It takes as input: 
-- `nn`: a `NeuralNetwork` (that has been trained).
-- `ics`: initial conditions (a `NamedTuple` of two vectors)
-"""
-function Iterate_Sympnet(nn::NeuralNetwork{<:SympNet}, ics::NamedTuple{(:q, :p), Tuple{AT, AT}}; n_points = 100) where {T, AT<:AbstractVector{T}}
-
-    n_dim = length(ics.q)
-    backend = KernelAbstractions.get_backend(ics.q)
-
-    # Array to store the predictions
-    q_valuation = KernelAbstractions.allocate(backend, T, n_dim, n_points)
-    p_valuation = KernelAbstractions.allocate(backend, T, n_dim, n_points)
-    
-    # Initialisation
-    @views q_valuation[:,1] = ics.q
-    @views p_valuation[:,1] = ics.p
-    
-    #Computation of phase space
-    @views for i in 2:n_points
-        qp_temp = (q=q_valuation[:,i-1], p=p_valuation[:,i-1]) 
-        qp_prediction = nn(qp_temp)
-        q_valuation[:,i] = qp_prediction.q
-        p_valuation[:,i] = qp_prediction.p
-    end
-
-    (q=q_valuation, p=p_valuation)
 end
\ No newline at end of file

From 670ceb53850f098cf7d8a6dc3ac9bd38e047fc13 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 08:02:04 +0200
Subject: [PATCH 05/16] Updated optimization routines.

---
 src/data_loader/optimize.jl | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/src/data_loader/optimize.jl b/src/data_loader/optimize.jl
index 04ba27181..e6eb1c766 100644
--- a/src/data_loader/optimize.jl
+++ b/src/data_loader/optimize.jl
@@ -15,24 +15,7 @@ output = \frac{1}{\mathtt{steps\_per\_epoch}}\sum_{t=1}^\mathtt{steps\_per\_epoc
 ```
 This is done because any **reverse differentiation** routine always has two outputs: a pullback and the value of the function it is differentiating. In the case of zygote: `loss_value, pullback = Zygote.pullback(ps -> loss(ps), ps)` (if the loss only depends on the parameters).
 """
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, AT, BT}, batch::Batch, loss) where {T, T1, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T1, 3}}
-    count = 0
-    total_error = T(0)
-    batches = batch(dl)
-    @views for batch_indices in batches 
-        count += 1
-        # these `copy`s should not be necessary! coming from a Zygote problem!
-        input_batch = copy(dl.input[:, :, batch_indices])
-        output_batch = copy(dl.output[:, :, batch_indices])
-        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_batch, output_batch), ps)
-        total_error += loss_value
-        dp = pullback(one(loss_value))[1]
-        optimization_step!(opt, model, ps, dp)
-    end
-    total_error / count
-end
-
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, CT, Nothing}, batch::Batch, loss::NetworkLoss) where {T, AT<:AbstractArray{T, 3}, BT<:NamedTuple{(:q, :p), Tuple{AT, AT}}, CT<:Union{AT, BT}}
+function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T}, batch::Batch, loss::Union{typeof(loss), NetworkLoss}) where T
     count = 0
     total_error = T(0)
     batches = batch(dl)
@@ -68,16 +51,12 @@ function (o::Optimizer)(nn::NeuralNetwork, dl::DataLoader, batch::Batch, n_epoch
     loss_array
 end
 
-#=
-function (o::Optimizer)(nn::NeuralNetwork{<:TransformerIntegrator}, dl::DataLoader, batch::Batch{Int}, n_epochs::Int=1)
+function (o::Optimizer)(nn::NeuralNetwork{<:TransformerIntegrator}, dl::DataLoader, batch::Batch{:Transformer}, n_epochs::Int=1)
     loss = TransformerLoss(batch)
     o(nn, dl, batch, n_epochs, loss)
 end
 
-function (o::Optimizer)(nn::NeuralNetwork{<:NeuralNetworkIntegrator}, dl::DataLoader, batch::Batch{Int}, n_epochs::Int=1)
+function (o::Optimizer)(nn::NeuralNetwork{<:NeuralNetworkIntegrator}, dl::DataLoader, batch::Batch{:FeedForward}, n_epochs::Int=1)
     loss = FeedForwardLoss()
     o(nn, dl, batch, n_epochs, loss)
 end
-
-(o::Optimizer)(nn::NeuralNetwork{<:NeuralNetworkIntegrator}, dl::DataLoader, batch::Batch{Nothing}, n_epochs::Int=1) = o(nn, dl, Batch(batch.batch_size, 1), n_epochs)
-=#
\ No newline at end of file

From a79ca6d8c6e9a5a6596ce9ac542bdd3256b52683 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 08:02:23 +0200
Subject: [PATCH 06/16] using _diff and _norm internally (also for
 NamedTuples).

---
 src/utils.jl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 16f447f53..230145293 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -37,8 +37,16 @@ function apply_toNT(fun, ps::NamedTuple...)
     NamedTuple{keys(ps[1])}(fun(p...) for p in zip(ps...))
 end
 
-# overloaded + operation 
-_add(dx₁::NamedTuple, dx₂::NamedTuple) = apply_toNT( _add, dx₁, dx₂)
+# overload norm 
+_norm(dx::NT) where {AT <: AbstractArray, NT <: NamedTuple{(:q, :p), Tuple{AT, AT}}}  = (norm(dx.q) + norm(dx.p)) / √2 # we need this because of a Zygote problem
+_norm(dx::NamedTuple) = sum(apply_toNT(norm, dx)) / √length(dx)
+_norm(A::AbstractArray) = norm(A)
+
+# overloaded +/- operation 
+_diff(dx₁::NT, dx₂::NT) where {AT <: AbstractArray, NT <: NamedTuple{(:q, :p), Tuple{AT, AT}}} = (q = dx₁.q - dx₂.q, p = dx₁.p - dx₂.p) # we need this because of a Zygote problem
+_diff(dx₁::NamedTuple, dx₂::NamedTuple) = apply_toNT(_diff, dx₁, dx₂)
+_diff(A::AbstractArray, B::AbstractArray) = A - B 
+_add(dx₁::NamedTuple, dx₂::NamedTuple) = apply_toNT(_add, dx₁, dx₂)
 _add(A::AbstractArray, B::AbstractArray) = A + B 
 
 function add!(C::AbstractVecOrMat, A::AbstractVecOrMat, B::AbstractVecOrMat)

From f1bd863b508026c6ec3722c4c313aadb6740affd Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 08:02:32 +0200
Subject: [PATCH 07/16] Updated loss.

---
 src/loss/losses.jl | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/loss/losses.jl b/src/loss/losses.jl
index ad16bb49f..a610db73a 100644
--- a/src/loss/losses.jl
+++ b/src/loss/losses.jl
@@ -1,9 +1,22 @@
 abstract type NetworkLoss end 
 
-function (loss::NetworkLoss)(nn::NeuralNetwork, input::AT, output::AT) where {AT <: AbstractArray}
+function (loss::NetworkLoss)(nn::NeuralNetwork, input::CT, output::CT) where {AT<:AbstractArray, BT <: NamedTuple{(:q, :p), Tuple{AT, AT}}, CT <: Union{AT, BT}}
     loss(nn.model, nn.params, input, output)
 end
 
+function _compute_loss(output_prediction::CT1, output::CT2) where {AT<:AbstractArray, BT <: NamedTuple{(:q, :p), Tuple{AT, AT}}, CT <: Union{AT, BT}, CT1 <: CT, CT2 <: CT}
+    _norm(_diff(output_prediction, output)) / _norm(output)
+end 
+
+function _compute_loss(model::Union{AbstractExplicitLayer, Chain}, ps::Union{Tuple, NamedTuple}, input::CT, output::CT) where {AT<:AbstractArray, BT <: NamedTuple{(:q, :p), Tuple{AT, AT}}, CT <: Union{AT, BT}}
+    output_prediction = model(input, ps)
+    _compute_loss(output_prediction, output)
+end
+
+function (loss::NetworkLoss)(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::CT, output::CT) where {AT<:AbstractArray, BT <: NamedTuple{(:q, :p), Tuple{AT, AT}}, CT <: Union{AT, BT}}
+    _compute_loss(model, ps, input, output)
+end
+
 @doc raw"""
 The loss for a transformer network (especially a transformer integrator). The constructor is called with:
 - `seq_length::Int`
@@ -14,16 +27,16 @@ struct TransformerLoss <: NetworkLoss
     prediction_window::Int
 end
 
-TransformerLoss(seq_length::Int) = TransformerLoss(seq_length, 1)
+TransformerLoss(seq_length::Int) = TransformerLoss(seq_length, seq_length)
 
 @doc raw"""
 This crops the output array of the neural network so that it conforms with the output it should be compared to. This is needed for the transformer loss. 
 """
-function crop_array_for_transformer_loss(nn_output::AT, output::AT) where {T, AT<:AbstractArray{T, 3}}
+function crop_array_for_transformer_loss(nn_output::AT, output::BT) where {T, T2, AT <: AbstractArray{T, 3}, BT <: AbstractArray{T2, 3}}
     @view nn_output[axes(output, 1), axes(output, 2) .+ size(nn_output, 2) .- size(output, 2), axes(output, 3)]
 end
 
-function (loss::TransformerLoss)(model::Chain, ps::Union{Tuple, NamedTuple}, input::AT, output::AT) where {T, AT <: Union{AbstractArray{T, 2}, AbstractArray{T, 3}}}
+function (loss::TransformerLoss)(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::AT, output::AT) where {T, AT <: Union{AbstractArray{T, 2}, AbstractArray{T, 3}}}
     input_dim, input_seq_length = size(input)
     output_dim, output_prediction_window = size(output)
     @assert input_dim == output_dim 
@@ -32,11 +45,15 @@ function (loss::TransformerLoss)(model::Chain, ps::Union{Tuple, NamedTuple}, inp
 
     predicted_output_uncropped = model(input, ps)
     predicted_output_cropped = crop_array_for_transformer_loss(predicted_output_uncropped, output)
-    norm(predicted_output_cropped - output) / norm(output)
+    _compute_loss(predicted_output_cropped, output)
 end
 
-struct FeedForwardLoss <: NetworkLoss end
+struct ClassificationTransformerLoss <: NetworkLoss end
+
+function (loss::ClassificationTransformerLoss)(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::AbstractArray, output::AbstractArray)
+    predicted_output_uncropped = model(input, ps)
+    predicted_output_cropped = crop_array_for_transformer_loss(predicted_output_uncropped, output)
+    norm(predicted_output_cropped - output) / norm(output)
+end
 
-function (loss::FeedForwardLoss)(model::Chain, ps::Union{Tuple, NamedTuple}, input::AT, output::AT) where {AT <: AbstractArray}
-    norm(model(input, ps) - output) / norm(output)
-end
\ No newline at end of file
+struct FeedForwardLoss <: NetworkLoss end
\ No newline at end of file

From ec39e67ad86884e4beb4a00bf06fa52771a9c3bd Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 15:46:25 +0200
Subject: [PATCH 08/16] adjusted to new API.

---
 docs/src/tutorials/sympnet_tutorial.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/tutorials/sympnet_tutorial.md b/docs/src/tutorials/sympnet_tutorial.md
index 7fa3a1ba6..5f1c4aa17 100644
--- a/docs/src/tutorials/sympnet_tutorial.md
+++ b/docs/src/tutorials/sympnet_tutorial.md
@@ -173,8 +173,8 @@ ics = (q=qp_data.q[:,1], p=qp_data.p[:,1])
 steps_to_plot = 200
 
 #predictions
-la_trajectory = Iterate_Sympnet(la_nn, ics; n_points = steps_to_plot)
-g_trajectory = Iterate_Sympnet(g_nn, ics; n_points = steps_to_plot)
+la_trajectory = iterate(la_nn, ics; n_points = steps_to_plot)
+g_trajectory = iterate(g_nn, ics; n_points = steps_to_plot)
 
 using Plots
 p2 = plot(qp_data.q'[1:steps_to_plot], qp_data.p'[1:steps_to_plot], label="training data")

From 72f09c1a82be5e5a249e74b806ca961e8d4c360f Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 19:22:25 +0200
Subject: [PATCH 09/16] Added tests for all changes.

---
 test/runtests.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 64b37f323..aab723333 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -62,4 +62,7 @@ using SafeTestsets
 @safetestset "SympNet integrator                                                              " begin include("sympnet_integrator.jl") end
 @safetestset "Regular transformer integrator                                                  " begin include("regular_transformer_integrator.jl") end
 
-@safetestset "Batch functor(s)                                                                " begin include("batch/batch_functor.jl") end
\ No newline at end of file
+@safetestset "Batch functor(s)                                                                " begin include("batch/batch_functor.jl") end
+
+@safetestset "SympNet integrator                                                              " begin include("sympnet_integrator.jl") end
+@safetestset "Regular transformer integrator                                                  " begin include("regular_transformer_integrator.jl") end
\ No newline at end of file

From 4ec05aa3249bd14b343574658a5ad3aeb0e30184 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 19:21:06 +0200
Subject: [PATCH 10/16] Adjusted to new interface.

---
 docs/src/tutorials/sympnet_tutorial.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/tutorials/sympnet_tutorial.md b/docs/src/tutorials/sympnet_tutorial.md
index 5f1c4aa17..35066f75e 100644
--- a/docs/src/tutorials/sympnet_tutorial.md
+++ b/docs/src/tutorials/sympnet_tutorial.md
@@ -174,7 +174,7 @@ steps_to_plot = 200
 
 #predictions
 la_trajectory = iterate(la_nn, ics; n_points = steps_to_plot)
-g_trajectory = iterate(g_nn, ics; n_points = steps_to_plot)
+g_trajectory =  iterate(g_nn, ics; n_points = steps_to_plot)
 
 using Plots
 p2 = plot(qp_data.q'[1:steps_to_plot], qp_data.p'[1:steps_to_plot], label="training data")

From e0ff55e7685af218cf6998f78f3f0dbc902b1fac Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Fri, 12 Apr 2024 15:55:55 +0200
Subject: [PATCH 11/16] Fixed typo.

---
 src/architectures/regular_transformer_integrator.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/architectures/regular_transformer_integrator.jl b/src/architectures/regular_transformer_integrator.jl
index 0da6c2350..6655f564e 100644
--- a/src/architectures/regular_transformer_integrator.jl
+++ b/src/architectures/regular_transformer_integrator.jl
@@ -3,7 +3,7 @@ The regular transformer used as an integrator (multi-step method).
 
 The constructor is called with the following arguments: 
 - `sys_dim::Int`
-- `transformer_dim::Int`: the defualt is `transformer_dim = sys_dim`.
+- `transformer_dim::Int`: the default is `transformer_dim = sys_dim`.
 - `n_blocks::Int`: The default is `1`.
 - `n_heads::Int`: the number of heads in the multihead attentio layer (default is `n_heads = sys_dim`)
 - `L::Int` the number of transformer blocks (default is `L = 2`).

From 088eb1020da947ae4362ddcdcf8b3b93662053cc Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 15 Apr 2024 11:54:19 +0200
Subject: [PATCH 12/16] Simplified docs.

---
 src/data_loader/data_loader.jl | 67 ++++++++++++----------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/src/data_loader/data_loader.jl b/src/data_loader/data_loader.jl
index be2f2cd6b..a70b75598 100644
--- a/src/data_loader/data_loader.jl
+++ b/src/data_loader/data_loader.jl
@@ -4,43 +4,33 @@ Data Loader is a struct that creates an instance based on a tensor (or different
 ## Constructor 
 
 The data loader can be called with various inputs:
+- **A single vector**: If the data loader is called with a single vector (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the second axis indicates parameter values and/or time steps and the system has a single degree of freedom (i.e. the system dimension is one).
+- **A single matrix**: If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps. 
+- **A single tensor**: If the data loader is called with a single tensor, then this is interpreted as an *integration problem* with the second axis indicating the time step and the third one indicating the parameters.
+- **A tensor and a vector**: This is a special case (MNIST classification problem). For the MNIST problem for example the input are ``n_p`` matrices (first input argument) and ``n_p`` integers (second input argument).
+- **A `NamedTuple` with fields `q` and `p`**: The `NamedTuple` contains (i) two matrices or (ii) two tensors. 
+- **An `EnsembleSolution`**: The `EnsembleSolution` typically comes from `GeometricProblems`.
+
+When we supply a single vector or a single matrix as input to `DataLoader` and further set `autoencoder = false` (keyword argument), then the data are stored as an *integration problem* and the second axis is assumed to indicate time steps.
+"""
 
-### A single matrix 
-
-If the data loader is called with a single matrix (and no other arguments are given), then this is interpreted as an autoencoder problem, i.e. the first axis is assumed to indicate the degrees of freedom of the system and the second axis indicates parameter values and/or time steps. 
-
-### A single tensor 
-
-If the data loader is called with a single tensor (and no other arguments are given), then this is interpreted as an *integration problem* with the second axis indicating the time step and the third one indicating the parameters.
-
-### A tensor and a vector 
-
-This is a special case (MNIST classification problem).
-
-### A `NamedTuple` with fields `q` and `p`.
-
-The `NamedTuple` contains (i) two matrices or (ii) two tensors. 
-
-### An `EnsembleSolution`
-
-The `EnsembleSolution` typically comes from `GeometricProblems`.
 """
+$(description(Val(:DataLoader)))
+
+## Fields of `DataLoader`
 
-#=
 The fields of the `DataLoader` struct are the following: 
-- `input`: The input data with axes (i) system dimension, (ii) number of time steps and (iii) number of parameters.
-- `output`: The tensor that contains the output (supervised learning) - this may be of type Nothing if the constructor is only called with one tensor (unsupervised learning).
-- `input_dim`: The *dimension* of the system, i.e. what is taken as input by a regular neural network.
-- `input_time_steps`: The length of the entire time series of the data
-- `n_params`: The number of parameters that are present in the data set (length of third axis)
-- `output_dim`: The dimension of the output tensor (first axis). 
-- `output_time_steps`: The size of the second axis of the output tensor (also called `prediction_window`, `output_time_steps=1` in most cases)
+    - `input`: The input data with axes (i) system dimension, (ii) number of time steps and (iii) number of parameters.
+    - `output`: The tensor that contains the output (supervised learning) - this may be of type `Nothing` if the constructor is only called with one tensor (unsupervised learning).
+    - `input_dim`: The *dimension* of the system, i.e. what is taken as input by a regular neural network.
+    - `input_time_steps`: The length of the entire time series (length of the second axis).
+    - `n_params`: The number of parameters that are present in the data set (length of third axis)
+    - `output_dim`: The dimension of the output tensor (first axis). If `output` is of type `Nothing`, then this is also of type `Nothing`.
+    - `output_time_steps`: The size of the second axis of the output tensor. If `output` is of type `Nothing`, then this is also of type `Nothing`.
 
-If for the output we have a tensor whose second axis has length 1, we still store it as a tensor and not a matrix for consistency. 
-=#
+### The `input` and `output` fields of `DataLoader`
 
-"""
-$(description(Val(:DataLoader)))
+Even though the arguments to the Constructor may be vectors or matrices, internally `DataLoader` always stores tensors.
 """
 struct DataLoader{T, AT<:Union{NamedTuple, AbstractArray{T}}, OT<:Union{AbstractArray, Nothing}, DataType}
     input::AT
@@ -58,15 +48,6 @@ function DataLoader(data::AbstractArray{T, 3}) where T
     DataLoader{T, typeof(data), Nothing, :TimeSeries}(data, nothing, input_dim, input_time_steps, n_params, nothing, nothing)
 end
 
-description(::Val{:data_loader_constructor_matrix}) = raw"""
-The constructor for the data loader, when called on a matrix, also takes an optional argument `autoencoder`. If set to true than the data loader assumes we are dealing with an *autoencoder problem* and the field `n_params` in the `DataLoader` object will be set to the number of columns of our input matrix. 
-If `autoencoder=false`, then the field `input_time_steps` of the `DataLoader` object will be set to the *number of columns minus 1*. This is because in this case the data are used to train a neural network integrator and we need to leave at least one time step after the last one free in order to have something that we can compare the prediction to. 
-So we have that for an input of form ``(z^{(0)}, \ldots, z^{(T)})`` `input_time_steps` is ``T``. 
-"""
-
-"""
-$(description(Val(:data_loader_constructor_matrix)))
-"""
 function DataLoader(data::AbstractMatrix{T}; autoencoder=true) where T 
     @info "You have provided a matrix as input. The axes will be interpreted as (i) system dimension and (ii) number of parameters."
     
@@ -81,7 +62,7 @@ function DataLoader(data::AbstractMatrix{T}; autoencoder=true) where T
     end
 end
 
-DataLoader(data::AbstractVector) = DataLoader(reshape(data, 1, length(data)))
+DataLoader(data::AbstractVector; autoencoder=true) = DataLoader(reshape(data, 1, length(data)); autoencoder = autoencoder)
 
 # T and T1 are not the same because T1 is of Integer type
 function DataLoader(data::AbstractArray{T, 3}, target::AbstractVector{T1}; patch_length=7) where {T, T1} 
@@ -129,7 +110,7 @@ end
 DataLoader(data::NamedTuple{(:q, :p), Tuple{VT, VT}}) where {VT <: AbstractVector} = DataLoader((q = reshape(data.q, 1, length(data.q)), p = reshape(data.p, 1, length(data.p))))
 
 """
-Constructor for `EnsembleSolution` form package `GeometricSolutions`.
+Constructor for `EnsembleSolution` form package `GeometricSolutions` with fields `q` and `p`.
 """
 function DataLoader(ensemble_solution::EnsembleSolution{T, T1, Vector{ST}}) where {T, T1, DT, ST <: GeometricSolution{T, T1, NamedTuple{(:q, :p), Tuple{DT, DT}}}}
 
@@ -148,7 +129,7 @@ function DataLoader(ensemble_solution::EnsembleSolution{T, T1, Vector{ST}}) wher
 end
 
 """
-Constructor for `EnsembleSolution` from package `GeometricSolutions`.
+Constructor for `EnsembleSolution` from package `GeometricSolutions` with field `q`.
 """
 function DataLoader(ensemble_solution::EnsembleSolution{T, T1, Vector{ST}}) where {T, T1, DT, ST <: GeometricSolution{T, T1, NamedTuple{(:q, ), Tuple{DT}}}}
 

From c71290329436841126a3699caf98b65fd935bcd5 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 15 Apr 2024 11:55:10 +0200
Subject: [PATCH 13/16] Put part of the DataLoader docs to the corresponding md
 file.

---
 docs/src/data_loader/data_loader.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/src/data_loader/data_loader.md b/docs/src/data_loader/data_loader.md
index 01d5d5788..f0d196fea 100644
--- a/docs/src/data_loader/data_loader.md
+++ b/docs/src/data_loader/data_loader.md
@@ -25,10 +25,9 @@ SnapshotTensor = rand(Float32, 10, 100, 5)
 dl = DataLoader(SnapshotTensor)
 ```
 
-```@eval
-using GeometricMachineLearning, Markdown 
-Markdown.parse(description(Val(:data_loader_constructor_matrix)))
-```
+The constructor for the data loader, when called on a matrix, also takes an optional argument `autoencoder`. If set to true than the data loader assumes we are dealing with an *autoencoder problem* and the field `n_params` in the `DataLoader` object will be set to the number of columns of our input matrix. 
+If `autoencoder=false`, then the field `input_time_steps` of the `DataLoader` object will be set to the *number of columns minus 1*. This is because in this case the data are used to train a neural network integrator and we need to leave at least one time step after the last one free in order to have something that we can compare the prediction to. 
+So we have that for an input of form ``(z^{(0)}, \ldots, z^{(T)})`` `input_time_steps` is ``T``. 
 
 ```@example 
 using GeometricMachineLearning # hide

From f3d2410e04224b57d4676b0311109baed9c8fa41 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 15 Apr 2024 11:55:46 +0200
Subject: [PATCH 14/16] fixed typo. skew_sym_mul_pullback ->
 symmetric_mul_pullback.

---
 src/kernels/kernel_ad_routines/mat_tensor_mul.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/kernel_ad_routines/mat_tensor_mul.jl b/src/kernels/kernel_ad_routines/mat_tensor_mul.jl
index 3059c4adc..79ab0d546 100644
--- a/src/kernels/kernel_ad_routines/mat_tensor_mul.jl
+++ b/src/kernels/kernel_ad_routines/mat_tensor_mul.jl
@@ -253,7 +253,7 @@ function ChainRulesCore.rrule(::typeof(mat_tensor_mul), B::SymmetricMatrix{T}, A
         return f̄, SymmetricMatrix(dS, B.n), dA 
     end 
 
-    return C, skew_sym_mul_pullback
+    return C, symmetric_mul_pullback
 end
 
 

From 98ffdfa8883797c2aa1b1752d751905eb09b364f Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 15 Apr 2024 12:25:51 +0200
Subject: [PATCH 15/16] Reworked section on DataLoader by moving some parts of
 the docstrings to the md file.

---
 docs/src/data_loader/data_loader.md | 53 +++++++++++++++--------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/docs/src/data_loader/data_loader.md b/docs/src/data_loader/data_loader.md
index f0d196fea..e4faac393 100644
--- a/docs/src/data_loader/data_loader.md
+++ b/docs/src/data_loader/data_loader.md
@@ -8,7 +8,7 @@ Markdown.parse(description(Val(:DataLoader)))
 The data loader can be called with various types of arrays as input, for example a [snapshot matrix](snapshot_matrix.md):
 
 ```@example
-using GeometricMachineLearning # hide 
+using GeometricMachineLearning # hide
 
 SnapshotMatrix = rand(Float32, 10, 100)
 
@@ -18,16 +18,15 @@ dl = DataLoader(SnapshotMatrix)
 or a snapshot tensor: 
 
 ```@example
-using GeometricMachineLearning # hide 
+using GeometricMachineLearning # hide
 
 SnapshotTensor = rand(Float32, 10, 100, 5)
 
 dl = DataLoader(SnapshotTensor)
 ```
 
-The constructor for the data loader, when called on a matrix, also takes an optional argument `autoencoder`. If set to true than the data loader assumes we are dealing with an *autoencoder problem* and the field `n_params` in the `DataLoader` object will be set to the number of columns of our input matrix. 
-If `autoencoder=false`, then the field `input_time_steps` of the `DataLoader` object will be set to the *number of columns minus 1*. This is because in this case the data are used to train a neural network integrator and we need to leave at least one time step after the last one free in order to have something that we can compare the prediction to. 
-So we have that for an input of form ``(z^{(0)}, \ldots, z^{(T)})`` `input_time_steps` is ``T``. 
+Here the `DataLoader` has different properties `:RegularData` and `:TimeSeries`. This indicates that in the first case we treat all columns in the input tensor independently (this is mostly used for autoencoder problems), whereas in the second case we have *time series-like data*, which are mostly used for integration problems. 
+We can also treat a problem with a matrix as input as a time series-like problem by providing an additional keyword argument: `autoencoder=false`:
 
 ```@example 
 using GeometricMachineLearning # hide
@@ -44,7 +43,7 @@ using GeometricMachineLearning, Markdown
 Markdown.parse(description(Val(:data_loader_for_named_tuple)))
 ```
 
-```@example
+```@example named_tuple_tensor
 using GeometricMachineLearning # hide
 
 SymplecticSnapshotTensor = (q = rand(Float32, 10, 100, 5), p = rand(Float32, 10, 100, 5))
@@ -52,23 +51,23 @@ SymplecticSnapshotTensor = (q = rand(Float32, 10, 100, 5), p = rand(Float32, 10,
 dl = DataLoader(SymplecticSnapshotTensor)
 ```
 
-## Convenience functions 
+```@example named_tuple_tensor
+dl.input_dim
+```
+
+## The `Batch` struct  
 
 ```@eval
 using GeometricMachineLearning, Markdown
 Markdown.parse(description(Val(:Batch)))
 ```
 
-```@eval
-using GeometricMachineLearning, Markdown
-Markdown.parse(description(Val(:batch_functor_matrix)))
-```
 
 ```@example 
 using GeometricMachineLearning # hide
 
 matrix_data = rand(Float32, 2, 10)
-dl = DataLoader(matrix_data; autoencoder=true)
+dl = DataLoader(matrix_data; autoencoder = true)
 
 batch = Batch(3)
 batch(dl)
@@ -77,30 +76,32 @@ batch(dl)
 This also works if the data are in ``qp`` form: 
 
 ```@example
-using GeometricMachineLearning # hide 
+using GeometricMachineLearning # hide
 
 qp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))
-dl = DataLoader(qp_data; autoencoder=true)
+dl = DataLoader(qp_data; autoencoder = true)
 
 batch = Batch(3)
 batch(dl)
 ```
 
+In those two examples the `autoencoder` keyword was set to `true` (the default). This is why the first index was always `1`. This changes if we set `autoencoder = false`: 
+
 ```@example
-using GeometricMachineLearning # hide 
+using GeometricMachineLearning # hide
 
 qp_data = (q = rand(Float32, 2, 10), p = rand(Float32, 2, 10))
-dl = DataLoader(qp_data; autoencoder=false) # false is default 
+dl = DataLoader(qp_data; autoencoder = false) # false is default 
 
 batch = Batch(3)
 batch(dl)
 ```
 
 Specifically the routines do the following: 
-1. ``\mathtt{n\_indices}\leftarrow \mathtt{n\_params}\lor\mathtt{input\_time\_steps}`` 
-2. ``\mathtt{indices} \leftarrow \mathtt{shuffle}(\mathtt{1:\mathtt{n\_indices}})``
-3. ``\mathcal{I}_i \leftarrow \mathtt{indices[(i - 1)} \cdot \mathtt{batch\_size} + 1 \mathtt{:} i \cdot \mathtt{batch\_size]}\text{ for }i=1, \ldots, (\mathrm{last} -1)``
-4. ``\mathcal{I}_\mathtt{last} \leftarrow \mathtt{indices[}(\mathtt{n\_batches} - 1) \cdot \mathtt{batch\_size} + 1\mathtt{:end]}``
+1. ``\mathtt{n\_indices}\leftarrow \mathtt{n\_params}\lor\mathtt{input\_time\_steps},`` 
+2. ``\mathtt{indices} \leftarrow \mathtt{shuffle}(\mathtt{1:\mathtt{n\_indices}}),``
+3. ``\mathcal{I}_i \leftarrow \mathtt{indices[(i - 1)} \cdot \mathtt{batch\_size} + 1 \mathtt{:} i \cdot \mathtt{batch\_size]}\text{ for }i=1, \ldots, (\mathrm{last} -1),``
+4. ``\mathcal{I}_\mathtt{last} \leftarrow \mathtt{indices[}(\mathtt{n\_batches} - 1) \cdot \mathtt{batch\_size} + 1\mathtt{:end]}.``
 
 Note that the routines are implemented in such a way that no two indices appear double. 
 
@@ -109,7 +110,7 @@ Note that the routines are implemented in such a way that no two indices appear
 We can also sample tensor data.
 
 ```@example
-using GeometricMachineLearning # hide 
+using GeometricMachineLearning # hide
 
 qp_data = (q = rand(Float32, 2, 20, 3), p = rand(Float32, 2, 20, 3))
 dl = DataLoader(qp_data)
@@ -120,11 +121,11 @@ batch(dl)
 ```
 
 Sampling from a tensor is done the following way (``\mathcal{I}_i`` again denotes the batch indices for the ``i``-th batch): 
-1. ``\mathtt{time\_indices} \leftarrow \mathtt{shuffle}(\mathtt{1:}(\mathtt{input\_time\_steps} - \mathtt{seq\_length})``
-2. ``\mathtt{parameter\_indices} \leftarrow \mathtt{shuffle}(\mathtt{1:n\_params})``
-3. ``\mathtt{complete\_indices} \leftarrow \mathtt{Iterators.product}(\mathtt{time\_indices}, \mathtt{parameter\_indices}) \mathtt{|> collect |> vec}``
-3. ``\mathcal{I}_i \leftarrow \mathtt{complete\_indices[}(i - 1) \cdot \mathtt{batch\_size} + 1 : i \cdot \mathtt{batch\_size]}\text{ for }i=1, \ldots, (\mathrm{last} -1)``
-4. ``\mathcal{I}_\mathrm{last} \leftarrow \mathtt{complete\_indices[}(\mathrm{last} - 1) \cdot \mathtt{batch\_size} + 1\mathtt{:end]}``
+1. ``\mathtt{time\_indices} \leftarrow \mathtt{shuffle}(\mathtt{1:}(\mathtt{input\_time\_steps} - \mathtt{seq\_length} - \mathtt{prediction_window}),``
+2. ``\mathtt{parameter\_indices} \leftarrow \mathtt{shuffle}(\mathtt{1:n\_params}),``
+3. ``\mathtt{complete\_indices} \leftarrow \mathtt{product}(\mathtt{time\_indices}, \mathtt{parameter\_indices}),``
+3. ``\mathcal{I}_i \leftarrow \mathtt{complete\_indices[}(i - 1) \cdot \mathtt{batch\_size} + 1 : i \cdot \mathtt{batch\_size]}\text{ for }i=1, \ldots, (\mathrm{last} -1),``
+4. ``\mathcal{I}_\mathrm{last} \leftarrow \mathtt{complete\_indices[}(\mathrm{last} - 1) \cdot \mathtt{batch\_size} + 1\mathtt{:end]}.``
 
 This algorithm can be visualized the following way (here `batch_size = 4`):
 

From 608a043faa240c4d7e21ef607e49fde17be87c29 Mon Sep 17 00:00:00 2001
From: benedict-96 <benedikt.brantner@icloud.com>
Date: Mon, 15 Apr 2024 12:26:24 +0200
Subject: [PATCH 16/16] Updated docs.

---
 src/data_loader/batch.jl | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/src/data_loader/batch.jl b/src/data_loader/batch.jl
index c0a0cba9f..8aa970287 100644
--- a/src/data_loader/batch.jl
+++ b/src/data_loader/batch.jl
@@ -1,5 +1,5 @@
 description(::Val{:Batch}) = raw"""
-`Batch` is a struct whose functor acts on an instance of `DataLoader` to produce a sequence of training samples for one epoch. 
+`Batch` is a struct whose functor acts on an instance of `DataLoader` to produce a sequence of training samples for training for one epoch. 
 
 ## The Constructor
 
@@ -8,29 +8,15 @@ The constructor for `Batch` is called with:
 - `seq_length::Int` (optional)
 - `prediction_window::Int` (optional)
 
-## The functor 
-
-An instance of `Batch` can be called on an instance of `DataLoader` to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. 
-"""
+The first one of these arguments is required; it indicates the number of training samples in a batch. If we deal with time series data then we can additionaly supply a *sequence length* and a *prediction window* as input arguments to `Batch`. These indicate the number of input vectors and the number of output vectors.
 
-description(::Val{:batch_functor_matrix}) = raw"""
-For a snapshot matrix (or a `NamedTuple` of the form `(q=A, p=B)` where `A` and `B` are matrices), the functor for `Batch` is called on an instance of `DataLoader`. It then returns a tuple of batch indices: 
-- for `autoencoder=true`: ``(\mathcal{I}_1, \ldots, \mathcal{I}_{\lceil\mathtt{n\_params/batch\_size}\rceil})``, where the index runs from 1 to the number of batches, which is the number of columns in the snapshot matrix divided by the batch size (rounded up).
-- for `autoencoder=false`: ``(\mathcal{I}_1, \ldots, \mathcal{I}_{\lceil\mathtt{dl.input\_time\_steps/batch\_size}\rceil})``, where the index runs from 1 to the number of batches, which is the number of columns in the snapshot matrix (minus one) divided by the batch size (rounded up).
-"""
+## The functor 
 
-description(::Val{:batch_functor_tensor}) = raw"""
-The functor for batch is called with an instance on `DataLoader`. It then returns a tuple of batch indices: ``(\mathcal{I}_1, \ldots, \mathcal{I}_{\lceil\mathtt{dl.n\_params/batch\_size}\rceil})``, where the index runs from 1 to the number of batches, which is the number of parameters divided by the batch size (rounded up).
+An instance of `Batch` can be called on an instance of `DataLoader` to produce a sequence of samples that contain all the input data, i.e. for training for one epoch. The output of applying `batch:Batch` to `dl::DataLoader` is a tuple of vectors of integers. Each of these vectors contains two integers: the first is the *time index* and the second one is the *parameter index*.
 """
 
 """
-## `Batch` constructor
 $(description(Val(:Batch)))
-
-## `Batch functor`
-$(description(Val(:batch_functor_matrix)))
-
-$(description(Val(:batch_functor_tensor)))
 """
 struct Batch{BatchType}
     batch_size::Int