Merge pull request #129 from JuliaGNI/loss_routines

Loss routines
JuliaGNI · Apr 10, 2024 · 8825732 · 8825732
2 parents af93bf5 + 7bbf45e
commit 8825732
Show file tree

Hide file tree

Showing 7 changed files with 189 additions and 53 deletions.
diff --git a/src/GeometricMachineLearning.jl b/src/GeometricMachineLearning.jl
@@ -61,9 +61,11 @@ module GeometricMachineLearning
     # from GeometricBase to print docs
     export description
 
-    # the functionality in the script doesn't require anything else defined in GML, but some of the other scripts in that folder do.
     include("data_loader/data_loader.jl")
 
+    include("loss/loss_routines.jl")
+    include("loss/losses.jl")
+
     include("kernels/assign_q_and_p.jl")
     include("kernels/tensor_mat_mul.jl")
     include("kernels/tensor_tensor_mul.jl")
@@ -381,5 +383,5 @@ module GeometricMachineLearning
 
     export ReducedSystem, compute_reduction_error, compute_projection_error, reduced_vector_field_from_full_explicit_vector_field, perform_integration_reduced, perform_integration_full
 
-    include("loss/loss_routines.jl")
+    include("data_loader/optimize.jl")
 end
diff --git a/src/data_loader/batch.jl b/src/data_loader/batch.jl
@@ -84,40 +84,6 @@ function number_of_batches(dl::DataLoader{T, AT}, batch::Batch{<:Integer}) where
     Int(ceil((dl.input_time_steps - batch.seq_length) * dl.n_params / batch.batch_size))
 end
 
-@doc raw"""
-Optimize for an entire epoch. For this you have to supply: 
-- an instance of the optimizer.
-- the neural network model 
-- the parameters of the model 
-- the data (in form of `DataLoader`)
-- in instance of `Batch` that contains `batch_size` (and optionally `seq_length`)
-
-With the optional argument:
-- the loss, which takes the `model`, the parameters `ps` and an instance of `DataLoader` as input.
-
-The output of `optimize_for_one_epoch!` is the average loss over all batches of the epoch:
-```math
-output = \frac{1}{\mathtt{steps\_per\_epoch}}\sum_{t=1}^\mathtt{steps\_per\_epoch}loss(\theta^{(t-1)}).
-```
-This is done because any **reverse differentiation** routine always has two outputs: a pullback and the value of the function it is differentiating. In the case of zygote: `loss_value, pullback = Zygote.pullback(ps -> loss(ps), ps)` (if the loss only depends on the parameters).
-"""
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, AT, BT}, batch::Batch, loss) where {T, T1, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T1, 3}}
-    count = 0
-    total_error = T(0)
-    batches = batch(dl)
-    @views for batch_indices in batches 
-        count += 1
-        # these `copy`s should not be necessary! coming from a Zygote problem!
-        input_batch = copy(dl.input[:, :, batch_indices])
-        output_batch = copy(dl.output[:, :, batch_indices])
-        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_batch, output_batch), ps)
-        total_error += loss_value
-        dp = pullback(one(loss_value))[1]
-        optimization_step!(opt, model, ps, dp)
-    end
-    total_error / count
-end
-
 @kernel function assign_input_from_vector_of_tuples_kernel!(q_input::AT, p_input::AT, input::NamedTuple{(:q, :p), Tuple{AT, AT}}, indices::AbstractArray{Int, 2}) where {T, AT<:AbstractArray{T, 3}}
     i, j, k = @index(Global, NTuple)
 
@@ -198,22 +164,6 @@ function convert_input_and_batch_indices_to_array(dl::DataLoader{T, BT}, batch::
     input, output
 end
 
-function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, CT, Nothing}, batch::Batch, loss) where {T, AT<:AbstractArray{T, 3}, BT<:NamedTuple{(:q, :p), Tuple{AT, AT}}, CT<:Union{AT, BT}}
-    count = 0
-    total_error = T(0)
-    batches = batch(dl)
-    @views for batch_indices in batches 
-        count += 1
-        # these `copy`s should not be necessary! coming from a Zygote problem!
-        input_nt, output_nt = convert_input_and_batch_indices_to_array(dl, batch, batch_indices)
-        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt, output_nt), ps)
-        total_error += loss_value
-        dp = pullback(one(loss_value))[1]
-        optimization_step!(opt, model, ps, dp)
-    end
-    total_error / count
-end
-
 function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, AT, BT}, batch::Batch) where {T, T1, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T1, 3}}
     optimize_for_one_epoch!(opt, model, ps, dl, batch, loss)
 end

diff --git a/src/data_loader/optimize.jl b/src/data_loader/optimize.jl
@@ -0,0 +1,83 @@
+@doc raw"""
+Optimize for an entire epoch. For this you have to supply: 
+- an instance of the optimizer.
+- the neural network model 
+- the parameters of the model 
+- the data (in form of `DataLoader`)
+- in instance of `Batch` that contains `batch_size` (and optionally `seq_length`)
+
+With the optional argument:
+- the loss, which takes the `model`, the parameters `ps` and an instance of `DataLoader` as input.
+
+The output of `optimize_for_one_epoch!` is the average loss over all batches of the epoch:
+```math
+output = \frac{1}{\mathtt{steps\_per\_epoch}}\sum_{t=1}^\mathtt{steps\_per\_epoch}loss(\theta^{(t-1)}).
+```
+This is done because any **reverse differentiation** routine always has two outputs: a pullback and the value of the function it is differentiating. In the case of zygote: `loss_value, pullback = Zygote.pullback(ps -> loss(ps), ps)` (if the loss only depends on the parameters).
+"""
+function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, AT, BT}, batch::Batch, loss) where {T, T1, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T1, 3}}
+    count = 0
+    total_error = T(0)
+    batches = batch(dl)
+    @views for batch_indices in batches 
+        count += 1
+        # these `copy`s should not be necessary! coming from a Zygote problem!
+        input_batch = copy(dl.input[:, :, batch_indices])
+        output_batch = copy(dl.output[:, :, batch_indices])
+        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_batch, output_batch), ps)
+        total_error += loss_value
+        dp = pullback(one(loss_value))[1]
+        optimization_step!(opt, model, ps, dp)
+    end
+    total_error / count
+end
+
+function optimize_for_one_epoch!(opt::Optimizer, model, ps::Union{Tuple, NamedTuple}, dl::DataLoader{T, CT, Nothing}, batch::Batch, loss::NetworkLoss) where {T, AT<:AbstractArray{T, 3}, BT<:NamedTuple{(:q, :p), Tuple{AT, AT}}, CT<:Union{AT, BT}}
+    count = 0
+    total_error = T(0)
+    batches = batch(dl)
+    @views for batch_indices in batches 
+        count += 1
+        # these `copy`s should not be necessary! coming from a Zygote problem!
+        input_nt, output_nt = convert_input_and_batch_indices_to_array(dl, batch, batch_indices)
+        loss_value, pullback = Zygote.pullback(ps -> loss(model, ps, input_nt, output_nt), ps)
+        total_error += loss_value
+        dp = pullback(one(loss_value))[1]
+        optimization_step!(opt, model, ps, dp)
+    end
+    total_error / count
+end
+
+@doc raw"""
+A functor for `Optimizer`. It is called with:
+    - `nn::NeuralNetwork`
+    - `dl::DataLoader`
+    - `batch::Batch`
+    - `n_epochs::Int`
+    - `loss`
+
+The last argument is a function through which `Zygote` differentiates. This argument is optional; if it is not supplied `GeometricMachineLearning` defaults to an appropriate loss for the `DataLoader`.
+"""
+function (o::Optimizer)(nn::NeuralNetwork, dl::DataLoader, batch::Batch, n_epochs::Int, loss::NetworkLoss)
+    progress_object = ProgressMeter.Progress(n_epochs; enabled=true)
+    loss_array = zeros(n_epochs)
+    for i in 1:n_epochs
+        loss_array[i] = optimize_for_one_epoch!(o, nn.model, nn.params, dl, batch, loss)
+        ProgressMeter.next!(progress_object; showvalues = [(:TrainingLoss, loss_array[i])]) 
+    end
+    loss_array
+end
+
+#=
+function (o::Optimizer)(nn::NeuralNetwork{<:TransformerIntegrator}, dl::DataLoader, batch::Batch{Int}, n_epochs::Int=1)
+    loss = TransformerLoss(batch)
+    o(nn, dl, batch, n_epochs, loss)
+end
+
+function (o::Optimizer)(nn::NeuralNetwork{<:NeuralNetworkIntegrator}, dl::DataLoader, batch::Batch{Int}, n_epochs::Int=1)
+    loss = FeedForwardLoss()
+    o(nn, dl, batch, n_epochs, loss)
+end
+
+(o::Optimizer)(nn::NeuralNetwork{<:NeuralNetworkIntegrator}, dl::DataLoader, batch::Batch{Nothing}, n_epochs::Int=1) = o(nn, dl, Batch(batch.batch_size, 1), n_epochs)
+=#
diff --git a/src/loss/loss_routines.jl b/src/loss/loss_routines.jl
@@ -12,7 +12,7 @@ It takes as input:
 - `input::Union{Array, NamedTuple}`
 - `output::Uniont{Array, NamedTuple}`
 """
-function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::AT, output::BT) where {T, T1, AT<:AbstractArray{T, 3}, BT<:AbstractArray{T1, 3}}
+function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::AT, output::BT) where {T, T1, AT<:AbstractArray{T}, BT<:AbstractArray{T1}}
     output_estimate = model(input, ps)
     norm(output - output_estimate) / norm(output) 
 end
@@ -46,6 +46,36 @@ function loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, Named
     nt_norm(nt_diff(output_estimate, input)) / nt_norm(input)
 end
 
+@doc raw"""
+The transformer works similarly to the regular loss, but with the difference that ``\mathcal{NN}(input)`` and ``output`` may have different sizes. 
+
+It takes as input: 
+- `model::Union{Chain, AbstractExplicitLayer}`
+- `ps::Union{Tuple, NamedTuple}`
+- `input::Union{Array, NamedTuple}`
+- `output::Uniont{Array, NamedTuple}`
+"""
+function transformer_loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::BT, output::BT) where {T, BT<:AbstractArray{T}} 
+    norm(
+        crop_array_for_transformer_loss(model(input, ps), output) - 
+        output 
+        ) / 
+    norm(input) 
+end
+
+function transformer_loss(model::Union{Chain, AbstractExplicitLayer}, ps::Union{Tuple, NamedTuple}, input::NT, output::NT) where {T, AT<:AbstractArray{T}, NT<:NamedTuple{(:q, :p,), Tuple{AT, AT}}}
+    output_estimate = model(input, ps)
+
+    nt_norm(
+        nt_diff(
+            (q = crop_array_for_transformer_loss(output_estimate.q, output.q),
+             p = crop_array_for_transformer_loss(output_estimate.p, output.p)), 
+            output
+            )
+            ) / 
+            nt_norm(input)
+end
+
 
 @doc raw"""
 Alternative call of the loss function. This takes as input: 

diff --git a/src/loss/losses.jl b/src/loss/losses.jl
@@ -0,0 +1,42 @@
+abstract type NetworkLoss end 
+
+function (loss::NetworkLoss)(nn::NeuralNetwork, input::AT, output::AT) where {AT <: AbstractArray}
+    loss(nn.model, nn.params, input, output)
+end
+
+@doc raw"""
+The loss for a transformer network (especially a transformer integrator). The constructor is called with:
+- `seq_length::Int`
+- `prediction_window::Int` (default is 1).
+"""
+struct TransformerLoss <: NetworkLoss
+    seq_length::Int
+    prediction_window::Int
+end
+
+TransformerLoss(seq_length::Int) = TransformerLoss(seq_length, 1)
+
+@doc raw"""
+This crops the output array of the neural network so that it conforms with the output it should be compared to. This is needed for the transformer loss. 
+"""
+function crop_array_for_transformer_loss(nn_output::AT, output::AT) where {T, AT<:AbstractArray{T, 3}}
+    @view nn_output[axes(output, 1), axes(output, 2) .+ size(nn_output, 2) .- size(output, 2), axes(output, 3)]
+end
+
+function (loss::TransformerLoss)(model::Chain, ps::Union{Tuple, NamedTuple}, input::AT, output::AT) where {T, AT <: Union{AbstractArray{T, 2}, AbstractArray{T, 3}}}
+    input_dim, input_seq_length = size(input)
+    output_dim, output_prediction_window = size(output)
+    @assert input_dim == output_dim 
+    @assert input_seq_length == loss.seq_length
+    @assert output_prediction_window == loss.prediction_window
+
+    predicted_output_uncropped = model(input, ps)
+    predicted_output_cropped = crop_array_for_transformer_loss(predicted_output_uncropped, output)
+    norm(predicted_output_cropped - output) / norm(output)
+end
+
+struct FeedForwardLoss <: NetworkLoss end
+
+function (loss::FeedForwardLoss)(model::Chain, ps::Union{Tuple, NamedTuple}, input::AT, output::AT) where {AT <: AbstractArray}
+    norm(model(input, ps) - output) / norm(output)
+end
diff --git a/test/network_losses/losses_and_optimization.jl b/test/network_losses/losses_and_optimization.jl
@@ -0,0 +1,27 @@
+using GeometricMachineLearning
+using GeometricMachineLearning: FeedForwardLoss
+using Test 
+import Random 
+
+Random.seed!(123)
+
+const sin_vector = sin.(0:0.01:2π)
+const dl = DataLoader(reshape(sin_vector, 1, length(sin_vector), 1))
+
+function setup_network(dl::DataLoader{T}) where T
+    arch = Chain(Dense(1, 5, tanh), ResNet(5, tanh), Dense(5, 1, identity))
+    NeuralNetwork(arch, CPU(), T)
+end
+
+function train_network(; n_epochs=5)
+    nn = setup_network(dl)
+    loss = FeedForwardLoss()
+
+    o = Optimizer(AdamOptimizer(), nn)
+    batch = Batch(5, 1)
+    loss_array = o(nn, dl, batch, n_epochs, loss)
+    T = eltype(dl)
+    @test loss_array[end] / loss_array[1] < T(0.1)
+end
+
+train_network()
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -46,6 +46,8 @@ using SafeTestsets
 @safetestset "Optimizer functor with data loader for Adam                                     " begin include("data_loader/optimizer_functor_with_adam.jl") end
 @safetestset "Test data loader for a tensor (q and p data)                                    " begin include("data_loader/draw_batch_for_tensor_test.jl") end
 
+@safetestset "Test NetworkLoss + Optimizer                                                    " begin include("network_losses/losses_and_optimization.jl") end
+
 @safetestset "Test parallel inverses                                                          " begin include("kernels/tensor_inverse.jl") end
 @safetestset "Test parallel Cayley                                                            " begin include("kernels/tensor_cayley.jl") end