CarloLucibello · CarloLucibello · May 20, 2023 · May 20, 2023
diff --git a/docs/src/guides.md b/docs/src/guides.md
@@ -26,11 +26,59 @@ TODO
 
 ## Gradient accumulation
 
-TODO
+Gradient accumulation is a technique that allows you to simulate larger batch sizes by accumulating gradients over multiple batches. This is useful when you want to use a large batch size but your GPU does not have enough memory.
+
+Optimisers.jl supports gradient accumulation the `AccumGrad` rule:
+
+```
+    AccumGrad(n::Int)
+
+A rule constructed `OptimiserChain(AccumGrad(n), Rule())` will accumulate for `n` steps, before applying Rule to the mean of these `n` gradients.
+
+This is useful for training with effective batch sizes too large for the available memory. Instead of computing the gradient for batch size `b` at once, compute it for size `b/n` and accumulate `n` such gradients.
+```
+
+AccumGrad can be easily integrated into Tsunami's `configure_optimisers`:
+
+```julia
+using Optimisers
+
+function Tsunami.configure_optimisers(model::Model, trainer)
+    opt = OptimiserChain(AccumGrad(5), AdamW(1e-3))
+    opt_state = Optimiser.setup(opt, model)
+    return opt_state
+end
+```
 
 ## Gradient clipping
 
-TODO
+Gradient clipping is a technique that allows you to limit the range or the norm of the gradients. This is useful to prevent exploding gradients and improve training stability.
+
+Optimisers.jl supports gradient clipping with the `ClipNorm` and `ClipValue` rule:
+
+```
+    ClipGrad(δ = 10f0)
+
+Restricts every gradient component to obey -δ ≤ dx[i] ≤ δ.
+```
+```
+    ClipNorm(ω = 10f0, p = 2; throw = true)
+
+Scales any gradient array for which norm(dx, p) > ω to stay at this threshold (unless p==0).
+Throws an error if the norm is infinite or NaN, which you can turn off with throw = false.
+```
+
+Gradient clipping can be easily integrated into Tsunami's `configure_optimisers`:
+
+```julia
+using Optimisers
+
+function Tsunami.configure_optimisers(model::Model, trainer)
+    opt = OptimiserChain(ClipGrad(0.1), AdamW(1e-3))
+    opt_state = Optimiser.setup(opt, model)
+    return opt_state
+end
+```
 
 ## Mixed precision training
 

diff --git a/src/fluxmodule.jl b/src/fluxmodule.jl
@@ -88,13 +88,13 @@ set as the learning rate for the next epoch.
 ```julia
 using Optimisers, ParameterScheduler
 
-function Tsunami.configure_optimisers(model::Model)
+function Tsunami.configure_optimisers(model::Model, trainer)
     return Optimisers.setup(AdamW(1e-3), model)
 end
 
 # Now with a scheduler dropping the learning rate by a factor 10 
 # at epochs [50, 100, 200] starting from the initial value of 1e-2
-function Tsunami.configure_optimisers(model::Model)
+function Tsunami.configure_optimisers(model::Model, trainer)
 
     function lr_scheduler(epoch)
         if epoch <= 50
@@ -113,7 +113,7 @@ function Tsunami.configure_optimisers(model::Model)
 end
 
 # Same as above but using the ParameterScheduler package.
-function Tsunami.configure_optimisers(model::Model)
+function Tsunami.configure_optimisers(model::Model, trainer)
     lr_scheduler = ParameterScheduler.Step(1e-2, 1/10, [50, 50, 100])
     opt = Optimisers.setup(AdamW(), model)
     return lr_scheduler, opt