From d442e91608e7f767c32128553845eaa0f6316a3b Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 7 Jun 2021 13:11:08 +0200
Subject: [PATCH 01/49] add kinetic stress autodiff example

---
 Project.toml                           |  3 +
 test/autodiff-stress/stress-kinetic.jl | 76 ++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 test/autodiff-stress/stress-kinetic.jl

diff --git a/Project.toml b/Project.toml
index 6b9f0d341e..04dba238a2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ version = "0.3.0"
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -28,6 +29,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -35,6 +37,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
 UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 spglib_jll = "ac4a9f1e-bdb2-5204-990c-47c8b2f70d4e"
 
 [compat]
diff --git a/test/autodiff-stress/stress-kinetic.jl b/test/autodiff-stress/stress-kinetic.jl
new file mode 100644
index 0000000000..918146a545
--- /dev/null
+++ b/test/autodiff-stress/stress-kinetic.jl
@@ -0,0 +1,76 @@
+# Very basic setup, useful for testing
+using DFTK
+using LinearAlgebra
+using BenchmarkTools
+
+a = 10.26  # Silicon lattice constant in Bohr
+lattice = a / 2 * [[0 1 1.];
+                   [1 0 1.];
+                   [1 1 0.]]
+Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+atoms = [Si => [ones(3)/8, -ones(3)/8]]
+
+model = model_LDA(lattice, atoms)
+kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
+Ecut = 15          # kinetic energy cutoff in Hartree -- can increase to make G_vectors larger (larger solve time)
+basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid)
+
+@time scfres = self_consistent_field(basis, tol=1e-8) # 75.068789 seconds (138.55 M allocations: 8.145 GiB, 4.59% gc time, 24.68% compilation time)
+
+# TODO try to rewrite for Zygote (performance optimizations)
+# e.g. translate loops to dense arrays or maps (?)
+
+function kinetic_energy(lattice, basis, ψ, occ)
+    recip_lattice = 2π * inv(lattice')
+    E = zero(Float64)
+    kinetic_energies = [[sum(abs2, recip_lattice * (G + kpt.coordinate)) / 2
+                         for G in  G_vectors(kpt)]
+                        for kpt in basis.kpoints]
+    for (ik, k) in enumerate(basis.kpoints)
+        for iband = 1:size(ψ[1], 2)
+            ψnk = @views ψ[ik][:, iband]
+            E += (basis.kweights[ik] * occ[ik][iband]
+                  * real(dot(ψnk, kinetic_energies[ik] .* ψnk)))
+        end
+    end
+    E
+end
+kinetic_energy(lattice) = kinetic_energy(lattice, basis, scfres.ψ, scfres.occupation)
+
+@time E = kinetic_energy(lattice) # 0.438027 seconds (623.88 k allocations: 36.457 MiB, 99.96% compilation time)
+@btime kinetic_energy(lattice) # 49.123 μs (742 allocations: 169.05 KiB)
+
+# stress := diff E wrt lattice
+
+#===#
+# Check results and compile times on first call
+stresses = Dict()
+
+# works fine
+using ForwardDiff
+@time stresses[:ForwardDiff] = ForwardDiff.gradient(kinetic_energy, lattice) # 3.627630 seconds (5.99 M allocations: 363.981 MiB, 5.08% gc time, 98.69% compilation time)
+
+# works but long compile time and gives ComplexF64 results
+# hypothesis: slow compilation due to loops (and generators)
+using Zygote
+@time stresses[:Zygote] = Zygote.gradient(kinetic_energy, lattice) # 61.094425 seconds (63.31 M allocations: 3.715 GiB, 3.85% gc time, 67.43% compilation time)
+
+# works fine
+using ReverseDiff
+@time stresses[:ReverseDiff] = ReverseDiff.gradient(kinetic_energy, lattice) # 5.409118 seconds (9.60 M allocations: 516.091 MiB, 14.61% gc time, 89.56% compilation time)
+
+# sanity check
+using FiniteDiff
+@time stresses[:FiniteDiff] = FiniteDiff.finite_difference_gradient(kinetic_energy, lattice) # 2.606210 seconds (2.87 M allocations: 232.911 MiB, 19.92% gc time, 99.19% compilation time)
+
+stresses
+# Dict{Any, Any} with 4 entries:
+# :ForwardDiff => [0.27005 -0.27005 -0.27005; -0.27005 0.27005 -0.27005; -0.27005 -0.27005 0.27005]
+# :FiniteDiff  => [0.27005 -0.27005 -0.27005; -0.27005 0.27005 -0.27005; -0.27005 -0.27005 0.27005]
+# :Zygote      => (ComplexF64[0.27005-0.0im -0.27005-0.0im -0.27005-0.0im; -0.27005-0.0im 0.27005-0.0im -0.27005-0.0im; -0.27005-0.0im -0.27005-0.0im 0.27005-0.0im],)
+# :ReverseDiff => [0.27005 -0.27005 -0.27005; -0.27005 0.27005 -0.27005; -0.27005 -0.27005 0.27005]
+
+@btime ForwardDiff.gradient(kinetic_energy, lattice) #    270.426 μs (   761 allocations:  1.07 MiB)
+@btime Zygote.gradient(kinetic_energy, lattice)      #  6.983 ms     ( 34765 allocations: 12.61 MiB)
+@btime ReverseDiff.gradient(kinetic_energy, lattice) # 15.376 ms     (415886 allocations: 16.42 MiB)
+@btime FiniteDiff.finite_difference_gradient(kinetic_energy, lattice) # 777.578 μs (13394 allocations: 2.97 MiB)

From 2074095340ca2ee459828feaf16535259344c8fd Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 7 Jun 2021 13:37:24 +0200
Subject: [PATCH 02/49] add stress-total  error messages of FD, RD, Zygote

---
 test/autodiff-stress/stress-total.jl | 188 +++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 test/autodiff-stress/stress-total.jl

diff --git a/test/autodiff-stress/stress-total.jl b/test/autodiff-stress/stress-total.jl
new file mode 100644
index 0000000000..86c72567dd
--- /dev/null
+++ b/test/autodiff-stress/stress-total.jl
@@ -0,0 +1,188 @@
+# Very basic setup, useful for testing
+using DFTK
+using Test
+
+a = 10.26
+lattice = a / 2 * [[0 1 1.];
+                   [1 0 1.];
+                   [1 1 0.]]
+Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+atoms = [Si => [ones(3)/8, -ones(3)/8]]
+
+model = model_atomic(lattice, atoms, symmetries=false)
+kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
+Ecut = 15          # kinetic energy cutoff in Hartree
+basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid)
+
+scfres = self_consistent_field(basis, tol=1e-8)
+
+function compute_energy(scfres_ref, a)
+    lattice = a / 2 * [[0 1 1.];
+                       [1 0 1.];
+                       [1 1 0.]]
+    Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+    atoms = [Si => [ones(3)/8, -ones(3)/8]]
+
+    model = model_atomic(lattice, atoms, symmetries=false)
+    kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
+    Ecut = 15           # kinetic energy cutoff in Hartree
+    basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid)
+
+    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
+    energies.total
+end
+
+compute_energy(scfres, 10.26)
+
+# Finite difference reference stress
+FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) # -1.4114474091964526
+
+###
+### Forward mode
+###
+
+using ForwardDiff
+ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26)
+# ERROR: LoadError: MethodError: no method matching svdvals!(::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}})
+# Closest candidates are:
+#   svdvals!(::SymTridiagonal) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/tridiag.jl:351
+#   svdvals!(::StridedMatrix{T}) where T<:Union{Float32, Float64, ComplexF32, ComplexF64} at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:192
+#   svdvals!(::StridedMatrix{T}, ::StridedMatrix{T}) where T<:Union{Float32, Float64, ComplexF32, ComplexF64} at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:498
+#   ...
+# Stacktrace:
+#  [1] svdvals(A::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}})
+#    @ LinearAlgebra /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:217
+#  [2] cond(A::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}}, p::Int64)
+#    @ LinearAlgebra /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/dense.jl:1462
+#  [3] cond
+#    @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/dense.jl:1461 [inlined]
+#  [4] Model(lattice::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}}; n_electrons::Nothing, atoms::Vector{Pair{ElementPsp, Vector{Vector{Float64}}}}, magnetic_moments::Vector{Any}, terms::Vector{Any}, temperature::ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}, smearing::Nothing, spin_polarization::Symbol, symmetries::Bool)
+#    @ DFTK ~/.julia/dev/DFTK.jl/src/Model.jl:106
+#  [5] model_atomic(lattice::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}}, atoms::Vector{Pair{ElementPsp, Vector{Vector{Float64}}}}; extra_terms::Vector{Any}, kwargs::Base.Iterators.Pairs{Symbol, Bool, Tuple{Symbol}, NamedTuple{(:symmetries,), Tuple{Bool}}})
+#    @ DFTK ~/.julia/dev/DFTK.jl/src/standard_models.jl:20
+#  [6] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1})
+#    @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:25
+#  [7] (::var"#15#16")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1})
+#    @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:45
+#  [8] derivative(f::var"#15#16", x::Float64)
+#    @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
+#  [9] top-level scope
+#    @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:45
+
+###
+### Reverse mode
+###
+
+using Zygote
+Zygote.gradient(a -> compute_energy(scfres, a), 10.26)
+# ERROR: LoadError: MethodError: no method matching zero(::String)
+# Closest candidates are:
+#   zero(::Union{Type{P}, P}) where P<:Dates.Period at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Dates/src/periods.jl:53
+#   zero(::FillArrays.Ones{T, N, Axes} where Axes) where {T, N} at /home/niku/.julia/packages/FillArrays/rPtlv/src/FillArrays.jl:537
+#   zero(::T) where T<:Dates.TimeType at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Dates/src/types.jl:423
+#   ...
+# Stacktrace:
+#   [1] pair_getfield
+#     @ ~/.julia/packages/Zygote/pM10l/src/lib/base.jl:134 [inlined]
+#   [2] #2040#back
+#     @ ~/.julia/packages/ZygoteRules/OjfTt/src/adjoint.jl:59 [inlined]
+#   [3] Pullback
+#     @ ./pair.jl:59 [inlined]
+#   [4] (::typeof(∂(getindex)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#   [5] Pullback
+#     @ ./abstractdict.jl:66 [inlined]
+#   [6] (::typeof(∂(iterate)))(Δ::Tuple{Float64, Nothing})
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#   [7] Pullback
+#     @ ./reduce.jl:60 [inlined]
+#   [8] (::typeof(∂(_foldl_impl)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#   [9] Pullback
+#     @ ./reduce.jl:48 [inlined]
+#  [10] (::typeof(∂(foldl_impl)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [11] Pullback
+#     @ ./reduce.jl:44 [inlined]
+#  [12] (::typeof(∂(mapfoldl_impl)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [13] Pullback (repeats 2 times)
+#     @ ./reduce.jl:160 [inlined]
+#  [14] (::typeof(∂(mapfoldl)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [15] Pullback
+#     @ ./reduce.jl:287 [inlined]
+#  [16] (::typeof(∂(#mapreduce#218)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [17] Pullback
+#     @ ./reduce.jl:287 [inlined]
+#  [18] (::typeof(∂(mapreduce)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [19] Pullback
+#     @ ./reduce.jl:501 [inlined]
+#  [20] (::typeof(∂(#sum#221)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [21] Pullback
+#     @ ./reduce.jl:501 [inlined]
+#  [22] (::typeof(∂(sum)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [23] Pullback
+#     @ ./reduce.jl:528 [inlined]
+#  [24] (::typeof(∂(#sum#222)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [25] Pullback
+#     @ ./reduce.jl:528 [inlined]
+#  [26] (::typeof(∂(sum)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [27] Pullback
+#     @ ~/.julia/dev/DFTK.jl/src/energies.jl:38 [inlined]
+#  [28] (::typeof(∂(getproperty)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [29] Pullback
+#     @ ~/.julia/packages/ZygoteRules/OjfTt/src/ZygoteRules.jl:11 [inlined]
+#  [30] Pullback
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:31 [inlined]
+#  [31] (::typeof(∂(compute_energy)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [32] Pullback
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:96 [inlined]
+#  [33] (::typeof(∂(#19)))(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
+#  [34] (::Zygote.var"#41#42"{typeof(∂(#19))})(Δ::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface.jl:41
+#  [35] gradient(f::Function, args::Float64)
+#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface.jl:59
+#  [36] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:96
+
+
+using ReverseDiff
+ReverseDiff.gradient(a -> compute_energy(scfres, first(a)), [10.26])
+# ERROR: LoadError: MethodError: no method matching svdvals!(::Matrix{ReverseDiff.TrackedReal{Float64, Float64, Nothing}})
+# Closest candidates are:
+#   svdvals!(::SymTridiagonal) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/tridiag.jl:351
+#   svdvals!(::StridedMatrix{T}) where T<:Union{Float32, Float64, ComplexF32, ComplexF64} at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:192
+#   svdvals!(::StridedMatrix{T}, ::StridedMatrix{T}) where T<:Union{Float32, Float64, ComplexF32, ComplexF64} at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:498
+#   ...
+# Stacktrace:
+#   [1] svdvals(A::Matrix{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}})
+#     @ LinearAlgebra /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:217
+#   [2] cond(A::Matrix{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, p::Int64)
+#     @ LinearAlgebra /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/dense.jl:1462
+#   [3] cond
+#     @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/dense.jl:1461 [inlined]
+#   [4] Model(lattice::ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}; n_electrons::Nothing, atoms::Vector{Pair{ElementPsp, Vector{Vector{Float64}}}}, magnetic_moments::Vector{Any}, terms::Vector{Any}, temperature::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, smearing::Nothing, spin_polarization::Symbol, symmetries::Bool)
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/Model.jl:106
+#   [5] model_atomic(lattice::ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}, atoms::Vector{Pair{ElementPsp, Vector{Vector{Float64}}}}; extra_terms::Vector{Any}, kwargs::Base.Iterators.Pairs{Symbol, Bool, Tuple{Symbol}, NamedTuple{(:symmetries,), Tuple{Bool}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/standard_models.jl:20
+#   [6] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:25
+#   [7] (::var"#23#24")(a::ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:160
+#   [8] ReverseDiff.GradientTape(f::var"#23#24", input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
+#     @ ReverseDiff ~/.julia/packages/ReverseDiff/E4Tzn/src/api/tape.jl:199
+#   [9] gradient(f::Function, input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}}) (repeats 2 times)
+#     @ ReverseDiff ~/.julia/packages/ReverseDiff/E4Tzn/src/api/gradients.jl:22
+#  [10] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:160
+

From cafed13ddc5d3f95d370fafcdb01869f3a61a23b Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 7 Jun 2021 13:47:07 +0200
Subject: [PATCH 03/49] add stress-total scalar test

---
 test/autodiff-stress/stress-total.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/autodiff-stress/stress-total.jl b/test/autodiff-stress/stress-total.jl
index 86c72567dd..544e318c60 100644
--- a/test/autodiff-stress/stress-total.jl
+++ b/test/autodiff-stress/stress-total.jl
@@ -1,4 +1,3 @@
-# Very basic setup, useful for testing
 using DFTK
 using Test
 
@@ -34,8 +33,11 @@ end
 
 compute_energy(scfres, 10.26)
 
-# Finite difference reference stress
-FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) # -1.4114474091964526
+function compute_stress(scfres_ref, a)
+    Inf # TODO implement
+end
+@test compute_stress(scfres, a) ≈ FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), a) # -1.411
+
 
 ###
 ### Forward mode

From 38db38fd7ff559f562379541b38a3e428a895475 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 11:10:20 +0200
Subject: [PATCH 04/49] disable lattice cond check temporarily

---
 src/Model.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Model.jl b/src/Model.jl
index 59af645316..77c376642e 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -103,7 +103,9 @@ function Model(lattice::AbstractMatrix{T};
         norm(lattice[:, i]) == norm(lattice[i, :]) == 0 || error(
             "For 1D and 2D systems, the non-empty dimensions must come first")
     end
-    cond(lattice[1:d, 1:d]) > 1e-5 || @warn "Your lattice is badly conditioned, the computation is likely to fail."
+
+    ## temporarily disabled for ForwardDiff. TODO re-enable
+    # cond(lattice[1:d, 1:d]) > 1e-5 || @warn "Your lattice is badly conditioned, the computation is likely to fail."
 
     # Compute reciprocal lattice and volumes.
     # recall that the reciprocal lattice is the set of G vectors such

From 797bf3417e0eb9c3efca8cf9630823f0c19a01cf Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 11:32:51 +0200
Subject: [PATCH 05/49] add model_atomic_debug with Kinetic() only

---
 src/DFTK.jl            |  1 +
 src/standard_models.jl | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 78fa7881cf..8bcf808fd9 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -99,6 +99,7 @@ include("eigen/preconditioners.jl")
 include("eigen/diag.jl")
 
 export model_atomic
+export model_atomic_debug # TODO delete
 export model_DFT
 export model_PBE
 export model_LDA
diff --git a/src/standard_models.jl b/src/standard_models.jl
index e64d856d00..f99178714a 100644
--- a/src/standard_models.jl
+++ b/src/standard_models.jl
@@ -20,6 +20,24 @@ function model_atomic(lattice::AbstractMatrix, atoms::Vector; extra_terms=[], kw
     Model(lattice; atoms=atoms, terms=terms, kwargs...)
 end
 
+"""
+Convenience constructor for debugging purposes only.
+"""
+function model_atomic_debug(lattice::AbstractMatrix, atoms::Vector; extra_terms=[], kwargs...)
+    @assert !(:terms in keys(kwargs))
+    @assert !(:atoms in keys(kwargs))
+    terms = [Kinetic(),
+            #  AtomicLocal(),
+            #  AtomicNonlocal(),
+            #  Ewald(),
+            #  PspCorrection(),
+             extra_terms...]
+    if :temperature in keys(kwargs) && kwargs[:temperature] != 0
+        terms = [terms..., Entropy()]
+    end
+    Model(lattice; atoms=atoms, terms=terms, kwargs...)
+end
+
 
 """
 Build a DFT model from the specified atoms, with the specified functionals.

From 223dc240c8d30338f9af1355b5040878d883ddd4 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 11:33:55 +0200
Subject: [PATCH 06/49] stacktrace: no next_working_fft_size for Duals

---
 test/autodiff-stress/stress-kinetic-debug.jl | 85 ++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 test/autodiff-stress/stress-kinetic-debug.jl

diff --git a/test/autodiff-stress/stress-kinetic-debug.jl b/test/autodiff-stress/stress-kinetic-debug.jl
new file mode 100644
index 0000000000..efe007ce2f
--- /dev/null
+++ b/test/autodiff-stress/stress-kinetic-debug.jl
@@ -0,0 +1,85 @@
+using DFTK
+using Test
+
+a = 10.26
+lattice = a / 2 * [[0 1 1.];
+                   [1 0 1.];
+                   [1 1 0.]]
+Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+atoms = [Si => [ones(3)/8, -ones(3)/8]]
+
+model = model_atomic_debug(lattice, atoms, symmetries=false)
+kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
+Ecut = 15          # kinetic energy cutoff in Hartree
+basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
+
+# scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
+
+# try a bogus tolerance for debugging
+scfres = self_consistent_field(basis, tol=1e9)
+
+function compute_energy(scfres_ref, a)
+    lattice = a / 2 * [[0 1 1.];
+                       [1 0 1.];
+                       [1 1 0.]]
+    Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+    atoms = [Si => [ones(3)/8, -ones(3)/8]]
+
+    model = model_atomic_debug(lattice, atoms, symmetries=false)
+    kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
+    Ecut = 15           # kinetic energy cutoff in Hartree
+    basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
+
+    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
+    energies.total
+end
+
+compute_energy(scfres, 10.26)
+
+function compute_stress(scfres_ref, a)
+    Inf # TODO implement
+end
+@test compute_stress(scfres, a) ≈ FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), a) # -1.411
+
+
+###
+### Forward mode
+###
+
+using ForwardDiff
+ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26)
+# ERROR: LoadError: MethodError: no method matching next_working_fft_size(::Type{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, ::Int64)
+# Closest candidates are:
+#   next_working_fft_size(::Type{Float32}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:167
+#   next_working_fft_size(::Type{Float64}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:168
+# Stacktrace:
+#   [1] _broadcast_getindex_evalf
+#     @ ./broadcast.jl:648 [inlined]
+#   [2] _broadcast_getindex
+#     @ ./broadcast.jl:631 [inlined]
+#   [3] getindex
+#     @ ./broadcast.jl:575 [inlined]
+#   [4] copy
+#     @ ./broadcast.jl:922 [inlined]
+#   [5] materialize
+#     @ ./broadcast.jl:883 [inlined]
+#   [6] validate_or_compute_fft_size(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, fft_size::Vector{Int64}, Ecut::Int64, supersampling::Int64, variational::Bool, optimize_fft_size::Bool, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:139
+#   [7] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:193 [inlined]
+#   [8] (::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, Int64})()
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
+#   [9] timeit(f::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
+#  [10] PlaneWaveBasis(model::Model{ForwardDiff.        Dual{ForwardDiff.Tag{var"#7#8", Float64}, Floa        t64, 1}}, Ecut::Int64, kcoords::Vector{StaticA        rrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Vector{Int64}, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
+#  [11] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
+#  [12] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-kinetic-debug.jl:31
+#  [13] (::var"#7#8")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-kinetic-debug.jl:50
+#  [14] derivative(f::var"#7#8", x::Float64)
+#     @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
+#  [15] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-kinetic-debug.jl:50

From abe7c7bc308fc864142c2cb38791205c4456ce85 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 11:48:00 +0200
Subject: [PATCH 07/49] stress-kinetic with generic fft forwarddiff works

---
 test/autodiff-stress/stress-kinetic-debug.jl | 48 +++-----------------
 1 file changed, 7 insertions(+), 41 deletions(-)

diff --git a/test/autodiff-stress/stress-kinetic-debug.jl b/test/autodiff-stress/stress-kinetic-debug.jl
index efe007ce2f..d84be9a7c8 100644
--- a/test/autodiff-stress/stress-kinetic-debug.jl
+++ b/test/autodiff-stress/stress-kinetic-debug.jl
@@ -1,6 +1,10 @@
 using DFTK
 using Test
 
+# for generic FourierTransforms.jl (TODO replace by FFTW later)
+using DoubleFloats
+using GenericLinearAlgebra
+
 a = 10.26
 lattice = a / 2 * [[0 1 1.];
                    [1 0 1.];
@@ -36,50 +40,12 @@ end
 
 compute_energy(scfres, 10.26)
 
-function compute_stress(scfres_ref, a)
-    Inf # TODO implement
-end
-@test compute_stress(scfres, a) ≈ FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), a) # -1.411
-
+import FiniteDiff
+FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) # -0.6579483620146331 
 
 ###
 ### Forward mode
 ###
 
 using ForwardDiff
-ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26)
-# ERROR: LoadError: MethodError: no method matching next_working_fft_size(::Type{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, ::Int64)
-# Closest candidates are:
-#   next_working_fft_size(::Type{Float32}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:167
-#   next_working_fft_size(::Type{Float64}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:168
-# Stacktrace:
-#   [1] _broadcast_getindex_evalf
-#     @ ./broadcast.jl:648 [inlined]
-#   [2] _broadcast_getindex
-#     @ ./broadcast.jl:631 [inlined]
-#   [3] getindex
-#     @ ./broadcast.jl:575 [inlined]
-#   [4] copy
-#     @ ./broadcast.jl:922 [inlined]
-#   [5] materialize
-#     @ ./broadcast.jl:883 [inlined]
-#   [6] validate_or_compute_fft_size(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, fft_size::Vector{Int64}, Ecut::Int64, supersampling::Int64, variational::Bool, optimize_fft_size::Bool, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:139
-#   [7] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:193 [inlined]
-#   [8] (::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, Int64})()
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#   [9] timeit(f::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
-#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#  [10] PlaneWaveBasis(model::Model{ForwardDiff.        Dual{ForwardDiff.Tag{var"#7#8", Float64}, Floa        t64, 1}}, Ecut::Int64, kcoords::Vector{StaticA        rrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Vector{Int64}, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
-#  [11] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#  [12] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-kinetic-debug.jl:31
-#  [13] (::var"#7#8")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#7#8", Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-kinetic-debug.jl:50
-#  [14] derivative(f::var"#7#8", x::Float64)
-#     @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
-#  [15] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-kinetic-debug.jl:50
+ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # -0.6579483619526001

From 4861677df4b56d9556329f671d14584bc55ac355 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 12:06:54 +0200
Subject: [PATCH 08/49] move model term selection to call site

---
 src/DFTK.jl                                  |  1 -
 src/standard_models.jl                       | 18 ---------
 test/autodiff-stress/stress-kinetic-debug.jl | 42 ++++++++++----------
 3 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 8bcf808fd9..78fa7881cf 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -99,7 +99,6 @@ include("eigen/preconditioners.jl")
 include("eigen/diag.jl")
 
 export model_atomic
-export model_atomic_debug # TODO delete
 export model_DFT
 export model_PBE
 export model_LDA
diff --git a/src/standard_models.jl b/src/standard_models.jl
index f99178714a..e64d856d00 100644
--- a/src/standard_models.jl
+++ b/src/standard_models.jl
@@ -20,24 +20,6 @@ function model_atomic(lattice::AbstractMatrix, atoms::Vector; extra_terms=[], kw
     Model(lattice; atoms=atoms, terms=terms, kwargs...)
 end
 
-"""
-Convenience constructor for debugging purposes only.
-"""
-function model_atomic_debug(lattice::AbstractMatrix, atoms::Vector; extra_terms=[], kwargs...)
-    @assert !(:terms in keys(kwargs))
-    @assert !(:atoms in keys(kwargs))
-    terms = [Kinetic(),
-            #  AtomicLocal(),
-            #  AtomicNonlocal(),
-            #  Ewald(),
-            #  PspCorrection(),
-             extra_terms...]
-    if :temperature in keys(kwargs) && kwargs[:temperature] != 0
-        terms = [terms..., Entropy()]
-    end
-    Model(lattice; atoms=atoms, terms=terms, kwargs...)
-end
-
 
 """
 Build a DFT model from the specified atoms, with the specified functionals.
diff --git a/test/autodiff-stress/stress-kinetic-debug.jl b/test/autodiff-stress/stress-kinetic-debug.jl
index d84be9a7c8..8c5a5d6679 100644
--- a/test/autodiff-stress/stress-kinetic-debug.jl
+++ b/test/autodiff-stress/stress-kinetic-debug.jl
@@ -5,35 +5,35 @@ using Test
 using DoubleFloats
 using GenericLinearAlgebra
 
-a = 10.26
-lattice = a / 2 * [[0 1 1.];
-                   [1 0 1.];
-                   [1 1 0.]]
-Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
-atoms = [Si => [ones(3)/8, -ones(3)/8]]
-
-model = model_atomic_debug(lattice, atoms, symmetries=false)
-kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
-Ecut = 15          # kinetic energy cutoff in Hartree
-basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
-
-# scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
-
-# try a bogus tolerance for debugging
-scfres = self_consistent_field(basis, tol=1e9)
-
-function compute_energy(scfres_ref, a)
+function make_basis(a)
     lattice = a / 2 * [[0 1 1.];
                        [1 0 1.];
                        [1 1 0.]]
     Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
     atoms = [Si => [ones(3)/8, -ones(3)/8]]
-
-    model = model_atomic_debug(lattice, atoms, symmetries=false)
+    terms = [
+        Kinetic(),
+        # AtomicLocal(),
+        # AtomicNonlocal(),
+        # Ewald(),
+        # PspCorrection()
+    ]
+    model = Model(lattice; atoms=atoms, terms=terms, symmetries=false)
     kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
-    Ecut = 15           # kinetic energy cutoff in Hartree
+    Ecut = 15          # kinetic energy cutoff in Hartree
     basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
+    return basis
+end
 
+a = 10.26
+basis = make_basis(a)
+
+# scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
+# try a bogus tolerance for debugging
+scfres = self_consistent_field(basis, tol=1e9)
+
+function compute_energy(scfres_ref, a)
+    basis = make_basis(a)
     energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
     energies.total
 end

From 4bea5293b755a01d627cba8ea9b7d20e69579c80 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 12:12:03 +0200
Subject: [PATCH 09/49] generic fft: Kinetic, AtomicLocal, Ewald, Psp work

---
 test/autodiff-stress/stress-kinetic-debug.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/autodiff-stress/stress-kinetic-debug.jl b/test/autodiff-stress/stress-kinetic-debug.jl
index 8c5a5d6679..519b6211c5 100644
--- a/test/autodiff-stress/stress-kinetic-debug.jl
+++ b/test/autodiff-stress/stress-kinetic-debug.jl
@@ -13,10 +13,10 @@ function make_basis(a)
     atoms = [Si => [ones(3)/8, -ones(3)/8]]
     terms = [
         Kinetic(),
-        # AtomicLocal(),
+        AtomicLocal(),
         # AtomicNonlocal(),
-        # Ewald(),
-        # PspCorrection()
+        Ewald(),
+        PspCorrection()
     ]
     model = Model(lattice; atoms=atoms, terms=terms, symmetries=false)
     kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
@@ -41,11 +41,11 @@ end
 compute_energy(scfres, 10.26)
 
 import FiniteDiff
-FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) # -0.6579483620146331 
+FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) # -11.113131188820518 
 
 ###
 ### Forward mode
 ###
 
 using ForwardDiff
-ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # -0.6579483619526001
+ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # -11.113131188299548

From 30dbc98bced642d9d8d33891d85a08fc28075c47 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 12:17:10 +0200
Subject: [PATCH 10/49] direct return

---
 test/autodiff-stress/stress-kinetic-debug.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/autodiff-stress/stress-kinetic-debug.jl b/test/autodiff-stress/stress-kinetic-debug.jl
index 519b6211c5..2c86c70276 100644
--- a/test/autodiff-stress/stress-kinetic-debug.jl
+++ b/test/autodiff-stress/stress-kinetic-debug.jl
@@ -21,8 +21,7 @@ function make_basis(a)
     model = Model(lattice; atoms=atoms, terms=terms, symmetries=false)
     kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
     Ecut = 15          # kinetic energy cutoff in Hartree
-    basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
-    return basis
+    PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
 end
 
 a = 10.26

From a5660f8ecc3e3fd39135a3340d41bad61cc18425 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 12:32:24 +0200
Subject: [PATCH 11/49] use make_basis in stress-total

---
 test/autodiff-stress/stress-total.jl | 154 ++++++++++++++-------------
 1 file changed, 82 insertions(+), 72 deletions(-)

diff --git a/test/autodiff-stress/stress-total.jl b/test/autodiff-stress/stress-total.jl
index 544e318c60..57fbb33854 100644
--- a/test/autodiff-stress/stress-total.jl
+++ b/test/autodiff-stress/stress-total.jl
@@ -1,43 +1,30 @@
 using DFTK
-using Test
 
-a = 10.26
-lattice = a / 2 * [[0 1 1.];
-                   [1 0 1.];
-                   [1 1 0.]]
-Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
-atoms = [Si => [ones(3)/8, -ones(3)/8]]
-
-model = model_atomic(lattice, atoms, symmetries=false)
-kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
-Ecut = 15          # kinetic energy cutoff in Hartree
-basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid)
-
-scfres = self_consistent_field(basis, tol=1e-8)
-
-function compute_energy(scfres_ref, a)
+function make_basis(a)
     lattice = a / 2 * [[0 1 1.];
-                       [1 0 1.];
-                       [1 1 0.]]
+                    [1 0 1.];
+                    [1 1 0.]]
     Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
     atoms = [Si => [ones(3)/8, -ones(3)/8]]
-
     model = model_atomic(lattice, atoms, symmetries=false)
     kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
-    Ecut = 15           # kinetic energy cutoff in Hartree
-    basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid)
+    Ecut = 15          # kinetic energy cutoff in Hartree
+    PlaneWaveBasis(model, Ecut; kgrid=kgrid)
+end
 
+a = 10.26
+scfres = self_consistent_field(make_basis(a), tol=1e-8)
+
+function compute_energy(scfres_ref, a)
+    basis = make_basis(a)
     energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
     energies.total
 end
 
 compute_energy(scfres, 10.26)
 
-function compute_stress(scfres_ref, a)
-    Inf # TODO implement
-end
-@test compute_stress(scfres, a) ≈ FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), a) # -1.411
-
+import FiniteDiff
+fd_stress = FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), a)
 
 ###
 ### Forward mode
@@ -45,31 +32,43 @@ end
 
 using ForwardDiff
 ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26)
-# ERROR: LoadError: MethodError: no method matching svdvals!(::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}})
+# ERROR: LoadError: MethodError: no method matching next_working_fft_size(::Type{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, ::Int64)
 # Closest candidates are:
-#   svdvals!(::SymTridiagonal) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/tridiag.jl:351
-#   svdvals!(::StridedMatrix{T}) where T<:Union{Float32, Float64, ComplexF32, ComplexF64} at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:192
-#   svdvals!(::StridedMatrix{T}, ::StridedMatrix{T}) where T<:Union{Float32, Float64, ComplexF32, ComplexF64} at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:498
-#   ...
+#   next_working_fft_size(::Type{Float32}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:167
+#   next_working_fft_size(::Type{Float64}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:168
 # Stacktrace:
-#  [1] svdvals(A::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}})
-#    @ LinearAlgebra /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:217
-#  [2] cond(A::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}}, p::Int64)
-#    @ LinearAlgebra /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/dense.jl:1462
-#  [3] cond
-#    @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/dense.jl:1461 [inlined]
-#  [4] Model(lattice::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}}; n_electrons::Nothing, atoms::Vector{Pair{ElementPsp, Vector{Vector{Float64}}}}, magnetic_moments::Vector{Any}, terms::Vector{Any}, temperature::ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}, smearing::Nothing, spin_polarization::Symbol, symmetries::Bool)
-#    @ DFTK ~/.julia/dev/DFTK.jl/src/Model.jl:106
-#  [5] model_atomic(lattice::Matrix{ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1}}, atoms::Vector{Pair{ElementPsp, Vector{Vector{Float64}}}}; extra_terms::Vector{Any}, kwargs::Base.Iterators.Pairs{Symbol, Bool, Tuple{Symbol}, NamedTuple{(:symmetries,), Tuple{Bool}}})
-#    @ DFTK ~/.julia/dev/DFTK.jl/src/standard_models.jl:20
-#  [6] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1})
-#    @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:25
-#  [7] (::var"#15#16")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#15#16", Float64}, Float64, 1})
-#    @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:45
-#  [8] derivative(f::var"#15#16", x::Float64)
-#    @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
-#  [9] top-level scope
-#    @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:45
+#   [1] macro expansion
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:125 [inlined]
+#   [2] _broadcast
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:99 [inlined]
+#   [3] copy
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:26 [inlined]
+#   [4] materialize
+#     @ ./broadcast.jl:883 [inlined]
+#   [5] validate_or_compute_fft_size(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, fft_size::Nothing, Ecut::Int64, supersampling::Int64, variational::Bool, optimize_fft_size::Bool, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:139
+#   [6] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:193 [inlined]
+#   [7] (::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, Int64})()
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
+#   [8] timeit(f::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
+#   [9] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, Ecut::Int64, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Nothing, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
+#  [10] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
+#  [11] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:13
+#  [12] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:20
+#  [13] (::var"#11#12")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:41
+#  [14] derivative(f::var"#11#12", x::Float64)
+#     @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
+#  [15] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:41
+# in expression starting at /home/niku/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:41
+
 
 ###
 ### Reverse mode
@@ -160,31 +159,42 @@ Zygote.gradient(a -> compute_energy(scfres, a), 10.26)
 
 using ReverseDiff
 ReverseDiff.gradient(a -> compute_energy(scfres, first(a)), [10.26])
-# ERROR: LoadError: MethodError: no method matching svdvals!(::Matrix{ReverseDiff.TrackedReal{Float64, Float64, Nothing}})
+# ERROR: LoadError: MethodError: no method matching next_working_fft_size(::Type{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, ::Int64)
 # Closest candidates are:
-#   svdvals!(::SymTridiagonal) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/tridiag.jl:351
-#   svdvals!(::StridedMatrix{T}) where T<:Union{Float32, Float64, ComplexF32, ComplexF64} at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:192
-#   svdvals!(::StridedMatrix{T}, ::StridedMatrix{T}) where T<:Union{Float32, Float64, ComplexF32, ComplexF64} at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:498
-#   ...
+#   next_working_fft_size(::Type{Float32}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:167
+#   next_working_fft_size(::Type{Float64}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:168
 # Stacktrace:
-#   [1] svdvals(A::Matrix{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}})
-#     @ LinearAlgebra /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/svd.jl:217
-#   [2] cond(A::Matrix{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, p::Int64)
-#     @ LinearAlgebra /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/dense.jl:1462
-#   [3] cond
-#     @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/dense.jl:1461 [inlined]
-#   [4] Model(lattice::ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}; n_electrons::Nothing, atoms::Vector{Pair{ElementPsp, Vector{Vector{Float64}}}}, magnetic_moments::Vector{Any}, terms::Vector{Any}, temperature::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, smearing::Nothing, spin_polarization::Symbol, symmetries::Bool)
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/Model.jl:106
-#   [5] model_atomic(lattice::ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}, atoms::Vector{Pair{ElementPsp, Vector{Vector{Float64}}}}; extra_terms::Vector{Any}, kwargs::Base.Iterators.Pairs{Symbol, Bool, Tuple{Symbol}, NamedTuple{(:symmetries,), Tuple{Bool}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/standard_models.jl:20
-#   [6] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:25
-#   [7] (::var"#23#24")(a::ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:160
-#   [8] ReverseDiff.GradientTape(f::var"#23#24", input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
+#   [1] macro expansion
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:125 [inlined]
+#   [2] _broadcast
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:99 [inlined]
+#   [3] copy
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:26 [inlined]
+#   [4] materialize
+#     @ ./broadcast.jl:883 [inlined]
+#   [5] validate_or_compute_fft_size(model::Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, fft_size::Nothing, Ecut::Int64, supersampling::Int64, variational::Bool, optimize_fft_size::Bool, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:139
+#   [6] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:193 [inlined]
+#   [7] (::DFTK.var"#62#64"{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Int64})()
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
+#   [8] timeit(f::DFTK.var"#62#64"{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
+#   [9] PlaneWaveBasis(model::Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Ecut::Int64, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Nothing, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
+#  [10] PlaneWaveBasis(model::Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
+#  [11] make_basis(a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:13
+#  [12] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:20
+#  [13] (::var"#15#16")(a::ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:161
+#  [14] ReverseDiff.GradientTape(f::var"#15#16", input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
 #     @ ReverseDiff ~/.julia/packages/ReverseDiff/E4Tzn/src/api/tape.jl:199
-#   [9] gradient(f::Function, input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}}) (repeats 2 times)
+#  [15] gradient(f::Function, input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}}) (repeats 2 times)
 #     @ ReverseDiff ~/.julia/packages/ReverseDiff/E4Tzn/src/api/gradients.jl:22
-#  [10] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:160
+#  [16] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:161
+# in expression starting at /home/niku/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:161
 

From 746aad66fcc98655a0e66da4b4db8f41d5b6d28a Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 12:37:59 +0200
Subject: [PATCH 12/49] rename stress-forward-genericlinearalgebra

---
 ...ss-kinetic-debug.jl => stress-forward-genericlinearalgebra.jl} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/autodiff-stress/{stress-kinetic-debug.jl => stress-forward-genericlinearalgebra.jl} (100%)

diff --git a/test/autodiff-stress/stress-kinetic-debug.jl b/test/autodiff-stress/stress-forward-genericlinearalgebra.jl
similarity index 100%
rename from test/autodiff-stress/stress-kinetic-debug.jl
rename to test/autodiff-stress/stress-forward-genericlinearalgebra.jl

From cdad8eb706a45c1eb6cab5ef3525ae4a4bfb928b Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 12:42:24 +0200
Subject: [PATCH 13/49] stack trace: no next_working_fft_size for Dual

---
 test/autodiff-stress/stress-forward.jl | 84 ++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 test/autodiff-stress/stress-forward.jl

diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
new file mode 100644
index 0000000000..91db8499b3
--- /dev/null
+++ b/test/autodiff-stress/stress-forward.jl
@@ -0,0 +1,84 @@
+using DFTK
+using Test
+
+function make_basis(a)
+    lattice = a / 2 * [[0 1 1.];
+                       [1 0 1.];
+                       [1 1 0.]]
+    Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+    atoms = [Si => [ones(3)/8, -ones(3)/8]]
+    terms = [
+        Kinetic(),
+        # AtomicLocal(),
+        # AtomicNonlocal(),
+        # Ewald(),
+        # PspCorrection()
+    ]
+    model = Model(lattice; atoms=atoms, terms=terms, symmetries=false)
+    kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
+    Ecut = 15          # kinetic energy cutoff in Hartree
+    PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
+end
+
+a = 10.26
+basis = make_basis(a)
+
+# scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
+# try a bogus tolerance for debugging
+scfres = self_consistent_field(basis, tol=1e9)
+
+function compute_energy(scfres_ref, a)
+    basis = make_basis(a)
+    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
+    energies.total
+end
+
+compute_energy(scfres, 10.26)
+
+import FiniteDiff
+FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) # -11.113131188820518 
+
+###
+### Forward mode
+###
+
+using ForwardDiff
+ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # -11.113131188299548
+# ERROR: LoadError: MethodError: no method matching next_working_fft_size(::Type{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, ::Int64)
+# Closest candidates are:
+#   next_working_fft_size(::Type{Float32}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:167
+#   next_working_fft_size(::Type{Float64}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:168
+# Stacktrace:
+#   [1] _broadcast_getindex_evalf
+#     @ ./broadcast.jl:648 [inlined]
+#   [2] _broadcast_getindex
+#     @ ./broadcast.jl:631 [inlined]
+#   [3] getindex
+#     @ ./broadcast.jl:575 [inlined]
+#   [4] copy
+#     @ ./broadcast.jl:922 [inlined]
+#   [5] materialize
+#     @ ./broadcast.jl:883 [inlined]
+#   [6] validate_or_compute_fft_size(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, fft_size::Vector{Int64}, Ecut::Int64, supersampling::Int64, variational::Bool, optimize_fft_size::Bool, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:139
+#   [7] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:193 [inlined]
+#   [8] (::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, Int64})()
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
+#   [9] timeit(f::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
+#  [10] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, Ecut::Int64, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Vector{Int64}, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
+#  [11] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
+#  [12] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:20
+#  [13] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:31
+#  [14] (::var"#19#20")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:46
+#  [15] derivative(f::var"#19#20", x::Float64)
+#     @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
+#  [16] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:46
+

From 324918fc42a174ea35455eb4f06c90c5b1d8cb74 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 12:48:18 +0200
Subject: [PATCH 14/49] stack trace: no build_fft_plans for Dual

---
 src/fft.jl                             |  1 +
 test/autodiff-stress/stress-forward.jl | 42 +++++++++-----------------
 2 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index b8690305df..53fb7defc8 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -166,3 +166,4 @@ end
 # for floating-point types natively supported by FFTW
 next_working_fft_size(::Type{Float32}, size) = size
 next_working_fft_size(::Type{Float64}, size) = size
+next_working_fft_size(::Type{<:ForwardDiff.Dual}, size) = size
diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index 91db8499b3..ea364793aa 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -43,42 +43,28 @@ FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) #
 ###
 
 using ForwardDiff
-ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # -11.113131188299548
-# ERROR: LoadError: MethodError: no method matching next_working_fft_size(::Type{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, ::Int64)
+ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26)
+# ERROR: LoadError: MethodError: no method matching build_fft_plans(::Type{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, ::Tuple{Int64, Int64, Int64})
 # Closest candidates are:
-#   next_working_fft_size(::Type{Float32}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:167
-#   next_working_fft_size(::Type{Float64}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:168
+#   build_fft_plans(::Union{Type{Float32}, Type{Float64}}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:154
 # Stacktrace:
-#   [1] _broadcast_getindex_evalf
-#     @ ./broadcast.jl:648 [inlined]
-#   [2] _broadcast_getindex
-#     @ ./broadcast.jl:631 [inlined]
-#   [3] getindex
-#     @ ./broadcast.jl:575 [inlined]
-#   [4] copy
-#     @ ./broadcast.jl:922 [inlined]
-#   [5] materialize
-#     @ ./broadcast.jl:883 [inlined]
-#   [6] validate_or_compute_fft_size(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, fft_size::Vector{Int64}, Ecut::Int64, supersampling::Int64, variational::Bool, optimize_fft_size::Bool, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:139
-#   [7] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:193 [inlined]
-#   [8] (::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, Int64})()
+#   [1] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:196 [inlined]
+#   [2] (::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, Int64})()
 #     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#   [9] timeit(f::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#   [3] timeit(f::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
 #     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#  [10] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, Ecut::Int64, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Vector{Int64}, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
+#   [4] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, Ecut::Int64, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Vector{Int64}, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
 #     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
-#  [11] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
+#   [5] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
 #     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#  [12] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1})
+#   [6] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1})
 #     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:20
-#  [13] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1})
+#   [7] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1})
 #     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:31
-#  [14] (::var"#19#20")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#19#20", Float64}, Float64, 1})
+#   [8] (::var"#23#24")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1})
 #     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:46
-#  [15] derivative(f::var"#19#20", x::Float64)
+#   [9] derivative(f::var"#23#24", x::Float64)
 #     @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
-#  [16] top-level scope
+#  [10] top-level scope
 #     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:46
-

From 9e772a5f6d74a86161ff1bb9296ce80f890badfb Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 13:00:30 +0200
Subject: [PATCH 15/49] reanme stress-forward-genericfft

---
 ...rd-genericlinearalgebra.jl => stress-forward-genericfft.jl} | 3 +++
 1 file changed, 3 insertions(+)
 rename test/autodiff-stress/{stress-forward-genericlinearalgebra.jl => stress-forward-genericfft.jl} (92%)

diff --git a/test/autodiff-stress/stress-forward-genericlinearalgebra.jl b/test/autodiff-stress/stress-forward-genericfft.jl
similarity index 92%
rename from test/autodiff-stress/stress-forward-genericlinearalgebra.jl
rename to test/autodiff-stress/stress-forward-genericfft.jl
index 2c86c70276..93d0eaced0 100644
--- a/test/autodiff-stress/stress-forward-genericlinearalgebra.jl
+++ b/test/autodiff-stress/stress-forward-genericfft.jl
@@ -1,3 +1,6 @@
+# Hellmann-Feynman stress
+# via ForwardDiff & generic arithmetic (FourierTransforms.jl)
+# (disregarding performance)
 using DFTK
 using Test
 

From e7c410572109337b09e79d48dd197a97e54ed781 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 16:11:41 +0200
Subject: [PATCH 16/49] add generic-fft stress btimes

---
 src/fft.jl                                      |  1 +
 .../stress-forward-genericfft.jl                | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index 53fb7defc8..144ff2a465 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -1,4 +1,5 @@
 import FFTW
+import ForwardDiff
 
 # returns the lengths of the bounding rectangle in reciprocal space
 # that encloses the sphere of radius Gmax
diff --git a/test/autodiff-stress/stress-forward-genericfft.jl b/test/autodiff-stress/stress-forward-genericfft.jl
index 93d0eaced0..7177a1a892 100644
--- a/test/autodiff-stress/stress-forward-genericfft.jl
+++ b/test/autodiff-stress/stress-forward-genericfft.jl
@@ -40,14 +40,17 @@ function compute_energy(scfres_ref, a)
     energies.total
 end
 
-compute_energy(scfres, 10.26)
+compute_energy(a) = compute_energy(scfres, a)
+compute_energy(10.26)
 
 import FiniteDiff
-FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) # -11.113131188820518 
-
-###
-### Forward mode
-###
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -11.113131188820518 
 
 using ForwardDiff
-ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # -11.113131188299548
+ForwardDiff.derivative(compute_energy, 10.26) # -11.113131188299548
+
+using BenchmarkTools
+@btime compute_energy(10.26)                                           # 14.294 ms (60112 allocations: 9.65 MiB)
+@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 29.582 ms (120228 allocations: 19.30 MiB)
+@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 26.178 ms (70669 allocations: 14.51 MiB)
+

From 55125261676dc41b436df046e6b4f8f47dfd557e Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 19:52:48 +0200
Subject: [PATCH 17/49] add (a few) FFTW ForwardDiff rules

---
 src/fft.jl | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 1 deletion(-)

diff --git a/src/fft.jl b/src/fft.jl
index 144ff2a465..50cec1d51e 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -1,5 +1,4 @@
 import FFTW
-import ForwardDiff
 
 # returns the lengths of the bounding rectangle in reciprocal space
 # that encloses the sphere of radius Gmax
@@ -167,4 +166,123 @@ end
 # for floating-point types natively supported by FFTW
 next_working_fft_size(::Type{Float32}, size) = size
 next_working_fft_size(::Type{Float64}, size) = size
+
+
+#==============================================================================#
+#
+# ForwardDiff + FFTW zone
+#
+#==============================================================================#
+import ForwardDiff
+import AbstractFFTs
+
+# original PR by mcabbott: https://github.com/JuliaDiff/ForwardDiff.jl/pull/495
+# modified version: https://github.com/niklasschmitz/ForwardDiff.jl/blob/nfs/fft/src/fft.jl
+
+ForwardDiff.value(x::Complex{<:ForwardDiff.Dual}) =
+    Complex(x.re.value, x.im.value)
+
+ForwardDiff.partials(x::Complex{<:ForwardDiff.Dual}, n::Int) =
+    Complex(ForwardDiff.partials(x.re, n), ForwardDiff.partials(x.im, n))
+
+ForwardDiff.npartials(x::Complex{<:ForwardDiff.Dual{T,V,N}}) where {T,V,N} = N
+ForwardDiff.npartials(::Type{<:Complex{<:ForwardDiff.Dual{T,V,N}}}) where {T,V,N} = N
+
+ForwardDiff.tagtype(x::Complex{<:ForwardDiff.Dual{T,V,N}}) where {T,V,N} = T
+ForwardDiff.tagtype(::Type{<:Complex{<:ForwardDiff.Dual{T,V,N}}}) where {T,V,N} = T
+
+# AbstractFFTs.complexfloat(x::AbstractArray{<:ForwardDiff.Dual}) = float.(x .+ 0im)
+AbstractFFTs.complexfloat(x::AbstractArray{<:ForwardDiff.Dual}) = AbstractFFTs.complexfloat.(x)
+AbstractFFTs.complexfloat(d::ForwardDiff.Dual{T,V,N}) where {T,V,N} = convert(ForwardDiff.Dual{T,float(V),N}, d) + 0im
+
+AbstractFFTs.realfloat(x::AbstractArray{<:ForwardDiff.Dual}) = AbstractFFTs.realfloat.(x)
+AbstractFFTs.realfloat(d::ForwardDiff.Dual{T,V,N}) where {T,V,N} = convert(ForwardDiff.Dual{T,float(V),N}, d)
+
+for plan in [:plan_fft, :plan_ifft, :plan_bfft]
+    @eval begin
+
+        AbstractFFTs.$plan(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
+            AbstractFFTs.$plan(ForwardDiff.value.(x) .+ 0im, region; kwargs...)
+
+        AbstractFFTs.$plan(x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, region=1:ndims(x); kwargs...) =
+            AbstractFFTs.$plan(ForwardDiff.value.(x), region; kwargs...)
+
+    end
+end
+
+# rfft only accepts real arrays
+AbstractFFTs.plan_rfft(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
+    AbstractFFTs.plan_rfft(ForwardDiff.value.(x), region; kwargs...)
+
+for plan in [:plan_irfft, :plan_brfft]  # these take an extra argument, only when complex?
+    @eval begin
+
+        AbstractFFTs.$plan(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
+            AbstractFFTs.$plan(ForwardDiff.value.(x) .+ 0im, region; kwargs...)
+
+        AbstractFFTs.$plan(x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, d::Integer, region=1:ndims(x); kwargs...) =
+            AbstractFFTs.$plan(ForwardDiff.value.(x), d, region; kwargs...)
+
+    end
+end
+
+for P in [:Plan, :ScaledPlan]  # need ScaledPlan to avoid ambiguities
+    @eval begin
+
+        Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:ForwardDiff.Dual}) =
+            _apply_plan(p, x)
+
+        Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}) =
+            _apply_plan(p, x)
+
+    end
+end
+
+function _apply_plan(p::AbstractFFTs.Plan, x::AbstractArray)
+    xtil = p * ForwardDiff.value.(x)
+    dxtils = ntuple(ForwardDiff.npartials(eltype(x))) do n
+        p * ForwardDiff.partials.(x, n)
+    end
+    T = ForwardDiff.tagtype(eltype(x))
+    map(xtil, dxtils...) do val, parts...
+        Complex(
+            ForwardDiff.Dual{T}(real(val), map(real, parts)),
+            ForwardDiff.Dual{T}(imag(val), map(imag, parts)),
+        )
+    end
+end
+
+###
+### DFTK setup specific
+###
+
 next_working_fft_size(::Type{<:ForwardDiff.Dual}, size) = size
+
+_fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED
+
+# *** COPIED from fft_generic.jl *** TODO refactor
+# A dummy wrapper around an out-of-place FFT plan to make it appear in-place
+# This is needed for some generic FFT implementations, which do not have in-place plans
+struct DummyInplace{opFFT}
+    fft::opFFT
+end
+LinearAlgebra.mul!(Y, p::DummyInplace, X) = (Y .= mul!(similar(X), p.fft, X))
+LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = (Y .= ldiv!(similar(X), p.fft, X))
+
+import Base: *, \, length
+*(p::DummyInplace, X) = p.fft * X
+\(p::DummyInplace, X) = p.fft \ X
+length(p::DummyInplace) = length(p.fft)
+
+function build_fft_plans(T::Type{<:ForwardDiff.Dual}, fft_size)
+    tmp = Array{Complex{T}}(undef, fft_size...)
+    opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
+    opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T))
+
+    ipFFT  = DummyInplace{typeof(opFFT)}(opFFT)
+    ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
+    # backward by inverting and stripping off normalizations
+    ipFFT, opFFT, ipBFFT, opBFFT
+end
+
+

From 1a5b54c08ea1d830294b57dc15281049f34f99a4 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 14 Jun 2021 19:54:30 +0200
Subject: [PATCH 18/49] stack trace: no mul!(...Dual, ...FFTW.cFFTWPlan,.)

---
 test/autodiff-stress/stress-forward.jl | 77 +++++++++++++++-----------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index ea364793aa..7e0885e436 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -1,5 +1,4 @@
 using DFTK
-using Test
 
 function make_basis(a)
     lattice = a / 2 * [[0 1 1.];
@@ -9,10 +8,10 @@ function make_basis(a)
     atoms = [Si => [ones(3)/8, -ones(3)/8]]
     terms = [
         Kinetic(),
-        # AtomicLocal(),
+        AtomicLocal(),
         # AtomicNonlocal(),
-        # Ewald(),
-        # PspCorrection()
+        Ewald(),
+        PspCorrection()
     ]
     model = Model(lattice; atoms=atoms, terms=terms, symmetries=false)
     kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
@@ -21,11 +20,10 @@ function make_basis(a)
 end
 
 a = 10.26
-basis = make_basis(a)
 
 # scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
 # try a bogus tolerance for debugging
-scfres = self_consistent_field(basis, tol=1e9)
+scfres = self_consistent_field(make_basis(a), tol=1e9)
 
 function compute_energy(scfres_ref, a)
     basis = make_basis(a)
@@ -33,38 +31,53 @@ function compute_energy(scfres_ref, a)
     energies.total
 end
 
-compute_energy(scfres, 10.26)
+compute_energy(a) = compute_energy(scfres, a)
+compute_energy(10.26)
 
 import FiniteDiff
-FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), 10.26) # -11.113131188820518 
-
-###
-### Forward mode
-###
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -11.113131188820518 
 
 using ForwardDiff
-ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26)
-# ERROR: LoadError: MethodError: no method matching build_fft_plans(::Type{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, ::Tuple{Int64, Int64, Int64})
+ForwardDiff.derivative(compute_energy, 10.26)
+# ERROR: LoadError: MethodError: no method matching mul!(::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, ::FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, ::Bool, ::Bool)
 # Closest candidates are:
-#   build_fft_plans(::Union{Type{Float32}, Type{Float64}}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:154
+#   mul!(::AbstractArray, ::Number, ::AbstractArray, ::Number, ::Number) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:132
+#   mul!(::AbstractArray, ::AbstractArray, ::Number, ::Number, ::Number) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:140
+#   mul!(::Any, ::Any, ::Any) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/matmul.jl:274
+#   ...
 # Stacktrace:
-#   [1] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:196 [inlined]
-#   [2] (::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, Int64})()
+#   [1] mul!
+#     @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/matmul.jl:275 [inlined]
+#   [2] mul!(y::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, p::AbstractFFTs.ScaledPlan{ComplexF64, FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, x::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
+#     @ AbstractFFTs ~/.julia/packages/AbstractFFTs/JebmH/src/definitions.jl:269
+#   [3] G_to_r!(f_real::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:383
+#   [4] G_to_r(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3}; assume_real::Bool)
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:414
+#   [5] G_to_r
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:410 [inlined]
+#   [6] (::AtomicLocal)(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/terms/local.jl:93
+#   [7] macro expansion
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:190 [inlined]
+#   [8] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:246 [inlined]
+#   [9] (::DFTK.var"#66#68"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64})()
 #     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#   [3] timeit(f::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#  [10] timeit(f::DFTK.var"#66#68"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
 #     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#   [4] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, Ecut::Int64, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Vector{Int64}, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
-#   [5] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
+#  [11] #PlaneWaveBasis#65
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
+#  [12] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
 #     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#   [6] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:20
-#   [7] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:31
-#   [8] (::var"#23#24")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#23#24", Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:46
-#   [9] derivative(f::var"#23#24", x::Float64)
-#     @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
-#  [10] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:46
+#  [13] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:19
+#  [14] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:29
+#  [15] compute_energy(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:34
+#  [16] derivative(f::typeof(compute_energy), x::Float64)
+#     @ ForwardDiff ~/.julia/packages/ForwardDiff/QOqCN/src/derivative.jl:14
+#  [17] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:41
+

From 42cb0763e78344763c3af21d9ad733fa4ad3f8f3 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 15 Jun 2021 10:17:33 +0200
Subject: [PATCH 19/49] fwddiff FFTW: Kinetic, AtomicLocal, Ewald, Psp

---
 src/PlaneWaveBasis.jl                  |  15 ++-
 src/fft.jl                             |  10 +-
 test/autodiff-stress/stress-forward.jl | 122 ++++++++++++++++++++++++-
 3 files changed, 138 insertions(+), 9 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index de3014d359..6761a779fd 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -203,11 +203,18 @@ build_kpoints(basis::PlaneWaveBasis, kcoords) =
     # ψ(r) = sum_G c_G e^iGr / sqrt(Ω)
     # so that the ifft is normalized by 1/sqrt(Ω). It follows that the
     # fft must be normalized by sqrt(Ω) / length
-    ipFFT = ipFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(ipFFT_unnormalized))
-    opFFT = opFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(opFFT_unnormalized))
-    ipIFFT = inv(ipFFT)
-    opIFFT = inv(opFFT)
 
+    ## normalization disabled for debugging (model.unit_cell_volume is a ForwardDiff.Dual, thus makes a real plan Dual which fails)
+    ## TODO re-enable normalization
+    # ipFFT = ipFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(ipFFT_unnormalized))
+    # opFFT = opFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(opFFT_unnormalized))
+    # ipIFFT = inv(ipFFT)
+    # opIFFT = inv(opFFT)
+    ipFFT = ipFFT_unnormalized #* (sqrt(model.unit_cell_volume) / length(ipFFT_unnormalized))
+    opFFT = opFFT_unnormalized #* (sqrt(model.unit_cell_volume) / length(opFFT_unnormalized))
+    ipIFFT = ipBFFT_unnormalized
+    opIFFT = opBFFT_unnormalized
+    
     # Setup kpoint basis sets
     !variational && @warn(
         "Non-variational calculations are experimental. " *
diff --git a/src/fft.jl b/src/fft.jl
index 50cec1d51e..fb96713e7a 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -229,12 +229,19 @@ end
 for P in [:Plan, :ScaledPlan]  # need ScaledPlan to avoid ambiguities
     @eval begin
 
+        # TODO handle ForwardDiff.Dual scaling factors (perhaps lazy evaluation?)
+
         Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:ForwardDiff.Dual}) =
             _apply_plan(p, x)
 
         Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}) =
             _apply_plan(p, x)
 
+        LinearAlgebra.mul!(Y::AbstractArray, p::AbstractFFTs.$P, X::AbstractArray{<:ForwardDiff.Dual}) = 
+            (Y .= _apply_plan(p, X))
+        
+        LinearAlgebra.mul!(Y::AbstractArray, p::AbstractFFTs.$P, X::AbstractArray{<:Complex{<:ForwardDiff.Dual}}) =
+            (Y .= _apply_plan(p, X))
     end
 end
 
@@ -274,7 +281,7 @@ import Base: *, \, length
 \(p::DummyInplace, X) = p.fft \ X
 length(p::DummyInplace) = length(p.fft)
 
-function build_fft_plans(T::Type{<:ForwardDiff.Dual}, fft_size)
+function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}, fft_size)
     tmp = Array{Complex{T}}(undef, fft_size...)
     opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
     opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T))
@@ -285,4 +292,3 @@ function build_fft_plans(T::Type{<:ForwardDiff.Dual}, fft_size)
     ipFFT, opFFT, ipBFFT, opBFFT
 end
 
-
diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index 7e0885e436..d78be8d4eb 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -35,11 +35,21 @@ compute_energy(a) = compute_energy(scfres, a)
 compute_energy(10.26)
 
 import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -11.113131188820518 
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.948556665633414e9 
 
 using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26)
-# ERROR: LoadError: MethodError: no method matching mul!(::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, ::FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, ::Bool, ::Bool)
+ForwardDiff.derivative(compute_energy, 10.26) # -2.948556665529993e9
+
+#===#
+# selected previous stack traces below.
+
+# ERROR: LoadError: MethodError: no method matching mul!(
+#    ::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, 
+#    ::FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, 
+#    ::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, 
+#    ::Bool, 
+#    ::Bool
+# )
 # Closest candidates are:
 #   mul!(::AbstractArray, ::Number, ::AbstractArray, ::Number, ::Number) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:132
 #   mul!(::AbstractArray, ::AbstractArray, ::Number, ::Number, ::Number) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:140
@@ -81,3 +91,109 @@ ForwardDiff.derivative(compute_energy, 10.26)
 #  [17] top-level scope
 #     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:41
 
+
+
+# ERROR: LoadError: MethodError: mul!(::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, ::AbstractFFTs.ScaledPlan{ComplexF64, FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, ::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}) is ambiguous. Candidates:
+#   mul!(Y, p::AbstractFFTs.ScaledPlan, X::AbstractArray{var"#s25", N} where {var"#s25"<:(Complex{var"#s24"} where var"#s24"<:ForwardDiff.Dual), N}) in DFTK at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:241
+#   mul!(Y, p::AbstractFFTs.Plan, X::AbstractArray{var"#s25", N} where {var"#s25"<:(Complex{var"#s24"} where var"#s24"<:ForwardDiff.Dual), N}) in DFTK at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:241
+#   mul!(y::AbstractArray, p::AbstractFFTs.ScaledPlan, x::AbstractArray) in AbstractFFTs at /home/niku/.julia/packages/AbstractFFTs/JebmH/src/definitions.jl:269
+# Possible fix, define
+#   mul!(::AbstractArray, ::AbstractFFTs.ScaledPlan, ::AbstractArray{var"#s25", N} where {var"#s25"<:(Complex{var"#s24"} where var"#s24"<:ForwardDiff.Dual), N})
+# Stacktrace:
+#   [1] G_to_r!(f_real::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:383
+#   [2] G_to_r(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3}; assume_real::Bool)
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:414
+#   [3] G_to_r
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:410 [inlined]
+#   [4] (::AtomicLocal)(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/terms/local.jl:93
+#   [5] macro expansion
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:190 [inlined]
+#   [6] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:246 [inlined]
+#   [7] (::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64})()
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
+#   [8] timeit(f::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
+#   [9] #PlaneWaveBasis#76
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
+#  [10] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
+#  [11] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:19
+#  [12] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:29
+#  [13] compute_energy(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:34
+#  [14] derivative(f::typeof(compute_energy), x::Float64)
+#     @ ForwardDiff ~/.julia/packages/ForwardDiff/QOqCN/src/derivative.jl:14
+#  [15] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:41
+
+
+
+# ERROR: LoadError: MethodError: no method matching Float64(::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+# Closest candidates are:
+#   (::Type{T})(::Real, ::RoundingMode) where T<:AbstractFloat at rounding.jl:200
+#   (::Type{T})(::T) where T<:Number at boot.jl:760
+#   (::Type{T})(::AbstractChar) where T<:Union{AbstractChar, Number} at char.jl:50
+#   ...
+# Stacktrace:
+#   [1] convert(#unused#::Type{Float64}, x::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Base ./number.jl:7
+#   [2] ComplexF64(re::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, im::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Base ./complex.jl:12
+#   [3] ComplexF64(z::Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
+#     @ Base ./complex.jl:36
+#   [4] convert(#unused#::Type{ComplexF64}, x::Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
+#     @ Base ./number.jl:7
+#   [5] setindex!(A::Array{ComplexF64, 3}, x::Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, i1::Int64)
+#     @ Base ./array.jl:839
+#   [6] macro expansion
+#     @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:183 [inlined]
+#   [7] macro expansion
+#     @ ./simdloop.jl:77 [inlined]
+#   [8] rmul!
+#     @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:182 [inlined]
+#   [9] *(p::AbstractFFTs.ScaledPlan{
+#           ComplexF64, 
+#           FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, 
+#           ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}
+#         }, 
+#         x::Array{ComplexF64, 3})
+#     @ AbstractFFTs ~/.julia/packages/AbstractFFTs/JebmH/src/definitions.jl:249
+#  [10] _apply_plan(p::AbstractFFTs.ScaledPlan{ComplexF64, FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, x::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:249
+#  [11] mul!(Y::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, p::AbstractFFTs.ScaledPlan{ComplexF64, FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, X::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:243
+#  [12] G_to_r!(f_real::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:383
+#  [13] G_to_r(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3}; assume_real::Bool)
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:414
+#  [14] G_to_r
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:410 [inlined]
+#  [15] (::AtomicLocal)(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/terms/local.jl:93
+#  [16] macro expansion
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:190 [inlined]
+#  [17] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:246 [inlined]
+#  [18] (::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64})()
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
+#  [19] timeit(f::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
+#  [20] #PlaneWaveBasis#76
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
+#  [21] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
+#  [22] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:19
+#  [23] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:29
+#  [24] compute_energy(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:34
+#  [25] derivative(f::typeof(compute_energy), x::Float64)
+#     @ ForwardDiff ~/.julia/packages/ForwardDiff/QOqCN/src/derivative.jl:14
+#  [26] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:41

From 791a5a8638dc747669d8ef41c8e064436262609c Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 15 Jun 2021 10:41:44 +0200
Subject: [PATCH 20/49] add fwddiff FFTW btimes

---
 test/autodiff-stress/stress-forward.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index d78be8d4eb..61339b447e 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -40,6 +40,11 @@ FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.948556665633
 using ForwardDiff
 ForwardDiff.derivative(compute_energy, 10.26) # -2.948556665529993e9
 
+using BenchmarkTools
+@btime compute_energy(10.26)                                           # 19.513 ms ( 60004 allocations:  8.15 MiB)
+@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 39.317 ms (120012 allocations: 16.29 MiB)
+@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 80.757 ms (543588 allocations: 31.91 MiB)
+
 #===#
 # selected previous stack traces below.
 

From 320ce60ed35cb0c11186869e383afdb1dd383093 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 15 Jun 2021 11:00:23 +0200
Subject: [PATCH 21/49] add description header

---
 test/autodiff-stress/stress-forward.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index 61339b447e..6c316d8599 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -1,3 +1,5 @@
+# Hellmann-Feynman stress
+# via ForwardDiff & custom FFTW overloads on ForwardDiff.Dual
 using DFTK
 
 function make_basis(a)

From 57331b0d498756dc85322ef67c02daf781bc5318 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 15 Jun 2021 11:44:17 +0200
Subject: [PATCH 22/49] update generic fft stress values

---
 test/autodiff-stress/stress-forward-genericfft.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/autodiff-stress/stress-forward-genericfft.jl b/test/autodiff-stress/stress-forward-genericfft.jl
index 7177a1a892..d41cb02057 100644
--- a/test/autodiff-stress/stress-forward-genericfft.jl
+++ b/test/autodiff-stress/stress-forward-genericfft.jl
@@ -2,7 +2,6 @@
 # via ForwardDiff & generic arithmetic (FourierTransforms.jl)
 # (disregarding performance)
 using DFTK
-using Test
 
 # for generic FourierTransforms.jl (TODO replace by FFTW later)
 using DoubleFloats
@@ -44,10 +43,10 @@ compute_energy(a) = compute_energy(scfres, a)
 compute_energy(10.26)
 
 import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -11.113131188820518 
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.9485566656257386e9 
 
 using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26) # -11.113131188299548
+ForwardDiff.derivative(compute_energy, 10.26) # -2.9485566655301175e9
 
 using BenchmarkTools
 @btime compute_energy(10.26)                                           # 14.294 ms (60112 allocations: 9.65 MiB)

From 3da7d29d72b169d51b088792be7026f3370ec8de Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 15 Jun 2021 13:22:39 +0200
Subject: [PATCH 23/49] del forward-genericfft in favor of seperate branch

---
 .../stress-forward-genericfft.jl              | 55 -------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 test/autodiff-stress/stress-forward-genericfft.jl

diff --git a/test/autodiff-stress/stress-forward-genericfft.jl b/test/autodiff-stress/stress-forward-genericfft.jl
deleted file mode 100644
index d41cb02057..0000000000
--- a/test/autodiff-stress/stress-forward-genericfft.jl
+++ /dev/null
@@ -1,55 +0,0 @@
-# Hellmann-Feynman stress
-# via ForwardDiff & generic arithmetic (FourierTransforms.jl)
-# (disregarding performance)
-using DFTK
-
-# for generic FourierTransforms.jl (TODO replace by FFTW later)
-using DoubleFloats
-using GenericLinearAlgebra
-
-function make_basis(a)
-    lattice = a / 2 * [[0 1 1.];
-                       [1 0 1.];
-                       [1 1 0.]]
-    Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
-    atoms = [Si => [ones(3)/8, -ones(3)/8]]
-    terms = [
-        Kinetic(),
-        AtomicLocal(),
-        # AtomicNonlocal(),
-        Ewald(),
-        PspCorrection()
-    ]
-    model = Model(lattice; atoms=atoms, terms=terms, symmetries=false)
-    kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
-    Ecut = 15          # kinetic energy cutoff in Hartree
-    PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
-end
-
-a = 10.26
-basis = make_basis(a)
-
-# scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
-# try a bogus tolerance for debugging
-scfres = self_consistent_field(basis, tol=1e9)
-
-function compute_energy(scfres_ref, a)
-    basis = make_basis(a)
-    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
-    energies.total
-end
-
-compute_energy(a) = compute_energy(scfres, a)
-compute_energy(10.26)
-
-import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.9485566656257386e9 
-
-using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26) # -2.9485566655301175e9
-
-using BenchmarkTools
-@btime compute_energy(10.26)                                           # 14.294 ms (60112 allocations: 9.65 MiB)
-@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 29.582 ms (120228 allocations: 19.30 MiB)
-@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 26.178 ms (70669 allocations: 14.51 MiB)
-

From 7a0dcf3a847c9a1c1d2f3c8aad6a7aadee05d19a Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 15 Jun 2021 13:39:52 +0200
Subject: [PATCH 24/49] update stack traces for Zygote, ReverseDiff

---
 test/autodiff-stress/stress-total.jl | 112 ++++++++++-----------------
 1 file changed, 40 insertions(+), 72 deletions(-)

diff --git a/test/autodiff-stress/stress-total.jl b/test/autodiff-stress/stress-total.jl
index 57fbb33854..7032bb56f6 100644
--- a/test/autodiff-stress/stress-total.jl
+++ b/test/autodiff-stress/stress-total.jl
@@ -9,7 +9,7 @@ function make_basis(a)
     model = model_atomic(lattice, atoms, symmetries=false)
     kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
     Ecut = 15          # kinetic energy cutoff in Hartree
-    PlaneWaveBasis(model, Ecut; kgrid=kgrid)
+    PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32,32,32])
 end
 
 a = 10.26
@@ -31,44 +31,7 @@ fd_stress = FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres,
 ###
 
 using ForwardDiff
-ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26)
-# ERROR: LoadError: MethodError: no method matching next_working_fft_size(::Type{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, ::Int64)
-# Closest candidates are:
-#   next_working_fft_size(::Type{Float32}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:167
-#   next_working_fft_size(::Type{Float64}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:168
-# Stacktrace:
-#   [1] macro expansion
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:125 [inlined]
-#   [2] _broadcast
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:99 [inlined]
-#   [3] copy
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:26 [inlined]
-#   [4] materialize
-#     @ ./broadcast.jl:883 [inlined]
-#   [5] validate_or_compute_fft_size(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, fft_size::Nothing, Ecut::Int64, supersampling::Int64, variational::Bool, optimize_fft_size::Bool, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:139
-#   [6] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:193 [inlined]
-#   [7] (::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, Int64})()
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#   [8] timeit(f::DFTK.var"#62#64"{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
-#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#   [9] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, Ecut::Int64, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Nothing, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
-#  [10] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#  [11] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:13
-#  [12] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:20
-#  [13] (::var"#11#12")(a::ForwardDiff.Dual{ForwardDiff.Tag{var"#11#12", Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:41
-#  [14] derivative(f::var"#11#12", x::Float64)
-#     @ ForwardDiff ~/.julia/packages/ForwardDiff/m7cm5/src/derivative.jl:14
-#  [15] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:41
-# in expression starting at /home/niku/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:41
-
+ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # NaN
 
 ###
 ### Reverse mode
@@ -159,42 +122,47 @@ Zygote.gradient(a -> compute_energy(scfres, a), 10.26)
 
 using ReverseDiff
 ReverseDiff.gradient(a -> compute_energy(scfres, first(a)), [10.26])
-# ERROR: LoadError: MethodError: no method matching next_working_fft_size(::Type{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, ::Int64)
-# Closest candidates are:
-#   next_working_fft_size(::Type{Float32}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:167
-#   next_working_fft_size(::Type{Float64}, ::Any) at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:168
+# ERROR: LoadError: UndefRefError: access to undefined reference
 # Stacktrace:
-#   [1] macro expansion
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:125 [inlined]
-#   [2] _broadcast
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:99 [inlined]
-#   [3] copy
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/broadcast.jl:26 [inlined]
-#   [4] materialize
-#     @ ./broadcast.jl:883 [inlined]
-#   [5] validate_or_compute_fft_size(model::Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, fft_size::Nothing, Ecut::Int64, supersampling::Int64, variational::Bool, optimize_fft_size::Bool, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:139
-#   [6] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:193 [inlined]
-#   [7] (::DFTK.var"#62#64"{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Int64})()
+#   [1] getindex
+#     @ ./array.jl:802 [inlined]
+#   [2] macro expansion
+#     @ ./multidimensional.jl:860 [inlined]
+#   [3] macro expansion
+#     @ ./cartesian.jl:64 [inlined]
+#   [4] macro expansion
+#     @ ./multidimensional.jl:855 [inlined]
+#   [5] _unsafe_getindex!
+#     @ ./multidimensional.jl:868 [inlined]
+#   [6] _unsafe_getindex(::IndexLinear, ::Array{Complex{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, 3}, ::Base.Slice{Base.OneTo{Int64}}, ::Int64, ::Int64)
+#     @ Base ./multidimensional.jl:846
+#   [7] _getindex
+#     @ ./multidimensional.jl:832 [inlined]
+#   [8] getindex
+#     @ ./abstractarray.jl:1170 [inlined]
+#   [9] generic_plan_fft(data::Array{Complex{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, 3})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft_generic.jl:84
+#  [10] build_fft_plans(T::Type, fft_size::Tuple{Int64, Int64, Int64})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft_generic.jl:41
+#  [11] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:196 [inlined]
+#  [12] (::DFTK.var"#62#64"{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Int64})()
 #     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#   [8] timeit(f::DFTK.var"#62#64"{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#  [13] timeit(f::DFTK.var"#62#64"{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
 #     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#   [9] PlaneWaveBasis(model::Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Ecut::Int64, kcoords::Vector{StaticArrays.SVector{3, Rational{Int64}}}, ksymops::Vector{Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}}, symmetries::Vector{Tuple{StaticArrays.SMatrix{3, 3, Int64, 9}, StaticArrays.SVector{3, Float64}}}; fft_size::Nothing, variational::Bool, optimize_fft_size::Bool, supersampling::Int64, kgrid::Vector{Int64}, kshift::Vector{Int64}, comm_kpts::MPI.Comm)
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
-#  [10] PlaneWaveBasis(model::Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
+#  [14] #PlaneWaveBasis#61
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
+#  [15] PlaneWaveBasis(model::Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
 #     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#  [11] make_basis(a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:13
-#  [12] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:20
-#  [13] (::var"#15#16")(a::ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:161
-#  [14] ReverseDiff.GradientTape(f::var"#15#16", input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
+#  [16] make_basis(a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:12
+#  [17] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:19
+#  [18] (::var"#19#20")(a::ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:124
+#  [19] ReverseDiff.GradientTape(f::var"#19#20", input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
 #     @ ReverseDiff ~/.julia/packages/ReverseDiff/E4Tzn/src/api/tape.jl:199
-#  [15] gradient(f::Function, input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}}) (repeats 2 times)
+#  [20] gradient(f::Function, input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}}) (repeats 2 times)
 #     @ ReverseDiff ~/.julia/packages/ReverseDiff/E4Tzn/src/api/gradients.jl:22
-#  [16] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:161
-# in expression starting at /home/niku/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:161
-
+#  [21] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:124

From e7630153e0214e75d8f6c20da253cad95da93092 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 17 Jun 2021 09:22:44 +0200
Subject: [PATCH 25/49] stack trace: one carbon

---
 test/autodiff-stress/stress-forward.jl | 234 +++++++------------------
 1 file changed, 63 insertions(+), 171 deletions(-)

diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index 6c316d8599..9e8d433c20 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -6,8 +6,8 @@ function make_basis(a)
     lattice = a / 2 * [[0 1 1.];
                        [1 0 1.];
                        [1 1 0.]]
-    Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
-    atoms = [Si => [ones(3)/8, -ones(3)/8]]
+    C = ElementPsp(:C, psp=load_psp("hgh/lda/c-q4.hgh"))
+    atoms = [C => [ones(3)/8]]
     terms = [
         Kinetic(),
         AtomicLocal(),
@@ -26,181 +26,73 @@ a = 10.26
 # scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
 # try a bogus tolerance for debugging
 scfres = self_consistent_field(make_basis(a), tol=1e9)
-
-function compute_energy(scfres_ref, a)
-    basis = make_basis(a)
-    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
-    energies.total
-end
-
-compute_energy(a) = compute_energy(scfres, a)
-compute_energy(10.26)
-
-import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.948556665633414e9 
-
-using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26) # -2.948556665529993e9
-
-using BenchmarkTools
-@btime compute_energy(10.26)                                           # 19.513 ms ( 60004 allocations:  8.15 MiB)
-@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 39.317 ms (120012 allocations: 16.29 MiB)
-@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 80.757 ms (543588 allocations: 31.91 MiB)
-
-#===#
-# selected previous stack traces below.
-
-# ERROR: LoadError: MethodError: no method matching mul!(
-#    ::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, 
-#    ::FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, 
-#    ::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, 
-#    ::Bool, 
-#    ::Bool
-# )
-# Closest candidates are:
-#   mul!(::AbstractArray, ::Number, ::AbstractArray, ::Number, ::Number) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:132
-#   mul!(::AbstractArray, ::AbstractArray, ::Number, ::Number, ::Number) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:140
-#   mul!(::Any, ::Any, ::Any) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/matmul.jl:274
-#   ...
+# ┌ Warning: Mismatch in number of electrons
+# │   sum_ρ = 1080.0455760000316
+# │   sum_occupation = 4.0
+# └ @ DFTK ~/.julia/dev/DFTK.jl/src/densities.jl:32
+# n     Free energy       Eₙ-Eₙ₋₁     ρout-ρin   Diag
+# ---   ---------------   ---------   --------   ----
+#   1   -3819171908.212         NaN   2.49e+07    21.0 
+# ERROR: LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
 # Stacktrace:
-#   [1] mul!
-#     @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/matmul.jl:275 [inlined]
-#   [2] mul!(y::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, p::AbstractFFTs.ScaledPlan{ComplexF64, FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, x::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
-#     @ AbstractFFTs ~/.julia/packages/AbstractFFTs/JebmH/src/definitions.jl:269
-#   [3] G_to_r!(f_real::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:383
-#   [4] G_to_r(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3}; assume_real::Bool)
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:414
-#   [5] G_to_r
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:410 [inlined]
-#   [6] (::AtomicLocal)(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/terms/local.jl:93
-#   [7] macro expansion
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:190 [inlined]
-#   [8] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:246 [inlined]
-#   [9] (::DFTK.var"#66#68"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64})()
+#   [1] error(s::String)
+#     @ Base ./error.jl:33
+#   [2] compute_occupation(basis::PlaneWaveBasis{Float64}, energies::Vector{Vector{Float64}}; temperature::Float64, smearing::DFTK.Smearing.None)
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/occupation.jl:77
+#   [3] compute_occupation(basis::PlaneWaveBasis{Float64}, energies::Vector{Vector{Float64}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/occupation.jl:16
+#   [4] next_density(ham::Hamiltonian; n_bands::Int64, ψ::Vector{Matrix{ComplexF64}}, n_ep_extra::Int64, eigensolver::Function, occupation_function::typeof(DFTK.compute_occupation), kwargs::Base.Iterators.Pairs{Symbol, Real, Tuple{Symbol, Symbol}, NamedTuple{(:miniter, :tol), Tuple{Int64, Float64}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/scf/self_consistent_field.jl:30
+#   [5] (::DFTK.var"#fixpoint_map#520"{DataType, Int64, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}})(ρin::Array{Float64, 4})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/scf/self_consistent_field.jl:98
+#   [6] (::DFTK.var"#487#490"{DFTK.var"#fixpoint_map#520"{DataType, Int64, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}}})(x::Array{Float64, 4})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/scf/scf_solvers.jl:18
+#   [7] (::NLSolversBase.var"#ff!#1"{DFTK.var"#487#490"{DFTK.var"#fixpoint_map#520"{DataType, Int64, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}}}})(F::Array{Float64, 4}, x::Array{Float64, 4})
+#     @ NLSolversBase ~/.julia/packages/NLSolversBase/geyh3/src/objective_types/inplace_factory.jl:11
+#   [8] value!!(obj::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, F::Array{Float64, 4}, x::Array{Float64, 4})
+#     @ NLSolversBase ~/.julia/packages/NLSolversBase/geyh3/src/interface.jl:166
+#   [9] value!!
+#     @ ~/.julia/packages/NLSolversBase/geyh3/src/interface.jl:163 [inlined]
+#  [10] anderson_(df::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, initial_x::Array{Float64, 4}, xtol::Float64, ftol::Float64, iterations::Int64, store_trace::Bool, show_trace::Bool, extended_trace::Bool, beta::Int64, aa_start::Int64, droptol::Float64, cache::NLsolve.AndersonCache{Array{Float64, 4}, Array{Float64, 4}, Vector{Array{Float64, 4}}, Vector{Float64}, Matrix{Float64}, Matrix{Float64}})
+#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/solvers/anderson.jl:73
+#  [11] anderson(df::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, initial_x::Array{Float64, 4}, xtol::Float64, ftol::Float64, iterations::Int64, store_trace::Bool, show_trace::Bool, extended_trace::Bool, beta::Int64, aa_start::Int64, droptol::Float64, cache::NLsolve.AndersonCache{Array{Float64, 4}, Array{Float64, 4}, Vector{Array{Float64, 4}}, Vector{Float64}, Matrix{Float64}, Matrix{Float64}})
+#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/solvers/anderson.jl:203
+#  [12] anderson(df::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, initial_x::Array{Float64, 4}, xtol::Float64, ftol::Float64, iterations::Int64, store_trace::Bool, show_trace::Bool, extended_trace::Bool, m::Int64, beta::Int64, aa_start::Int64, droptol::Float64)
+#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/solvers/anderson.jl:188
+#  [13] nlsolve(df::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, initial_x::Array{Float64, 4}; method::Symbol, xtol::Float64, ftol::Float64, iterations::Int64, store_trace::Bool, show_trace::Bool, extended_trace::Bool, linesearch::LineSearches.Static, linsolve::NLsolve.var"#29#31", factor::Float64, autoscale::Bool, m::Int64, beta::Int64, aa_start::Int64, droptol::Float64)
+#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/nlsolve/nlsolve.jl:30
+#  [14] nlsolve(f::Function, initial_x::Array{Float64, 4}; method::Symbol, autodiff::Symbol, inplace::Bool, kwargs::Base.Iterators.Pairs{Symbol, Real, NTuple{5, Symbol}, NamedTuple{(:m, :xtol, :ftol, :show_trace, :iterations), Tuple{Int64, Float64, Float64, Bool, Int64}}})
+#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/nlsolve/nlsolve.jl:52
+#  [15] fp_solver
+#     @ ~/.julia/dev/DFTK.jl/src/scf/scf_solvers.jl:18 [inlined]
+#  [16] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/scf/self_consistent_field.jl:137 [inlined]
+#  [17] (::DFTK.var"#518#519"{Int64, Array{Float64, 4}, Int64, DFTK.var"#fp_solver#488"{DFTK.var"#fp_solver#486#489"{Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, Int64, Symbol}}, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}})()
 #     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#  [10] timeit(f::DFTK.var"#66#68"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#  [18] timeit(f::DFTK.var"#518#519"{Int64, Array{Float64, 4}, Int64, DFTK.var"#fp_solver#488"{DFTK.var"#fp_solver#486#489"{Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, Int64, Symbol}}, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}}, to::TimerOutputs.TimerOutput, label::String)
 #     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#  [11] #PlaneWaveBasis#65
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
-#  [12] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#  [13] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:19
-#  [14] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:29
-#  [15] compute_energy(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:34
-#  [16] derivative(f::typeof(compute_energy), x::Float64)
-#     @ ForwardDiff ~/.julia/packages/ForwardDiff/QOqCN/src/derivative.jl:14
-#  [17] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:41
+#  [19] self_consistent_field(basis::PlaneWaveBasis{Float64}; n_bands::Int64, ρ::Array{Float64, 4}, ψ::Nothing, tol::Float64, maxiter::Int64, solver::Function, eigensolver::Function, n_ep_extra::Int64, determine_diagtol::DFTK.var"#determine_diagtol#515"{Float64}, α::Float64, mixing::SimpleMixing, is_converged::DFTK.var"#is_converged#511"{Float64}, callback::DFTK.var"#callback#510", compute_consistent_energies::Bool, enforce_symmetry::Bool, occupation_function::typeof(DFTK.compute_occupation))
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
+#  [20] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:29
 
 
+# function compute_energy(scfres_ref, a)
+#     basis = make_basis(a)
+#     energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
+#     energies.total
+# end
 
-# ERROR: LoadError: MethodError: mul!(::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, ::AbstractFFTs.ScaledPlan{ComplexF64, FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, ::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}) is ambiguous. Candidates:
-#   mul!(Y, p::AbstractFFTs.ScaledPlan, X::AbstractArray{var"#s25", N} where {var"#s25"<:(Complex{var"#s24"} where var"#s24"<:ForwardDiff.Dual), N}) in DFTK at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:241
-#   mul!(Y, p::AbstractFFTs.Plan, X::AbstractArray{var"#s25", N} where {var"#s25"<:(Complex{var"#s24"} where var"#s24"<:ForwardDiff.Dual), N}) in DFTK at /home/niku/.julia/dev/DFTK.jl/src/fft.jl:241
-#   mul!(y::AbstractArray, p::AbstractFFTs.ScaledPlan, x::AbstractArray) in AbstractFFTs at /home/niku/.julia/packages/AbstractFFTs/JebmH/src/definitions.jl:269
-# Possible fix, define
-#   mul!(::AbstractArray, ::AbstractFFTs.ScaledPlan, ::AbstractArray{var"#s25", N} where {var"#s25"<:(Complex{var"#s24"} where var"#s24"<:ForwardDiff.Dual), N})
-# Stacktrace:
-#   [1] G_to_r!(f_real::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:383
-#   [2] G_to_r(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3}; assume_real::Bool)
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:414
-#   [3] G_to_r
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:410 [inlined]
-#   [4] (::AtomicLocal)(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/terms/local.jl:93
-#   [5] macro expansion
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:190 [inlined]
-#   [6] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:246 [inlined]
-#   [7] (::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64})()
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#   [8] timeit(f::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
-#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#   [9] #PlaneWaveBasis#76
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
-#  [10] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#  [11] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:19
-#  [12] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:29
-#  [13] compute_energy(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:34
-#  [14] derivative(f::typeof(compute_energy), x::Float64)
-#     @ ForwardDiff ~/.julia/packages/ForwardDiff/QOqCN/src/derivative.jl:14
-#  [15] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:41
+# compute_energy(a) = compute_energy(scfres, a)
+# compute_energy(10.26)
 
+# import FiniteDiff
+# FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.948556665633414e9 
 
+# using ForwardDiff
+# ForwardDiff.derivative(compute_energy, 10.26) # -2.948556665529993e9
 
-# ERROR: LoadError: MethodError: no method matching Float64(::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-# Closest candidates are:
-#   (::Type{T})(::Real, ::RoundingMode) where T<:AbstractFloat at rounding.jl:200
-#   (::Type{T})(::T) where T<:Number at boot.jl:760
-#   (::Type{T})(::AbstractChar) where T<:Union{AbstractChar, Number} at char.jl:50
-#   ...
-# Stacktrace:
-#   [1] convert(#unused#::Type{Float64}, x::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Base ./number.jl:7
-#   [2] ComplexF64(re::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, im::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Base ./complex.jl:12
-#   [3] ComplexF64(z::Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
-#     @ Base ./complex.jl:36
-#   [4] convert(#unused#::Type{ComplexF64}, x::Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
-#     @ Base ./number.jl:7
-#   [5] setindex!(A::Array{ComplexF64, 3}, x::Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, i1::Int64)
-#     @ Base ./array.jl:839
-#   [6] macro expansion
-#     @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:183 [inlined]
-#   [7] macro expansion
-#     @ ./simdloop.jl:77 [inlined]
-#   [8] rmul!
-#     @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/generic.jl:182 [inlined]
-#   [9] *(p::AbstractFFTs.ScaledPlan{
-#           ComplexF64, 
-#           FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, 
-#           ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}
-#         }, 
-#         x::Array{ComplexF64, 3})
-#     @ AbstractFFTs ~/.julia/packages/AbstractFFTs/JebmH/src/definitions.jl:249
-#  [10] _apply_plan(p::AbstractFFTs.ScaledPlan{ComplexF64, FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, x::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:249
-#  [11] mul!(Y::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, p::AbstractFFTs.ScaledPlan{ComplexF64, FFTW.cFFTWPlan{ComplexF64, 1, false, 3, UnitRange{Int64}}, ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, X::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft.jl:243
-#  [12] G_to_r!(f_real::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true}, basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::SubArray{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3, Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 4}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, true})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:383
-#  [13] G_to_r(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, f_fourier::Array{Complex{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, 3}; assume_real::Bool)
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:414
-#  [14] G_to_r
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:410 [inlined]
-#  [15] (::AtomicLocal)(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/terms/local.jl:93
-#  [16] macro expansion
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:190 [inlined]
-#  [17] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:246 [inlined]
-#  [18] (::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64})()
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#  [19] timeit(f::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
-#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#  [20] #PlaneWaveBasis#76
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
-#  [21] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#  [22] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:19
-#  [23] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:29
-#  [24] compute_energy(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:34
-#  [25] derivative(f::typeof(compute_energy), x::Float64)
-#     @ ForwardDiff ~/.julia/packages/ForwardDiff/QOqCN/src/derivative.jl:14
-#  [26] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:41
+# using BenchmarkTools
+# @btime compute_energy(10.26)                                           # 19.513 ms ( 60004 allocations:  8.15 MiB)
+# @btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 39.317 ms (120012 allocations: 16.29 MiB)
+# @btime ForwardDiff.derivative(compute_energy, 10.26)                   # 80.757 ms (543588 allocations: 31.91 MiB)

From 34b5129304364210396e450d9fcb5389e1fce1e7 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 17 Jun 2021 10:42:26 +0200
Subject: [PATCH 26/49] stack trace: AtomicNonlocal NaN ForwardDiff

---
 test/autodiff-stress/stress-forward.jl | 151 ++++++++++++++-----------
 1 file changed, 86 insertions(+), 65 deletions(-)

diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index 9e8d433c20..f8d7be7fcd 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -6,12 +6,12 @@ function make_basis(a)
     lattice = a / 2 * [[0 1 1.];
                        [1 0 1.];
                        [1 1 0.]]
-    C = ElementPsp(:C, psp=load_psp("hgh/lda/c-q4.hgh"))
-    atoms = [C => [ones(3)/8]]
+    Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+    atoms = [Si => [ones(3)/8, -ones(3)/8]]
     terms = [
         Kinetic(),
         AtomicLocal(),
-        # AtomicNonlocal(),
+        AtomicNonlocal(),
         Ewald(),
         PspCorrection()
     ]
@@ -25,74 +25,95 @@ a = 10.26
 
 # scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
 # try a bogus tolerance for debugging
-scfres = self_consistent_field(make_basis(a), tol=1e9)
-# ┌ Warning: Mismatch in number of electrons
-# │   sum_ρ = 1080.0455760000316
-# │   sum_occupation = 4.0
-# └ @ DFTK ~/.julia/dev/DFTK.jl/src/densities.jl:32
-# n     Free energy       Eₙ-Eₙ₋₁     ρout-ρin   Diag
-# ---   ---------------   ---------   --------   ----
-#   1   -3819171908.212         NaN   2.49e+07    21.0 
-# ERROR: LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
-# Stacktrace:
-#   [1] error(s::String)
-#     @ Base ./error.jl:33
-#   [2] compute_occupation(basis::PlaneWaveBasis{Float64}, energies::Vector{Vector{Float64}}; temperature::Float64, smearing::DFTK.Smearing.None)
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/occupation.jl:77
-#   [3] compute_occupation(basis::PlaneWaveBasis{Float64}, energies::Vector{Vector{Float64}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/occupation.jl:16
-#   [4] next_density(ham::Hamiltonian; n_bands::Int64, ψ::Vector{Matrix{ComplexF64}}, n_ep_extra::Int64, eigensolver::Function, occupation_function::typeof(DFTK.compute_occupation), kwargs::Base.Iterators.Pairs{Symbol, Real, Tuple{Symbol, Symbol}, NamedTuple{(:miniter, :tol), Tuple{Int64, Float64}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/scf/self_consistent_field.jl:30
-#   [5] (::DFTK.var"#fixpoint_map#520"{DataType, Int64, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}})(ρin::Array{Float64, 4})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/scf/self_consistent_field.jl:98
-#   [6] (::DFTK.var"#487#490"{DFTK.var"#fixpoint_map#520"{DataType, Int64, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}}})(x::Array{Float64, 4})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/scf/scf_solvers.jl:18
-#   [7] (::NLSolversBase.var"#ff!#1"{DFTK.var"#487#490"{DFTK.var"#fixpoint_map#520"{DataType, Int64, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}}}})(F::Array{Float64, 4}, x::Array{Float64, 4})
-#     @ NLSolversBase ~/.julia/packages/NLSolversBase/geyh3/src/objective_types/inplace_factory.jl:11
-#   [8] value!!(obj::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, F::Array{Float64, 4}, x::Array{Float64, 4})
-#     @ NLSolversBase ~/.julia/packages/NLSolversBase/geyh3/src/interface.jl:166
-#   [9] value!!
-#     @ ~/.julia/packages/NLSolversBase/geyh3/src/interface.jl:163 [inlined]
-#  [10] anderson_(df::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, initial_x::Array{Float64, 4}, xtol::Float64, ftol::Float64, iterations::Int64, store_trace::Bool, show_trace::Bool, extended_trace::Bool, beta::Int64, aa_start::Int64, droptol::Float64, cache::NLsolve.AndersonCache{Array{Float64, 4}, Array{Float64, 4}, Vector{Array{Float64, 4}}, Vector{Float64}, Matrix{Float64}, Matrix{Float64}})
-#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/solvers/anderson.jl:73
-#  [11] anderson(df::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, initial_x::Array{Float64, 4}, xtol::Float64, ftol::Float64, iterations::Int64, store_trace::Bool, show_trace::Bool, extended_trace::Bool, beta::Int64, aa_start::Int64, droptol::Float64, cache::NLsolve.AndersonCache{Array{Float64, 4}, Array{Float64, 4}, Vector{Array{Float64, 4}}, Vector{Float64}, Matrix{Float64}, Matrix{Float64}})
-#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/solvers/anderson.jl:203
-#  [12] anderson(df::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, initial_x::Array{Float64, 4}, xtol::Float64, ftol::Float64, iterations::Int64, store_trace::Bool, show_trace::Bool, extended_trace::Bool, m::Int64, beta::Int64, aa_start::Int64, droptol::Float64)
-#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/solvers/anderson.jl:188
-#  [13] nlsolve(df::NLSolversBase.NonDifferentiable{Array{Float64, 4}, Array{Float64, 4}}, initial_x::Array{Float64, 4}; method::Symbol, xtol::Float64, ftol::Float64, iterations::Int64, store_trace::Bool, show_trace::Bool, extended_trace::Bool, linesearch::LineSearches.Static, linsolve::NLsolve.var"#29#31", factor::Float64, autoscale::Bool, m::Int64, beta::Int64, aa_start::Int64, droptol::Float64)
-#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/nlsolve/nlsolve.jl:30
-#  [14] nlsolve(f::Function, initial_x::Array{Float64, 4}; method::Symbol, autodiff::Symbol, inplace::Bool, kwargs::Base.Iterators.Pairs{Symbol, Real, NTuple{5, Symbol}, NamedTuple{(:m, :xtol, :ftol, :show_trace, :iterations), Tuple{Int64, Float64, Float64, Bool, Int64}}})
-#     @ NLsolve ~/.julia/packages/NLsolve/gJL1I/src/nlsolve/nlsolve.jl:52
-#  [15] fp_solver
-#     @ ~/.julia/dev/DFTK.jl/src/scf/scf_solvers.jl:18 [inlined]
-#  [16] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/scf/self_consistent_field.jl:137 [inlined]
-#  [17] (::DFTK.var"#518#519"{Int64, Array{Float64, 4}, Int64, DFTK.var"#fp_solver#488"{DFTK.var"#fp_solver#486#489"{Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, Int64, Symbol}}, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}})()
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#  [18] timeit(f::DFTK.var"#518#519"{Int64, Array{Float64, 4}, Int64, DFTK.var"#fp_solver#488"{DFTK.var"#fp_solver#486#489"{Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, Int64, Symbol}}, typeof(lobpcg_hyper), Int64, DFTK.var"#determine_diagtol#515"{Float64}, Float64, SimpleMixing, DFTK.var"#is_converged#511"{Float64}, DFTK.var"#callback#510", Bool, Bool, typeof(DFTK.compute_occupation), PlaneWaveBasis{Float64}}, to::TimerOutputs.TimerOutput, label::String)
-#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#  [19] self_consistent_field(basis::PlaneWaveBasis{Float64}; n_bands::Int64, ρ::Array{Float64, 4}, ψ::Nothing, tol::Float64, maxiter::Int64, solver::Function, eigensolver::Function, n_ep_extra::Int64, determine_diagtol::DFTK.var"#determine_diagtol#515"{Float64}, α::Float64, mixing::SimpleMixing, is_converged::DFTK.var"#is_converged#511"{Float64}, callback::DFTK.var"#callback#510", compute_consistent_energies::Bool, enforce_symmetry::Bool, occupation_function::typeof(DFTK.compute_occupation))
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236
-#  [20] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:29
-
+scfres = self_consistent_field(make_basis(a), tol=1e-4)
 
-# function compute_energy(scfres_ref, a)
-#     basis = make_basis(a)
-#     energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
-#     energies.total
-# end
+function compute_energy(scfres_ref, a)
+    basis = make_basis(a)
+    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
+    energies.total
+end
 
-# compute_energy(a) = compute_energy(scfres, a)
-# compute_energy(10.26)
+compute_energy(a) = compute_energy(scfres, a)
+compute_energy(10.26)
 
-# import FiniteDiff
-# FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.948556665633414e9 
+import FiniteDiff
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.948556665633414e9 
 
-# using ForwardDiff
-# ForwardDiff.derivative(compute_energy, 10.26) # -2.948556665529993e9
+using ForwardDiff
+ForwardDiff.derivative(compute_energy, 10.26) # NaN
 
 # using BenchmarkTools
 # @btime compute_energy(10.26)                                           # 19.513 ms ( 60004 allocations:  8.15 MiB)
 # @btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 39.317 ms (120012 allocations: 16.29 MiB)
 # @btime ForwardDiff.derivative(compute_energy, 10.26)                   # 80.757 ms (543588 allocations: 31.91 MiB)
+
+#===#
+# debug NaN in AtomicNonlocal ForwardDiff
+
+# Bits for x86 FPU control word
+const FE_INVALID    = 0x1
+const FE_DIVBYZERO  = 0x4
+const FE_OVERFLOW   = 0x8
+const FE_UNDERFLOW  = 0x10
+const FE_INEXACT    = 0x20
+
+fpexceptions() = ccall(:fegetexcept, Cint, ())
+
+function setfpexceptions(f, mode)
+    prev = ccall(:feenableexcept, Cint, (Cint,), mode)
+    try
+        f()
+    finally
+        ccall(:fedisableexcept, Cint, (Cint,), mode & ~prev)
+    end
+end
+
+setfpexceptions(FE_DIVBYZERO) do
+    FiniteDiff.finite_difference_derivative(compute_energy, 10.26)
+end
+
+setfpexceptions(FE_DIVBYZERO) do
+    ForwardDiff.derivative(compute_energy, 10.26)  
+end
+# ERROR: LoadError: DivideError: integer division error
+# Stacktrace:
+#   [1] /
+#     @ ./math.jl:0 [inlined]
+#   [2] inv
+#     @ ./number.jl:217 [inlined]
+#   [3] sqrt
+#     @ ~/.julia/packages/ForwardDiff/QOqCN/src/dual.jl:203 [inlined]
+#   [4] macro expansion
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/linalg.jl:225 [inlined]
+#   [5] _norm
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/linalg.jl:213 [inlined]
+#   [6] norm
+#     @ ~/.julia/packages/StaticArrays/NTbHj/src/linalg.jl:212 [inlined]
+#   [7] (::AtomicLocal)(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/terms/local.jl:85
+#   [8] macro expansion
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:190 [inlined]
+#   [9] macro expansion
+#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:253 [inlined]
+#  [10] (::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64})()
+#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
+#  [11] timeit(f::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
+#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
+#  [12] #PlaneWaveBasis#76
+#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
+#  [13] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
+#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:293
+#  [14] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:21
+#  [15] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:31
+#  [16] compute_energy(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:36
+#  [17] derivative
+#     @ ~/.julia/packages/ForwardDiff/QOqCN/src/derivative.jl:14 [inlined]
+#  [18] (::var"#15#16")()
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:76
+#  [19] setfpexceptions(f::var"#15#16", mode::UInt8)
+#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:65
+#  [20] top-level scope
+#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:75

From 5cf5a2c7825028a2835fa8301a030e95ce51350c Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 17 Jun 2021 17:29:55 +0200
Subject: [PATCH 27/49] fix AtomicNonlocal NaN

---
 src/terms/nonlocal.jl                  |  9 ++-
 test/autodiff-stress/stress-forward.jl | 87 +++-----------------------
 2 files changed, 15 insertions(+), 81 deletions(-)

diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 1a15737cce..ed25b2c4b5 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -184,7 +184,14 @@ end
 Build form factors (Fourier transforms of projectors) for an atom centered at 0.
 """
 function build_form_factors(psp, qs)
-    qnorms = norm.(qs)
+
+    ## The AtomicNonlocal NaN boiled down to ForwardDiff + norm + StaticArrays at [0., 0., 0.]
+    ## https://github.com/JuliaDiff/ForwardDiff.jl/issues/243#issuecomment-369948031
+    ## specifically this happend because qs[1] is a Vec3 of zeros
+    ## TODO fix this more generally
+    # qnorms = norm.(qs)
+    qnorms = vcat(norm(Vector(qs[1])), norm.(@view qs[2:end]))
+
     T = real(eltype(qnorms))
     # Compute position-independent form factors
     form_factors = zeros(Complex{T}, length(qs), count_n_proj(psp))
diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index f8d7be7fcd..f0c2a2d64f 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -22,10 +22,7 @@ function make_basis(a)
 end
 
 a = 10.26
-
-# scfres = self_consistent_field(basis, tol=1e-8) # LoadError: Unable to find non-fractional occupations that have the correct number of electrons. You should add a temperature.
-# try a bogus tolerance for debugging
-scfres = self_consistent_field(make_basis(a), tol=1e-4)
+scfres = self_consistent_field(make_basis(a), tol=1e-8)
 
 function compute_energy(scfres_ref, a)
     basis = make_basis(a)
@@ -37,83 +34,13 @@ compute_energy(a) = compute_energy(scfres, a)
 compute_energy(10.26)
 
 import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.948556665633414e9 
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.940653844187964e9 
 
 using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26) # NaN
-
-# using BenchmarkTools
-# @btime compute_energy(10.26)                                           # 19.513 ms ( 60004 allocations:  8.15 MiB)
-# @btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 39.317 ms (120012 allocations: 16.29 MiB)
-# @btime ForwardDiff.derivative(compute_energy, 10.26)                   # 80.757 ms (543588 allocations: 31.91 MiB)
-
-#===#
-# debug NaN in AtomicNonlocal ForwardDiff
+ForwardDiff.derivative(compute_energy, 10.26) # -2.940653844103271e9
 
-# Bits for x86 FPU control word
-const FE_INVALID    = 0x1
-const FE_DIVBYZERO  = 0x4
-const FE_OVERFLOW   = 0x8
-const FE_UNDERFLOW  = 0x10
-const FE_INEXACT    = 0x20
+using BenchmarkTools
+@btime compute_energy(10.26)                                           # 19.513 ms ( 60004 allocations:  8.15 MiB)
+@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 39.317 ms (120012 allocations: 16.29 MiB)
+@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 80.757 ms (543588 allocations: 31.91 MiB)
 
-fpexceptions() = ccall(:fegetexcept, Cint, ())
-
-function setfpexceptions(f, mode)
-    prev = ccall(:feenableexcept, Cint, (Cint,), mode)
-    try
-        f()
-    finally
-        ccall(:fedisableexcept, Cint, (Cint,), mode & ~prev)
-    end
-end
-
-setfpexceptions(FE_DIVBYZERO) do
-    FiniteDiff.finite_difference_derivative(compute_energy, 10.26)
-end
-
-setfpexceptions(FE_DIVBYZERO) do
-    ForwardDiff.derivative(compute_energy, 10.26)  
-end
-# ERROR: LoadError: DivideError: integer division error
-# Stacktrace:
-#   [1] /
-#     @ ./math.jl:0 [inlined]
-#   [2] inv
-#     @ ./number.jl:217 [inlined]
-#   [3] sqrt
-#     @ ~/.julia/packages/ForwardDiff/QOqCN/src/dual.jl:203 [inlined]
-#   [4] macro expansion
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/linalg.jl:225 [inlined]
-#   [5] _norm
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/linalg.jl:213 [inlined]
-#   [6] norm
-#     @ ~/.julia/packages/StaticArrays/NTbHj/src/linalg.jl:212 [inlined]
-#   [7] (::AtomicLocal)(basis::PlaneWaveBasis{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/terms/local.jl:85
-#   [8] macro expansion
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:190 [inlined]
-#   [9] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:253 [inlined]
-#  [10] (::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64})()
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#  [11] timeit(f::DFTK.var"#77#79"{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
-#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#  [12] #PlaneWaveBasis#76
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
-#  [13] PlaneWaveBasis(model::Model{ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:293
-#  [14] make_basis(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:21
-#  [15] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:31
-#  [16] compute_energy(a::ForwardDiff.Dual{ForwardDiff.Tag{typeof(compute_energy), Float64}, Float64, 1})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:36
-#  [17] derivative
-#     @ ~/.julia/packages/ForwardDiff/QOqCN/src/derivative.jl:14 [inlined]
-#  [18] (::var"#15#16")()
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:76
-#  [19] setfpexceptions(f::var"#15#16", mode::UInt8)
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:65
-#  [20] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-forward.jl:75

From eb89f369b4a381b69b012e46b439156940f38d50 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Fri, 18 Jun 2021 00:30:16 +0200
Subject: [PATCH 28/49] fix fft normalization & dual-scaled plans

---
 src/PlaneWaveBasis.jl                  | 15 ++++-----------
 src/fft.jl                             |  6 ++++--
 test/autodiff-stress/stress-forward.jl |  5 ++---
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 6761a779fd..7c94a4a523 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -203,17 +203,10 @@ build_kpoints(basis::PlaneWaveBasis, kcoords) =
     # ψ(r) = sum_G c_G e^iGr / sqrt(Ω)
     # so that the ifft is normalized by 1/sqrt(Ω). It follows that the
     # fft must be normalized by sqrt(Ω) / length
-
-    ## normalization disabled for debugging (model.unit_cell_volume is a ForwardDiff.Dual, thus makes a real plan Dual which fails)
-    ## TODO re-enable normalization
-    # ipFFT = ipFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(ipFFT_unnormalized))
-    # opFFT = opFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(opFFT_unnormalized))
-    # ipIFFT = inv(ipFFT)
-    # opIFFT = inv(opFFT)
-    ipFFT = ipFFT_unnormalized #* (sqrt(model.unit_cell_volume) / length(ipFFT_unnormalized))
-    opFFT = opFFT_unnormalized #* (sqrt(model.unit_cell_volume) / length(opFFT_unnormalized))
-    ipIFFT = ipBFFT_unnormalized
-    opIFFT = opBFFT_unnormalized
+    ipFFT = ipFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(ipFFT_unnormalized))
+    opFFT = opFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(opFFT_unnormalized))
+    ipIFFT = inv(ipFFT)
+    opIFFT = inv(opFFT)
     
     # Setup kpoint basis sets
     !variational && @warn(
diff --git a/src/fft.jl b/src/fft.jl
index fb96713e7a..dc98f8d162 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -229,8 +229,6 @@ end
 for P in [:Plan, :ScaledPlan]  # need ScaledPlan to avoid ambiguities
     @eval begin
 
-        # TODO handle ForwardDiff.Dual scaling factors (perhaps lazy evaluation?)
-
         Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:ForwardDiff.Dual}) =
             _apply_plan(p, x)
 
@@ -259,6 +257,10 @@ function _apply_plan(p::AbstractFFTs.Plan, x::AbstractArray)
     end
 end
 
+function _apply_plan(p::AbstractFFTs.ScaledPlan{T,P,<:ForwardDiff.Dual}, x::AbstractArray) where {T,P}
+    _apply_plan(p.p, p.scale * x) # for when p.scale is Dual, need out-of-place
+end
+
 ###
 ### DFTK setup specific
 ###
diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index f0c2a2d64f..803a0e9bc6 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -34,13 +34,12 @@ compute_energy(a) = compute_energy(scfres, a)
 compute_energy(10.26)
 
 import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -2.940653844187964e9 
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -1.4114477062973088 
 
 using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26) # -2.940653844103271e9
+ForwardDiff.derivative(compute_energy, 10.26) # -1.4114477059240538
 
 using BenchmarkTools
 @btime compute_energy(10.26)                                           # 19.513 ms ( 60004 allocations:  8.15 MiB)
 @btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 39.317 ms (120012 allocations: 16.29 MiB)
 @btime ForwardDiff.derivative(compute_energy, 10.26)                   # 80.757 ms (543588 allocations: 31.91 MiB)
-

From cb1b96815c6102e2ed0e5e25c5112950c5f6ab0f Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 21 Jun 2021 15:28:17 +0200
Subject: [PATCH 29/49] add ForwardDiff norm of SVector workaround

---
 src/DFTK.jl                            |   2 +
 src/PlaneWaveBasis.jl                  |   2 +-
 src/fft.jl                             | 128 -----------------------
 src/forwarddiff_rules.jl               | 137 +++++++++++++++++++++++++
 src/terms/nonlocal.jl                  |   9 +-
 test/autodiff-stress/stress-forward.jl |   6 +-
 6 files changed, 144 insertions(+), 140 deletions(-)
 create mode 100644 src/forwarddiff_rules.jl

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 78fa7881cf..aa70c677aa 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -165,6 +165,8 @@ include("postprocess/chi0.jl")
 export compute_current
 include("postprocess/current.jl")
 
+include("forwarddiff_rules.jl")
+
 function __init__()
     # Use "@require" to only include fft_generic.jl once IntervalArithmetic or
     # DoubleFloats has been loaded (via a "using" or an "import").
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 7c94a4a523..de3014d359 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -207,7 +207,7 @@ build_kpoints(basis::PlaneWaveBasis, kcoords) =
     opFFT = opFFT_unnormalized * (sqrt(model.unit_cell_volume) / length(opFFT_unnormalized))
     ipIFFT = inv(ipFFT)
     opIFFT = inv(opFFT)
-    
+
     # Setup kpoint basis sets
     !variational && @warn(
         "Non-variational calculations are experimental. " *
diff --git a/src/fft.jl b/src/fft.jl
index dc98f8d162..b8690305df 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -166,131 +166,3 @@ end
 # for floating-point types natively supported by FFTW
 next_working_fft_size(::Type{Float32}, size) = size
 next_working_fft_size(::Type{Float64}, size) = size
-
-
-#==============================================================================#
-#
-# ForwardDiff + FFTW zone
-#
-#==============================================================================#
-import ForwardDiff
-import AbstractFFTs
-
-# original PR by mcabbott: https://github.com/JuliaDiff/ForwardDiff.jl/pull/495
-# modified version: https://github.com/niklasschmitz/ForwardDiff.jl/blob/nfs/fft/src/fft.jl
-
-ForwardDiff.value(x::Complex{<:ForwardDiff.Dual}) =
-    Complex(x.re.value, x.im.value)
-
-ForwardDiff.partials(x::Complex{<:ForwardDiff.Dual}, n::Int) =
-    Complex(ForwardDiff.partials(x.re, n), ForwardDiff.partials(x.im, n))
-
-ForwardDiff.npartials(x::Complex{<:ForwardDiff.Dual{T,V,N}}) where {T,V,N} = N
-ForwardDiff.npartials(::Type{<:Complex{<:ForwardDiff.Dual{T,V,N}}}) where {T,V,N} = N
-
-ForwardDiff.tagtype(x::Complex{<:ForwardDiff.Dual{T,V,N}}) where {T,V,N} = T
-ForwardDiff.tagtype(::Type{<:Complex{<:ForwardDiff.Dual{T,V,N}}}) where {T,V,N} = T
-
-# AbstractFFTs.complexfloat(x::AbstractArray{<:ForwardDiff.Dual}) = float.(x .+ 0im)
-AbstractFFTs.complexfloat(x::AbstractArray{<:ForwardDiff.Dual}) = AbstractFFTs.complexfloat.(x)
-AbstractFFTs.complexfloat(d::ForwardDiff.Dual{T,V,N}) where {T,V,N} = convert(ForwardDiff.Dual{T,float(V),N}, d) + 0im
-
-AbstractFFTs.realfloat(x::AbstractArray{<:ForwardDiff.Dual}) = AbstractFFTs.realfloat.(x)
-AbstractFFTs.realfloat(d::ForwardDiff.Dual{T,V,N}) where {T,V,N} = convert(ForwardDiff.Dual{T,float(V),N}, d)
-
-for plan in [:plan_fft, :plan_ifft, :plan_bfft]
-    @eval begin
-
-        AbstractFFTs.$plan(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
-            AbstractFFTs.$plan(ForwardDiff.value.(x) .+ 0im, region; kwargs...)
-
-        AbstractFFTs.$plan(x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, region=1:ndims(x); kwargs...) =
-            AbstractFFTs.$plan(ForwardDiff.value.(x), region; kwargs...)
-
-    end
-end
-
-# rfft only accepts real arrays
-AbstractFFTs.plan_rfft(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
-    AbstractFFTs.plan_rfft(ForwardDiff.value.(x), region; kwargs...)
-
-for plan in [:plan_irfft, :plan_brfft]  # these take an extra argument, only when complex?
-    @eval begin
-
-        AbstractFFTs.$plan(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
-            AbstractFFTs.$plan(ForwardDiff.value.(x) .+ 0im, region; kwargs...)
-
-        AbstractFFTs.$plan(x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, d::Integer, region=1:ndims(x); kwargs...) =
-            AbstractFFTs.$plan(ForwardDiff.value.(x), d, region; kwargs...)
-
-    end
-end
-
-for P in [:Plan, :ScaledPlan]  # need ScaledPlan to avoid ambiguities
-    @eval begin
-
-        Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:ForwardDiff.Dual}) =
-            _apply_plan(p, x)
-
-        Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}) =
-            _apply_plan(p, x)
-
-        LinearAlgebra.mul!(Y::AbstractArray, p::AbstractFFTs.$P, X::AbstractArray{<:ForwardDiff.Dual}) = 
-            (Y .= _apply_plan(p, X))
-        
-        LinearAlgebra.mul!(Y::AbstractArray, p::AbstractFFTs.$P, X::AbstractArray{<:Complex{<:ForwardDiff.Dual}}) =
-            (Y .= _apply_plan(p, X))
-    end
-end
-
-function _apply_plan(p::AbstractFFTs.Plan, x::AbstractArray)
-    xtil = p * ForwardDiff.value.(x)
-    dxtils = ntuple(ForwardDiff.npartials(eltype(x))) do n
-        p * ForwardDiff.partials.(x, n)
-    end
-    T = ForwardDiff.tagtype(eltype(x))
-    map(xtil, dxtils...) do val, parts...
-        Complex(
-            ForwardDiff.Dual{T}(real(val), map(real, parts)),
-            ForwardDiff.Dual{T}(imag(val), map(imag, parts)),
-        )
-    end
-end
-
-function _apply_plan(p::AbstractFFTs.ScaledPlan{T,P,<:ForwardDiff.Dual}, x::AbstractArray) where {T,P}
-    _apply_plan(p.p, p.scale * x) # for when p.scale is Dual, need out-of-place
-end
-
-###
-### DFTK setup specific
-###
-
-next_working_fft_size(::Type{<:ForwardDiff.Dual}, size) = size
-
-_fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED
-
-# *** COPIED from fft_generic.jl *** TODO refactor
-# A dummy wrapper around an out-of-place FFT plan to make it appear in-place
-# This is needed for some generic FFT implementations, which do not have in-place plans
-struct DummyInplace{opFFT}
-    fft::opFFT
-end
-LinearAlgebra.mul!(Y, p::DummyInplace, X) = (Y .= mul!(similar(X), p.fft, X))
-LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = (Y .= ldiv!(similar(X), p.fft, X))
-
-import Base: *, \, length
-*(p::DummyInplace, X) = p.fft * X
-\(p::DummyInplace, X) = p.fft \ X
-length(p::DummyInplace) = length(p.fft)
-
-function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}, fft_size)
-    tmp = Array{Complex{T}}(undef, fft_size...)
-    opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
-    opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T))
-
-    ipFFT  = DummyInplace{typeof(opFFT)}(opFFT)
-    ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
-    # backward by inverting and stripping off normalizations
-    ipFFT, opFFT, ipBFFT, opBFFT
-end
-
diff --git a/src/forwarddiff_rules.jl b/src/forwarddiff_rules.jl
new file mode 100644
index 0000000000..4e9df69bf3
--- /dev/null
+++ b/src/forwarddiff_rules.jl
@@ -0,0 +1,137 @@
+import ForwardDiff
+import AbstractFFTs
+
+# original PR by mcabbott: https://github.com/JuliaDiff/ForwardDiff.jl/pull/495
+# modified version: https://github.com/niklasschmitz/ForwardDiff.jl/blob/nfs/fft/src/fft.jl
+
+ForwardDiff.value(x::Complex{<:ForwardDiff.Dual}) =
+    Complex(x.re.value, x.im.value)
+
+ForwardDiff.partials(x::Complex{<:ForwardDiff.Dual}, n::Int) =
+    Complex(ForwardDiff.partials(x.re, n), ForwardDiff.partials(x.im, n))
+
+ForwardDiff.npartials(x::Complex{<:ForwardDiff.Dual{T,V,N}}) where {T,V,N} = N
+ForwardDiff.npartials(::Type{<:Complex{<:ForwardDiff.Dual{T,V,N}}}) where {T,V,N} = N
+
+ForwardDiff.tagtype(x::Complex{<:ForwardDiff.Dual{T,V,N}}) where {T,V,N} = T
+ForwardDiff.tagtype(::Type{<:Complex{<:ForwardDiff.Dual{T,V,N}}}) where {T,V,N} = T
+
+# AbstractFFTs.complexfloat(x::AbstractArray{<:ForwardDiff.Dual}) = float.(x .+ 0im)
+AbstractFFTs.complexfloat(x::AbstractArray{<:ForwardDiff.Dual}) = AbstractFFTs.complexfloat.(x)
+AbstractFFTs.complexfloat(d::ForwardDiff.Dual{T,V,N}) where {T,V,N} = convert(ForwardDiff.Dual{T,float(V),N}, d) + 0im
+
+AbstractFFTs.realfloat(x::AbstractArray{<:ForwardDiff.Dual}) = AbstractFFTs.realfloat.(x)
+AbstractFFTs.realfloat(d::ForwardDiff.Dual{T,V,N}) where {T,V,N} = convert(ForwardDiff.Dual{T,float(V),N}, d)
+
+for plan in [:plan_fft, :plan_ifft, :plan_bfft]
+    @eval begin
+
+        AbstractFFTs.$plan(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
+            AbstractFFTs.$plan(ForwardDiff.value.(x) .+ 0im, region; kwargs...)
+
+        AbstractFFTs.$plan(x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, region=1:ndims(x); kwargs...) =
+            AbstractFFTs.$plan(ForwardDiff.value.(x), region; kwargs...)
+
+    end
+end
+
+# rfft only accepts real arrays
+AbstractFFTs.plan_rfft(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
+    AbstractFFTs.plan_rfft(ForwardDiff.value.(x), region; kwargs...)
+
+for plan in [:plan_irfft, :plan_brfft]  # these take an extra argument, only when complex?
+    @eval begin
+
+        AbstractFFTs.$plan(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
+            AbstractFFTs.$plan(ForwardDiff.value.(x) .+ 0im, region; kwargs...)
+
+        AbstractFFTs.$plan(x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, d::Integer, region=1:ndims(x); kwargs...) =
+            AbstractFFTs.$plan(ForwardDiff.value.(x), d, region; kwargs...)
+
+    end
+end
+
+for P in [:Plan, :ScaledPlan]  # need ScaledPlan to avoid ambiguities
+    @eval begin
+
+        Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:ForwardDiff.Dual}) =
+            _apply_plan(p, x)
+
+        Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}) =
+            _apply_plan(p, x)
+
+        LinearAlgebra.mul!(Y::AbstractArray, p::AbstractFFTs.$P, X::AbstractArray{<:ForwardDiff.Dual}) = 
+            (Y .= _apply_plan(p, X))
+        
+        LinearAlgebra.mul!(Y::AbstractArray, p::AbstractFFTs.$P, X::AbstractArray{<:Complex{<:ForwardDiff.Dual}}) =
+            (Y .= _apply_plan(p, X))
+    end
+end
+
+function _apply_plan(p::AbstractFFTs.Plan, x::AbstractArray)
+    xtil = p * ForwardDiff.value.(x)
+    dxtils = ntuple(ForwardDiff.npartials(eltype(x))) do n
+        p * ForwardDiff.partials.(x, n)
+    end
+    T = ForwardDiff.tagtype(eltype(x))
+    map(xtil, dxtils...) do val, parts...
+        Complex(
+            ForwardDiff.Dual{T}(real(val), map(real, parts)),
+            ForwardDiff.Dual{T}(imag(val), map(imag, parts)),
+        )
+    end
+end
+
+function _apply_plan(p::AbstractFFTs.ScaledPlan{T,P,<:ForwardDiff.Dual}, x::AbstractArray) where {T,P}
+    _apply_plan(p.p, p.scale * x) # for when p.scale is Dual, need out-of-place
+end
+
+###
+### DFTK setup specific
+###
+
+next_working_fft_size(::Type{<:ForwardDiff.Dual}, size) = size
+
+_fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED
+
+# *** COPIED from fft_generic.jl *** TODO refactor
+# A dummy wrapper around an out-of-place FFT plan to make it appear in-place
+# This is needed for some generic FFT implementations, which do not have in-place plans
+struct DummyInplace{opFFT}
+    fft::opFFT
+end
+LinearAlgebra.mul!(Y, p::DummyInplace, X) = (Y .= mul!(similar(X), p.fft, X))
+LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = (Y .= ldiv!(similar(X), p.fft, X))
+
+import Base: *, \, length
+*(p::DummyInplace, X) = p.fft * X
+\(p::DummyInplace, X) = p.fft \ X
+length(p::DummyInplace) = length(p.fft)
+
+function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}, fft_size)
+    tmp = Array{Complex{T}}(undef, fft_size...)
+    opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
+    opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T))
+
+    ipFFT  = DummyInplace{typeof(opFFT)}(opFFT)
+    ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
+    # backward by inverting and stripping off normalizations
+    ipFFT, opFFT, ipBFFT, opBFFT
+end
+
+###
+### other workarounds
+###
+
+# problem: ForwardDiff of norm of SVector gives NaN derivative at zero
+# https://github.com/JuliaMolSim/DFTK.jl/issues/443#issuecomment-864930410
+# solution: follow ChainRules custom frule for norm
+# https://github.com/JuliaDiff/ChainRules.jl/blob/52a0eeadf8d19bff491f224517b7b064ce1ba378/src/rulesets/LinearAlgebra/norm.jl#L5
+# TODO delete, once forward diff AD tools use ChainRules natively
+function LinearAlgebra.norm(x::SVector{S,<:ForwardDiff.Dual}) where {S}
+    T = ForwardDiff.tagtype(eltype(x))
+    dx = ForwardDiff.partials.(x)
+    y = norm(ForwardDiff.value.(x))
+    dy = real(dot(ForwardDiff.value.(x), dx)) * pinv(y)
+    ForwardDiff.Dual{T}(y, dy)
+end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index ed25b2c4b5..1a15737cce 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -184,14 +184,7 @@ end
 Build form factors (Fourier transforms of projectors) for an atom centered at 0.
 """
 function build_form_factors(psp, qs)
-
-    ## The AtomicNonlocal NaN boiled down to ForwardDiff + norm + StaticArrays at [0., 0., 0.]
-    ## https://github.com/JuliaDiff/ForwardDiff.jl/issues/243#issuecomment-369948031
-    ## specifically this happend because qs[1] is a Vec3 of zeros
-    ## TODO fix this more generally
-    # qnorms = norm.(qs)
-    qnorms = vcat(norm(Vector(qs[1])), norm.(@view qs[2:end]))
-
+    qnorms = norm.(qs)
     T = real(eltype(qnorms))
     # Compute position-independent form factors
     form_factors = zeros(Complex{T}, length(qs), count_n_proj(psp))
diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
index 803a0e9bc6..a899020885 100644
--- a/test/autodiff-stress/stress-forward.jl
+++ b/test/autodiff-stress/stress-forward.jl
@@ -40,6 +40,6 @@ using ForwardDiff
 ForwardDiff.derivative(compute_energy, 10.26) # -1.4114477059240538
 
 using BenchmarkTools
-@btime compute_energy(10.26)                                           # 19.513 ms ( 60004 allocations:  8.15 MiB)
-@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 39.317 ms (120012 allocations: 16.29 MiB)
-@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 80.757 ms (543588 allocations: 31.91 MiB)
+@btime compute_energy(10.26)                                           # 14.253 ms (60220 allocations: 10.01 MiB)
+@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 30.813 ms (120444 allocations: 20.01 MiB)
+@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 67.657 ms (543847 allocations: 35.61 MiB)

From 6eafdefac5e90856e6bf3ba43631a4d4b4775d9f Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 21 Jun 2021 15:35:27 +0200
Subject: [PATCH 30/49] add total stress result of ForwardDiff

---
 test/autodiff-stress/stress-total.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/autodiff-stress/stress-total.jl b/test/autodiff-stress/stress-total.jl
index 7032bb56f6..c25a15cadb 100644
--- a/test/autodiff-stress/stress-total.jl
+++ b/test/autodiff-stress/stress-total.jl
@@ -24,14 +24,14 @@ end
 compute_energy(scfres, 10.26)
 
 import FiniteDiff
-fd_stress = FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), a)
+fd_stress = FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), a) # -1.411445519652162
 
 ###
 ### Forward mode
 ###
 
 using ForwardDiff
-ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # NaN
+ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # -1.4114455192624642
 
 ###
 ### Reverse mode

From b0dcf5e7e2b6cde7b97d1fd368aa99d8bf4053a1 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Mon, 28 Jun 2021 09:22:09 +0200
Subject: [PATCH 31/49] add model_DFT ForwardDiff stress

---
 src/forwarddiff_rules.jl                      | 13 +++++++
 .../stress-forward-modeldft.jl                | 38 +++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 test/autodiff-stress/stress-forward-modeldft.jl

diff --git a/src/forwarddiff_rules.jl b/src/forwarddiff_rules.jl
index 4e9df69bf3..f28f62ed26 100644
--- a/src/forwarddiff_rules.jl
+++ b/src/forwarddiff_rules.jl
@@ -68,6 +68,9 @@ for P in [:Plan, :ScaledPlan]  # need ScaledPlan to avoid ambiguities
     end
 end
 
+LinearAlgebra.mul!(Y::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, p::AbstractFFTs.ScaledPlan{T,P,<:ForwardDiff.Dual}, X::AbstractArray{<:ComplexF64}) where {T,P} =
+    (Y .= _apply_plan(p, X))
+
 function _apply_plan(p::AbstractFFTs.Plan, x::AbstractArray)
     xtil = p * ForwardDiff.value.(x)
     dxtils = ntuple(ForwardDiff.npartials(eltype(x))) do n
@@ -119,6 +122,16 @@ function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.
     ipFFT, opFFT, ipBFFT, opBFFT
 end
 
+function r_to_G(basis::PlaneWaveBasis{T}, f_real::AbstractArray) where {T<:ForwardDiff.Dual}
+    f_fourier = similar(f_real, complex(T))
+    @assert length(size(f_real)) ∈ (3, 4)
+    # this exploits trailing index convention
+    for iσ = 1:size(f_real, 4)
+        @views r_to_G!(f_fourier[:, :, :, iσ], basis, f_real[:, :, :, iσ])
+    end
+    f_fourier
+end
+
 ###
 ### other workarounds
 ###
diff --git a/test/autodiff-stress/stress-forward-modeldft.jl b/test/autodiff-stress/stress-forward-modeldft.jl
new file mode 100644
index 0000000000..13f0107383
--- /dev/null
+++ b/test/autodiff-stress/stress-forward-modeldft.jl
@@ -0,0 +1,38 @@
+# Hellmann-Feynman stress
+# via ForwardDiff & custom FFTW overloads on ForwardDiff.Dual
+using DFTK
+
+function make_basis(a)
+    lattice = a / 2 * [[0 1 1.];
+                       [1 0 1.];
+                       [1 1 0.]]
+    Si = ElementPsp(:Si, psp=load_psp(:Si, functional="lda"))
+    atoms = [Si => [ones(3)/8, -ones(3)/8]]
+    model = model_DFT(lattice, atoms, []; symmetries=false)
+    kgrid = [4, 4, 4] # k-point grid (Regular Monkhorst-Pack grid)
+    Ecut = 7          # kinetic energy cutoff in Hartree
+    PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
+end
+
+a = 10.26
+scfres = self_consistent_field(make_basis(a), tol=1e-4)
+
+function compute_energy(scfres_ref, a)
+    basis = make_basis(a)
+    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
+    energies.total
+end
+
+compute_energy(a) = compute_energy(scfres, a)
+compute_energy(10.26)
+
+import FiniteDiff
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -0.4347657610031856 
+
+using ForwardDiff
+ForwardDiff.derivative(compute_energy, 10.26) # -0.434770331446876
+
+using BenchmarkTools
+@btime compute_energy(10.26)                                           # 101.814 ms (89326 allocations: 48.94 MiB)
+@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 224.734 ms (178656 allocations: 97.87 MiB)
+@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 340.684 ms (1556202 allocations: 146.99 MiB)

From 631b591bc78131e49d888c9442086590d161ae3a Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Wed, 30 Jun 2021 09:57:05 +0200
Subject: [PATCH 32/49] move forwarddiff rules to workarounds dir

---
 Project.toml                               | 4 +---
 src/DFTK.jl                                | 2 +-
 src/{ => workarounds}/forwarddiff_rules.jl | 0
 3 files changed, 2 insertions(+), 4 deletions(-)
 rename src/{ => workarounds}/forwarddiff_rules.jl (100%)

diff --git a/Project.toml b/Project.toml
index 095cf0cffe..3d5a7d9ae6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -50,6 +50,7 @@ ForwardDiff = "0.10"
 Interpolations = "0.12, 0.13"
 IterTools = "1"
 IterativeSolvers = "0.8, 0.9"
+JLD2 = "0.4 - 0.4.7"
 JSON = "0.21"
 Libxc = "0.3"
 LineSearches = "7"
@@ -73,9 +74,6 @@ UnitfulAtomic = "1"
 julia = "1.6"
 spglib_jll = "1.15"
 
-# Some world-age-related issues with JLD2 > 0.4.7 at the moment
-JLD2 = "0.4 - 0.4.7"
-
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
diff --git a/src/DFTK.jl b/src/DFTK.jl
index cdf60fc93b..195194d0ab 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -169,7 +169,7 @@ include("postprocess/chi0.jl")
 export compute_current
 include("postprocess/current.jl")
 
-include("forwarddiff_rules.jl")
+include("workarounds/forwarddiff_rules.jl")
 
 function __init__()
     # Use "@require" to only include fft_generic.jl once IntervalArithmetic or
diff --git a/src/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
similarity index 100%
rename from src/forwarddiff_rules.jl
rename to src/workarounds/forwarddiff_rules.jl

From 00078e39694f70558755ac51869656bee7285c0c Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Wed, 30 Jun 2021 10:27:56 +0200
Subject: [PATCH 33/49] add comments on r_to_G on duals

---
 src/workarounds/forwarddiff_rules.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index f28f62ed26..fb5f2f4735 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -122,6 +122,8 @@ function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.
     ipFFT, opFFT, ipBFFT, opBFFT
 end
 
+# PlaneWaveBasis{<:Dual} contains dual-scaled fft, which means that the result f_fourier 
+# must be able to hold complex dual numbers even if f_real is not dual
 function r_to_G(basis::PlaneWaveBasis{T}, f_real::AbstractArray) where {T<:ForwardDiff.Dual}
     f_fourier = similar(f_real, complex(T))
     @assert length(size(f_real)) ∈ (3, 4)

From b878050522353822bf3cd6a026766473f8911c77 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 6 Jul 2021 09:12:50 +0200
Subject: [PATCH 34/49] add lda_x

---
 test/autodiff-stress/stress-forward-modeldft.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/autodiff-stress/stress-forward-modeldft.jl b/test/autodiff-stress/stress-forward-modeldft.jl
index 13f0107383..15860bcaea 100644
--- a/test/autodiff-stress/stress-forward-modeldft.jl
+++ b/test/autodiff-stress/stress-forward-modeldft.jl
@@ -8,7 +8,7 @@ function make_basis(a)
                        [1 1 0.]]
     Si = ElementPsp(:Si, psp=load_psp(:Si, functional="lda"))
     atoms = [Si => [ones(3)/8, -ones(3)/8]]
-    model = model_DFT(lattice, atoms, []; symmetries=false)
+    model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn]; symmetries=false)
     kgrid = [4, 4, 4] # k-point grid (Regular Monkhorst-Pack grid)
     Ecut = 7          # kinetic energy cutoff in Hartree
     PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
@@ -27,12 +27,12 @@ compute_energy(a) = compute_energy(scfres, a)
 compute_energy(10.26)
 
 import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -0.4347657610031856 
+FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -1.215583767344458
 
 using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26) # -0.434770331446876
+ForwardDiff.derivative(compute_energy, 10.26) # -1.2155837670108651
 
 using BenchmarkTools
-@btime compute_energy(10.26)                                           # 101.814 ms (89326 allocations: 48.94 MiB)
-@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 224.734 ms (178656 allocations: 97.87 MiB)
-@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 340.684 ms (1556202 allocations: 146.99 MiB)
+@btime compute_energy(10.26)                                           #  91.082 ms (  88919 allocations:  65.00 MiB)
+@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 244.758 ms ( 177842 allocations: 129.99 MiB)
+@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 206.069 ms (1555666 allocations: 177.43 MiB)

From 65aad15e2df50cd15c60ba0569eaea94c02ee38e Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 09:33:26 +0200
Subject: [PATCH 35/49] delete scratch files

---
 Project.toml                                  |   2 -
 .../stress-forward-modeldft.jl                |  38 ----
 test/autodiff-stress/stress-forward.jl        |  45 -----
 test/autodiff-stress/stress-kinetic.jl        |  76 --------
 test/autodiff-stress/stress-total.jl          | 168 ------------------
 5 files changed, 329 deletions(-)
 delete mode 100644 test/autodiff-stress/stress-forward-modeldft.jl
 delete mode 100644 test/autodiff-stress/stress-forward.jl
 delete mode 100644 test/autodiff-stress/stress-kinetic.jl
 delete mode 100644 test/autodiff-stress/stress-total.jl

diff --git a/Project.toml b/Project.toml
index f61f6ceade..a16a2e335d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -30,7 +30,6 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
-ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -38,7 +37,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
 UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 spglib_jll = "ac4a9f1e-bdb2-5204-990c-47c8b2f70d4e"
 
 [compat]
diff --git a/test/autodiff-stress/stress-forward-modeldft.jl b/test/autodiff-stress/stress-forward-modeldft.jl
deleted file mode 100644
index 15860bcaea..0000000000
--- a/test/autodiff-stress/stress-forward-modeldft.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-# Hellmann-Feynman stress
-# via ForwardDiff & custom FFTW overloads on ForwardDiff.Dual
-using DFTK
-
-function make_basis(a)
-    lattice = a / 2 * [[0 1 1.];
-                       [1 0 1.];
-                       [1 1 0.]]
-    Si = ElementPsp(:Si, psp=load_psp(:Si, functional="lda"))
-    atoms = [Si => [ones(3)/8, -ones(3)/8]]
-    model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn]; symmetries=false)
-    kgrid = [4, 4, 4] # k-point grid (Regular Monkhorst-Pack grid)
-    Ecut = 7          # kinetic energy cutoff in Hartree
-    PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
-end
-
-a = 10.26
-scfres = self_consistent_field(make_basis(a), tol=1e-4)
-
-function compute_energy(scfres_ref, a)
-    basis = make_basis(a)
-    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
-    energies.total
-end
-
-compute_energy(a) = compute_energy(scfres, a)
-compute_energy(10.26)
-
-import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -1.215583767344458
-
-using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26) # -1.2155837670108651
-
-using BenchmarkTools
-@btime compute_energy(10.26)                                           #  91.082 ms (  88919 allocations:  65.00 MiB)
-@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 244.758 ms ( 177842 allocations: 129.99 MiB)
-@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 206.069 ms (1555666 allocations: 177.43 MiB)
diff --git a/test/autodiff-stress/stress-forward.jl b/test/autodiff-stress/stress-forward.jl
deleted file mode 100644
index a899020885..0000000000
--- a/test/autodiff-stress/stress-forward.jl
+++ /dev/null
@@ -1,45 +0,0 @@
-# Hellmann-Feynman stress
-# via ForwardDiff & custom FFTW overloads on ForwardDiff.Dual
-using DFTK
-
-function make_basis(a)
-    lattice = a / 2 * [[0 1 1.];
-                       [1 0 1.];
-                       [1 1 0.]]
-    Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
-    atoms = [Si => [ones(3)/8, -ones(3)/8]]
-    terms = [
-        Kinetic(),
-        AtomicLocal(),
-        AtomicNonlocal(),
-        Ewald(),
-        PspCorrection()
-    ]
-    model = Model(lattice; atoms=atoms, terms=terms, symmetries=false)
-    kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
-    Ecut = 15          # kinetic energy cutoff in Hartree
-    PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
-end
-
-a = 10.26
-scfres = self_consistent_field(make_basis(a), tol=1e-8)
-
-function compute_energy(scfres_ref, a)
-    basis = make_basis(a)
-    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
-    energies.total
-end
-
-compute_energy(a) = compute_energy(scfres, a)
-compute_energy(10.26)
-
-import FiniteDiff
-FiniteDiff.finite_difference_derivative(compute_energy, 10.26) # -1.4114477062973088 
-
-using ForwardDiff
-ForwardDiff.derivative(compute_energy, 10.26) # -1.4114477059240538
-
-using BenchmarkTools
-@btime compute_energy(10.26)                                           # 14.253 ms (60220 allocations: 10.01 MiB)
-@btime FiniteDiff.finite_difference_derivative(compute_energy, 10.26)  # 30.813 ms (120444 allocations: 20.01 MiB)
-@btime ForwardDiff.derivative(compute_energy, 10.26)                   # 67.657 ms (543847 allocations: 35.61 MiB)
diff --git a/test/autodiff-stress/stress-kinetic.jl b/test/autodiff-stress/stress-kinetic.jl
deleted file mode 100644
index 918146a545..0000000000
--- a/test/autodiff-stress/stress-kinetic.jl
+++ /dev/null
@@ -1,76 +0,0 @@
-# Very basic setup, useful for testing
-using DFTK
-using LinearAlgebra
-using BenchmarkTools
-
-a = 10.26  # Silicon lattice constant in Bohr
-lattice = a / 2 * [[0 1 1.];
-                   [1 0 1.];
-                   [1 1 0.]]
-Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
-atoms = [Si => [ones(3)/8, -ones(3)/8]]
-
-model = model_LDA(lattice, atoms)
-kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
-Ecut = 15          # kinetic energy cutoff in Hartree -- can increase to make G_vectors larger (larger solve time)
-basis = PlaneWaveBasis(model, Ecut; kgrid=kgrid)
-
-@time scfres = self_consistent_field(basis, tol=1e-8) # 75.068789 seconds (138.55 M allocations: 8.145 GiB, 4.59% gc time, 24.68% compilation time)
-
-# TODO try to rewrite for Zygote (performance optimizations)
-# e.g. translate loops to dense arrays or maps (?)
-
-function kinetic_energy(lattice, basis, ψ, occ)
-    recip_lattice = 2π * inv(lattice')
-    E = zero(Float64)
-    kinetic_energies = [[sum(abs2, recip_lattice * (G + kpt.coordinate)) / 2
-                         for G in  G_vectors(kpt)]
-                        for kpt in basis.kpoints]
-    for (ik, k) in enumerate(basis.kpoints)
-        for iband = 1:size(ψ[1], 2)
-            ψnk = @views ψ[ik][:, iband]
-            E += (basis.kweights[ik] * occ[ik][iband]
-                  * real(dot(ψnk, kinetic_energies[ik] .* ψnk)))
-        end
-    end
-    E
-end
-kinetic_energy(lattice) = kinetic_energy(lattice, basis, scfres.ψ, scfres.occupation)
-
-@time E = kinetic_energy(lattice) # 0.438027 seconds (623.88 k allocations: 36.457 MiB, 99.96% compilation time)
-@btime kinetic_energy(lattice) # 49.123 μs (742 allocations: 169.05 KiB)
-
-# stress := diff E wrt lattice
-
-#===#
-# Check results and compile times on first call
-stresses = Dict()
-
-# works fine
-using ForwardDiff
-@time stresses[:ForwardDiff] = ForwardDiff.gradient(kinetic_energy, lattice) # 3.627630 seconds (5.99 M allocations: 363.981 MiB, 5.08% gc time, 98.69% compilation time)
-
-# works but long compile time and gives ComplexF64 results
-# hypothesis: slow compilation due to loops (and generators)
-using Zygote
-@time stresses[:Zygote] = Zygote.gradient(kinetic_energy, lattice) # 61.094425 seconds (63.31 M allocations: 3.715 GiB, 3.85% gc time, 67.43% compilation time)
-
-# works fine
-using ReverseDiff
-@time stresses[:ReverseDiff] = ReverseDiff.gradient(kinetic_energy, lattice) # 5.409118 seconds (9.60 M allocations: 516.091 MiB, 14.61% gc time, 89.56% compilation time)
-
-# sanity check
-using FiniteDiff
-@time stresses[:FiniteDiff] = FiniteDiff.finite_difference_gradient(kinetic_energy, lattice) # 2.606210 seconds (2.87 M allocations: 232.911 MiB, 19.92% gc time, 99.19% compilation time)
-
-stresses
-# Dict{Any, Any} with 4 entries:
-# :ForwardDiff => [0.27005 -0.27005 -0.27005; -0.27005 0.27005 -0.27005; -0.27005 -0.27005 0.27005]
-# :FiniteDiff  => [0.27005 -0.27005 -0.27005; -0.27005 0.27005 -0.27005; -0.27005 -0.27005 0.27005]
-# :Zygote      => (ComplexF64[0.27005-0.0im -0.27005-0.0im -0.27005-0.0im; -0.27005-0.0im 0.27005-0.0im -0.27005-0.0im; -0.27005-0.0im -0.27005-0.0im 0.27005-0.0im],)
-# :ReverseDiff => [0.27005 -0.27005 -0.27005; -0.27005 0.27005 -0.27005; -0.27005 -0.27005 0.27005]
-
-@btime ForwardDiff.gradient(kinetic_energy, lattice) #    270.426 μs (   761 allocations:  1.07 MiB)
-@btime Zygote.gradient(kinetic_energy, lattice)      #  6.983 ms     ( 34765 allocations: 12.61 MiB)
-@btime ReverseDiff.gradient(kinetic_energy, lattice) # 15.376 ms     (415886 allocations: 16.42 MiB)
-@btime FiniteDiff.finite_difference_gradient(kinetic_energy, lattice) # 777.578 μs (13394 allocations: 2.97 MiB)
diff --git a/test/autodiff-stress/stress-total.jl b/test/autodiff-stress/stress-total.jl
deleted file mode 100644
index c25a15cadb..0000000000
--- a/test/autodiff-stress/stress-total.jl
+++ /dev/null
@@ -1,168 +0,0 @@
-using DFTK
-
-function make_basis(a)
-    lattice = a / 2 * [[0 1 1.];
-                    [1 0 1.];
-                    [1 1 0.]]
-    Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
-    atoms = [Si => [ones(3)/8, -ones(3)/8]]
-    model = model_atomic(lattice, atoms, symmetries=false)
-    kgrid = [1, 1, 1]  # k-point grid (Regular Monkhorst-Pack grid)
-    Ecut = 15          # kinetic energy cutoff in Hartree
-    PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32,32,32])
-end
-
-a = 10.26
-scfres = self_consistent_field(make_basis(a), tol=1e-8)
-
-function compute_energy(scfres_ref, a)
-    basis = make_basis(a)
-    energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
-    energies.total
-end
-
-compute_energy(scfres, 10.26)
-
-import FiniteDiff
-fd_stress = FiniteDiff.finite_difference_derivative(a -> compute_energy(scfres, a), a) # -1.411445519652162
-
-###
-### Forward mode
-###
-
-using ForwardDiff
-ForwardDiff.derivative(a -> compute_energy(scfres, a), 10.26) # -1.4114455192624642
-
-###
-### Reverse mode
-###
-
-using Zygote
-Zygote.gradient(a -> compute_energy(scfres, a), 10.26)
-# ERROR: LoadError: MethodError: no method matching zero(::String)
-# Closest candidates are:
-#   zero(::Union{Type{P}, P}) where P<:Dates.Period at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Dates/src/periods.jl:53
-#   zero(::FillArrays.Ones{T, N, Axes} where Axes) where {T, N} at /home/niku/.julia/packages/FillArrays/rPtlv/src/FillArrays.jl:537
-#   zero(::T) where T<:Dates.TimeType at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Dates/src/types.jl:423
-#   ...
-# Stacktrace:
-#   [1] pair_getfield
-#     @ ~/.julia/packages/Zygote/pM10l/src/lib/base.jl:134 [inlined]
-#   [2] #2040#back
-#     @ ~/.julia/packages/ZygoteRules/OjfTt/src/adjoint.jl:59 [inlined]
-#   [3] Pullback
-#     @ ./pair.jl:59 [inlined]
-#   [4] (::typeof(∂(getindex)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#   [5] Pullback
-#     @ ./abstractdict.jl:66 [inlined]
-#   [6] (::typeof(∂(iterate)))(Δ::Tuple{Float64, Nothing})
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#   [7] Pullback
-#     @ ./reduce.jl:60 [inlined]
-#   [8] (::typeof(∂(_foldl_impl)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#   [9] Pullback
-#     @ ./reduce.jl:48 [inlined]
-#  [10] (::typeof(∂(foldl_impl)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [11] Pullback
-#     @ ./reduce.jl:44 [inlined]
-#  [12] (::typeof(∂(mapfoldl_impl)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [13] Pullback (repeats 2 times)
-#     @ ./reduce.jl:160 [inlined]
-#  [14] (::typeof(∂(mapfoldl)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [15] Pullback
-#     @ ./reduce.jl:287 [inlined]
-#  [16] (::typeof(∂(#mapreduce#218)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [17] Pullback
-#     @ ./reduce.jl:287 [inlined]
-#  [18] (::typeof(∂(mapreduce)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [19] Pullback
-#     @ ./reduce.jl:501 [inlined]
-#  [20] (::typeof(∂(#sum#221)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [21] Pullback
-#     @ ./reduce.jl:501 [inlined]
-#  [22] (::typeof(∂(sum)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [23] Pullback
-#     @ ./reduce.jl:528 [inlined]
-#  [24] (::typeof(∂(#sum#222)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [25] Pullback
-#     @ ./reduce.jl:528 [inlined]
-#  [26] (::typeof(∂(sum)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [27] Pullback
-#     @ ~/.julia/dev/DFTK.jl/src/energies.jl:38 [inlined]
-#  [28] (::typeof(∂(getproperty)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [29] Pullback
-#     @ ~/.julia/packages/ZygoteRules/OjfTt/src/ZygoteRules.jl:11 [inlined]
-#  [30] Pullback
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:31 [inlined]
-#  [31] (::typeof(∂(compute_energy)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [32] Pullback
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:96 [inlined]
-#  [33] (::typeof(∂(#19)))(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface2.jl:0
-#  [34] (::Zygote.var"#41#42"{typeof(∂(#19))})(Δ::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface.jl:41
-#  [35] gradient(f::Function, args::Float64)
-#     @ Zygote ~/.julia/packages/Zygote/pM10l/src/compiler/interface.jl:59
-#  [36] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:96
-
-
-using ReverseDiff
-ReverseDiff.gradient(a -> compute_energy(scfres, first(a)), [10.26])
-# ERROR: LoadError: UndefRefError: access to undefined reference
-# Stacktrace:
-#   [1] getindex
-#     @ ./array.jl:802 [inlined]
-#   [2] macro expansion
-#     @ ./multidimensional.jl:860 [inlined]
-#   [3] macro expansion
-#     @ ./cartesian.jl:64 [inlined]
-#   [4] macro expansion
-#     @ ./multidimensional.jl:855 [inlined]
-#   [5] _unsafe_getindex!
-#     @ ./multidimensional.jl:868 [inlined]
-#   [6] _unsafe_getindex(::IndexLinear, ::Array{Complex{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, 3}, ::Base.Slice{Base.OneTo{Int64}}, ::Int64, ::Int64)
-#     @ Base ./multidimensional.jl:846
-#   [7] _getindex
-#     @ ./multidimensional.jl:832 [inlined]
-#   [8] getindex
-#     @ ./abstractarray.jl:1170 [inlined]
-#   [9] generic_plan_fft(data::Array{Complex{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, 3})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft_generic.jl:84
-#  [10] build_fft_plans(T::Type, fft_size::Tuple{Int64, Int64, Int64})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/fft_generic.jl:41
-#  [11] macro expansion
-#     @ ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:196 [inlined]
-#  [12] (::DFTK.var"#62#64"{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Int64})()
-#     @ DFTK ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:237
-#  [13] timeit(f::DFTK.var"#62#64"{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}, Bool, Bool, Int64, Vector{Int64}, Vector{Int64}, Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Int64}, to::TimerOutputs.TimerOutput, label::String)
-#     @ TimerOutputs ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:285
-#  [14] #PlaneWaveBasis#61
-#     @ ~/.julia/packages/TimerOutputs/ZmKD7/src/TimerOutput.jl:236 [inlined]
-#  [15] PlaneWaveBasis(model::Model{ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 2, Matrix{Float64}, Matrix{Float64}}}}, Ecut::Int64; kgrid::Vector{Int64}, kshift::Vector{Int64}, use_symmetry::Bool, kwargs::Base.Iterators.Pairs{Symbol, Vector{Int64}, Tuple{Symbol}, NamedTuple{(:fft_size,), Tuple{Vector{Int64}}}})
-#     @ DFTK ~/.julia/dev/DFTK.jl/src/PlaneWaveBasis.jl:286
-#  [16] make_basis(a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:12
-#  [17] compute_energy(scfres_ref::NamedTuple{(:ham, :basis, :energies, :converged, :ρ, :eigenvalues, :occupation, :εF, :n_iter, :n_ep_extra, :ψ, :diagonalization, :stage), Tuple{Hamiltonian, PlaneWaveBasis{Float64}, Energies{Float64}, Bool, Array{Float64, 4}, Vector{Vector{Float64}}, Vector{Vector{Float64}}, Float64, Int64, Int64, Vector{Matrix{ComplexF64}}, NamedTuple{(:λ, :X, :residual_norms, :iterations, :converged, :n_matvec), Tuple{Vector{Vector{Float64}}, Vector{Matrix{ComplexF64}}, Vector{Vector{Float64}}, Vector{Int64}, Bool, Int64}}, Symbol}}, a::ReverseDiff.TrackedReal{Float64, Float64, ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:19
-#  [18] (::var"#19#20")(a::ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}})
-#     @ Main ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:124
-#  [19] ReverseDiff.GradientTape(f::var"#19#20", input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}})
-#     @ ReverseDiff ~/.julia/packages/ReverseDiff/E4Tzn/src/api/tape.jl:199
-#  [20] gradient(f::Function, input::Vector{Float64}, cfg::ReverseDiff.GradientConfig{ReverseDiff.TrackedArray{Float64, Float64, 1, Vector{Float64}, Vector{Float64}}}) (repeats 2 times)
-#     @ ReverseDiff ~/.julia/packages/ReverseDiff/E4Tzn/src/api/gradients.jl:22
-#  [21] top-level scope
-#     @ ~/.julia/dev/DFTK.jl/test/autodiff-stress/stress-total.jl:124

From e8c9b5f20bc109b8ce7430cf3cea39cdab850156 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 09:45:58 +0200
Subject: [PATCH 36/49] add silicon stress testcase

---
 test/stresses.jl | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 test/stresses.jl

diff --git a/test/stresses.jl b/test/stresses.jl
new file mode 100644
index 0000000000..3e7da16827
--- /dev/null
+++ b/test/stresses.jl
@@ -0,0 +1,33 @@
+# Hellmann-Feynman stress
+# via ForwardDiff & custom FFTW overloads on ForwardDiff.Dual
+using Test
+using DFTK
+using ForwardDiff
+import FiniteDiff
+
+@testset "ForwardDiff stresses on silicon" begin
+    function make_basis(a)
+        lattice = a / 2 * [[0 1 1.];
+                        [1 0 1.];
+                        [1 1 0.]]
+        Si = ElementPsp(:Si, psp=load_psp(:Si, functional="lda"))
+        atoms = [Si => [ones(3)/8, -ones(3)/8]]
+        model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn]; symmetries=false)
+        kgrid = [4, 4, 4] # k-point grid (Regular Monkhorst-Pack grid)
+        Ecut = 7          # kinetic energy cutoff in Hartree
+        PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
+    end
+
+    function compute_energy(scfres_ref, a)
+        basis = make_basis(a)
+        energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
+        energies.total
+    end
+
+    a = 10.26
+    scfres = self_consistent_field(make_basis(a), tol=1e-4)
+    compute_energy(a) = compute_energy(scfres, a)
+
+    ref = FiniteDiff.finite_difference_derivative(compute_energy, a)
+    @test isapprox(ForwardDiff.derivative(compute_energy, a), ref, atol=1e-4)
+end

From 80f30bdf448c8882c0aee069f5e90790f71b4fb9 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 09:51:57 +0200
Subject: [PATCH 37/49] revert project toml comment

---
 Project.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index a16a2e335d..c76ae01c6a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -48,7 +48,6 @@ ForwardDiff = "0.10"
 Interpolations = "0.12, 0.13"
 IterTools = "1"
 IterativeSolvers = "0.8, 0.9"
-JLD2 = "0.4 - 0.4.7"
 JSON = "0.21"
 Libxc = "0.3"
 LineSearches = "7"
@@ -72,6 +71,9 @@ UnitfulAtomic = "1"
 julia = "1.6"
 spglib_jll = "1.15"
 
+# Some world-age-related issues with JLD2 > 0.4.7 at the moment
+JLD2 = "0.4 - 0.4.7"
+
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"

From 21e39c9331bfb8a188fddcda30fa97587c12dafc Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 12:27:36 +0200
Subject: [PATCH 38/49] rm hardcoded fft_size

---
 test/stresses.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/stresses.jl b/test/stresses.jl
index 3e7da16827..8087c7c6ea 100644
--- a/test/stresses.jl
+++ b/test/stresses.jl
@@ -15,7 +15,7 @@ import FiniteDiff
         model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn]; symmetries=false)
         kgrid = [4, 4, 4] # k-point grid (Regular Monkhorst-Pack grid)
         Ecut = 7          # kinetic energy cutoff in Hartree
-        PlaneWaveBasis(model, Ecut; kgrid=kgrid, fft_size=[32, 32, 32])
+        PlaneWaveBasis(model, Ecut; kgrid=kgrid)
     end
 
     function compute_energy(scfres_ref, a)

From badd501751bb831f74ba48a0b6f9a833b04823ef Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 12:42:25 +0200
Subject: [PATCH 39/49] tighten test tolerances

---
 test/stresses.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/stresses.jl b/test/stresses.jl
index 8087c7c6ea..dddfd49ac2 100644
--- a/test/stresses.jl
+++ b/test/stresses.jl
@@ -13,7 +13,7 @@ import FiniteDiff
         Si = ElementPsp(:Si, psp=load_psp(:Si, functional="lda"))
         atoms = [Si => [ones(3)/8, -ones(3)/8]]
         model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn]; symmetries=false)
-        kgrid = [4, 4, 4] # k-point grid (Regular Monkhorst-Pack grid)
+        kgrid = [1, 1, 1] # k-point grid (Regular Monkhorst-Pack grid)
         Ecut = 7          # kinetic energy cutoff in Hartree
         PlaneWaveBasis(model, Ecut; kgrid=kgrid)
     end
@@ -25,9 +25,9 @@ import FiniteDiff
     end
 
     a = 10.26
-    scfres = self_consistent_field(make_basis(a), tol=1e-4)
+    scfres = self_consistent_field(make_basis(a), is_converged=DFTK.ScfConvergenceDensity(1e-13))
     compute_energy(a) = compute_energy(scfres, a)
 
     ref = FiniteDiff.finite_difference_derivative(compute_energy, a)
-    @test isapprox(ForwardDiff.derivative(compute_energy, a), ref, atol=1e-4)
+    @test isapprox(ForwardDiff.derivative(compute_energy, a), ref, atol=1e-8)
 end

From f377c5a3c7f351be7ee15590544a07bfb0ac476a Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 13:59:41 +0200
Subject: [PATCH 40/49] add spglib dual rule

---
 src/workarounds/forwarddiff_rules.jl | 5 +++++
 test/stresses.jl                     | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index fb5f2f4735..fc566a9169 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -134,6 +134,11 @@ function r_to_G(basis::PlaneWaveBasis{T}, f_real::AbstractArray) where {T<:Forwa
     f_fourier
 end
 
+# determine symmetry operations only from primal lattice values
+function spglib_get_symmetry(lattice::Matrix{<:ForwardDiff.Dual}, atoms, magnetic_moments=[]; kwargs...)
+    spglib_get_symmetry(ForwardDiff.value.(lattice), atoms, magnetic_moments; kwargs...)
+end
+
 ###
 ### other workarounds
 ###
diff --git a/test/stresses.jl b/test/stresses.jl
index dddfd49ac2..da5e823acb 100644
--- a/test/stresses.jl
+++ b/test/stresses.jl
@@ -12,7 +12,7 @@ import FiniteDiff
                         [1 1 0.]]
         Si = ElementPsp(:Si, psp=load_psp(:Si, functional="lda"))
         atoms = [Si => [ones(3)/8, -ones(3)/8]]
-        model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn]; symmetries=false)
+        model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn])
         kgrid = [1, 1, 1] # k-point grid (Regular Monkhorst-Pack grid)
         Ecut = 7          # kinetic energy cutoff in Hartree
         PlaneWaveBasis(model, Ecut; kgrid=kgrid)

From d00aed51f3105cf9ffc8c33e8140bae0f9b9bee8 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 16:10:10 +0200
Subject: [PATCH 41/49] re-enable cond check

---
 src/Model.jl                         | 5 +++--
 src/workarounds/forwarddiff_rules.jl | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/Model.jl b/src/Model.jl
index 304c3a625e..a5072c5fc9 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -104,8 +104,7 @@ function Model(lattice::AbstractMatrix{T};
             "For 1D and 2D systems, the non-empty dimensions must come first")
     end
 
-    ## temporarily disabled for ForwardDiff. TODO re-enable
-    # cond(lattice[1:d, 1:d]) > 1e-5 || @warn "Your lattice is badly conditioned, the computation is likely to fail."
+    _check_well_conditioned(lattice[1:d, 1:d]) || @warn "Your lattice is badly conditioned, the computation is likely to fail."
 
     # Compute reciprocal lattice and volumes.
     # recall that the reciprocal lattice is the set of G vectors such
@@ -214,3 +213,5 @@ function spin_components(spin_polarization::Symbol)
     spin_polarization == :full      && return (:undefined, )
 end
 spin_components(model::Model) = spin_components(model.spin_polarization)
+
+_check_well_conditioned(A; tol=1e5) = (cond(A) <= tol)
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index fc566a9169..e64d0c9e7e 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -139,6 +139,10 @@ function spglib_get_symmetry(lattice::Matrix{<:ForwardDiff.Dual}, atoms, magneti
     spglib_get_symmetry(ForwardDiff.value.(lattice), atoms, magnetic_moments; kwargs...)
 end
 
+function _check_well_conditioned(A::AbstractArray{<:ForwardDiff.Dual}; kwargs...)
+    _check_well_conditioned(ForwardDiff.value.(A); kwargs...)
+end
+
 ###
 ### other workarounds
 ###

From 23a77a8ffb48929bf36fbf735501a3c0d174b1cc Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 17:39:59 +0200
Subject: [PATCH 42/49] fix hellmann-feynman: recompute density from psi

---
 src/densities.jl | 12 ++++++++----
 test/stresses.jl | 22 +++++++++++++++++-----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/densities.jl b/src/densities.jl
index 4d84d6de46..0f51999e15 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -46,7 +46,7 @@ grid `basis`, where the individual k-Points are occupied according to `occupatio
 `ψ` should be one coefficient matrix per k-Point. If the `Model` underlying the basis
 is not collinear the spin density is `nothing`.
 """
-@views @timing function compute_density(basis::PlaneWaveBasis, ψ, occupation)
+@views @timing function compute_density(basis::PlaneWaveBasis{T}, ψ, occupation) where T
     n_k = length(basis.kpoints)
     n_spin = basis.model.n_spin_components
 
@@ -60,9 +60,10 @@ is not collinear the spin density is `nothing`.
     @assert n_k > 0
 
     # Allocate an accumulator for ρ in each thread for each spin component
-    ρaccus = [similar(view(ψ[1], :, 1), (basis.fft_size..., n_spin))
+    # TODO use something like T = promote_type(Array{eltype(basis)}, eltype(ψ), ...)
+    ρaccus = [similar(Array{complex(T)}, (basis.fft_size..., n_spin))
               for ithread in 1:Threads.nthreads()]
-
+    
     # TODO Better load balancing ... the workload per kpoint depends also on
     #      the number of symmetry operations. We know heuristically that the Gamma
     #      point (first k-Point) has least symmetry operations, so we will put
@@ -79,7 +80,10 @@ is not collinear the spin density is `nothing`.
 
     Threads.@threads for (ikpts, ρaccu) in collect(zip(kpt_per_thread, ρaccus))
         ρaccu .= 0
-        ρ_k = similar(ψ[1][:, 1], basis.fft_size)
+
+        # TODO use something like T = promote_type(Array{eltype(basis)}, eltype(ψ), ...)
+        ρ_k = similar(Array{complex(T)}, basis.fft_size)
+        
         for ik in ikpts
             kpt = basis.kpoints[ik]
             compute_partial_density!(ρ_k, basis, kpt, ψ[ik], occupation[ik])
diff --git a/test/stresses.jl b/test/stresses.jl
index da5e823acb..cc55c2137c 100644
--- a/test/stresses.jl
+++ b/test/stresses.jl
@@ -18,16 +18,28 @@ import FiniteDiff
         PlaneWaveBasis(model, Ecut; kgrid=kgrid)
     end
 
-    function compute_energy(scfres_ref, a)
+    function recompute_energy(a)
         basis = make_basis(a)
-        energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=scfres_ref.ρ)
+        scfres = self_consistent_field(basis, is_converged=DFTK.ScfConvergenceDensity(1e-13))
+        energies, H = energy_hamiltonian(basis, scfres.ψ, scfres.occupation; ρ=scfres.ρ)
+        energies.total
+    end
+
+    function hellmann_feynman_energy(scfres_ref, a)
+        basis = make_basis(a)
+        ρ = DFTK.compute_density(basis, scfres_ref.ψ, scfres_ref.occupation)
+        energies, H = energy_hamiltonian(basis, scfres_ref.ψ, scfres_ref.occupation; ρ=ρ)
         energies.total
     end
 
     a = 10.26
     scfres = self_consistent_field(make_basis(a), is_converged=DFTK.ScfConvergenceDensity(1e-13))
-    compute_energy(a) = compute_energy(scfres, a)
+    hellmann_feynman_energy(a) = hellmann_feynman_energy(scfres, a)
+
+    ref_recompute = FiniteDiff.finite_difference_derivative(recompute_energy, a)
+    ref_hf = FiniteDiff.finite_difference_derivative(hellmann_feynman_energy, a)
+    s_hf = ForwardDiff.derivative(hellmann_feynman_energy, a)
 
-    ref = FiniteDiff.finite_difference_derivative(compute_energy, a)
-    @test isapprox(ForwardDiff.derivative(compute_energy, a), ref, atol=1e-8)
+    @test isapprox(ref_hf, ref_recompute, atol=1e-8)
+    @test isapprox(s_hf, ref_hf, atol=1e-8)
 end

From a181ed2a34cfba798899c5e01509035de96a5c4e Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 8 Jul 2021 18:15:11 +0200
Subject: [PATCH 43/49] move FiniteDiff to test deps

---
 Project.toml     | 4 ++--
 test/runtests.jl | 4 ++++
 test/stresses.jl | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index c76ae01c6a..a30afbcd49 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,6 @@ AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -77,6 +76,7 @@ JLD2 = "0.4 - 0.4.7"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
+FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 GenericLinearAlgebra = "14197337-ba66-59df-a3e3-ca00e7dcff7a"
 IntervalArithmetic = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
@@ -87,4 +87,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 
 [targets]
-test = ["Test", "Aqua", "DoubleFloats", "GenericLinearAlgebra", "IntervalArithmetic", "Plots", "Random", "KrylovKit", "JLD2", "WriteVTK"]
+test = ["Test", "Aqua", "DoubleFloats", "FiniteDiff", "GenericLinearAlgebra", "IntervalArithmetic", "Plots", "Random", "KrylovKit", "JLD2", "WriteVTK"]
diff --git a/test/runtests.jl b/test/runtests.jl
index 89562c52ff..118d3cac11 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -107,5 +107,9 @@ Random.seed!(0)
         include("aqua.jl")
     end
 
+    if "all" in TAGS
+        include("stresses.jl")
+    end
+
     ("example" in TAGS) && include("runexamples.jl")
 end
diff --git a/test/stresses.jl b/test/stresses.jl
index cc55c2137c..5bd8bf902e 100644
--- a/test/stresses.jl
+++ b/test/stresses.jl
@@ -40,6 +40,6 @@ import FiniteDiff
     ref_hf = FiniteDiff.finite_difference_derivative(hellmann_feynman_energy, a)
     s_hf = ForwardDiff.derivative(hellmann_feynman_energy, a)
 
-    @test isapprox(ref_hf, ref_recompute, atol=1e-8)
+    @test isapprox(ref_hf, ref_recompute, atol=1e-4)
     @test isapprox(s_hf, ref_hf, atol=1e-8)
 end

From 3c57d6b6d11a8b9d602f16f1e71a7f1b198b9873 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 13 Jul 2021 10:57:13 +0200
Subject: [PATCH 44/49] apply suggestions from review I

---
 src/DFTK.jl                          |  5 +++--
 src/densities.jl                     |  2 +-
 src/workarounds/forwarddiff_rules.jl | 18 ++++--------------
 test/stresses.jl                     | 13 +++++++------
 4 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 2f9f72a408..a63c91da4d 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -172,8 +172,6 @@ include("postprocess/chi0.jl")
 export compute_current
 include("postprocess/current.jl")
 
-include("workarounds/forwarddiff_rules.jl")
-
 function __init__()
     # Use "@require" to only include fft_generic.jl once IntervalArithmetic or
     # DoubleFloats has been loaded (via a "using" or an "import").
@@ -181,6 +179,9 @@ function __init__()
     #
     # The global variable GENERIC_FFT_LOADED makes sure that things are only
     # included once.
+    @require ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" begin
+        include("workarounds/forwarddiff_rules.jl")
+    end
     @require IntervalArithmetic="d1acc4aa-44c8-5952-acd4-ba5d80a2a253" begin
         include("workarounds/intervals.jl")
         !isdefined(DFTK, :GENERIC_FFT_LOADED) && include("workarounds/fft_generic.jl")
diff --git a/src/densities.jl b/src/densities.jl
index 0f51999e15..a62a91d3ee 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -63,7 +63,7 @@ is not collinear the spin density is `nothing`.
     # TODO use something like T = promote_type(Array{eltype(basis)}, eltype(ψ), ...)
     ρaccus = [similar(Array{complex(T)}, (basis.fft_size..., n_spin))
               for ithread in 1:Threads.nthreads()]
-    
+
     # TODO Better load balancing ... the workload per kpoint depends also on
     #      the number of symmetry operations. We know heuristically that the Gamma
     #      point (first k-Point) has least symmetry operations, so we will put
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index e64d0c9e7e..226751e90c 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -2,10 +2,8 @@ import ForwardDiff
 import AbstractFFTs
 
 # original PR by mcabbott: https://github.com/JuliaDiff/ForwardDiff.jl/pull/495
-# modified version: https://github.com/niklasschmitz/ForwardDiff.jl/blob/nfs/fft/src/fft.jl
 
-ForwardDiff.value(x::Complex{<:ForwardDiff.Dual}) =
-    Complex(x.re.value, x.im.value)
+ForwardDiff.value(x::Complex{<:ForwardDiff.Dual}) = Complex(x.re.value, x.im.value)
 
 ForwardDiff.partials(x::Complex{<:ForwardDiff.Dual}, n::Int) =
     Complex(ForwardDiff.partials(x.re, n), ForwardDiff.partials(x.im, n))
@@ -25,13 +23,11 @@ AbstractFFTs.realfloat(d::ForwardDiff.Dual{T,V,N}) where {T,V,N} = convert(Forwa
 
 for plan in [:plan_fft, :plan_ifft, :plan_bfft]
     @eval begin
-
         AbstractFFTs.$plan(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
             AbstractFFTs.$plan(ForwardDiff.value.(x) .+ 0im, region; kwargs...)
 
         AbstractFFTs.$plan(x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, region=1:ndims(x); kwargs...) =
             AbstractFFTs.$plan(ForwardDiff.value.(x), region; kwargs...)
-
     end
 end
 
@@ -41,19 +37,16 @@ AbstractFFTs.plan_rfft(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x);
 
 for plan in [:plan_irfft, :plan_brfft]  # these take an extra argument, only when complex?
     @eval begin
-
         AbstractFFTs.$plan(x::AbstractArray{<:ForwardDiff.Dual}, region=1:ndims(x); kwargs...) =
             AbstractFFTs.$plan(ForwardDiff.value.(x) .+ 0im, region; kwargs...)
 
         AbstractFFTs.$plan(x::AbstractArray{<:Complex{<:ForwardDiff.Dual}}, d::Integer, region=1:ndims(x); kwargs...) =
             AbstractFFTs.$plan(ForwardDiff.value.(x), d, region; kwargs...)
-
     end
 end
 
 for P in [:Plan, :ScaledPlan]  # need ScaledPlan to avoid ambiguities
     @eval begin
-
         Base.:*(p::AbstractFFTs.$P, x::AbstractArray{<:ForwardDiff.Dual}) =
             _apply_plan(p, x)
 
@@ -89,9 +82,7 @@ function _apply_plan(p::AbstractFFTs.ScaledPlan{T,P,<:ForwardDiff.Dual}, x::Abst
     _apply_plan(p.p, p.scale * x) # for when p.scale is Dual, need out-of-place
 end
 
-###
-### DFTK setup specific
-###
+# DFTK setup specific
 
 next_working_fft_size(::Type{<:ForwardDiff.Dual}, size) = size
 
@@ -143,9 +134,8 @@ function _check_well_conditioned(A::AbstractArray{<:ForwardDiff.Dual}; kwargs...
     _check_well_conditioned(ForwardDiff.value.(A); kwargs...)
 end
 
-###
-### other workarounds
-###
+
+# other workarounds
 
 # problem: ForwardDiff of norm of SVector gives NaN derivative at zero
 # https://github.com/JuliaMolSim/DFTK.jl/issues/443#issuecomment-864930410
diff --git a/test/stresses.jl b/test/stresses.jl
index 5bd8bf902e..c5a2e7a6fe 100644
--- a/test/stresses.jl
+++ b/test/stresses.jl
@@ -1,20 +1,21 @@
-# Hellmann-Feynman stress
-# via ForwardDiff & custom FFTW overloads on ForwardDiff.Dual
 using Test
 using DFTK
 using ForwardDiff
 import FiniteDiff
 
+# Hellmann-Feynman stress
+# via ForwardDiff & custom FFTW overloads on ForwardDiff.Dual
+
 @testset "ForwardDiff stresses on silicon" begin
     function make_basis(a)
         lattice = a / 2 * [[0 1 1.];
-                        [1 0 1.];
-                        [1 1 0.]]
+                           [1 0 1.];
+                           [1 1 0.]]
         Si = ElementPsp(:Si, psp=load_psp(:Si, functional="lda"))
         atoms = [Si => [ones(3)/8, -ones(3)/8]]
         model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn])
-        kgrid = [1, 1, 1] # k-point grid (Regular Monkhorst-Pack grid)
-        Ecut = 7          # kinetic energy cutoff in Hartree
+        kgrid = [1, 1, 1]
+        Ecut = 7
         PlaneWaveBasis(model, Ecut; kgrid=kgrid)
     end
 

From f1ef7eec234448a2091ed13f645677ac9db52454 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Tue, 13 Jul 2021 11:59:20 +0200
Subject: [PATCH 45/49] avoid code duplication of DummyInplace

---
 src/DFTK.jl                          |  7 +++++--
 src/workarounds/dummy_inplace_fft.jl | 15 +++++++++++++++
 src/workarounds/fft_generic.jl       | 14 --------------
 src/workarounds/forwarddiff_rules.jl | 14 --------------
 4 files changed, 20 insertions(+), 30 deletions(-)
 create mode 100644 src/workarounds/dummy_inplace_fft.jl

diff --git a/src/DFTK.jl b/src/DFTK.jl
index a63c91da4d..670f766593 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -177,16 +177,19 @@ function __init__()
     # DoubleFloats has been loaded (via a "using" or an "import").
     # See https://github.com/JuliaPackaging/Requires.jl for details.
     #
-    # The global variable GENERIC_FFT_LOADED makes sure that things are only
-    # included once.
+    # The global variables GENERIC_FFT_LOADED and DUMMY_INPLACE_LOADED
+    # make sure that things are only included once.
     @require ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" begin
+        !isdefined(DFTK, :DUMMY_INPLACE_LOADED) && include("workarounds/dummy_inplace_fft.jl")
         include("workarounds/forwarddiff_rules.jl")
     end
     @require IntervalArithmetic="d1acc4aa-44c8-5952-acd4-ba5d80a2a253" begin
         include("workarounds/intervals.jl")
+        !isdefined(DFTK, :DUMMY_INPLACE_LOADED) && include("workarounds/dummy_inplace_fft.jl")
         !isdefined(DFTK, :GENERIC_FFT_LOADED) && include("workarounds/fft_generic.jl")
     end
     @require DoubleFloats="497a8b3b-efae-58df-a0af-a86822472b78" begin
+        !isdefined(DFTK, :DUMMY_INPLACE_LOADED) && include("workarounds/dummy_inplace_fft.jl")
         !isdefined(DFTK, :GENERIC_FFT_LOADED) && include("workarounds/fft_generic.jl")
     end
     @require Plots="91a5bcdd-55d7-5caf-9e0b-520d859cae80" include("plotting.jl")
diff --git a/src/workarounds/dummy_inplace_fft.jl b/src/workarounds/dummy_inplace_fft.jl
new file mode 100644
index 0000000000..0b2567b384
--- /dev/null
+++ b/src/workarounds/dummy_inplace_fft.jl
@@ -0,0 +1,15 @@
+# This is needed to flag that the dummy_inplace_fft.jl file has already been loaded
+const DUMMY_INPLACE_LOADED = true
+
+# A dummy wrapper around an out-of-place FFT plan to make it appear in-place
+# This is needed for some generic FFT implementations, which do not have in-place plans
+struct DummyInplace{opFFT}
+    fft::opFFT
+end
+LinearAlgebra.mul!(Y, p::DummyInplace, X) = (Y .= mul!(similar(X), p.fft, X))
+LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = (Y .= ldiv!(similar(X), p.fft, X))
+
+import Base: *, \, length
+*(p::DummyInplace, X) = p.fft * X
+\(p::DummyInplace, X) = p.fft \ X
+length(p::DummyInplace) = length(p.fft)
diff --git a/src/workarounds/fft_generic.jl b/src/workarounds/fft_generic.jl
index 4da356b6ef..15e86a787d 100644
--- a/src/workarounds/fft_generic.jl
+++ b/src/workarounds/fft_generic.jl
@@ -90,17 +90,3 @@ function generic_plan_bfft(data::AbstractArray{T, 3}) where T
                     FourierTransforms.plan_bfft(data[1, :, 1]),
                     FourierTransforms.plan_bfft(data[1, 1, :])], T(1))
 end
-
-
-# A dummy wrapper around an out-of-place FFT plan to make it appear in-place
-# This is needed for some generic FFT implementations, which do not have in-place plans
-struct DummyInplace{opFFT}
-    fft::opFFT
-end
-LinearAlgebra.mul!(Y, p::DummyInplace, X) = (Y .= mul!(similar(X), p.fft, X))
-LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = (Y .= ldiv!(similar(X), p.fft, X))
-
-import Base: *, \, length
-*(p::DummyInplace, X) = p.fft * X
-\(p::DummyInplace, X) = p.fft \ X
-length(p::DummyInplace) = length(p.fft)
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index 226751e90c..a89201039b 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -88,20 +88,6 @@ next_working_fft_size(::Type{<:ForwardDiff.Dual}, size) = size
 
 _fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED
 
-# *** COPIED from fft_generic.jl *** TODO refactor
-# A dummy wrapper around an out-of-place FFT plan to make it appear in-place
-# This is needed for some generic FFT implementations, which do not have in-place plans
-struct DummyInplace{opFFT}
-    fft::opFFT
-end
-LinearAlgebra.mul!(Y, p::DummyInplace, X) = (Y .= mul!(similar(X), p.fft, X))
-LinearAlgebra.ldiv!(Y, p::DummyInplace, X) = (Y .= ldiv!(similar(X), p.fft, X))
-
-import Base: *, \, length
-*(p::DummyInplace, X) = p.fft * X
-\(p::DummyInplace, X) = p.fft \ X
-length(p::DummyInplace) = length(p.fft)
-
 function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}, fft_size)
     tmp = Array{Complex{T}}(undef, fft_size...)
     opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))

From 8be63da5d59ef4a8743d175e0ac71154662688ab Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 15 Jul 2021 11:41:01 +0200
Subject: [PATCH 46/49] use silicon numbers from testcases.jl

---
 test/stresses.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/stresses.jl b/test/stresses.jl
index c5a2e7a6fe..8f5ea65983 100644
--- a/test/stresses.jl
+++ b/test/stresses.jl
@@ -2,6 +2,7 @@ using Test
 using DFTK
 using ForwardDiff
 import FiniteDiff
+include("testcases.jl")
 
 # Hellmann-Feynman stress
 # via ForwardDiff & custom FFTW overloads on ForwardDiff.Dual
@@ -11,8 +12,8 @@ import FiniteDiff
         lattice = a / 2 * [[0 1 1.];
                            [1 0 1.];
                            [1 1 0.]]
-        Si = ElementPsp(:Si, psp=load_psp(:Si, functional="lda"))
-        atoms = [Si => [ones(3)/8, -ones(3)/8]]
+        Si = ElementPsp(silicon.atnum, psp=load_psp(silicon.psp))
+        atoms = [Si => silicon.positions]
         model = model_DFT(lattice, atoms, [:lda_x, :lda_c_vwn])
         kgrid = [1, 1, 1]
         Ecut = 7

From 6a821bd79a920dafe7b2001db6609d8aba9644c3 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 15 Jul 2021 13:33:54 +0200
Subject: [PATCH 47/49] use promote_type between basis and psi

---
 src/densities.jl | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/densities.jl b/src/densities.jl
index a62a91d3ee..efff8a7581 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -46,7 +46,7 @@ grid `basis`, where the individual k-Points are occupied according to `occupatio
 `ψ` should be one coefficient matrix per k-Point. If the `Model` underlying the basis
 is not collinear the spin density is `nothing`.
 """
-@views @timing function compute_density(basis::PlaneWaveBasis{T}, ψ, occupation) where T
+@views @timing function compute_density(basis::PlaneWaveBasis, ψ, occupation)
     n_k = length(basis.kpoints)
     n_spin = basis.model.n_spin_components
 
@@ -60,8 +60,8 @@ is not collinear the spin density is `nothing`.
     @assert n_k > 0
 
     # Allocate an accumulator for ρ in each thread for each spin component
-    # TODO use something like T = promote_type(Array{eltype(basis)}, eltype(ψ), ...)
-    ρaccus = [similar(Array{complex(T)}, (basis.fft_size..., n_spin))
+    T = promote_type(eltype(basis), eltype(ψ[1]))
+    ρaccus = [similar(ψ[1], T, (basis.fft_size..., n_spin))
               for ithread in 1:Threads.nthreads()]
 
     # TODO Better load balancing ... the workload per kpoint depends also on
@@ -80,10 +80,7 @@ is not collinear the spin density is `nothing`.
 
     Threads.@threads for (ikpts, ρaccu) in collect(zip(kpt_per_thread, ρaccus))
         ρaccu .= 0
-
-        # TODO use something like T = promote_type(Array{eltype(basis)}, eltype(ψ), ...)
-        ρ_k = similar(Array{complex(T)}, basis.fft_size)
-        
+        ρ_k = similar(ψ[1], T, basis.fft_size)
         for ik in ikpts
             kpt = basis.kpoints[ik]
             compute_partial_density!(ρ_k, basis, kpt, ψ[ik], occupation[ik])

From 30ada7f9967b715576a246d4a70d509b30a3f5d4 Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 15 Jul 2021 13:51:06 +0200
Subject: [PATCH 48/49] line break

---
 src/Model.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Model.jl b/src/Model.jl
index 3ba425e352..c2d772726f 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -103,7 +103,8 @@ function Model(lattice::AbstractMatrix{T};
         norm(lattice[:, i]) == norm(lattice[i, :]) == 0 || error(
             "For 1D and 2D systems, the non-empty dimensions must come first")
     end
-    _check_well_conditioned(lattice[1:d, 1:d]) || @warn "Your lattice is badly conditioned, the computation is likely to fail."
+    _check_well_conditioned(lattice[1:d, 1:d]) || @warn (
+        "Your lattice is badly conditioned, the computation is likely to fail.")
 
     # Compute reciprocal lattice and volumes.
     # recall that the reciprocal lattice is the set of G vectors such

From cb466c1903f805af1008fb009472712cef51f8ea Mon Sep 17 00:00:00 2001
From: Niklas Schmitz <niklas.f.schmitz@gmail.com>
Date: Thu, 15 Jul 2021 13:58:09 +0200
Subject: [PATCH 49/49] d to n_dim

---
 src/Model.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Model.jl b/src/Model.jl
index c2d772726f..e5dfafb72d 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -103,7 +103,7 @@ function Model(lattice::AbstractMatrix{T};
         norm(lattice[:, i]) == norm(lattice[i, :]) == 0 || error(
             "For 1D and 2D systems, the non-empty dimensions must come first")
     end
-    _check_well_conditioned(lattice[1:d, 1:d]) || @warn (
+    _check_well_conditioned(lattice[1:n_dim, 1:n_dim]) || @warn (
         "Your lattice is badly conditioned, the computation is likely to fail.")
 
     # Compute reciprocal lattice and volumes.