Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust to new ann interface #125

Merged
merged 8 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/Latex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
LatexDocs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
AbstractNeuralNetworks = "0.1"
AbstractNeuralNetworks = "0.2"
BandedMatrices = "0.17, 1"
ChainRules = "1"
ChainRulesCore = "1"
Expand Down
8 changes: 3 additions & 5 deletions docs/src/tikz/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
all: pdf
$(MAKE) png
$(MAKE) png res1=150 res2=200 res3=100 res4=180
$(MAKE) logo
$(MAKE) clean

Expand Down Expand Up @@ -46,6 +46,8 @@ png:
pdftocairo -png -r $(res3) -transp -singlefile structs_visualization.pdf structs_visualization
pdftocairo -png -r $(res3) -transp -singlefile structs_visualization_dark.pdf structs_visualization_dark
pdftocairo -png -r $(res1) -transp -singlefile logo.pdf logo
pdftocairo -png -r 500 -transp -singlefile logo_with_name.pdf logo_with_name
pdftocairo -png -r 500 -transp -singlefile logo_with_name_dark.pdf logo_with_name_dark
pdftocairo -png -r $(res1) -transp -singlefile symplectic_autoencoder.pdf symplectic_autoencoder
pdftocairo -png -r $(res1) -transp -singlefile symplectic_autoencoder_dark.pdf symplectic_autoencoder_dark
pdftocairo -png -r $(res1) -transp -singlefile solution_manifold_2.pdf solution_manifold_2
Expand All @@ -55,10 +57,6 @@ png:
pdftocairo -png -r $(res1) -transp -singlefile tensor_dark.pdf tensor_dark
pdftocairo -png -r $(res4) -transp -singlefile tensor_sampling.pdf tensor_sampling
pdftocairo -png -r $(res4) -transp -singlefile tensor_sampling_dark.pdf tensor_sampling_dark
pdftocairo -png -r $(res4) -transp -singlefile skew_sym_visualization.pdf skew_sym_visualization
pdftocairo -png -r $(res4) -transp -singlefile skew_sym_visualization_dark.pdf skew_sym_visualization_dark
pdftocairo -png -r $(res1) -transp -singlefile vp_feedforward.pdf vp_feedforward
pdftocairo -png -r $(res1) -transp -singlefile vp_feedforward_dark.pdf vp_feedforward_dark

logo:
cp logo_with_name.png ../assets/logo.png
Expand Down
4 changes: 2 additions & 2 deletions docs/src/tutorials/grassmann_layer.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const ε = 0.1 # entropic regularization. √ε is a length. #
const q = 1.0 # annealing parameter # hide
const Δ = 1.0 # characteristic domain size # hide
const s = ε # current scale: no annealing -> equals ε # hide
const tol = 1e-4 # marginal condition tolerance # hide
const tol = 1e-6 # marginal condition tolerance # hide
const crit_it = 20 # acceleration inference # hide
const p_η = 2

Expand All @@ -58,7 +58,7 @@ function compute_wasserstein_gradient(ensemble1::AT, ensemble2::AT) where AT<:Ab
V = SinkhornVariable(copy(ensemble1'), ones(number_of_particles1) / number_of_particles1)
W = SinkhornVariable(copy(ensemble2'), ones(number_of_particles2) / number_of_particles2)
params = SinkhornParameters(; ε=ε,q=1.0,Δ=1.0,s=s,tol=tol,crit_it=crit_it,p_η=p_η,sym=false,acc=true) # hide
S = SinkhornDivergence(V, W, c, params, true)
S = SinkhornDivergence(V, W, c, params; islog = true)
initialize_potentials!(S)
compute!(S)
value(S), x_gradient!(S, ∇c)'
Expand Down
4 changes: 2 additions & 2 deletions src/layers/attention_layer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ function parameterlength(d::Attention{M, M, true}) where M
M*(M-1)
end

function initialparameters(backend::KernelAbstractions.Backend, T::Type, d::Attention{M, M, false}; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M}
function initialparameters(d::Attention{M, M, false}, backend::KernelAbstractions.Backend, T::Type; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M}
# transformations for queries and keys.
PQ_weight = KernelAbstractions.allocate(backend, T, M, M)
PK_weight = KernelAbstractions.allocate(backend, T, M, M)
Expand All @@ -52,7 +52,7 @@ function initialparameters(backend::KernelAbstractions.Backend, T::Type, d::Atte
(PQ=PQ_weight, PK=PK_weight)
end

function initialparameters(backend::KernelAbstractions.Backend, T::Type, d::Attention{M, M, true}; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M}
function initialparameters(d::Attention{M, M, true}, backend::KernelAbstractions.Backend, T::Type; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M}
# projections for queries, keys and vectors.
PQ_weight = rand(backend, rng, StiefelManifold{T}, M, M)
PK_weight = rand(backend, rng, StiefelManifold{T}, M, M)
Expand Down
2 changes: 1 addition & 1 deletion src/layers/bias_layer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
BiasLayer{M, M}()
end

function initialparameters(backend::Backend, ::Type{T}, ::BiasLayer{M, M}; rng::AbstractRNG = Random.default_rng(), init_bias = GlorotUniform()) where {M, T}
function initialparameters(::BiasLayer{M, M}, backend::Backend, ::Type{T}; rng::AbstractRNG = Random.default_rng(), init_bias = GlorotUniform()) where {M, T}

Check warning on line 11 in src/layers/bias_layer.jl

View check run for this annotation

Codecov / codecov/patch

src/layers/bias_layer.jl#L11

Added line #L11 was not covered by tests
q_part = KernelAbstractions.zeros(backend, T, M÷2)
p_part = KernelAbstractions.zeros(backend, T, M÷2)
init_bias(rng, q_part)
Expand Down
2 changes: 1 addition & 1 deletion src/layers/classification.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ function Classification(M::Integer, N::Integer, activation; average::Bool=false)
Classification{M, N, average, typeof(activation)}(activation)
end

function initialparameters(device::KernelAbstractions.Backend, T::Type, ::Classification{M, N}; rng::Random.AbstractRNG=Random.default_rng(), init_weight! = GlorotUniform()) where {M, N}
function initialparameters(::Classification{M, N}, device::KernelAbstractions.Backend, T::Type; rng::Random.AbstractRNG=Random.default_rng(), init_weight! = GlorotUniform()) where {M, N}
weight = KernelAbstractions.allocate(device, T, N, M)
init_weight!(rng, weight)
(weight=weight, )
Expand Down
2 changes: 1 addition & 1 deletion src/layers/grassmann_layer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ function GrassmannLayer(n::Integer, N::Integer, Retraction::AbstractRetraction=d
GrassmannLayer{n, N, typeof(Retraction)}()
end

function AbstractNeuralNetworks.initialparameters(backend::KernelAbstractions.Backend, ::Type{T}, d::GrassmannLayer{N,M}; rng::AbstractRNG=Random.default_rng()) where {M,N,T}
function AbstractNeuralNetworks.initialparameters(d::GrassmannLayer{N,M}, backend::KernelAbstractions.Backend, ::Type{T}; rng::AbstractRNG=Random.default_rng()) where {M,N,T}
(weight = N > M ? rand(backend, rng, GrassmannManifold{T}, N, M) : rand(backend, rng, GrassmannManifold{T}, M, N), )
end

Expand Down
4 changes: 2 additions & 2 deletions src/layers/multi_head_attention.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ function parameterlength(d::MultiHeadAttention{M, M, true}) where M
Int(3*M^2 - 3*M*(M + d.n_heads)/(2*d.n_heads))
end

function initialparameters(backend::KernelAbstractions.Backend, T::Type, d::MultiHeadAttention{M, M, false}; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M}
function initialparameters(d::MultiHeadAttention{M, M, false}, backend::KernelAbstractions.Backend, T::Type; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M}
# number of "hidden" dimension (dimension of projection)
Dₕ = M ÷ d.n_heads
# projections for queries, keys and values.
Expand Down Expand Up @@ -51,7 +51,7 @@ function initialparameters(backend::KernelAbstractions.Backend, T::Type, d::Mult
end


function initialparameters(backend::KernelAbstractions.Backend, T::Type, d::MultiHeadAttention{M, M, true}; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M}
function initialparameters(d::MultiHeadAttention{M, M, true}, backend::KernelAbstractions.Backend, T::Type; rng::AbstractRNG=Random.default_rng(), initializer::AbstractNeuralNetworks.AbstractInitializer=GlorotUniform()) where {M}
# number of "hidden" dimension (dimension of projection)
Dₕ = M ÷ d.n_heads
# projections for queries, keys and vectors.
Expand Down
2 changes: 1 addition & 1 deletion src/layers/psd_like_layer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ function parameterlength(::PSDLayer{M, N}) where {M, N}
M÷2*(N÷2 - (M÷2+1)÷2)
end

function initialparameters(backend::KernelAbstractions.Backend, T::Type, ::PSDLayer{M, N}, rng::AbstractRNG=Random.default_rng()) where {M, N}
function initialparameters(::PSDLayer{M, N}, backend::KernelAbstractions.Backend, T::Type; rng::AbstractRNG=Random.default_rng()) where {M, N}
(weight = N > M ? rand(backend, rng, StiefelManifold{T}, N÷2, M÷2) : rand(backend, rng, StiefelManifold{T}, M÷2, N÷2), )
end

Expand Down
2 changes: 1 addition & 1 deletion src/layers/resnet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ function ResNet(dim::IT, activation=identity; use_bias::Bool=true) where {IT<:In
return ResNet{dim, dim, use_bias, typeof(activation)}(activation)
end

function initialparameters(backend::KernelAbstractions.Backend, T::Type, ::ResNet{M, M, use_bias}; rng::Random.AbstractRNG=Random.default_rng(), init_weight = GlorotUniform(), init_bias = ZeroInitializer()) where {M, use_bias}
function initialparameters(::ResNet{M, M, use_bias}, backend::KernelAbstractions.Backend, T::Type; rng::Random.AbstractRNG=Random.default_rng(), init_weight = GlorotUniform(), init_bias = ZeroInitializer()) where {M, use_bias}
if use_bias
weight = KernelAbstractions.allocate(backend, T, M, M)
bias = KernelAbstractions.allocate(backend, T, M)
Expand Down
2 changes: 1 addition & 1 deletion src/layers/stiefel_layer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ function StiefelLayer(n::Integer, N::Integer; retraction::AbstractRetraction=def
StiefelLayer{n, N, typeof(retraction)}()
end

function AbstractNeuralNetworks.initialparameters(backend::KernelAbstractions.Backend, ::Type{T}, d::StiefelLayer{M,N}; rng::AbstractRNG=Random.default_rng()) where {M,N,T}
function AbstractNeuralNetworks.initialparameters(d::StiefelLayer{M,N}, backend::KernelAbstractions.Backend, ::Type{T}; rng::AbstractRNG=Random.default_rng()) where {M,N,T}
(weight = N > M ? rand(backend, rng, StiefelManifold{T}, N, M) : rand(backend, rng, StiefelManifold{T}, M, N), )
end

Expand Down
66 changes: 0 additions & 66 deletions src/layers/symplectic_stiefel_layer.jl

This file was deleted.

6 changes: 3 additions & 3 deletions src/layers/sympnets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ function Gradient(dim::Int, dim2::Int=dim, activation = identity; full_grad::Boo
end
end

function initialparameters(backend::Backend, ::Type{T}, d::GradientLayer{M, M}; rng::AbstractRNG = Random.default_rng(), init_weight = GlorotUniform(), init_bias = ZeroInitializer(), init_scale = GlorotUniform()) where {M, T}
function initialparameters(d::GradientLayer{M, M}, backend::Backend, ::Type{T}; rng::AbstractRNG = Random.default_rng(), init_weight = GlorotUniform(), init_bias = ZeroInitializer(), init_scale = GlorotUniform()) where {M, T}
K = KernelAbstractions.allocate(backend, T, d.second_dim÷2, M÷2)
b = KernelAbstractions.allocate(backend, T, d.second_dim÷2)
a = KernelAbstractions.allocate(backend, T, d.second_dim÷2)
Expand All @@ -158,13 +158,13 @@ function initialparameters(backend::Backend, ::Type{T}, d::GradientLayer{M, M};
return (weight=K, bias=b, scale=a)
end

function initialparameters(backend::Backend, ::Type{T}, ::ActivationLayer{M, M}; rng::AbstractRNG = Random.default_rng(), init_scale = GlorotUniform()) where {M, T}
function initialparameters(::ActivationLayer{M, M}, backend::Backend, ::Type{T}; rng::AbstractRNG = Random.default_rng(), init_scale = GlorotUniform()) where {M, T}
a = KernelAbstractions.zeros(backend, T, M÷2)
init_scale(rng, a)
return (scale = a,)
end

function initialparameters(backend::Backend, ::Type{T}, ::LinearLayer{M, M}; rng::AbstractRNG = Random.default_rng(), init_weight = GlorotUniform()) where {M, T}
function initialparameters(::LinearLayer{M, M}, backend::Backend, ::Type{T}; rng::AbstractRNG = Random.default_rng(), init_weight = GlorotUniform()) where {M, T}
S = KernelAbstractions.allocate(backend, T, (M÷2)*(M÷2+1)÷2)
init_weight(rng, S)
(weight=SymmetricMatrix(S, M÷2), )
Expand Down
2 changes: 1 addition & 1 deletion test/attention_layer/apply_multi_head_attention.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ function compare_attention_to_mha(N, batch_size=10, T=Float32)
model₃ = MultiHeadAttention(N, 1, add_connection=true)
model₄ = Attention(N, softmax, add_connection=true)

ps₂ = initialparameters(CPU(), T, model₂)
ps₂ = initialparameters(model₂, CPU(), T)
ps₁ = (PQ=(head_1=ps₂.PQ,), PK=(head_1=ps₂.PK,), PV=(head_1=typeof(ps₂.PK)(I(N)),))

mat = rand(T, N, N)
Expand Down
6 changes: 3 additions & 3 deletions test/attention_layer/attention_setup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ function attention_tests(N, T=Float32)
# same as model₁, but with the add connection
model₄ = Attention(N, Stiefel=false, add_connection=true)

ps₁ = initialparameters(CPU(), T, model₁)
ps₂ = initialparameters(CPU(), T, model₂)
ps₃ = initialparameters(CPU(), T, model₃)
ps₁ = initialparameters(model₁, CPU(), T)
ps₂ = initialparameters(model₂, CPU(), T)
ps₃ = initialparameters(model₃, CPU(), T)
@test typeof(ps₂.PQ) <: StiefelManifold
@test typeof(ps₂.PK) <: StiefelManifold

Expand Down
2 changes: 1 addition & 1 deletion test/data_loader/data_loader_optimization_step.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ function test_data_loader(sys_dim, n_time_steps, n_params, T=Float32)

# first argument is sys_dim, second is number of heads, third is number of units
model = Transformer(dl.input_dim, 2, 1)
ps = initialparameters(CPU(), T, model)
ps = initialparameters(model, CPU(), T)
dx = Zygote.gradient(ps -> GeometricMachineLearning.loss(model, ps, dl), ps)[1]
ps_copy = deepcopy(ps)
o = Optimizer(GradientOptimizer(), ps)
Expand Down
2 changes: 1 addition & 1 deletion test/data_loader/mnist_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function test_optimizer_for_classification_layer(; dim₁=28, dim₂=28, number_
activation_function(x) = tanh.(x)
model = Classification(patch_length * patch_length, 10, activation_function)

ps = initialparameters(CPU(), T, model)
ps = initialparameters(model, CPU(), T)
loss₁ = GeometricMachineLearning.loss(model, ps, dl)

opt = Optimizer(GradientOptimizer(), ps)
Expand Down
2 changes: 1 addition & 1 deletion test/data_loader/optimizer_functor_with_adam.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function test_optimizer_functor_with_adam(;T=Float32, dim₁=6, dim₂=6, n_imag
# input dim is dim₁ / patch_length * dim₂ / pach_length; the transformer is called with dim₁ / patch_length and two layers
model = Chain(Transformer(dl.input_dim, patch_length, 2; Stiefel=true), Classification(dl.input_dim, 10, σ))

ps = initialparameters(CPU(), Float32, model)
ps = initialparameters(model, CPU(), Float32)

loss₁ = GeometricMachineLearning.loss(model, ps, dl)

Expand Down
2 changes: 1 addition & 1 deletion test/layers/classification.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Random.seed!(1234)

function test_set_up_and_application(T=Float32, sys_dim=49, output_dim=10, seq_length=16, batch_size=32; average=false)
d = Classification(sys_dim, output_dim, σ, average=average)
ps = initialparameters(CPU(), T, d)
ps = initialparameters(d, CPU(), T)
output₁ = d(rand(T, sys_dim, seq_length), ps)
output₂ = d(rand(T, sys_dim, seq_length, batch_size), ps)
@test size(output₁) == (10, 1)
Expand Down
4 changes: 2 additions & 2 deletions test/layers/gradient_layer_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Random.seed!(1234)

function test_gradient_layer_application(T, M, N, batch_size=10)
dummy_model = GradientLayerQ(M, N, tanh)
ps = initialparameters(CPU(), T, dummy_model)
ps = initialparameters(dummy_model, CPU(), T)

x = rand(T, M)
x_applied = dummy_model(x, ps)
Expand All @@ -22,7 +22,7 @@ end

function test_gradient_layer_derivative_and_update(T, M, N, batch_size=10)
dummy_model = Chain(GradientLayerP(M, N, tanh), GradientLayerQ(M, N, tanh))
ps = initialparameters(CPU(), T, dummy_model)
ps = initialparameters(dummy_model, CPU(), T)
o = Optimizer(AdamOptimizer(T(0.1), T(.9), T(0.999), T(3e-7)), ps)

# test for vector
Expand Down
4 changes: 2 additions & 2 deletions test/layers/manifold_layers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Random.seed!(1234)

function stiefel_layer_test(T, M, N, tol=1f-1)
model = Chain(StiefelLayer(M, N), StiefelLayer(N, N))
ps = initialparameters(T, model)
ps = initialparameters(model, T)
o = Optimizer(AdamOptimizer(T(1f0), T(5f-1), T(5f-1), T(3f-7)),ps)

dx = ((weight=rand(T,N,M),),(weight=rand(T,N,N),))
Expand All @@ -20,7 +20,7 @@ end

function grassmann_layer_test(T, M, N, tol=1f-1)
model = Chain(GrassmannLayer(M, N), StiefelLayer(N, N))
ps = initialparameters(T, model)
ps = initialparameters(model, T)
o = Optimizer(AdamOptimizer(T(1f0), T(5f-1), T(5f-1), T(3f-7)),ps)

dx = ((weight=rand(T,N,M),),(weight=rand(T,N,N),))
Expand Down
Loading
Loading