Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cur variants #72

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
julia = ">= 1.10"
InteratomicPotentials = ">= 0.2.9"
julia = ">= 1.10"
113 changes: 113 additions & 0 deletions src/DimensionReduction/cur.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
The CUR matrix decomposition is a dimension reduction method. It approximates a given matrix by
selecting a few of its columns (C), a few of its rows (R), and a small intersection matrix (U),
such that the product of these three matrices closely approximates the original matrix.
This technique is particularly useful for dimensionality reduction in large datasets because it
retains a subset of the original data's structure, making it interpretable and efficient for
large-scale data analysis.

Three varients of CUR are implemented in PotentialLearning.jl: LinearTimeCUR, DEIMCUR, and LSCUR.

"""

struct CUR{T<:Real} <: DimensionReducer
rows::Vector{Int64}
cols::Vector{Int64}
end

function CUR(rows::Vector{Int64}, cols::Vector{Int64})
CUR(rows, cols)
end

function LinearTimeCUR(A::Matrix{T}, k::Int64) where {T<:Number}
m, n = size(A)
C = zeros(T, m, k)
R = zeros(T, k, n)

fsq_normA = norm(A)^2
colsq_norm = [ norm(A[:, j])^2 for j in range(1, n)]
rowsq_norm = [ norm(A[i, :])^2 for i in range(1, m)]

col_p = [ (colsq_norm[j]/fsq_normA) for j in range(1, n)]
row_p = [ (rowsq_norm[j]/fsq_normA) for j in range(1, m)]

#computing C and R based on uniform random sampling of rows and cols of matrix A
cols = Vector{Int64}(undef, k)
rows = Vector{Int64}(undef, k)

for i in range(1, k)
cols[i] = sample(1:n, ProbabilityWeights(col_p))
C[:, i] = A[:, cols[i]] ./ sqrt(k*col_p[cols[i]])
rows[i] = sample(1:m, ProbabilityWeights(row_p))
R[i, :] = A[rows[i], :] ./ sqrt(k*row_p[rows[i]])
end

return rows, cols
end

function DEIMCUR(A::Matrix{T}, k::Int64) where {T<:Number}

m, n = size(A)
C = zeros(T, m, k)
R = zeros(T, k, n)
u, s, vh = svd(A)

U = u[:, 1:k]
V = vh[:, 1:k]


rows = Vector{Int64}(undef, k)
cols = Vector{Int64}(undef, k)

for i in range(1, k)
rows[i] = first.(Tuple.(findall(x -> x==maximum(abs.(U[:,i])), abs.(U))))[1]
cols[i] = first.(Tuple.(findall(x -> x==maximum(abs.(V[:,i])), abs.(V))))[1]

@time U_p = pinv(U[rows[i], 1:i])'
@time mul!(U[:, i+1:k], U[:, 1:i], U_p * U[rows[i],i+1:k]')

@time V_p = pinv(V[cols[i], 1:i])'
@time mul!(V[:, i+1:k], V[:, 1:i], V_p * V[cols[i],i+1:k]')
end

return rows, cols
end

function LSCUR_ColSelect(::Type{T}, A::Matrix{T}, k::Int64) where {T<:Number}

m, n = size(A)
F = zeros(T, m, k)

m, n = size(A)
u, s, vh = svd(A)


V = vh[:, 1:k]

V = (V.^2)

prob = [sum(V[i, :]) for i in range(1, m)]

idx = Vector{Int64}(undef, k)

for i in range(1, k)
idx[i] = sample(1:n, ProbabilityWeights(prob))
end

F = A[:, idx]

return F, idx
end

function LSCUR(A::Matrix{T}, k::Int64) where {T<:Number}

m, n = size(A)
C = zeros(T, m, k)
R = zeros(T, k, n)

C, cols = LSCUR_ColSelect(T, A, k)

R, rows = LSCUR_ColSelect(T, Matrix(transpose(A)), k)

return rows, cols
end
7 changes: 7 additions & 0 deletions src/DimensionReduction/dimension_reduction.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
abstract type DimensionReducer end

export DimensionReducer, PCA, PCAState, ActiveSubspace
export CUR, LinearTimeCUR, DEIMCUR, LSCUR
export fit, fit_transform, fit!, transform!, select_eigendirections

"""
fit(ds::DataSet, dr::DimensionReducer)

Expand All @@ -12,13 +15,15 @@ function compute_eigen(d::Vector{T}) where {T<:Vector{<:Real}}
Q = Symmetric(mean(di * di' for di in d))
eigen(Symmetric(Q))
end

function select_eigendirections(d::Vector{T}, tol::Float64) where {T<:Vector{<:Real}}
λ, ϕ = compute_eigen(d)
λ, ϕ = λ[end:-1:1], ϕ[:, end:-1:1] # reorder
Σ = 1.0 .- cumsum(λ) / sum(λ)
W = ϕ[:, Σ.>tol]
λ, W
end

function select_eigendirections(d::Vector{T}, tol::Int) where {T<:Vector{<:Real}}
λ, ϕ = compute_eigen(d)
λ, ϕ = λ[end:-1:1], ϕ[:, end:-1:1] # reorder
Expand All @@ -30,6 +35,8 @@ end
include("pca.jl")
include("pca_state.jl")
include("as.jl")
include("cur.jl")

"""
fit_transform(ds::DataSet, dr::DimensionReducer)

Expand Down
2 changes: 1 addition & 1 deletion src/PotentialLearning.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module PotentialLearning
using AtomsBase
using InteratomicPotentials
using Unitful, UnitfulAtomic
using LinearAlgebra, Statistics, Random, Distributions
using LinearAlgebra, Statistics, StatsBase, Random, Distributions
using StaticArrays
using OrderedCollections
using Flux
Expand Down
Loading