diff --git a/Project.toml b/Project.toml index c2fe1a8e..6f32db60 100644 --- a/Project.toml +++ b/Project.toml @@ -20,11 +20,12 @@ ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d" UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] -julia = ">= 1.10" InteratomicPotentials = ">= 0.2.9" +julia = ">= 1.10" diff --git a/src/DimensionReduction/cur.jl b/src/DimensionReduction/cur.jl new file mode 100644 index 00000000..25ca492b --- /dev/null +++ b/src/DimensionReduction/cur.jl @@ -0,0 +1,113 @@ +""" +The CUR matrix decomposition is a dimension reduction method. It approximates a given matrix by +selecting a few of its columns (C), a few of its rows (R), and a small intersection matrix (U), +such that the product of these three matrices closely approximates the original matrix. +This technique is particularly useful for dimensionality reduction in large datasets because it +retains a subset of the original data's structure, making it interpretable and efficient for +large-scale data analysis. + +Three varients of CUR are implemented in PotentialLearning.jl: LinearTimeCUR, DEIMCUR, and LSCUR. + +""" + +struct CUR{T<:Real} <: DimensionReducer + rows::Vector{Int64} + cols::Vector{Int64} +end + +function CUR(rows::Vector{Int64}, cols::Vector{Int64}) + CUR(rows, cols) +end + +function LinearTimeCUR(A::Matrix{T}, k::Int64) where {T<:Number} + m, n = size(A) + C = zeros(T, m, k) + R = zeros(T, k, n) + + fsq_normA = norm(A)^2 + colsq_norm = [ norm(A[:, j])^2 for j in range(1, n)] + rowsq_norm = [ norm(A[i, :])^2 for i in range(1, m)] + + col_p = [ (colsq_norm[j]/fsq_normA) for j in range(1, n)] + row_p = [ (rowsq_norm[j]/fsq_normA) for j in range(1, m)] + + #computing C and R based on uniform random sampling of rows and cols of matrix A + cols = Vector{Int64}(undef, k) + rows = Vector{Int64}(undef, k) + + for i in range(1, k) + cols[i] = sample(1:n, ProbabilityWeights(col_p)) + C[:, i] = A[:, cols[i]] ./ sqrt(k*col_p[cols[i]]) + rows[i] = sample(1:m, ProbabilityWeights(row_p)) + R[i, :] = A[rows[i], :] ./ sqrt(k*row_p[rows[i]]) + end + + return rows, cols +end + +function DEIMCUR(A::Matrix{T}, k::Int64) where {T<:Number} + + m, n = size(A) + C = zeros(T, m, k) + R = zeros(T, k, n) + u, s, vh = svd(A) + + U = u[:, 1:k] + V = vh[:, 1:k] + + + rows = Vector{Int64}(undef, k) + cols = Vector{Int64}(undef, k) + + for i in range(1, k) + rows[i] = first.(Tuple.(findall(x -> x==maximum(abs.(U[:,i])), abs.(U))))[1] + cols[i] = first.(Tuple.(findall(x -> x==maximum(abs.(V[:,i])), abs.(V))))[1] + + @time U_p = pinv(U[rows[i], 1:i])' + @time mul!(U[:, i+1:k], U[:, 1:i], U_p * U[rows[i],i+1:k]') + + @time V_p = pinv(V[cols[i], 1:i])' + @time mul!(V[:, i+1:k], V[:, 1:i], V_p * V[cols[i],i+1:k]') + end + + return rows, cols +end + +function LSCUR_ColSelect(::Type{T}, A::Matrix{T}, k::Int64) where {T<:Number} + + m, n = size(A) + F = zeros(T, m, k) + + m, n = size(A) + u, s, vh = svd(A) + + + V = vh[:, 1:k] + + V = (V.^2) + + prob = [sum(V[i, :]) for i in range(1, m)] + + idx = Vector{Int64}(undef, k) + + for i in range(1, k) + idx[i] = sample(1:n, ProbabilityWeights(prob)) + end + + F = A[:, idx] + + return F, idx +end + +function LSCUR(A::Matrix{T}, k::Int64) where {T<:Number} + + m, n = size(A) + C = zeros(T, m, k) + R = zeros(T, k, n) + + C, cols = LSCUR_ColSelect(T, A, k) + + R, rows = LSCUR_ColSelect(T, Matrix(transpose(A)), k) + + return rows, cols +end diff --git a/src/DimensionReduction/dimension_reduction.jl b/src/DimensionReduction/dimension_reduction.jl index 9137ccaa..7b56131f 100644 --- a/src/DimensionReduction/dimension_reduction.jl +++ b/src/DimensionReduction/dimension_reduction.jl @@ -1,6 +1,9 @@ abstract type DimensionReducer end + export DimensionReducer, PCA, PCAState, ActiveSubspace +export CUR, LinearTimeCUR, DEIMCUR, LSCUR export fit, fit_transform, fit!, transform!, select_eigendirections + """ fit(ds::DataSet, dr::DimensionReducer) @@ -12,6 +15,7 @@ function compute_eigen(d::Vector{T}) where {T<:Vector{<:Real}} Q = Symmetric(mean(di * di' for di in d)) eigen(Symmetric(Q)) end + function select_eigendirections(d::Vector{T}, tol::Float64) where {T<:Vector{<:Real}} λ, ϕ = compute_eigen(d) λ, ϕ = λ[end:-1:1], ϕ[:, end:-1:1] # reorder @@ -19,6 +23,7 @@ function select_eigendirections(d::Vector{T}, tol::Float64) where {T<:Vector{<:R W = ϕ[:, Σ.>tol] λ, W end + function select_eigendirections(d::Vector{T}, tol::Int) where {T<:Vector{<:Real}} λ, ϕ = compute_eigen(d) λ, ϕ = λ[end:-1:1], ϕ[:, end:-1:1] # reorder @@ -30,6 +35,8 @@ end include("pca.jl") include("pca_state.jl") include("as.jl") +include("cur.jl") + """ fit_transform(ds::DataSet, dr::DimensionReducer) diff --git a/src/PotentialLearning.jl b/src/PotentialLearning.jl index 3c2fd79e..25eb0027 100644 --- a/src/PotentialLearning.jl +++ b/src/PotentialLearning.jl @@ -3,7 +3,7 @@ module PotentialLearning using AtomsBase using InteratomicPotentials using Unitful, UnitfulAtomic -using LinearAlgebra, Statistics, Random, Distributions +using LinearAlgebra, Statistics, StatsBase, Random, Distributions using StaticArrays using OrderedCollections using Flux