cesmix-mit · Rabab53 · Jun 13, 2024 · Jun 13, 2024 · Jun 18, 2024
diff --git a/Project.toml b/Project.toml
@@ -20,11 +20,12 @@ ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
 UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-julia = ">= 1.10"
 InteratomicPotentials = ">= 0.2.9"
+julia = ">= 1.10"
diff --git a/src/DimensionReduction/cur.jl b/src/DimensionReduction/cur.jl
@@ -0,0 +1,113 @@
+"""
+The CUR matrix decomposition is a dimension reduction method. It approximates a given matrix by 
+selecting a few of its columns (C), a few of its rows (R), and a small intersection matrix (U), 
+such that the product of these three matrices closely approximates the original matrix. 
+This technique is particularly useful for dimensionality reduction in large datasets because it 
+retains a subset of the original data's structure, making it interpretable and efficient for 
+large-scale data analysis.
+
+Three varients of CUR are implemented in PotentialLearning.jl: LinearTimeCUR, DEIMCUR, and LSCUR.
+
+"""
+
+struct CUR{T<:Real} <: DimensionReducer
+    rows::Vector{Int64}
+    cols::Vector{Int64}
+end
+
+function CUR(rows::Vector{Int64}, cols::Vector{Int64})
+    CUR(rows, cols)
+end
+
+function LinearTimeCUR(A::Matrix{T}, k::Int64) where {T<:Number}
+    m, n = size(A)
+    C = zeros(T, m, k)
+    R = zeros(T, k, n)
+
+    fsq_normA = norm(A)^2
+    colsq_norm = [ norm(A[:, j])^2 for j in range(1, n)] 
+    rowsq_norm = [ norm(A[i, :])^2 for i in range(1, m)] 
+
+    col_p = [ (colsq_norm[j]/fsq_normA) for j in range(1, n)] 
+    row_p = [ (rowsq_norm[j]/fsq_normA) for j in range(1, m)] 
+
+    #computing C and R based on uniform random sampling of rows and cols of matrix A 
+    cols = Vector{Int64}(undef, k)
+    rows = Vector{Int64}(undef, k)
+
+    for i in range(1, k)
+        cols[i] = sample(1:n, ProbabilityWeights(col_p))
+        C[:, i] =  A[:, cols[i]] ./ sqrt(k*col_p[cols[i]])
+        rows[i] = sample(1:m, ProbabilityWeights(row_p))
+        R[i, :] =   A[rows[i], :] ./ sqrt(k*row_p[rows[i]])
+    end
+
+    return rows, cols
+end
+
+function DEIMCUR(A::Matrix{T}, k::Int64) where {T<:Number}
+
+    m, n = size(A)
+    C = zeros(T, m, k)
+    R = zeros(T, k, n)
+    u, s, vh = svd(A)
+
+    U = u[:, 1:k]
+    V = vh[:, 1:k]
+
+
+    rows = Vector{Int64}(undef, k)
+    cols = Vector{Int64}(undef, k)
+
+    for i in range(1, k)
+        rows[i] = first.(Tuple.(findall(x -> x==maximum(abs.(U[:,i])), abs.(U))))[1]
+        cols[i] = first.(Tuple.(findall(x -> x==maximum(abs.(V[:,i])), abs.(V))))[1]
+
+        @time U_p = pinv(U[rows[i], 1:i])'
+        @time mul!(U[:, i+1:k], U[:, 1:i], U_p * U[rows[i],i+1:k]')
+
+        @time V_p = pinv(V[cols[i], 1:i])'
+        @time mul!(V[:, i+1:k], V[:, 1:i], V_p * V[cols[i],i+1:k]')
+    end
+
+    return rows, cols  
+end
+
+function LSCUR_ColSelect(::Type{T}, A::Matrix{T}, k::Int64) where {T<:Number}
+
+    m, n = size(A)
+    F = zeros(T, m, k)
+
+    m, n = size(A)
+    u, s, vh = svd(A)
+
+
+    V = vh[:, 1:k]
+
+    V = (V.^2)
+
+    prob = [sum(V[i, :]) for i in range(1, m)]
+
+    idx = Vector{Int64}(undef, k)
+
+    for i in range(1, k)
+        idx[i] = sample(1:n, ProbabilityWeights(prob))
+    end
+
+    F = A[:, idx]
+
+    return F, idx
+end
+
+function LSCUR(A::Matrix{T}, k::Int64) where {T<:Number}
+
+    m, n = size(A)
+    C = zeros(T, m, k)
+    R = zeros(T, k, n)
+
+    C, cols = LSCUR_ColSelect(T, A, k)
+
+    R, rows = LSCUR_ColSelect(T, Matrix(transpose(A)), k)
+
+    return rows, cols   
+end
diff --git a/src/DimensionReduction/dimension_reduction.jl b/src/DimensionReduction/dimension_reduction.jl
@@ -1,6 +1,9 @@
 abstract type DimensionReducer end
+
 export DimensionReducer, PCA, PCAState, ActiveSubspace
+export CUR, LinearTimeCUR, DEIMCUR, LSCUR
 export fit, fit_transform, fit!, transform!, select_eigendirections
+
 """
     fit(ds::DataSet, dr::DimensionReducer)
 
@@ -12,13 +15,15 @@ function compute_eigen(d::Vector{T}) where {T<:Vector{<:Real}}
     Q = Symmetric(mean(di * di' for di in d))
     eigen(Symmetric(Q))
 end
+
 function select_eigendirections(d::Vector{T}, tol::Float64) where {T<:Vector{<:Real}}
     λ, ϕ = compute_eigen(d)
     λ, ϕ = λ[end:-1:1], ϕ[:, end:-1:1] # reorder
     Σ = 1.0 .- cumsum(λ) / sum(λ)
     W = ϕ[:, Σ.>tol]
     λ, W
 end
+
 function select_eigendirections(d::Vector{T}, tol::Int) where {T<:Vector{<:Real}}
     λ, ϕ = compute_eigen(d)
     λ, ϕ = λ[end:-1:1], ϕ[:, end:-1:1] # reorder
@@ -30,6 +35,8 @@ end
 include("pca.jl")
 include("pca_state.jl")
 include("as.jl")
+include("cur.jl")
+
 """
     fit_transform(ds::DataSet, dr::DimensionReducer)
 

diff --git a/src/PotentialLearning.jl b/src/PotentialLearning.jl
@@ -3,7 +3,7 @@ module PotentialLearning
 using AtomsBase
 using InteratomicPotentials
 using Unitful, UnitfulAtomic
-using LinearAlgebra, Statistics, Random, Distributions
+using LinearAlgebra, Statistics, StatsBase, Random, Distributions
 using StaticArrays
 using OrderedCollections
 using Flux