From 6816ad4964cdd09eacb998990f87403d3cdb6210 Mon Sep 17 00:00:00 2001 From: rabab53 Date: Thu, 13 Jun 2024 06:15:56 +0300 Subject: [PATCH 1/2] add cur variants --- src/DimensionReduction/cur.jl | 119 ++++++++++++++++++ src/DimensionReduction/dimension_reduction.jl | 3 +- 2 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 src/DimensionReduction/cur.jl diff --git a/src/DimensionReduction/cur.jl b/src/DimensionReduction/cur.jl new file mode 100644 index 00000000..f32ef500 --- /dev/null +++ b/src/DimensionReduction/cur.jl @@ -0,0 +1,119 @@ +""" +The CUR matrix decomposition is a dimension reduction method. It approximates a given matrix by +selecting a few of its columns (C), a few of its rows (R), and a small intersection matrix (U), +such that the product of these three matrices closely approximates the original matrix. +This technique is particularly useful for dimensionality reduction in large datasets because it +retains a subset of the original data's structure, making it interpretable and efficient for +large-scale data analysis. + +Three varients of CUR are implemented in PotentialLearning.jl: LinearTimeCUR, DEIMCUR, and LSCUR. + +""" +using StatsBase + +struct CUR{T<:Real} <: DimensionReducer + rows::Vector{Int64} + cols::Vector{Int64} +end + +function CUR(rows::Vector{Int64}, cols::Vector{Int64}) + CUR(rows, cols) +end + +function LinearTimeCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} + m, n = size(A) + C = zeros(T, m, k) + R = zeros(T, k, n) + + fsq_normA = norm(A)^2 + colsq_norm = [ norm(A[:, j])^2 for j in range(1, n)] + rowsq_norm = [ norm(A[i, :])^2 for i in range(1, m)] + + col_p = [ (colsq_norm[j]/fsq_normA) for j in range(1, n)] + row_p = [ (rowsq_norm[j]/fsq_normA) for j in range(1, m)] + + #computing C and R based on uniform random sampling of rows and cols of matrix A + cols = Vector{Int64}(undef, k) + rows = Vector{Int64}(undef, k) + + for i in range(1, k) + cols[i] = sample(1:n, ProbabilityWeights(col_p)) + C[:, i] = A[:, cols[i]] ./ sqrt(k*col_p[cols[i]]) + rows[i] = sample(1:m, ProbabilityWeights(row_p)) + R[i, :] = A[rows[i], :] ./ sqrt(k*row_p[rows[i]]) + end + cur.rows = rows + cur.cols = cols + return rows, cols +end + +function DEIMCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} + + m, n = size(A) + C = zeros(T, m, k) + R = zeros(T, k, n) + u, s, vh = svd(A) + + U = u[:, 1:k] + V = vh[:, 1:k] + + + rows = Vector{Int64}(undef, k) + cols = Vector{Int64}(undef, k) + + for i in range(1, k) + rows[i] = first.(Tuple.(findall(x -> x==maximum(abs.(U[:,i])), abs.(U))))[1] + cols[i] = first.(Tuple.(findall(x -> x==maximum(abs.(V[:,i])), abs.(V))))[1] + + @time U_p = pinv(U[rows[i], 1:i])' + @time mul!(U[:, i+1:k], U[:, 1:i], U_p * U[rows[i],i+1:k]') + + @time V_p = pinv(V[cols[i], 1:i])' + @time mul!(V[:, i+1:k], V[:, 1:i], V_p * V[cols[i],i+1:k]') + end + + cur.rows = rows + cur.cols = cols + return rows, cols +end + +function LSCUR_ColSelect(::Type{T}, A::Matrix{T}, k::Int64) where {T<:Number} + + m, n = size(A) + F = zeros(T, m, k) + + m, n = size(A) + u, s, vh = svd(A) + + + V = vh[:, 1:k] + + V = (V.^2) + + prob = [sum(V[i, :]) for i in range(1, m)] + + idx = Vector{Int64}(undef, k) + + for i in range(1, k) + idx[i] = sample(1:n, ProbabilityWeights(prob)) + end + + F = A[:, idx] + + return F, idx +end + +function LSCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} + + m, n = size(A) + C = zeros(T, m, k) + R = zeros(T, k, n) + + C, cols = LSCUR_ColSelect(T, A, k) + + R, rows = LSCUR_ColSelect(T, Matrix(transpose(A)), k) + + cur.rows = rows + cur.cols = cols + return rows, cols +end \ No newline at end of file diff --git a/src/DimensionReduction/dimension_reduction.jl b/src/DimensionReduction/dimension_reduction.jl index 009b6fe7..212a0d2e 100644 --- a/src/DimensionReduction/dimension_reduction.jl +++ b/src/DimensionReduction/dimension_reduction.jl @@ -1,5 +1,5 @@ abstract type DimensionReducer end -export DimensionReducer, PCA, ActiveSubspace, fit, fit_transform, select_eigendirections +export DimensionReducer, PCA, ActiveSubspace, fit, fit_transform, select_eigendirections, CUR, LinearTimeCUR, DEIMCUR, LSCUR """ fit(ds::DataSet, dr::DimensionReducer) @@ -28,6 +28,7 @@ end include("pca.jl") include("as.jl") +include("cur.jl") """ fit_transform(ds::DataSet, dr::DimensionReducer) From 4bd3dfb4b7551a5e8c6c4b61c9b3330f88d437e3 Mon Sep 17 00:00:00 2001 From: rabab53 Date: Thu, 13 Jun 2024 06:45:58 +0300 Subject: [PATCH 2/2] change dec --- src/DimensionReduction/cur.jl | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/DimensionReduction/cur.jl b/src/DimensionReduction/cur.jl index f32ef500..3577429d 100644 --- a/src/DimensionReduction/cur.jl +++ b/src/DimensionReduction/cur.jl @@ -20,7 +20,7 @@ function CUR(rows::Vector{Int64}, cols::Vector{Int64}) CUR(rows, cols) end -function LinearTimeCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} +function LinearTimeCUR(A::Matrix{T}, k::Int64) where {T<:Number} m, n = size(A) C = zeros(T, m, k) R = zeros(T, k, n) @@ -42,12 +42,11 @@ function LinearTimeCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} rows[i] = sample(1:m, ProbabilityWeights(row_p)) R[i, :] = A[rows[i], :] ./ sqrt(k*row_p[rows[i]]) end - cur.rows = rows - cur.cols = cols + return rows, cols end -function DEIMCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} +function DEIMCUR(A::Matrix{T}, k::Int64) where {T<:Number} m, n = size(A) C = zeros(T, m, k) @@ -72,8 +71,6 @@ function DEIMCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} @time mul!(V[:, i+1:k], V[:, 1:i], V_p * V[cols[i],i+1:k]') end - cur.rows = rows - cur.cols = cols return rows, cols end @@ -103,7 +100,7 @@ function LSCUR_ColSelect(::Type{T}, A::Matrix{T}, k::Int64) where {T<:Number} return F, idx end -function LSCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} +function LSCUR(A::Matrix{T}, k::Int64) where {T<:Number} m, n = size(A) C = zeros(T, m, k) @@ -113,7 +110,5 @@ function LSCUR(cur::CUR, A::Matrix{T}, k::Int64) where {T<:Number} R, rows = LSCUR_ColSelect(T, Matrix(transpose(A)), k) - cur.rows = rows - cur.cols = cols return rows, cols end \ No newline at end of file