Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement entropy based feature selection #118

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ Imports:
purrr (>= 0.2.3),
rlang (>= 0.1.2),
tibble (>= 1.3.4),
tidyr (>= 0.7.1)
tidyr (>= 0.7.1),
Rcpp (>= 0.12.12),
RcppArmadillo (>= 0.8.100.1.0)
Suggests:
DBI (>= 0.7),
dbplyr (>= 1.1.0),
Expand All @@ -42,6 +44,9 @@ Suggests:
RSQLite (>= 2.0),
stringr (>= 1.2.0),
testthat (>= 1.0.2)
LinkingTo:
Rcpp (>= 0.12.12),
RcppArmadillo (>= 0.8.100.1.0)
VignetteBuilder: knitr
URL: https://github.com/cytomining/cytominer
BugReports: https://github.com/cytomining/cytominer/issues
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,23 @@ export(count_na_rows)
export(covariance)
export(drop_na_columns)
export(drop_na_rows)
export(entropy_feature_selection)
export(extract_subpopulations)
export(generalized_log)
export(generate_component_matrix)
export(normalize)
export(replicate_correlation)
export(score_features_sv_entropy)
export(singular_value_entropy)
export(sparse_random_projection)
export(svd_entropy)
export(transform)
export(variable_importance)
export(variable_select)
export(variance_threshold)
export(whiten)
exportPattern("^[[:alpha:]]+")
importFrom(Rcpp,evalCpp)
importFrom(Matrix,sparseMatrix)
importFrom(foreach,"%dopar%")
importFrom(magrittr,"%<>%")
Expand All @@ -32,3 +37,4 @@ importFrom(stats,rbinom)
importFrom(stats,sd)
importFrom(stats,setNames)
importFrom(utils,find)
useDynLib(cytominer)
29 changes: 29 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' singular_value_entropy
#'
#' Calculate the entropy of a matrix singular values (SV) in the SVD decomposition, when the SVs summation is normalized to one.
#' @param A a matrix of any arbitrary size
#' @return entropy of the normalized singular values using log base 10
#'
#' @export
#'
singular_value_entropy <- function(A) {
.Call('_cytominer_singular_value_entropy', PACKAGE = 'cytominer', A)
}

#' score_features_sv_entropy
#'
#' Scores each feature based on the difference of normalized SVs entropy of the data
#' with and without the feature; the higher the difference, the more informative the
#' feature would be.
#' @param data a matrix which represents the dataset; columns and rows correspond to features and observations, respectively.
#' @return vector containing scores for all the features, in the same order as the columns of data are arranged
#'
#' @export
#'
score_features_sv_entropy <- function(data) {
.Call('_cytominer_score_features_sv_entropy', PACKAGE = 'cytominer', data)
}

4 changes: 4 additions & 0 deletions R/cytominer.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#' @useDynLib cytominer
#' @importFrom Rcpp evalCpp
#' @exportPattern "^[[:alpha:]]+"
NULL
39 changes: 39 additions & 0 deletions R/entropy_feature_selection.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#' Feature selection based on redundancy
#' \code{entropy_feature_selection} is a feature selection method based on the entropy of data singular values
#'
#' @param population named tbl containing data, where columns and rows correspond to features (and/or metadata) and samples, respectively. Column names are assumed to be feature or metadata names.
#' @param variables vector containing the names of numerical variables (or features) in the population.
#' @param n_feature integer specifying number of features to be selected
#'
#' @importFrom magrittr %>%
#'
#' @return vector containing name of the features sorted based on their score, and the actual score values. Higher score means more informative feature.
#'
#' @examples
#' population <- tibble::data_frame(
#' AreaShape_MinorAxisLength = c(10, 12, 15, 16, 8, 8, 7, 7, 13, 18),
#' AreaShape_MajorAxisLength = c(35, 18, 22, 16, 9, 20, 11, 15, 18, 42),
#' AreaShape_Area = c(245, 151, 231, 179, 50, 112, 53, 73, 164, 529)
#' )
#' variables <- c("AreaShape_MinorAxisLength", "AreaShape_MajorAxisLength", "AreaShape_Area")
#' entropy_feature_selection(population, variables, 2)
#'
#' @export
entropy_feature_selection <- function(population, variables, n_feature) {

population_data <- population %>%
dplyr::select(dplyr::one_of(variables)) %>%
dplyr::collect() %>%
as.matrix()


# working with the matrix inner product; as it would be computationally more efficient for large number of samples
feat_inner_prods <- crossprod(population_data, population_data)

entropy_score <- score_features_sv_entropy(feat_inner_prods)

feat_rank <- order(entropy_score, decreasing = T)

return(list(features = colnames(population_data)[feat_rank[1:n_feature]],
entropy_score = entropy_score[feat_rank[1:n_feature]]))
}
3 changes: 3 additions & 0 deletions R/variable_select.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ variable_select <- function(population, variables, sample = NULL,
excluded <- correlation_threshold(variables, sample, ...)
} else if (operation == "drop_na_columns") {
excluded <- drop_na_columns(population, variables, ...)
} else if (operation == "entropy_based") {
included <- entropy_feature_selection(population, variables, ...)[["features"]]
excluded <- setdiff(variables, included)
} else {
error <- paste0("undefined operation `", operation, "'")

Expand Down
33 changes: 33 additions & 0 deletions man/entropy_feature_selection.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions man/score_features_sv_entropy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/singular_value_entropy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions src/Makevars
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

## optional
#CXX_STD = CXX11

PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
6 changes: 6 additions & 0 deletions src/Makevars.win
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

## optional
#CXX_STD = CXX11

PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
41 changes: 41 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#include <RcppArmadillo.h>
#include <Rcpp.h>

using namespace Rcpp;

// singular_value_entropy
double singular_value_entropy(arma::mat A);
RcppExport SEXP _cytominer_singular_value_entropy(SEXP ASEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
rcpp_result_gen = Rcpp::wrap(singular_value_entropy(A));
return rcpp_result_gen;
END_RCPP
}
// score_features_sv_entropy
NumericVector score_features_sv_entropy(NumericMatrix data);
RcppExport SEXP _cytominer_score_features_sv_entropy(SEXP dataSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< NumericMatrix >::type data(dataSEXP);
rcpp_result_gen = Rcpp::wrap(score_features_sv_entropy(data));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_cytominer_singular_value_entropy", (DL_FUNC) &_cytominer_singular_value_entropy, 1},
{"_cytominer_score_features_sv_entropy", (DL_FUNC) &_cytominer_score_features_sv_entropy, 1},
{NULL, NULL, 0}
};

RcppExport void R_init_cytominer(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
71 changes: 71 additions & 0 deletions src/entropy_feature_selection.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#include "RcppArmadillo.h"
// [[Rcpp::depends(RcppArmadillo)]]

#include <iostream>

using namespace Rcpp;
using namespace std;
using namespace arma;


//' singular_value_entropy
//'
//' Calculate the entropy of a matrix singular values (SV) in the SVD decomposition, when the SVs summation is normalized to one.
//' @param A a matrix of any arbitrary size
//' @return entropy of the normalized singular values using log base 10
//'
//' @export
//'
// [[Rcpp::export]]
double singular_value_entropy(arma::mat A) {

// calculate the svd
vec singular_values = svd(A);

// normalize relative values
vec singular_values_nrm = singular_values/sum(singular_values);

// calculate the entropy for values greater than 0 (to avoid log(0))
arma::vec sv_nonzero = singular_values_nrm.elem(find(singular_values_nrm > 0));
double sv_entropy = -sum(sv_nonzero % log(sv_nonzero))/log(10);

return sv_entropy;
}

//' score_features_sv_entropy
//'
//' Scores each feature based on the difference of normalized SVs entropy of the data
//' with and without the feature; the higher the difference, the more informative the
//' feature would be.
//' @param data a matrix which represents the dataset; columns and rows correspond to features and observations, respectively.
//' @return vector containing scores for all the features, in the same order as the columns of data are arranged
//'
//' @export
//'
// [[Rcpp::export]]
NumericVector score_features_sv_entropy(NumericMatrix data){

// convert into matrix (armadillo)
mat data_mat(data.begin(), data.nrow(), data.ncol(), false);

// total entropy
double sv_entropy_orig = singular_value_entropy(data_mat);

// vector of contribution to the entropy by on a leave-a-feature-out basis
NumericVector sv_entropy(data.nrow());

// for each feature calculate the contribution to the entropy by leaving that feature out
for(unsigned int i = 0; i < data.nrow(); i++){

mat data_mat_i = data_mat;

// remove the row and column i
data_mat_i.shed_row(i);
data_mat_i.shed_col(i);

double sv_entropy_i = singular_value_entropy(data_mat_i);
sv_entropy[i] = sv_entropy_orig - sv_entropy_i;
}

return(sv_entropy);
}
31 changes: 31 additions & 0 deletions tests/testthat/test-entropy_feature_selection.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
context("entropy_feature_selection")

test_that("`entropy_feature_selection` selects features based on the singular values entropy", {

set.seed(24)
xa <- rnorm(5)
xb <- rnorm(5)
ya <- rnorm(5)
yb <- rnorm(5)

data <-
rbind(
data.frame(g = "a", x = xa, y = ya, z = xa + ya),
data.frame(g = "b", x = xb, y = yb, z = xb + yb)
)

db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")

RSQLite::initExtension(db)

data <- dplyr::copy_to(db, data)

expect_equal(
sort(entropy_feature_selection(population = data, variables = c("x", "y", "z"), n_feature = 2)[["features"]],
decreasing = F),
c("x", "y")
)

DBI::dbDisconnect(db)

})