diff --git a/DESCRIPTION b/DESCRIPTION index d1440cf..c48e1f5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,9 @@ Imports: purrr (>= 0.2.3), rlang (>= 0.1.2), tibble (>= 1.3.4), - tidyr (>= 0.7.1) + tidyr (>= 0.7.1), + Rcpp (>= 0.12.12), + RcppArmadillo (>= 0.8.100.1.0) Suggests: DBI (>= 0.7), dbplyr (>= 1.1.0), @@ -42,6 +44,9 @@ Suggests: RSQLite (>= 2.0), stringr (>= 1.2.0), testthat (>= 1.0.2) +LinkingTo: + Rcpp (>= 0.12.12), + RcppArmadillo (>= 0.8.100.1.0) VignetteBuilder: knitr URL: https://github.com/cytomining/cytominer BugReports: https://github.com/cytomining/cytominer/issues diff --git a/NAMESPACE b/NAMESPACE index 9c3a06c..73a49c6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,11 +6,14 @@ export(count_na_rows) export(covariance) export(drop_na_columns) export(drop_na_rows) +export(entropy_feature_selection) export(extract_subpopulations) export(generalized_log) export(generate_component_matrix) export(normalize) export(replicate_correlation) +export(score_features_sv_entropy) +export(singular_value_entropy) export(sparse_random_projection) export(svd_entropy) export(transform) @@ -18,6 +21,8 @@ export(variable_importance) export(variable_select) export(variance_threshold) export(whiten) +exportPattern("^[[:alpha:]]+") +importFrom(Rcpp,evalCpp) importFrom(Matrix,sparseMatrix) importFrom(foreach,"%dopar%") importFrom(magrittr,"%<>%") @@ -32,3 +37,4 @@ importFrom(stats,rbinom) importFrom(stats,sd) importFrom(stats,setNames) importFrom(utils,find) +useDynLib(cytominer) diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..77f6a61 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,29 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' singular_value_entropy +#' +#' Calculate the entropy of a matrix singular values (SV) in the SVD decomposition, when the SVs summation is normalized to one. +#' @param A a matrix of any arbitrary size +#' @return entropy of the normalized singular values using log base 10 +#' +#' @export +#' +singular_value_entropy <- function(A) { + .Call('_cytominer_singular_value_entropy', PACKAGE = 'cytominer', A) +} + +#' score_features_sv_entropy +#' +#' Scores each feature based on the difference of normalized SVs entropy of the data +#' with and without the feature; the higher the difference, the more informative the +#' feature would be. +#' @param data a matrix which represents the dataset; columns and rows correspond to features and observations, respectively. +#' @return vector containing scores for all the features, in the same order as the columns of data are arranged +#' +#' @export +#' +score_features_sv_entropy <- function(data) { + .Call('_cytominer_score_features_sv_entropy', PACKAGE = 'cytominer', data) +} + diff --git a/R/cytominer.R b/R/cytominer.R new file mode 100644 index 0000000..61c4222 --- /dev/null +++ b/R/cytominer.R @@ -0,0 +1,4 @@ +#' @useDynLib cytominer +#' @importFrom Rcpp evalCpp +#' @exportPattern "^[[:alpha:]]+" +NULL diff --git a/R/entropy_feature_selection.R b/R/entropy_feature_selection.R new file mode 100644 index 0000000..6c258d1 --- /dev/null +++ b/R/entropy_feature_selection.R @@ -0,0 +1,39 @@ +#' Feature selection based on redundancy +#' \code{entropy_feature_selection} is a feature selection method based on the entropy of data singular values +#' +#' @param population named tbl containing data, where columns and rows correspond to features (and/or metadata) and samples, respectively. Column names are assumed to be feature or metadata names. +#' @param variables vector containing the names of numerical variables (or features) in the population. +#' @param n_feature integer specifying number of features to be selected +#' +#' @importFrom magrittr %>% +#' +#' @return vector containing name of the features sorted based on their score, and the actual score values. Higher score means more informative feature. +#' +#' @examples +#' population <- tibble::data_frame( +#' AreaShape_MinorAxisLength = c(10, 12, 15, 16, 8, 8, 7, 7, 13, 18), +#' AreaShape_MajorAxisLength = c(35, 18, 22, 16, 9, 20, 11, 15, 18, 42), +#' AreaShape_Area = c(245, 151, 231, 179, 50, 112, 53, 73, 164, 529) +#' ) +#' variables <- c("AreaShape_MinorAxisLength", "AreaShape_MajorAxisLength", "AreaShape_Area") +#' entropy_feature_selection(population, variables, 2) +#' +#' @export +entropy_feature_selection <- function(population, variables, n_feature) { + + population_data <- population %>% + dplyr::select(dplyr::one_of(variables)) %>% + dplyr::collect() %>% + as.matrix() + + + # working with the matrix inner product; as it would be computationally more efficient for large number of samples + feat_inner_prods <- crossprod(population_data, population_data) + + entropy_score <- score_features_sv_entropy(feat_inner_prods) + + feat_rank <- order(entropy_score, decreasing = T) + + return(list(features = colnames(population_data)[feat_rank[1:n_feature]], + entropy_score = entropy_score[feat_rank[1:n_feature]])) +} diff --git a/R/variable_select.R b/R/variable_select.R index 671acbe..567ed84 100644 --- a/R/variable_select.R +++ b/R/variable_select.R @@ -50,6 +50,9 @@ variable_select <- function(population, variables, sample = NULL, excluded <- correlation_threshold(variables, sample, ...) } else if (operation == "drop_na_columns") { excluded <- drop_na_columns(population, variables, ...) + } else if (operation == "entropy_based") { + included <- entropy_feature_selection(population, variables, ...)[["features"]] + excluded <- setdiff(variables, included) } else { error <- paste0("undefined operation `", operation, "'") diff --git a/man/entropy_feature_selection.Rd b/man/entropy_feature_selection.Rd new file mode 100644 index 0000000..ec5bc4c --- /dev/null +++ b/man/entropy_feature_selection.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/entropy_feature_selection.R +\name{entropy_feature_selection} +\alias{entropy_feature_selection} +\title{Feature selection based on redundancy +\code{entropy_feature_selection} is a feature selection method based on the entropy of data singular values} +\usage{ +entropy_feature_selection(population, variables, n_feature) +} +\arguments{ +\item{population}{named tbl containing data, where columns and rows correspond to features (and/or metadata) and samples, respectively. Column names are assumed to be feature or metadata names.} + +\item{variables}{vector containing the names of numerical variables (or features) in the population.} + +\item{n_feature}{integer specifying number of features to be selected} +} +\value{ +vector containing name of the features sorted based on their score, and the actual score values. Higher score means more informative feature. +} +\description{ +Feature selection based on redundancy +\code{entropy_feature_selection} is a feature selection method based on the entropy of data singular values +} +\examples{ +population <- tibble::data_frame( + AreaShape_MinorAxisLength = c(10, 12, 15, 16, 8, 8, 7, 7, 13, 18), + AreaShape_MajorAxisLength = c(35, 18, 22, 16, 9, 20, 11, 15, 18, 42), + AreaShape_Area = c(245, 151, 231, 179, 50, 112, 53, 73, 164, 529) + ) +variables <- c("AreaShape_MinorAxisLength", "AreaShape_MajorAxisLength", "AreaShape_Area") +entropy_feature_selection(population, variables, 2) + +} diff --git a/man/score_features_sv_entropy.Rd b/man/score_features_sv_entropy.Rd new file mode 100644 index 0000000..797694b --- /dev/null +++ b/man/score_features_sv_entropy.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{score_features_sv_entropy} +\alias{score_features_sv_entropy} +\title{score_features_sv_entropy} +\usage{ +score_features_sv_entropy(data) +} +\arguments{ +\item{data}{a matrix which represents the dataset; columns and rows correspond to features and observations, respectively.} +} +\value{ +vector containing scores for all the features, in the same order as the columns of data are arranged +} +\description{ +Scores each feature based on the difference of normalized SVs entropy of the data +with and without the feature; the higher the difference, the more informative the +feature would be. +} diff --git a/man/singular_value_entropy.Rd b/man/singular_value_entropy.Rd new file mode 100644 index 0000000..48b85c7 --- /dev/null +++ b/man/singular_value_entropy.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{singular_value_entropy} +\alias{singular_value_entropy} +\title{singular_value_entropy} +\usage{ +singular_value_entropy(A) +} +\arguments{ +\item{A}{a matrix of any arbitrary size} +} +\value{ +entropy of the normalized singular values using log base 10 +} +\description{ +Calculate the entropy of a matrix singular values (SV) in the SVD decomposition, when the SVs summation is normalized to one. +} diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..d26f652 --- /dev/null +++ b/src/Makevars @@ -0,0 +1,6 @@ + +## optional +#CXX_STD = CXX11 + +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win b/src/Makevars.win new file mode 100644 index 0000000..d26f652 --- /dev/null +++ b/src/Makevars.win @@ -0,0 +1,6 @@ + +## optional +#CXX_STD = CXX11 + +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..414419c --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,41 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include +#include + +using namespace Rcpp; + +// singular_value_entropy +double singular_value_entropy(arma::mat A); +RcppExport SEXP _cytominer_singular_value_entropy(SEXP ASEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); + rcpp_result_gen = Rcpp::wrap(singular_value_entropy(A)); + return rcpp_result_gen; +END_RCPP +} +// score_features_sv_entropy +NumericVector score_features_sv_entropy(NumericMatrix data); +RcppExport SEXP _cytominer_score_features_sv_entropy(SEXP dataSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericMatrix >::type data(dataSEXP); + rcpp_result_gen = Rcpp::wrap(score_features_sv_entropy(data)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_cytominer_singular_value_entropy", (DL_FUNC) &_cytominer_singular_value_entropy, 1}, + {"_cytominer_score_features_sv_entropy", (DL_FUNC) &_cytominer_score_features_sv_entropy, 1}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_cytominer(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/entropy_feature_selection.cpp b/src/entropy_feature_selection.cpp new file mode 100644 index 0000000..c65e11b --- /dev/null +++ b/src/entropy_feature_selection.cpp @@ -0,0 +1,71 @@ +#include "RcppArmadillo.h" +// [[Rcpp::depends(RcppArmadillo)]] + +#include + +using namespace Rcpp; +using namespace std; +using namespace arma; + + +//' singular_value_entropy +//' +//' Calculate the entropy of a matrix singular values (SV) in the SVD decomposition, when the SVs summation is normalized to one. +//' @param A a matrix of any arbitrary size +//' @return entropy of the normalized singular values using log base 10 +//' +//' @export +//' +// [[Rcpp::export]] +double singular_value_entropy(arma::mat A) { + + // calculate the svd + vec singular_values = svd(A); + + // normalize relative values + vec singular_values_nrm = singular_values/sum(singular_values); + + // calculate the entropy for values greater than 0 (to avoid log(0)) + arma::vec sv_nonzero = singular_values_nrm.elem(find(singular_values_nrm > 0)); + double sv_entropy = -sum(sv_nonzero % log(sv_nonzero))/log(10); + + return sv_entropy; +} + +//' score_features_sv_entropy +//' +//' Scores each feature based on the difference of normalized SVs entropy of the data +//' with and without the feature; the higher the difference, the more informative the +//' feature would be. +//' @param data a matrix which represents the dataset; columns and rows correspond to features and observations, respectively. +//' @return vector containing scores for all the features, in the same order as the columns of data are arranged +//' +//' @export +//' +// [[Rcpp::export]] +NumericVector score_features_sv_entropy(NumericMatrix data){ + + // convert into matrix (armadillo) + mat data_mat(data.begin(), data.nrow(), data.ncol(), false); + + // total entropy + double sv_entropy_orig = singular_value_entropy(data_mat); + + // vector of contribution to the entropy by on a leave-a-feature-out basis + NumericVector sv_entropy(data.nrow()); + + // for each feature calculate the contribution to the entropy by leaving that feature out + for(unsigned int i = 0; i < data.nrow(); i++){ + + mat data_mat_i = data_mat; + + // remove the row and column i + data_mat_i.shed_row(i); + data_mat_i.shed_col(i); + + double sv_entropy_i = singular_value_entropy(data_mat_i); + sv_entropy[i] = sv_entropy_orig - sv_entropy_i; + } + + return(sv_entropy); +} diff --git a/tests/testthat/test-entropy_feature_selection.R b/tests/testthat/test-entropy_feature_selection.R new file mode 100644 index 0000000..ec5debe --- /dev/null +++ b/tests/testthat/test-entropy_feature_selection.R @@ -0,0 +1,31 @@ +context("entropy_feature_selection") + +test_that("`entropy_feature_selection` selects features based on the singular values entropy", { + + set.seed(24) + xa <- rnorm(5) + xb <- rnorm(5) + ya <- rnorm(5) + yb <- rnorm(5) + + data <- + rbind( + data.frame(g = "a", x = xa, y = ya, z = xa + ya), + data.frame(g = "b", x = xb, y = yb, z = xb + yb) + ) + + db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") + + RSQLite::initExtension(db) + + data <- dplyr::copy_to(db, data) + + expect_equal( + sort(entropy_feature_selection(population = data, variables = c("x", "y", "z"), n_feature = 2)[["features"]], + decreasing = F), + c("x", "y") + ) + + DBI::dbDisconnect(db) + +})