diff --git a/NAMESPACE b/NAMESPACE index 04894e3..064ca89 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ export(SetNaTo) export(SplitColType) export(create_report) export(drop_columns) +export(dummify) export(group_category) export(plot_bar) export(plot_boxplot) @@ -28,6 +29,7 @@ export(split_columns) import(data.table) import(ggplot2) import(gridExtra) +import(reshape2) importFrom(networkD3,diagonalNetwork) importFrom(networkD3,radialNetwork) importFrom(rmarkdown,render) diff --git a/NEWS.md b/NEWS.md index bb36c04..c26cf6d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,14 @@ # Changelog ### DataExplorer 0.5.0.9000 +#### New Features +* [#54](https://github.com/boxuancui/DataExplorer/issues/54): Extracted `dummify` from `plot_correlation` as a new function. + #### Enhancements * `plot_missing`: * Percentage text labels from output plot now has 2 decimals to prevent small percentages from being truncated to 0%. * Added example to quickly drop columns with too many missing values. +* Added `.ignoreCat` to helper. --- @@ -54,7 +58,7 @@ * [#37](https://github.com/boxuancui/DataExplorer/issues/37): Changed all `cat()` to `message()`. * [#38](https://github.com/boxuancui/DataExplorer/issues/38): Added option to order bars in `BarDiscrete`. * [#39](https://github.com/boxuancui/DataExplorer/issues/39): Extended `SetNaTo` to discrete features. -* Added more examples in README file. +* Added more examples to **README.md**. --- @@ -64,11 +68,11 @@ * [#29](https://github.com/boxuancui/DataExplorer/issues/29): Added `DropVar` to quickly drop variables by either name or column position. #### Bug Fixes -* [#24](https://github.com/boxuancui/DataExplorer/issues/24): `CorrelationDiscrete` now displays all factor levels instead of contrasts from `model.matrix`. +* [#24](https://github.com/boxuancui/DataExplorer/issues/24): `CorrelationDiscrete` now displays all factor levels instead of full rank matrix from `model.matrix`. #### Enhancements * [#11](https://github.com/boxuancui/DataExplorer/issues/11): Functions with return values will now match the input class and set it back. -* [#22](https://github.com/boxuancui/DataExplorer/issues/22): Added documentation for **num_all_missing** in `SplitColType`. +* [#22](https://github.com/boxuancui/DataExplorer/issues/22): Added documentation for `num_all_missing` in `SplitColType`. * [#23](https://github.com/boxuancui/DataExplorer/issues/23): Added additional measures (in addition to frequency) to `CollapseCategory`. * [#26](https://github.com/boxuancui/DataExplorer/issues/26): Removed density estimation section from report template. * [#31](https://github.com/boxuancui/DataExplorer/issues/31): Added flexibility to name the new category in `CollapseCategory`. @@ -121,5 +125,5 @@ * Features with all missing values will be ignored. * Switched position between continuous and discrete features in report template. * Renamed package name to **DataExplorer**. -* Added `NEWS.md`. +* Added **NEWS.md**. * Removed `BoxplotContinuous`. diff --git a/R/drop_columns.r b/R/drop_columns.r index b72bd92..84ad679 100644 --- a/R/drop_columns.r +++ b/R/drop_columns.r @@ -33,7 +33,7 @@ drop_columns <- function(data, ind) { if (!is.data.table(data)) stop("Please change your input data class to data.table!") - data[, (ind) := NULL] + data[, (ind) := NULL][] } DropVar <- function(data, ind) { diff --git a/R/dummify.r b/R/dummify.r new file mode 100644 index 0000000..6fb854f --- /dev/null +++ b/R/dummify.r @@ -0,0 +1,74 @@ +#' Dummify discrete features to binary columns +#' +#' Data dummification is also known as one hot encoding or feature binarization. It turns each category to a distinct column with binary (numeric) values. +#' @param data input data, in either \link{data.frame} or \link{data.table} format. +#' @param maxcat maximum categories allowed for each discrete feature. The default is 50. +#' @keywords dummify +#' @note This is different from \link{model.matrix}, where the latter aims to create a full rank matrix for regression-like use cases. If your intention is to create a design matrix, use \link{model.matrix} instead. +#' @return dummified dataset (discrete features only) preserving original features. However, column order might be different. +#' @import data.table +#' @import reshape2 +#' @export +#' @examples +#' ## Dummify iris dataset +#' str(dummify(iris)) +#' +#' ## Dummify diamonds dataset ignoring features with more than 5 categories +#' data("diamonds", package = "ggplot2") +#' str(dummify(diamonds, maxcat = 5)) + +dummify <- function(data, maxcat = 50L) { + ## Declare variable first to pass R CMD check + discrete_id <- NULL + ## Check if input is data.table + is_data_table <- is.data.table(data) + ## Detect input data class + data_class <- class(data) + ## Set data to data.table + if (!is.data.table(data)) {data <- data.table(data)} + ## Split data + split_data <- split_columns(data) + continuous <- split_data$continuous + ## Scan feature type + if (split_data$num_discrete > 0) { + discrete <- split_data$discrete + ## Get number of categories for each feature + ind <- .ignoreCat(discrete, maxcat) + n_true_discrete <- split_data$num_discrete - length(ind) + if (all(split_data$num_discrete, length(ind), !n_true_discrete)) { + warning("Ignored all discrete features since `maxcat` set to ", maxcat, " categories!") + final_data <- data + } else { + if (n_true_discrete > 0) { + if (length(ind) > 0) { + message(length(ind), " features with more than ", maxcat, " categories ignored!\n", paste0(names(ind), ": ", as.numeric(ind), " categories\n")) + } + ## Calculate categorical correlation and melt into tidy data format + discrete[, discrete_id := .I] + discrete_pivot <- Reduce( + function(x, y) {merge(x, y, by = "discrete_id")}, + c( + list(discrete[, c("discrete_id", names(ind)), with = FALSE]), + lapply(names(discrete)[!(names(discrete) %in% c("discrete_id", names(ind)))], function(x) { + dcast.data.table(discrete, discrete_id ~ make.names(paste0(x, "_", get(x))), length, value.var = "discrete_id") + }) + ) + ) + drop_columns(discrete_pivot, "discrete_id") + if (split_data$num_continuous == 0) { + final_data <- discrete_pivot + } else { + final_data <- cbind(continuous, discrete_pivot) + } + } + } + } else { + warning("No discrete features found! Nothing is dummified!") + final_data <- continuous + } + + ## Set data class back to original + if (!is_data_table) {class(final_data) <- data_class} + ## Set return object + return(final_data) +} diff --git a/R/helper.r b/R/helper.r new file mode 100644 index 0000000..d3608b8 --- /dev/null +++ b/R/helper.r @@ -0,0 +1,12 @@ +#' Truncate category +#' +#' Output index and name for features that will be ignored +#' @param data input data object. +#' @param maxcat maximum categories allowed for each discrete feature. +#' @return a named vector containing indices of features to be ignored. +#' @import data.table +.ignoreCat <- function(dt, maxcat) { + if (!is.data.table(dt)) {dt <- data.table(dt)} + n_cat <- sapply(dt, function(x) {length(unique(x))}) + n_cat[which(n_cat > maxcat)] +} diff --git a/R/plot_correlation.r b/R/plot_correlation.r index 3d2437d..7461a8c 100644 --- a/R/plot_correlation.r +++ b/R/plot_correlation.r @@ -17,16 +17,15 @@ #' # load diamonds dataset from ggplot2 #' data("diamonds", package = "ggplot2") #' -#' # Plot correlation heatmap with all columns +#' # Plot correlation heatmap #' plot_correlation(diamonds) -#' # Plot correlation heatmap with discrete features only -#' plot_correlation(diamonds, type = "d") -#' # Plot correlation heatmap with continuous features only +#' plot_correlation(diamonds, maxcat = 5) #' plot_correlation(diamonds, type = "c") +#' plot_correlation(diamonds, type = "d") plot_correlation <- function(data, type = c("all", "discrete", "continuous"), maxcat = 20, title = NULL, ...) { ## Declare variable first to pass R CMD check - Var1 <- Var2 <- value <- discrete_id <- NULL + Var1 <- Var2 <- value <- NULL ## Set data to data.table if (!is.data.table(data)) {data <- data.table(data)} ## Split data @@ -39,43 +38,27 @@ plot_correlation <- function(data, type = c("all", "discrete", "continuous"), ma } if (col_type %in% c("all", "discrete")) { if ((col_type == "discrete") & (split_data$num_discrete == 0)) stop("No discrete features found!") - discrete <- split_data$discrete - ## Get number of categories for each feature - n_cat <- sapply(discrete, function(x) {length(unique(x))}) - ign_ind <- which(n_cat > maxcat) - n_true_discrete <- split_data$num_discrete - length(ign_ind) - if (all(split_data$num_discrete, length(ign_ind), !n_true_discrete)) warning("Ignored all discrete features since `maxcat` set to ", maxcat, " categories!") - if (n_true_discrete > 0) { - if (length(ign_ind) > 0) { - set(discrete, j = ign_ind, value = NULL) - message(length(ign_ind), " features with more than ", maxcat, " categories ignored!\n", paste0(names(ign_ind), ": ", n_cat[ign_ind], " categories\n")) - } - ## Calculate categorical correlation and melt into tidy data format - discrete[, discrete_id := seq(nrow(discrete))] - discrete_pivot <- Reduce( - function(x, y) {merge(x, y, by = "discrete_id")}, - lapply(names(discrete)[names(discrete) != "discrete_id"], function(x) { - dcast.data.table(discrete, discrete_id ~ paste0(x, "_", get(x)), length, value.var = "discrete_id") - }) - ) - discrete_pivot[, discrete_id := NULL] - } + raw_discrete <- split_data$discrete + ind <- .ignoreCat(raw_discrete, maxcat) + n_true_discrete <- split_data$num_discrete - length(ind) + discrete <- dummify(raw_discrete, maxcat = maxcat) + if (length(ind)) drop_columns(discrete, names(ind)) } if (col_type == "all") { if (all(nrow(continuous), n_true_discrete)) { - all_data <- cbind(continuous, discrete_pivot) + all_data <- cbind(continuous, discrete) } else if (nrow(continuous)) { all_data <- continuous } else if (n_true_discrete) { - all_data <- discrete_pivot + all_data <- discrete } else { stop("No data to plot!") } } ## Calculate correlation and melt into tidy data format - final_data <- switch(col_type, "all" = all_data, "discrete" = discrete_pivot, "continuous" = continuous) + final_data <- switch(col_type, "all" = all_data, "discrete" = discrete, "continuous" = continuous) plot_data <- reshape2::melt(cor(final_data, ...)) ## Create ggplot object plot <- ggplot(plot_data, aes(x = Var1, y = Var2, fill = value)) + diff --git a/man/dummify.Rd b/man/dummify.Rd new file mode 100644 index 0000000..fae6b31 --- /dev/null +++ b/man/dummify.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dummify.r +\name{dummify} +\alias{dummify} +\title{Dummify discrete features to binary columns} +\usage{ +dummify(data, maxcat = 50L) +} +\arguments{ +\item{data}{input data, in either \link{data.frame} or \link{data.table} format.} + +\item{maxcat}{maximum categories allowed for each discrete feature. The default is 50.} +} +\value{ +dummified dataset (discrete features only) preserving original features. However, column order might be different. +} +\description{ +Data dummification is also known as one hot encoding or feature binarization. It turns each category to a distinct column with binary (numeric) values. +} +\note{ +This is different from \link{model.matrix}, where the latter aims to create a full rank matrix for regression-like use cases. If your intention is to create a design matrix, use \link{model.matrix} instead. +} +\examples{ +## Dummify iris dataset +str(dummify(iris)) + +## Dummify diamonds dataset ignoring features with more than 5 categories +data("diamonds", package = "ggplot2") +str(dummify(diamonds, maxcat = 5)) +} +\keyword{dummify} diff --git a/man/plot_correlation.Rd b/man/plot_correlation.Rd index 8c48d97..c33e0b4 100644 --- a/man/plot_correlation.Rd +++ b/man/plot_correlation.Rd @@ -30,11 +30,10 @@ For discrete features, the function first dummifies all categories, then calcula # load diamonds dataset from ggplot2 data("diamonds", package = "ggplot2") -# Plot correlation heatmap with all columns +# Plot correlation heatmap plot_correlation(diamonds) -# Plot correlation heatmap with discrete features only -plot_correlation(diamonds, type = "d") -# Plot correlation heatmap with continuous features only +plot_correlation(diamonds, maxcat = 5) plot_correlation(diamonds, type = "c") +plot_correlation(diamonds, type = "d") } \keyword{plot_correlation} diff --git a/tests/testthat/test-dummify.r b/tests/testthat/test-dummify.r new file mode 100644 index 0000000..8be2d24 --- /dev/null +++ b/tests/testthat/test-dummify.r @@ -0,0 +1,30 @@ +context("dummify") +data("diamonds", package = "ggplot2") + +test_that("test return object class", { + expect_equal(class(diamonds), class(dummify(diamonds, maxcat = 5))) + expect_equal(class(iris), class(dummify(iris))) + expect_is(dummify(data.table("D" = letters[1:5])), "data.table") +}) + +test_that("test messages and warnings", { + expect_message(dummify(diamonds, maxcat = 5)) + expect_warning(dummify(iris, maxcat = 2)) + expect_warning(dummify(airquality)) +}) + +test_that("test feature count", { + expect_equal(ncol(dummify(diamonds)), 27L) + expect_equal(ncol(dummify(diamonds, maxcat = 5)), 14L) + expect_equal(ncol(dummify(data.table("A" = letters[1:5]))), 5L) + expect_equal(ncol(dummify(data.table("A" = letters[1:5], "B" = letters[6:10]))), 10L) +}) + +test_that("test binary outcome", { + expect_equal(max(dummify(data.table("A" = letters[1:5]))), 1L) + expect_equal(min(dummify(data.table("A" = letters[1:5]))), 0L) +}) + +test_that("test continuous features", { + expect_equivalent(split_columns(diamonds)$continuous, split_columns(dummify(diamonds))$continuous[, 1:7]) +}) diff --git a/tests/testthat/test-helper.r b/tests/testthat/test-helper.r new file mode 100644 index 0000000..27f1b22 --- /dev/null +++ b/tests/testthat/test-helper.r @@ -0,0 +1,19 @@ +context("helper") + +test_that(".ignoreCat", { + set.seed(1) + dt <- data.table( + "a" = as.factor(rep(1, 10)), + "b" = as.factor(sample.int(2, 10, replace = TRUE)), + "c" = as.factor(sample.int(5, 10, replace = TRUE)), + "d" = as.factor(sample.int(10)) + ) + expect_equal(as.numeric(.ignoreCat(dt, 0)), c(1L, 2L, 5L, 10L)) + expect_equal(as.numeric(.ignoreCat(dt, 1)), c(2L, 5L, 10L)) + expect_equal(as.numeric(.ignoreCat(dt, 2)), c(5L, 10L)) + expect_equal(as.numeric(.ignoreCat(dt, 5)), 10L) + expect_equal(names(.ignoreCat(dt, 0)), letters[1L:4L]) + expect_equal(names(.ignoreCat(dt, 1)), letters[2L:4L]) + expect_equal(names(.ignoreCat(dt, 2)), letters[3L:4L]) + expect_equal(names(.ignoreCat(dt, 5)), letters[4L]) +}) diff --git a/tests/testthat/test-plot-correlation.r b/tests/testthat/test-plot-correlation.r index 5640989..a18bf6b 100644 --- a/tests/testthat/test-plot-correlation.r +++ b/tests/testthat/test-plot-correlation.r @@ -1,7 +1,13 @@ context("plot correlation heatmap") +data(diamonds, package = "ggplot2") test_that("test maximum categories for discrete features", { - data(diamonds, package = "ggplot2") expect_message(plot_correlation(diamonds, type = "d", maxcat = 5)) expect_silent(plot_correlation(diamonds, type = "d")) }) + +test_that("test error messages", { + expect_error(plot_correlation(split_columns(diamonds)$continuous, type = "d")) + expect_error(plot_correlation(split_columns(diamonds)$discrete, type = "c")) + expect_error(suppressWarnings(plot_correlation(split_columns(diamonds)$discrete, maxcat = 2))) +})