Completed #54

boxuancui · Mar 21, 2018 · 0cae1f5 · 0cae1f5
1 parent 07d4b97
commit 0cae1f5
Show file tree

Hide file tree

Showing 11 changed files with 199 additions and 39 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(SetNaTo)
 export(SplitColType)
 export(create_report)
 export(drop_columns)
+export(dummify)
 export(group_category)
 export(plot_bar)
 export(plot_boxplot)
@@ -28,6 +29,7 @@ export(split_columns)
 import(data.table)
 import(ggplot2)
 import(gridExtra)
+import(reshape2)
 importFrom(networkD3,diagonalNetwork)
 importFrom(networkD3,radialNetwork)
 importFrom(rmarkdown,render)

diff --git a/NEWS.md b/NEWS.md
@@ -1,10 +1,14 @@
 # Changelog
 
 ### DataExplorer 0.5.0.9000
+#### New Features
+* [#54](https://github.com/boxuancui/DataExplorer/issues/54): Extracted `dummify` from `plot_correlation` as a new function.
+
 #### Enhancements
 * `plot_missing`:
 	* Percentage text labels from output plot now has 2 decimals to prevent small percentages from being truncated to 0%.
 	* Added example to quickly drop columns with too many missing values.
+* Added `.ignoreCat` to helper.
 
 ---
 
@@ -54,7 +58,7 @@
 * [#37](https://github.com/boxuancui/DataExplorer/issues/37): Changed all `cat()` to `message()`.
 * [#38](https://github.com/boxuancui/DataExplorer/issues/38): Added option to order bars in `BarDiscrete`.
 * [#39](https://github.com/boxuancui/DataExplorer/issues/39): Extended `SetNaTo` to discrete features.
-* Added more examples in README file.
+* Added more examples to **README.md**.
 
 ---
 
@@ -64,11 +68,11 @@
 * [#29](https://github.com/boxuancui/DataExplorer/issues/29): Added `DropVar` to quickly drop variables by either name or column position.
 
 #### Bug Fixes
-* [#24](https://github.com/boxuancui/DataExplorer/issues/24): `CorrelationDiscrete` now displays all factor levels instead of contrasts from `model.matrix`.
+* [#24](https://github.com/boxuancui/DataExplorer/issues/24): `CorrelationDiscrete` now displays all factor levels instead of full rank matrix from `model.matrix`.
 
 #### Enhancements
 * [#11](https://github.com/boxuancui/DataExplorer/issues/11): Functions with return values will now match the input class and set it back.
-* [#22](https://github.com/boxuancui/DataExplorer/issues/22): Added documentation for **num_all_missing** in `SplitColType`.
+* [#22](https://github.com/boxuancui/DataExplorer/issues/22): Added documentation for `num_all_missing` in `SplitColType`.
 * [#23](https://github.com/boxuancui/DataExplorer/issues/23): Added additional measures (in addition to frequency) to `CollapseCategory`.
 * [#26](https://github.com/boxuancui/DataExplorer/issues/26): Removed density estimation section from report template.
 * [#31](https://github.com/boxuancui/DataExplorer/issues/31): Added flexibility to name the new category in `CollapseCategory`.
@@ -121,5 +125,5 @@
 * Features with all missing values will be ignored.
 * Switched position between continuous and discrete features in report template.
 * Renamed package name to **DataExplorer**.
-* Added `NEWS.md`.
+* Added **NEWS.md**.
 * Removed `BoxplotContinuous`.
diff --git a/R/drop_columns.r b/R/drop_columns.r
@@ -33,7 +33,7 @@
 
 drop_columns <- function(data, ind) {
   if (!is.data.table(data)) stop("Please change your input data class to data.table!")
-  data[, (ind) := NULL]
+  data[, (ind) := NULL][]
 }
 
 DropVar <- function(data, ind) {

diff --git a/R/dummify.r b/R/dummify.r
@@ -0,0 +1,74 @@
+#' Dummify discrete features to binary columns
+#'
+#' Data dummification is also known as one hot encoding or feature binarization. It turns each category to a distinct column with binary (numeric) values.
+#' @param data input data, in either \link{data.frame} or \link{data.table} format.
+#' @param maxcat maximum categories allowed for each discrete feature. The default is 50.
+#' @keywords dummify
+#' @note This is different from \link{model.matrix}, where the latter aims to create a full rank matrix for regression-like use cases. If your intention is to create a design matrix, use \link{model.matrix} instead.
+#' @return dummified dataset (discrete features only) preserving original features. However, column order might be different.
+#' @import data.table
+#' @import reshape2
+#' @export
+#' @examples
+#' ## Dummify iris dataset
+#' str(dummify(iris))
+#'
+#' ## Dummify diamonds dataset ignoring features with more than 5 categories
+#' data("diamonds", package = "ggplot2")
+#' str(dummify(diamonds, maxcat = 5))
+
+dummify <- function(data, maxcat = 50L) {
+  ## Declare variable first to pass R CMD check
+  discrete_id <- NULL
+  ## Check if input is data.table
+  is_data_table <- is.data.table(data)
+  ## Detect input data class
+  data_class <- class(data)
+  ## Set data to data.table
+  if (!is.data.table(data)) {data <- data.table(data)}
+  ## Split data
+  split_data <- split_columns(data)
+  continuous <- split_data$continuous
+  ## Scan feature type
+  if (split_data$num_discrete > 0) {
+    discrete <- split_data$discrete
+    ## Get number of categories for each feature
+    ind <- .ignoreCat(discrete, maxcat)
+    n_true_discrete <- split_data$num_discrete - length(ind)
+    if (all(split_data$num_discrete, length(ind), !n_true_discrete)) {
+      warning("Ignored all discrete features since `maxcat` set to ", maxcat, " categories!")
+      final_data <- data
+    } else {
+      if (n_true_discrete > 0) {
+        if (length(ind) > 0) {
+          message(length(ind), " features with more than ", maxcat, " categories ignored!\n", paste0(names(ind), ": ", as.numeric(ind), " categories\n"))
+        }
+        ## Calculate categorical correlation and melt into tidy data format
+        discrete[, discrete_id := .I]
+        discrete_pivot <- Reduce(
+          function(x, y) {merge(x, y, by = "discrete_id")},
+          c(
+            list(discrete[, c("discrete_id", names(ind)), with = FALSE]),
+            lapply(names(discrete)[!(names(discrete) %in% c("discrete_id", names(ind)))], function(x) {
+              dcast.data.table(discrete, discrete_id ~ make.names(paste0(x, "_", get(x))), length, value.var = "discrete_id")
+            })
+          )
+        )
+        drop_columns(discrete_pivot, "discrete_id")
+        if (split_data$num_continuous == 0) {
+          final_data <- discrete_pivot
+        } else {
+          final_data <- cbind(continuous, discrete_pivot)
+        }
+      }
+    }
+  } else {
+    warning("No discrete features found! Nothing is dummified!")
+    final_data <- continuous
+  }
+
+  ## Set data class back to original
+  if (!is_data_table) {class(final_data) <- data_class}
+  ## Set return object
+  return(final_data)
+}
diff --git a/R/helper.r b/R/helper.r
@@ -0,0 +1,12 @@
+#' Truncate category
+#'
+#' Output index and name for features that will be ignored
+#' @param data input data object.
+#' @param maxcat maximum categories allowed for each discrete feature.
+#' @return a named vector containing indices of features to be ignored.
+#' @import data.table
+.ignoreCat <- function(dt, maxcat) {
+  if (!is.data.table(dt)) {dt <- data.table(dt)}
+  n_cat <- sapply(dt, function(x) {length(unique(x))})
+  n_cat[which(n_cat > maxcat)]
+}
diff --git a/R/plot_correlation.r b/R/plot_correlation.r
@@ -17,16 +17,15 @@
 #' # load diamonds dataset from ggplot2
 #' data("diamonds", package = "ggplot2")
 #'
-#' # Plot correlation heatmap with all columns
+#' # Plot correlation heatmap
 #' plot_correlation(diamonds)
-#' # Plot correlation heatmap with discrete features only
-#' plot_correlation(diamonds, type = "d")
-#' # Plot correlation heatmap with continuous features only
+#' plot_correlation(diamonds, maxcat = 5)
 #' plot_correlation(diamonds, type = "c")
+#' plot_correlation(diamonds, type = "d")
 
 plot_correlation <- function(data, type = c("all", "discrete", "continuous"), maxcat = 20, title = NULL, ...) {
   ## Declare variable first to pass R CMD check
-  Var1 <- Var2 <- value <- discrete_id <- NULL
+  Var1 <- Var2 <- value <- NULL
   ## Set data to data.table
   if (!is.data.table(data)) {data <- data.table(data)}
   ## Split data
@@ -39,43 +38,27 @@ plot_correlation <- function(data, type = c("all", "discrete", "continuous"), ma
   }
   if (col_type %in% c("all", "discrete")) {
     if ((col_type == "discrete") & (split_data$num_discrete == 0)) stop("No discrete features found!")
-    discrete <- split_data$discrete
-    ## Get number of categories for each feature
-    n_cat <- sapply(discrete, function(x) {length(unique(x))})
-    ign_ind <- which(n_cat > maxcat)
-    n_true_discrete <- split_data$num_discrete - length(ign_ind)
-    if (all(split_data$num_discrete, length(ign_ind), !n_true_discrete)) warning("Ignored all discrete features since `maxcat` set to ", maxcat, " categories!")
-    if (n_true_discrete > 0) {
-      if (length(ign_ind) > 0) {
-        set(discrete, j = ign_ind, value = NULL)
-        message(length(ign_ind), " features with more than ", maxcat, " categories ignored!\n", paste0(names(ign_ind), ": ", n_cat[ign_ind], " categories\n"))
-      }
-      ## Calculate categorical correlation and melt into tidy data format
-      discrete[, discrete_id := seq(nrow(discrete))]
-      discrete_pivot <- Reduce(
-        function(x, y) {merge(x, y, by = "discrete_id")},
-        lapply(names(discrete)[names(discrete) != "discrete_id"], function(x) {
-          dcast.data.table(discrete, discrete_id ~ paste0(x, "_", get(x)), length, value.var = "discrete_id")
-        })
-      )
-      discrete_pivot[, discrete_id := NULL]
-    }
+    raw_discrete <- split_data$discrete
+    ind <- .ignoreCat(raw_discrete, maxcat)
+    n_true_discrete <- split_data$num_discrete - length(ind)
+    discrete <- dummify(raw_discrete, maxcat = maxcat)
+    if (length(ind)) drop_columns(discrete, names(ind))
   }
 
   if (col_type == "all") {
     if (all(nrow(continuous), n_true_discrete)) {
-      all_data <- cbind(continuous, discrete_pivot)
+      all_data <- cbind(continuous, discrete)
     } else if (nrow(continuous)) {
       all_data <- continuous
     } else if (n_true_discrete) {
-      all_data <- discrete_pivot
+      all_data <- discrete
     } else {
       stop("No data to plot!")
     }
   }
 
   ## Calculate correlation and melt into tidy data format
-  final_data <- switch(col_type, "all" = all_data, "discrete" = discrete_pivot, "continuous" = continuous)
+  final_data <- switch(col_type, "all" = all_data, "discrete" = discrete, "continuous" = continuous)
   plot_data <- reshape2::melt(cor(final_data, ...))
   ## Create ggplot object
   plot <- ggplot(plot_data, aes(x = Var1, y = Var2, fill = value)) +

diff --git a/man/dummify.Rd b/man/dummify.Rd
diff --git a/man/plot_correlation.Rd b/man/plot_correlation.Rd
diff --git a/tests/testthat/test-dummify.r b/tests/testthat/test-dummify.r
@@ -0,0 +1,30 @@
+context("dummify")
+data("diamonds", package = "ggplot2")
+
+test_that("test return object class", {
+  expect_equal(class(diamonds), class(dummify(diamonds, maxcat = 5)))
+  expect_equal(class(iris), class(dummify(iris)))
+  expect_is(dummify(data.table("D" = letters[1:5])), "data.table")
+})
+
+test_that("test messages and warnings", {
+  expect_message(dummify(diamonds, maxcat = 5))
+  expect_warning(dummify(iris, maxcat = 2))
+  expect_warning(dummify(airquality))
+})
+
+test_that("test feature count", {
+  expect_equal(ncol(dummify(diamonds)), 27L)
+  expect_equal(ncol(dummify(diamonds, maxcat = 5)), 14L)
+  expect_equal(ncol(dummify(data.table("A" = letters[1:5]))), 5L)
+  expect_equal(ncol(dummify(data.table("A" = letters[1:5], "B" = letters[6:10]))), 10L)
+})
+
+test_that("test binary outcome", {
+  expect_equal(max(dummify(data.table("A" = letters[1:5]))), 1L)
+  expect_equal(min(dummify(data.table("A" = letters[1:5]))), 0L)
+})
+
+test_that("test continuous features", {
+  expect_equivalent(split_columns(diamonds)$continuous, split_columns(dummify(diamonds))$continuous[, 1:7])
+})
diff --git a/tests/testthat/test-helper.r b/tests/testthat/test-helper.r
@@ -0,0 +1,19 @@
+context("helper")
+
+test_that(".ignoreCat", {
+  set.seed(1)
+  dt <- data.table(
+    "a" = as.factor(rep(1, 10)),
+    "b" = as.factor(sample.int(2, 10, replace = TRUE)),
+    "c" = as.factor(sample.int(5, 10, replace = TRUE)),
+    "d" = as.factor(sample.int(10))
+  )
+  expect_equal(as.numeric(.ignoreCat(dt, 0)), c(1L, 2L, 5L, 10L))
+  expect_equal(as.numeric(.ignoreCat(dt, 1)), c(2L, 5L, 10L))
+  expect_equal(as.numeric(.ignoreCat(dt, 2)), c(5L, 10L))
+  expect_equal(as.numeric(.ignoreCat(dt, 5)), 10L)
+  expect_equal(names(.ignoreCat(dt, 0)), letters[1L:4L])
+  expect_equal(names(.ignoreCat(dt, 1)), letters[2L:4L])
+  expect_equal(names(.ignoreCat(dt, 2)), letters[3L:4L])
+  expect_equal(names(.ignoreCat(dt, 5)), letters[4L])
+})
diff --git a/tests/testthat/test-plot-correlation.r b/tests/testthat/test-plot-correlation.r
@@ -1,7 +1,13 @@
 context("plot correlation heatmap")
+data(diamonds, package = "ggplot2")
 
 test_that("test maximum categories for discrete features", {
-  data(diamonds, package = "ggplot2")
   expect_message(plot_correlation(diamonds, type = "d", maxcat = 5))
   expect_silent(plot_correlation(diamonds, type = "d"))
 })
+
+test_that("test error messages", {
+  expect_error(plot_correlation(split_columns(diamonds)$continuous, type = "d"))
+  expect_error(plot_correlation(split_columns(diamonds)$discrete, type = "c"))
+  expect_error(suppressWarnings(plot_correlation(split_columns(diamonds)$discrete, maxcat = 2)))
+})