Skip to content

Commit

Permalink
Completed #54
Browse files Browse the repository at this point in the history
  • Loading branch information
boxuancui committed Mar 21, 2018
1 parent 07d4b97 commit 0cae1f5
Show file tree
Hide file tree
Showing 11 changed files with 199 additions and 39 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export(SetNaTo)
export(SplitColType)
export(create_report)
export(drop_columns)
export(dummify)
export(group_category)
export(plot_bar)
export(plot_boxplot)
Expand All @@ -28,6 +29,7 @@ export(split_columns)
import(data.table)
import(ggplot2)
import(gridExtra)
import(reshape2)
importFrom(networkD3,diagonalNetwork)
importFrom(networkD3,radialNetwork)
importFrom(rmarkdown,render)
Expand Down
12 changes: 8 additions & 4 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# Changelog

### DataExplorer 0.5.0.9000
#### New Features
* [#54](https://github.com/boxuancui/DataExplorer/issues/54): Extracted `dummify` from `plot_correlation` as a new function.

#### Enhancements
* `plot_missing`:
* Percentage text labels from output plot now has 2 decimals to prevent small percentages from being truncated to 0%.
* Added example to quickly drop columns with too many missing values.
* Added `.ignoreCat` to helper.

---

Expand Down Expand Up @@ -54,7 +58,7 @@
* [#37](https://github.com/boxuancui/DataExplorer/issues/37): Changed all `cat()` to `message()`.
* [#38](https://github.com/boxuancui/DataExplorer/issues/38): Added option to order bars in `BarDiscrete`.
* [#39](https://github.com/boxuancui/DataExplorer/issues/39): Extended `SetNaTo` to discrete features.
* Added more examples in README file.
* Added more examples to **README.md**.

---

Expand All @@ -64,11 +68,11 @@
* [#29](https://github.com/boxuancui/DataExplorer/issues/29): Added `DropVar` to quickly drop variables by either name or column position.

#### Bug Fixes
* [#24](https://github.com/boxuancui/DataExplorer/issues/24): `CorrelationDiscrete` now displays all factor levels instead of contrasts from `model.matrix`.
* [#24](https://github.com/boxuancui/DataExplorer/issues/24): `CorrelationDiscrete` now displays all factor levels instead of full rank matrix from `model.matrix`.

#### Enhancements
* [#11](https://github.com/boxuancui/DataExplorer/issues/11): Functions with return values will now match the input class and set it back.
* [#22](https://github.com/boxuancui/DataExplorer/issues/22): Added documentation for **num_all_missing** in `SplitColType`.
* [#22](https://github.com/boxuancui/DataExplorer/issues/22): Added documentation for `num_all_missing` in `SplitColType`.
* [#23](https://github.com/boxuancui/DataExplorer/issues/23): Added additional measures (in addition to frequency) to `CollapseCategory`.
* [#26](https://github.com/boxuancui/DataExplorer/issues/26): Removed density estimation section from report template.
* [#31](https://github.com/boxuancui/DataExplorer/issues/31): Added flexibility to name the new category in `CollapseCategory`.
Expand Down Expand Up @@ -121,5 +125,5 @@
* Features with all missing values will be ignored.
* Switched position between continuous and discrete features in report template.
* Renamed package name to **DataExplorer**.
* Added `NEWS.md`.
* Added **NEWS.md**.
* Removed `BoxplotContinuous`.
2 changes: 1 addition & 1 deletion R/drop_columns.r
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

drop_columns <- function(data, ind) {
if (!is.data.table(data)) stop("Please change your input data class to data.table!")
data[, (ind) := NULL]
data[, (ind) := NULL][]
}

DropVar <- function(data, ind) {
Expand Down
74 changes: 74 additions & 0 deletions R/dummify.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#' Dummify discrete features to binary columns
#'
#' Data dummification is also known as one hot encoding or feature binarization. It turns each category to a distinct column with binary (numeric) values.
#' @param data input data, in either \link{data.frame} or \link{data.table} format.
#' @param maxcat maximum categories allowed for each discrete feature. The default is 50.
#' @keywords dummify
#' @note This is different from \link{model.matrix}, where the latter aims to create a full rank matrix for regression-like use cases. If your intention is to create a design matrix, use \link{model.matrix} instead.
#' @return dummified dataset (discrete features only) preserving original features. However, column order might be different.
#' @import data.table
#' @import reshape2
#' @export
#' @examples
#' ## Dummify iris dataset
#' str(dummify(iris))
#'
#' ## Dummify diamonds dataset ignoring features with more than 5 categories
#' data("diamonds", package = "ggplot2")
#' str(dummify(diamonds, maxcat = 5))

dummify <- function(data, maxcat = 50L) {
## Declare variable first to pass R CMD check
discrete_id <- NULL
## Check if input is data.table
is_data_table <- is.data.table(data)
## Detect input data class
data_class <- class(data)
## Set data to data.table
if (!is.data.table(data)) {data <- data.table(data)}
## Split data
split_data <- split_columns(data)
continuous <- split_data$continuous
## Scan feature type
if (split_data$num_discrete > 0) {
discrete <- split_data$discrete
## Get number of categories for each feature
ind <- .ignoreCat(discrete, maxcat)
n_true_discrete <- split_data$num_discrete - length(ind)
if (all(split_data$num_discrete, length(ind), !n_true_discrete)) {
warning("Ignored all discrete features since `maxcat` set to ", maxcat, " categories!")
final_data <- data
} else {
if (n_true_discrete > 0) {
if (length(ind) > 0) {
message(length(ind), " features with more than ", maxcat, " categories ignored!\n", paste0(names(ind), ": ", as.numeric(ind), " categories\n"))
}
## Calculate categorical correlation and melt into tidy data format
discrete[, discrete_id := .I]
discrete_pivot <- Reduce(
function(x, y) {merge(x, y, by = "discrete_id")},
c(
list(discrete[, c("discrete_id", names(ind)), with = FALSE]),
lapply(names(discrete)[!(names(discrete) %in% c("discrete_id", names(ind)))], function(x) {
dcast.data.table(discrete, discrete_id ~ make.names(paste0(x, "_", get(x))), length, value.var = "discrete_id")
})
)
)
drop_columns(discrete_pivot, "discrete_id")
if (split_data$num_continuous == 0) {
final_data <- discrete_pivot
} else {
final_data <- cbind(continuous, discrete_pivot)
}
}
}
} else {
warning("No discrete features found! Nothing is dummified!")
final_data <- continuous
}

## Set data class back to original
if (!is_data_table) {class(final_data) <- data_class}
## Set return object
return(final_data)
}
12 changes: 12 additions & 0 deletions R/helper.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#' Truncate category
#'
#' Output index and name for features that will be ignored
#' @param data input data object.
#' @param maxcat maximum categories allowed for each discrete feature.
#' @return a named vector containing indices of features to be ignored.
#' @import data.table
.ignoreCat <- function(dt, maxcat) {
if (!is.data.table(dt)) {dt <- data.table(dt)}
n_cat <- sapply(dt, function(x) {length(unique(x))})
n_cat[which(n_cat > maxcat)]
}
41 changes: 12 additions & 29 deletions R/plot_correlation.r
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,15 @@
#' # load diamonds dataset from ggplot2
#' data("diamonds", package = "ggplot2")
#'
#' # Plot correlation heatmap with all columns
#' # Plot correlation heatmap
#' plot_correlation(diamonds)
#' # Plot correlation heatmap with discrete features only
#' plot_correlation(diamonds, type = "d")
#' # Plot correlation heatmap with continuous features only
#' plot_correlation(diamonds, maxcat = 5)
#' plot_correlation(diamonds, type = "c")
#' plot_correlation(diamonds, type = "d")

plot_correlation <- function(data, type = c("all", "discrete", "continuous"), maxcat = 20, title = NULL, ...) {
## Declare variable first to pass R CMD check
Var1 <- Var2 <- value <- discrete_id <- NULL
Var1 <- Var2 <- value <- NULL
## Set data to data.table
if (!is.data.table(data)) {data <- data.table(data)}
## Split data
Expand All @@ -39,43 +38,27 @@ plot_correlation <- function(data, type = c("all", "discrete", "continuous"), ma
}
if (col_type %in% c("all", "discrete")) {
if ((col_type == "discrete") & (split_data$num_discrete == 0)) stop("No discrete features found!")
discrete <- split_data$discrete
## Get number of categories for each feature
n_cat <- sapply(discrete, function(x) {length(unique(x))})
ign_ind <- which(n_cat > maxcat)
n_true_discrete <- split_data$num_discrete - length(ign_ind)
if (all(split_data$num_discrete, length(ign_ind), !n_true_discrete)) warning("Ignored all discrete features since `maxcat` set to ", maxcat, " categories!")
if (n_true_discrete > 0) {
if (length(ign_ind) > 0) {
set(discrete, j = ign_ind, value = NULL)
message(length(ign_ind), " features with more than ", maxcat, " categories ignored!\n", paste0(names(ign_ind), ": ", n_cat[ign_ind], " categories\n"))
}
## Calculate categorical correlation and melt into tidy data format
discrete[, discrete_id := seq(nrow(discrete))]
discrete_pivot <- Reduce(
function(x, y) {merge(x, y, by = "discrete_id")},
lapply(names(discrete)[names(discrete) != "discrete_id"], function(x) {
dcast.data.table(discrete, discrete_id ~ paste0(x, "_", get(x)), length, value.var = "discrete_id")
})
)
discrete_pivot[, discrete_id := NULL]
}
raw_discrete <- split_data$discrete
ind <- .ignoreCat(raw_discrete, maxcat)
n_true_discrete <- split_data$num_discrete - length(ind)
discrete <- dummify(raw_discrete, maxcat = maxcat)
if (length(ind)) drop_columns(discrete, names(ind))
}

if (col_type == "all") {
if (all(nrow(continuous), n_true_discrete)) {
all_data <- cbind(continuous, discrete_pivot)
all_data <- cbind(continuous, discrete)
} else if (nrow(continuous)) {
all_data <- continuous
} else if (n_true_discrete) {
all_data <- discrete_pivot
all_data <- discrete
} else {
stop("No data to plot!")
}
}

## Calculate correlation and melt into tidy data format
final_data <- switch(col_type, "all" = all_data, "discrete" = discrete_pivot, "continuous" = continuous)
final_data <- switch(col_type, "all" = all_data, "discrete" = discrete, "continuous" = continuous)
plot_data <- reshape2::melt(cor(final_data, ...))
## Create ggplot object
plot <- ggplot(plot_data, aes(x = Var1, y = Var2, fill = value)) +
Expand Down
31 changes: 31 additions & 0 deletions man/dummify.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 3 additions & 4 deletions man/plot_correlation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions tests/testthat/test-dummify.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
context("dummify")
data("diamonds", package = "ggplot2")

test_that("test return object class", {
expect_equal(class(diamonds), class(dummify(diamonds, maxcat = 5)))
expect_equal(class(iris), class(dummify(iris)))
expect_is(dummify(data.table("D" = letters[1:5])), "data.table")
})

test_that("test messages and warnings", {
expect_message(dummify(diamonds, maxcat = 5))
expect_warning(dummify(iris, maxcat = 2))
expect_warning(dummify(airquality))
})

test_that("test feature count", {
expect_equal(ncol(dummify(diamonds)), 27L)
expect_equal(ncol(dummify(diamonds, maxcat = 5)), 14L)
expect_equal(ncol(dummify(data.table("A" = letters[1:5]))), 5L)
expect_equal(ncol(dummify(data.table("A" = letters[1:5], "B" = letters[6:10]))), 10L)
})

test_that("test binary outcome", {
expect_equal(max(dummify(data.table("A" = letters[1:5]))), 1L)
expect_equal(min(dummify(data.table("A" = letters[1:5]))), 0L)
})

test_that("test continuous features", {
expect_equivalent(split_columns(diamonds)$continuous, split_columns(dummify(diamonds))$continuous[, 1:7])
})
19 changes: 19 additions & 0 deletions tests/testthat/test-helper.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
context("helper")

test_that(".ignoreCat", {
set.seed(1)
dt <- data.table(
"a" = as.factor(rep(1, 10)),
"b" = as.factor(sample.int(2, 10, replace = TRUE)),
"c" = as.factor(sample.int(5, 10, replace = TRUE)),
"d" = as.factor(sample.int(10))
)
expect_equal(as.numeric(.ignoreCat(dt, 0)), c(1L, 2L, 5L, 10L))
expect_equal(as.numeric(.ignoreCat(dt, 1)), c(2L, 5L, 10L))
expect_equal(as.numeric(.ignoreCat(dt, 2)), c(5L, 10L))
expect_equal(as.numeric(.ignoreCat(dt, 5)), 10L)
expect_equal(names(.ignoreCat(dt, 0)), letters[1L:4L])
expect_equal(names(.ignoreCat(dt, 1)), letters[2L:4L])
expect_equal(names(.ignoreCat(dt, 2)), letters[3L:4L])
expect_equal(names(.ignoreCat(dt, 5)), letters[4L])
})
8 changes: 7 additions & 1 deletion tests/testthat/test-plot-correlation.r
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
context("plot correlation heatmap")
data(diamonds, package = "ggplot2")

test_that("test maximum categories for discrete features", {
data(diamonds, package = "ggplot2")
expect_message(plot_correlation(diamonds, type = "d", maxcat = 5))
expect_silent(plot_correlation(diamonds, type = "d"))
})

test_that("test error messages", {
expect_error(plot_correlation(split_columns(diamonds)$continuous, type = "d"))
expect_error(plot_correlation(split_columns(diamonds)$discrete, type = "c"))
expect_error(suppressWarnings(plot_correlation(split_columns(diamonds)$discrete, maxcat = 2)))
})

0 comments on commit 0cae1f5

Please sign in to comment.