Skip to content

Commit

Permalink
Completed #77
Browse files Browse the repository at this point in the history
  • Loading branch information
boxuancui committed Oct 14, 2018
1 parent 9c8e2a4 commit 8cc19ac
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 94 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
### DataExplorer 0.6.1.9000
#### Enhancements
* [#42](https://github.com/boxuancui/DataExplorer/issues/42): Applied S3 methods for plotting functions.
* [#77](https://github.com/boxuancui/DataExplorer/issues/77): `dummify` now works on selected columns.
* [#78](https://github.com/boxuancui/DataExplorer/issues/78): All ggplot objects from `plot_*` are now invisibly returned. As a result, extracted `profile_missing` from `plot_missing` for missing value profiles.
* [#83](https://github.com/boxuancui/DataExplorer/issues/83): Removed all deprecated functions.
* [#85](https://github.com/boxuancui/DataExplorer/issues/85): Users can now specify number of rows/columns for plot page layout.
Expand Down
139 changes: 80 additions & 59 deletions R/dummify.r
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
#'
#' Data dummification is also known as one hot encoding or feature binarization. It turns each category to a distinct column with binary (numeric) values.
#' @param data input data
#' @param maxcat maximum categories allowed for each discrete feature. The default is 50.
#' @param maxcat maximum categories allowed for each discrete feature. Default is 50.
#' @param select names of selected features to be dummified. Default is \code{NULL}.
#' @keywords dummify
#' @details Continuous features will be ignored if added in \code{select}.
#' @details \code{select} features will be ignored if categories exceed \code{maxcat}.
#' @note This is different from \link{model.matrix}, where the latter aims to create a full rank matrix for regression-like use cases. If your intention is to create a design matrix, use \link{model.matrix} instead.
#' @return dummified dataset (discrete features only) preserving original features. However, column order might be different.
#' @import data.table
Expand All @@ -15,64 +18,82 @@
#' ## Dummify diamonds dataset ignoring features with more than 5 categories
#' data("diamonds", package = "ggplot2")
#' str(dummify(diamonds, maxcat = 5))
#' str(dummify(diamonds, select = c("cut", "color")))

dummify <- function(data, maxcat = 50L) {
## Declare variable first to pass R CMD check
discrete_id <- NULL
## Check if input is data.table
is_data_table <- is.data.table(data)
## Detect input data class
data_class <- class(data)
## Set data to data.table
if (!is.data.table(data)) data <- data.table(data)
## Split data
split_data <- split_columns(data)
continuous <- split_data$continuous
## Scan feature type
if (split_data$num_discrete > 0) {
discrete <- split_data$discrete
## Get number of categories for each feature
ind <- .ignoreCat(discrete, maxcat)
n_true_discrete <- split_data$num_discrete - length(ind)
if (all(split_data$num_discrete, length(ind), !n_true_discrete)) {
warning("Ignored all discrete features since `maxcat` set to ", maxcat, " categories!")
final_data <- data
} else {
if (n_true_discrete > 0) {
if (length(ind) > 0) {
message(length(ind), " features with more than ", maxcat, " categories ignored!\n", paste0(names(ind), ": ", as.numeric(ind), " categories\n"))
}
## Set key for discrete features
discrete[, discrete_id := .I]
## Join ignored and valid discrete features based on key
discrete_pivot <- Reduce(
function(x, y) {merge(x, y, by = "discrete_id")},
c(
## Get ignored discrete features
list(discrete[, c("discrete_id", names(ind)), with = FALSE]),
## Pivot valid discrete features
lapply(names(discrete)[!(names(discrete) %in% c("discrete_id", names(ind)))], function(x) {
dcast.data.table(discrete, discrete_id ~ make.names(paste0(x, "_", get(x))), length, value.var = "discrete_id")
})
)
)
drop_columns(discrete_pivot, "discrete_id")
if (split_data$num_continuous == 0) {
final_data <- discrete_pivot
} else {
final_data <- cbind(continuous, discrete_pivot)
}
}
}
} else {
warning("No discrete features found! Nothing is dummified!")
final_data <- continuous
}
dummify <- function(data, maxcat = 50L, select = NULL) {
## Declare variable first to pass R CMD check
discrete_id <- NULL
## Check if input is data.table
is_data_table <- is.data.table(data)
## Detect input data class
data_class <- class(data)
## Set data to data.table
if (!is.data.table(data)) data <- data.table(data)
## Split data
split_data <- split_columns(data)
continuous <- split_data$continuous
## Check select in continuous features
if (any(select %in% names(continuous))) {
cont_ind <- which(names(continuous) %in% select)
warning("Ignored the following continuous features in `select`:\n", paste0("\t* ", names(continuous)[cont_ind], "\n"))
## Remove continuous names from select
if (setequal(select, names(continuous)[cont_ind])) {
select <- NULL
} else {
select <- setdiff(select, names(continuous)[cont_ind])
}
}

## Set data class back to original
if (!is_data_table) {
class(final_data) <- data_class
}
## Set return object
return(final_data)
if (split_data$num_discrete > 0) {
discrete <- split_data$discrete
## Count valid features
ind <- .ignoreCat(discrete, maxcat)
if (!is.null(select)) {
## Remove maxcat names from select
true_discrete_names <- setdiff(select, names(ind))
} else {
## Remove maxcat names from discrete
true_discrete_names <- setdiff(names(discrete), names(ind))
}
n_true_discrete <- length(true_discrete_names)
ignored_discrete_names <- setdiff(names(discrete), true_discrete_names)

if (all(split_data$num_discrete, length(ind), !n_true_discrete)) {
warning("Ignored all discrete features since `maxcat` set to ", maxcat, " categories!")
final_data <- data
} else {
if (n_true_discrete > 0) {
if (length(ind) > 0) {
message(length(ind), " features with more than ", maxcat, " categories ignored!\n", paste0(names(ind), ": ", as.numeric(ind), " categories\n"))
}
## Set key for discrete features
discrete[, discrete_id := .I]
## Join ignored and valid discrete features based on key
discrete_pivot <- Reduce(
function(x, y) {merge(x, y, by = "discrete_id")},
c(
## Get ignored discrete features
list(discrete[, c("discrete_id", ignored_discrete_names), with = FALSE]),
## Pivot valid discrete features
lapply(true_discrete_names, function(x) {
dcast.data.table(discrete, discrete_id ~ make.names(paste0(x, "_", get(x))), length, value.var = "discrete_id")
})
)
)
drop_columns(discrete_pivot, "discrete_id")
if (split_data$num_continuous == 0) {
final_data <- discrete_pivot
} else {
final_data <- cbind(continuous, discrete_pivot)
}
}
}
} else {
warning("No discrete features found! Nothing is dummified!")
final_data <- continuous
}

## Set data class back to original
if (!is_data_table) class(final_data) <- data_class
final_data
}
6 changes: 2 additions & 4 deletions R/internal.r
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
#' Output index and name for features that will be ignored
#' @param dt input data object.
#' @param maxcat maximum categories allowed for each discrete feature.
#' @return a named vector containing indices of features to be ignored.
#' @return a named vector containing number of categories for to-be-ignored features.
#' @import data.table
.ignoreCat <- function(dt, maxcat) {
if (!is.data.table(dt)) dt <- data.table(dt)
n_cat <- sapply(dt, function(x) {
length(unique(x))
})
n_cat <- vapply(dt, function(x) {length(unique(x))}, 0)
n_cat[which(n_cat > maxcat)]
}

Expand Down
34 changes: 22 additions & 12 deletions R/introduce.r
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#' Describe basic information for input data.
#' @param data input data
#' @keywords introduce
#' @return Describe basic information in a \link{data.frame}:
#' @return Describe basic information in input data class:
#' \itemize{
#' \item{rows: number of rows}
#' \item{columns: number of columns}
Expand All @@ -21,15 +21,25 @@
#' introduce(mtcars)

introduce <- function(data) {
split_data <- split_columns(data)
data.frame(
"rows" = nrow(data),
"columns" = ncol(data),
"discrete_columns" = split_data[["num_discrete"]],
"continuous_columns" = split_data[["num_continuous"]],
"all_missing_columns" = split_data[["num_all_missing"]],
"total_missing_values" = sum(is.na(data)),
"total_observations" = nrow(data) * ncol(data),
"memory_usage" = as.numeric(object.size(data))
)
## Check and set to data.table
is_data_table <- is.data.table(data)
data_class <- class(data)
if (!is.data.table(data)) data <- data.table(data)

split_data <- split_columns(data)

output <- data.table(
"rows" = nrow(data),
"columns" = ncol(data),
"discrete_columns" = split_data[["num_discrete"]],
"continuous_columns" = split_data[["num_continuous"]],
"all_missing_columns" = split_data[["num_all_missing"]],
"total_missing_values" = sum(is.na(data)),
"total_observations" = nrow(data) * ncol(data),
"memory_usage" = as.numeric(object.size(data))
)

## Set data class back to original
if (!is_data_table) class(output) <- data_class
output
}
2 changes: 1 addition & 1 deletion man/dot-ignoreCat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 10 additions & 2 deletions man/dummify.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/introduce.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 16 additions & 13 deletions tests/testthat/test-dummify.r
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,32 @@ context("dummify")
data("diamonds", package = "ggplot2")

test_that("test return object class", {
expect_equal(class(diamonds), class(dummify(diamonds, maxcat = 5)))
expect_equal(class(iris), class(dummify(iris)))
expect_is(dummify(data.table("D" = letters[1:5])), "data.table")
expect_equal(class(diamonds), class(dummify(diamonds, maxcat = 5L)))
expect_equal(class(iris), class(dummify(iris)))
expect_is(dummify(data.table("D" = letters[1:5])), "data.table")
})

test_that("test messages and warnings", {
expect_message(dummify(diamonds, maxcat = 5))
expect_warning(dummify(iris, maxcat = 2))
expect_warning(dummify(airquality))
expect_message(dummify(diamonds, maxcat = 5L))
expect_warning(dummify(iris, maxcat = 2L))
expect_warning(dummify(airquality))
expect_warning(dummify(iris, select = c("Sepal.Length")))
})

test_that("test feature count", {
expect_equal(ncol(dummify(diamonds)), 27L)
expect_equal(ncol(dummify(diamonds, maxcat = 5)), 14L)
expect_equal(ncol(dummify(data.table("A" = letters[1:5]))), 5L)
expect_equal(ncol(dummify(data.table("A" = letters[1:5], "B" = letters[6:10]))), 10L)
expect_equal(ncol(dummify(diamonds)), 27L)
expect_equal(ncol(dummify(diamonds, maxcat = 5L)), 14L)
expect_equal(ncol(dummify(diamonds, select = c("cut", "price"))), 14L)
expect_equal(ncol(dummify(diamonds, maxcat = 5L, select = c("cut", "color", "clarity"))), 14L)
expect_equal(ncol(dummify(data.table("A" = letters[1:5]))), 5L)
expect_equal(ncol(dummify(data.table("A" = letters[1:5], "B" = letters[6:10]))), 10L)
})

test_that("test binary outcome", {
expect_equal(max(dummify(data.table("A" = letters[1:5]))), 1L)
expect_equal(min(dummify(data.table("A" = letters[1:5]))), 0L)
expect_equal(max(dummify(data.table("A" = letters[1:5]))), 1L)
expect_equal(min(dummify(data.table("A" = letters[1:5]))), 0L)
})

test_that("test continuous features", {
expect_equivalent(split_columns(diamonds)$continuous, split_columns(dummify(diamonds))$continuous[, 1:7])
expect_equivalent(split_columns(diamonds)$continuous, split_columns(dummify(diamonds))$continuous[, 1:7])
})
11 changes: 9 additions & 2 deletions vignettes/dataexplorer-intro.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Introduction to DataExplorer}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
\usepackage[utf8]{inputenc}
---

```{r setup, include=FALSE}
Expand All @@ -18,7 +18,14 @@ library(ggplot2)
library(nycflights13)
library(networkD3)
opts_chunk$set(fig.width = 6, fig.height = 6, fig.align = "center", warning = FALSE)
opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 6,
fig.height = 6,
fig.align = "center",
warning = FALSE
)
```

<script src="d3.min.js"></script>
Expand Down

0 comments on commit 8cc19ac

Please sign in to comment.