Skip to content

Commit

Permalink
Completed #76
Browse files Browse the repository at this point in the history
  • Loading branch information
boxuancui committed Oct 15, 2018
1 parent 897a5d9 commit 4d2c449
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 4 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(plot_boxplot)
export(plot_correlation)
export(plot_density)
export(plot_histogram)
export(plot_intro)
export(plot_missing)
export(plot_prcomp)
export(plot_scatterplot)
Expand All @@ -25,7 +26,9 @@ importFrom(networkD3,radialNetwork)
importFrom(parallel,detectCores)
importFrom(parallel,mclapply)
importFrom(rmarkdown,render)
importFrom(scales,comma)
importFrom(scales,percent)
importFrom(stats,complete.cases)
importFrom(stats,cor)
importFrom(stats,prcomp)
importFrom(stats,reorder)
Expand Down
5 changes: 4 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Changelog

### DataExplorer 0.6.1.9000
#### New Features
* [#76](https://github.com/boxuancui/DataExplorer/issues/76): Added `plot_intro` to visualize results of `introduce`.

#### Enhancements
* [#42](https://github.com/boxuancui/DataExplorer/issues/42): Applied S3 methods for plotting functions.
* [#77](https://github.com/boxuancui/DataExplorer/issues/77): `dummify` now works on selected columns.
Expand All @@ -17,7 +20,7 @@
### DataExplorer 0.6.1
#### Enhancements
* Updated vignette for better clarity.
* Added better error handler for `plot_prcomp`.
* [#71](https://github.com/boxuancui/DataExplorer/issues/71): Added better error handler for `plot_prcomp`.

#### Bug Fixes
* [#69](https://github.com/boxuancui/DataExplorer/issues/69): Fixed bug causing `create_report` failure (specifically from `plot_prcomp`) when `y` is specified.
Expand Down
5 changes: 4 additions & 1 deletion R/introduce.r
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
#' \item{continuous_columns: number of continuous columns}
#' \item{all_missing_columns: number of columns with everything missing}
#' \item{total_missing_values: number of missing observations}
#' \item{complete_rows: number of rows without missing values. See \link{complete.cases}.}
#' \item{total_observations: total number of observations}
#' \item{memory_usage: estimated memory allocation in bytes. See \link{object.size}}
#' \item{memory_usage: estimated memory allocation in bytes. See \link{object.size}.}
#' }
#' @import data.table
#' @importFrom stats complete.cases
#' @importFrom utils object.size
#' @export introduce
#' @examples
Expand All @@ -35,6 +37,7 @@ introduce <- function(data) {
"continuous_columns" = split_data[["num_continuous"]],
"all_missing_columns" = split_data[["num_all_missing"]],
"total_missing_values" = sum(is.na(data)),
"complete_rows" = sum(complete.cases(data)),
"total_observations" = nrow(data) * ncol(data),
"memory_usage" = as.numeric(object.size(data))
)
Expand Down
57 changes: 57 additions & 0 deletions R/plot_intro.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#' Plot introduction
#'
#' Plot basic information (from \link{introduce}) for input data.
#' @param data input data
#' @param title plot title
#' @param ggtheme complete ggplot2 themes. The default is \link{theme_gray}.
#' @param theme_config a list of configurations to be passed to \link{theme}.
#' @return invisibly return the ggplot object
#' @keywords plot_intro
#' @details To change default font family and size, you may pass \code{base_size} and \code{base_family} to \code{ggtheme} options, e.g., \code{ggtheme = theme_gray(base_size = 15, base_family = "serif")}
#' @details \code{theme_config} argument expects all inputs to be wrapped in a list object, e.g., to change the text color: \code{theme_config = list("text" = element_text(color = "blue"))}
#' @import ggplot2
#' @importFrom stats reorder
#' @importFrom scales comma percent
#' @export plot_intro
#' @seealso \link{introduce}
#' @examples
#' plot_intro(airquality)
#' plot_intro(iris)

plot_intro <- function(data, title = NULL, ggtheme = theme_gray(), theme_config = list()) {
## Declare variable first to pass R CMD check
id <- dimension <- variable <- value <- NULL
## Get intro data
intro <- introduce(data)
memory_usage <- intro[["memory_usage"]]
class(memory_usage) <- "object_size"
memory_usage_string <- format(memory_usage, unit = "auto")
intro2 <- data.table(
"id" = seq.int(5L),
"dimension" = c(rep("column", 3L), "row", "observation"),
"variable" = c("Discrete Columns", "Continuous Columns", "All Missing Columns", "Complete Rows", "Missing Observations"),
"value" = c(
intro[["discrete_columns"]] / intro[["columns"]],
intro[["continuous_columns"]] / intro[["columns"]],
intro[["all_missing_columns"]] / intro[["columns"]],
intro[["complete_rows"]] / intro[["rows"]],
intro[["total_missing_values"]] / intro[["total_observations"]]
)
)
## Create ggplot object
output <- ggplot(intro2, aes(x = reorder(variable, -id), y = value, fill = dimension)) +
geom_bar(stat = "identity") +
geom_text(aes(label = percent(value))) +
scale_y_continuous(labels = percent) +
scale_fill_discrete("Dimension") +
coord_flip() +
labs(x = "Metrics", y = "Value")
## Plot object
class(output) <- c("single", class(output))
plotDataExplorer(
obj = output,
title = ifelse(is.null(title), paste("Memory Usage:", memory_usage_string), title),
ggtheme = ggtheme,
theme_config = theme_config
)
}
1 change: 1 addition & 0 deletions R/plot_missing.r
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#'
#' # Plot with customized theme components
#' plot_missing(airquality, theme_config = list("legend.position" = c("top")))

plot_missing <- function(data, title = NULL, ggtheme = theme_gray(), theme_config = list("legend.position" = c("bottom"))) {
## Declare variable first to pass R CMD check
pct_missing <- NULL
Expand Down
3 changes: 2 additions & 1 deletion inst/rmd_template/report.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,11 @@ if ("introduce" %in% names(report_config)) {
format(memory_usage, unit = "auto")
)
)
## Render content
cat("### Basic Statistics", fill = TRUE)
kable(intro_df)
## Plot introduction
do_call("plot_intro")
}
```

Expand Down
3 changes: 2 additions & 1 deletion man/introduce.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 37 additions & 0 deletions man/plot_intro.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions vignettes/dataexplorer-intro.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ introduce(final_data)
kable(introduce(final_data), format.args = list(big.mark = ","))
```

To visualize the table above (with some light analysis):

```{r eda-plot-intro}
plot_intro(final_data)
```

### Missing values
Real-world data is messy. After running the basic descriptive statistics, you might be interested in the missing data profile. You can simply use `plot_missing` function for this.

Expand Down

0 comments on commit 4d2c449

Please sign in to comment.