Skip to content

Commit

Permalink
fixing factor issue for alluvial_model_response
Browse files Browse the repository at this point in the history
  • Loading branch information
erblast committed Jun 11, 2019
1 parent c09fdf8 commit 9a8e350
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 14 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: easyalluvial
Title: Generate Alluvial Plots with a Single Line of Code
Version: 0.2.0
Version: 0.2.0.900
Authors@R: person( "Bjoern", "Koneswarakantha", role = c("aut","cre"), email = "[email protected]", comment = c(ORCID = " 0000-0003-4585-7799") )
URL: https://github.com/erblast/easyalluvial
Description: Alluvial plots are similar to sankey diagrams and visualise categorical data
Expand Down
73 changes: 60 additions & 13 deletions R/alluvial_model_response.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,10 @@ tidy_imp = function(imp, df, .f = max){
#'@param degree integer, number of top important variables to select. For
#' plotting more than 4 will result in two many flows and the alluvial plot
#' will not be very readable, Default: 4
#'@param bins integer, number of bins for numeric variables, increasing this
#' number might result in too many flows, Default: 5
#'@param bins integer, number of bins for numeric variables, and maximum number
#' of levels for factor variables, increasing this number might result in too
#' many flows, Default: 5
#'@param max_levels integer, maximum number of levels per factor variable, Default: 10
#'@return data frame
#'@details this model visualisation approach follows the "visualising the model
#' in the dataspace" principle as described in Wickham H, Cook D, Hofmann H
Expand All @@ -139,7 +141,7 @@ tidy_imp = function(imp, df, .f = max){
#'@export
#'@seealso \code{\link[easyalluvial]{alluvial_wide}},
#' \code{\link[easyalluvial]{manip_bin_numerics}}
get_data_space = function(df, imp, degree = 4, bins = 5){
get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){

degree = check_degree(degree, imp, df)

Expand All @@ -152,17 +154,62 @@ get_data_space = function(df, imp, degree = 4, bins = 5){
df_top = select(df, one_of(imp_top$vars) )

numerics_top = names( select_if( df_top, is.numeric ) )

factors_top = names( select_if( df_top, is.factor) )

# generate warning if number of levels is > max_levels
n_levels = select(df, one_of(factors_top) ) %>%
summarise_all( ~ length( levels(.) ) ) %>%
unlist()

most_levels = n_levels[ which(n_levels == max(n_levels)) ]

if(most_levels > max_levels){
warning( paste('factor', names(most_levels), 'contains', most_levels
, 'levels. Data space will only be created for top', max_levels
, 'levels. Adjust this behaviour using the `max_levels` parameter ') )
}

summarise_top = function(x, agg){
if(is.numeric(x)){
return( agg(x) )
}
else{
return( x[1] )
}
}

# variant 1
df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins-2) %>%
distinct() %>%
mutate_if( ~ ! is.ordered(.), ~ as.numeric( as.character(.) ) ) %>%
bind_rows( summarise_all(df_top, max) ) %>%
bind_rows( summarise_all(df_top, min) ) %>%
mutate_if( ~ ! is.ordered(.), as.factor ) %>%
tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) %>%
mutate_if( is.factor, fct_lump, n = bins )

if(! is_empty(numerics_top)){

factors_top = NULL

df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins-2) %>%
mutate_if( is.factor, fct_lump, n = max_levels, other_level = 'easyalluvial_factor_cap' ) %>%
distinct() %>%
mutate_at( vars( one_of(numerics_top) ), ~ as.numeric( as.character(.) ) ) %>%
bind_rows( summarise_all(df_top, summarise_top, agg = max) ) %>%
bind_rows( summarise_all(df_top, summarise_top, agg = min) ) %>%
mutate_at( vars( one_of(numerics_top) ), as.factor ) %>%
tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) %>%
filter_at( vars(one_of(factors_top)), ~ . != 'easyalluvial_factor_cap') %>%
mutate_at(vars(one_of(factors_top)), fct_drop )

}else{
df_facs = df_top %>%
mutate_if( is.factor, fct_lump, n = max_levels, other_level = 'easyalluvial_factor_cap' ) %>%
distinct() %>%
tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
filter_at( vars(one_of(factors_top)), ~ . != 'easyalluvial_factor_cap') %>%
mutate_at(vars(one_of(factors_top)), fct_drop )
}

# make sure levels are the same as in input data
for(fac in factors_top){
missing_levels = levels(df[[fac]])[ ! levels(df[[fac]]) %in% levels(df_facs[[fac]]) ]
df_facs[[fac]] = fct_expand(df_facs[[fac]], missing_levels)
}

# variant 2
# df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins) %>%
Expand Down
29 changes: 29 additions & 0 deletions data-raw/create_titanic.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
data("titanic_train",package="titanic")
library(tidyverse)
str(titanic_train)

d <- titanic_train %>% as_tibble %>%
mutate(title=str_replace_all(string = Name, # extract title as general feature
pattern = "^[[:alpha:][:space:]'-]+,\\s+(the\\s)?(\\w+)\\..+",
replacement = "\\2")) %>%
mutate(title=str_trim(title),
title=case_when(title %in% c('Mlle','Ms')~'Miss', # normalize some titles
title=='Mme'~ 'Mrs',
title %in% c('Capt','Don','Major','Sir','Jonkheer', 'Col')~'Sir',
title %in% c('Dona', 'Lady', 'Countess')~'Lady',
TRUE~title)) %>%
mutate(title=as_factor(title),
Survived=factor(Survived,levels = c(0,1),labels=c("no","yes")),
Sex=as_factor(Sex),
Pclass=factor(Pclass,ordered = T)) %>%
group_by(title) %>% # impute Age by median in current title
mutate(Age=replace_na(Age,replace = median(Age,na.rm = T))) %>% ungroup
table(d$title,d$Sex) # look on title distribution
caret::nearZeroVar(x = d,saveMetrics = T) # search and drop some unusefull features (PassengerId,Name,Ticket)
d <- d %>% select_at(vars(-c(PassengerId,Name,Ticket)))

titanic_fac = d %>%
mutate_if(is.character, as.factor) %>%
select( - Cabin, - Fare, - Age, -SibSp, - Parch)

usethis::use_data( titanic_fac, overwrite = TRUE )
Binary file added data/titanic.rda
Binary file not shown.
Binary file added data/titanic_fac.rda
Binary file not shown.
Binary file modified tests/testthat/Rplots.pdf
Binary file not shown.
13 changes: 13 additions & 0 deletions tests/testthat/test_alluvial_model_response.R
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,19 @@ test_that('alluvial_model_response'

vdiffr::expect_doppelganger('model_response_cat_multi', p)

# all factors

set.seed(0)
m = randomForest::randomForest( Survived ~ ., titanic_fac)
imp = m$importance

expect_warning( {dspace = get_data_space(titanic_fac, imp, degree = 3, max_levels = 5)} )

expect_true( nrow(dspace) == 30 )

pred = predict(m, newdata = dspace,type = 'response')
p = alluvial_model_response(pred, dspace, imp, degree = 3)

})

test_that('alluvial_model_response_caret'
Expand Down

0 comments on commit 9a8e350

Please sign in to comment.