diff --git a/R/alluvial_model_response.R b/R/alluvial_model_response.R index 2621891..42fed31 100644 --- a/R/alluvial_model_response.R +++ b/R/alluvial_model_response.R @@ -157,16 +157,20 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){ factors_top = names( select_if( df_top, is.factor) ) # generate warning if number of levels is > max_levels - n_levels = select(df, one_of(factors_top) ) %>% + if( ! is_empty(factors_top) ){ + + n_levels = select(df, one_of(factors_top) ) %>% summarise_all( ~ length( levels(.) ) ) %>% unlist() - most_levels = n_levels[ which(n_levels == max(n_levels)) ] + most_levels = n_levels[ which(n_levels == max(n_levels)) ] + - if(most_levels > max_levels){ - warning( paste('factor', names(most_levels), 'contains', most_levels - , 'levels. Data space will only be created for top', max_levels - , 'levels. Adjust this behaviour using the `max_levels` parameter ') ) + if(most_levels[1] > max_levels){ + warning( paste('factor', names(most_levels), 'contains', most_levels + , 'levels. Data space will only be created for top', max_levels + , 'levels. Adjust this behaviour using the `max_levels` parameter ') ) + } } summarise_top = function(x, agg){ @@ -178,12 +182,9 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){ } } - # variant 1 - - if(! is_empty(numerics_top)){ + # mix factors and numerics in df_top ----------------------------------------------------------- + if( ! is_empty(numerics_top) & ! is_empty(factors_top) ){ - factors_top = NULL - df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins-2) %>% mutate_if( is.factor, fct_lump, n = max_levels, other_level = 'easyalluvial_factor_cap' ) %>% distinct() %>% @@ -195,37 +196,41 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){ mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) %>% filter_at( vars(one_of(factors_top)), ~ . != 'easyalluvial_factor_cap') %>% mutate_at(vars(one_of(factors_top)), fct_drop ) + + # only factors --------------------------------------------------------------------------------- + }else if( ! is_empty(factors_top) ){ - }else{ df_facs = df_top %>% mutate_if( is.factor, fct_lump, n = max_levels, other_level = 'easyalluvial_factor_cap' ) %>% distinct() %>% tidyr::complete( !!! map( names(df_top) , as.name) ) %>% filter_at( vars(one_of(factors_top)), ~ . != 'easyalluvial_factor_cap') %>% mutate_at(vars(one_of(factors_top)), fct_drop ) + } - # make sure levels are the same as in input data - for(fac in factors_top){ - missing_levels = levels(df[[fac]])[ ! levels(df[[fac]]) %in% levels(df_facs[[fac]]) ] - df_facs[[fac]] = fct_expand(df_facs[[fac]], missing_levels) + # make sure levels are the same as in input data --------------------------------------------- + if( ! is_empty(factors_top) ){ + for(fac in factors_top){ + missing_levels = levels(df[[fac]])[ ! levels(df[[fac]]) %in% levels(df_facs[[fac]]) ] + df_facs[[fac]] = fct_expand(df_facs[[fac]], missing_levels) + } } - # variant 2 - # df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins) %>% - # distinct() %>% - # mutate_if( ~ ! is.ordered(.), ~ as.numeric( as.character(.) ) ) %>% - # #summary() - # filter_all( all_vars( . > min(.) ) ) %>% - # filter_all( all_vars( . < max(.) ) ) %>% - # # summary() - # bind_rows( summarise_all(df_top, max) ) %>% - # bind_rows( summarise_all(df_top, min) ) %>% - # mutate_if( ~ ! is.ordered(.), as.factor ) %>% - # tidyr::complete( !!! map( names(df_top) , as.name) ) %>% - # mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) %>% - # mutate_if( is.factor, fct_lump, n = bins ) + # only numerics ------------------------------------------------------------------------------ + if( ! is_empty(numerics_top) ){ + + df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins-2) %>% + distinct() %>% + mutate_at( vars( one_of(numerics_top) ), ~ as.numeric( as.character(.) ) ) %>% + bind_rows( summarise_all(df_top, summarise_top, agg = max) ) %>% + bind_rows( summarise_all(df_top, summarise_top, agg = min) ) %>% + mutate_at( vars( one_of(numerics_top) ), as.factor ) %>% + tidyr::complete( !!! map( names(df_top) , as.name) ) %>% + mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) + } + # Add remaining features as mode or median ----------------------------------------- mode = function(x) { ux = unique(x) ux[which.max(tabulate(match(x, ux)))] diff --git a/R/data.R b/R/data.R index 9d6089c..5f772ee 100644 --- a/R/data.R +++ b/R/data.R @@ -34,7 +34,6 @@ - #'mtcars dataset with cyl, vs, am ,gear, carb as factor variables and car model #'names as id #'@format A data frame with 32 rows and 12 variables @@ -54,4 +53,23 @@ #'\item{ids}{car model name} #'} #'@source datasets -"mtcars2" \ No newline at end of file +"mtcars2" + + +#'titanic data set' +#'@format A data frame with 891 rows and 10 variables +#'\describe{ +#' +#'\item{Survived}{Survived} +#'\item{Pclass}{Pclass} +#'\item{Sex}{Sex} +#'\item{Age}{Age} +#'\item{SibSp}{SibSp} +#'\item{Parch}{Parch} +#'\item{Fare}{Fare} +#'\item{Cabin}{Cabin} +#'\item{Embarked}{Embarked} +#'\item{title}{title} +#'} +#'@source datasets +"titanic" diff --git a/data-raw/create_titanic.R b/data-raw/create_titanic.R index 604d3c8..0694942 100644 --- a/data-raw/create_titanic.R +++ b/data-raw/create_titanic.R @@ -1,6 +1,5 @@ data("titanic_train",package="titanic") library(tidyverse) -str(titanic_train) d <- titanic_train %>% as_tibble %>% mutate(title=str_replace_all(string = Name, # extract title as general feature @@ -22,8 +21,6 @@ table(d$title,d$Sex) # look on title distribution caret::nearZeroVar(x = d,saveMetrics = T) # search and drop some unusefull features (PassengerId,Name,Ticket) d <- d %>% select_at(vars(-c(PassengerId,Name,Ticket))) -titanic_fac = d %>% - mutate_if(is.character, as.factor) %>% - select( - Cabin, - Fare, - Age, -SibSp, - Parch) +titanic = d -usethis::use_data( titanic_fac, overwrite = TRUE ) +usethis::use_data( titanic, overwrite = TRUE ) diff --git a/data/titanic.rda b/data/titanic.rda index fccf56c..787b85c 100644 Binary files a/data/titanic.rda and b/data/titanic.rda differ diff --git a/data/titanic_fac.rda b/data/titanic_fac.rda deleted file mode 100644 index ee81abe..0000000 Binary files a/data/titanic_fac.rda and /dev/null differ diff --git a/man/get_data_space.Rd b/man/get_data_space.Rd index f1cf5e4..c63a885 100644 --- a/man/get_data_space.Rd +++ b/man/get_data_space.Rd @@ -4,7 +4,7 @@ \alias{get_data_space} \title{calculate data space} \usage{ -get_data_space(df, imp, degree = 4, bins = 5) +get_data_space(df, imp, degree = 4, bins = 5, max_levels = 10) } \arguments{ \item{df}{dataframe, training data} @@ -17,8 +17,11 @@ corresponding variable names as found in training data.} plotting more than 4 will result in two many flows and the alluvial plot will not be very readable, Default: 4} -\item{bins}{integer, number of bins for numeric variables, increasing this -number might result in too many flows, Default: 5} +\item{bins}{integer, number of bins for numeric variables, and maximum number +of levels for factor variables, increasing this number might result in too +many flows, Default: 5} + +\item{max_levels}{integer, maximum number of levels per factor variable, Default: 10} } \value{ data frame diff --git a/man/titanic.Rd b/man/titanic.Rd new file mode 100644 index 0000000..8bb11fe --- /dev/null +++ b/man/titanic.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{titanic} +\alias{titanic} +\title{titanic data set'} +\format{A data frame with 891 rows and 10 variables +\describe{ + +\item{Survived}{Survived} +\item{Pclass}{Pclass} +\item{Sex}{Sex} +\item{Age}{Age} +\item{SibSp}{SibSp} +\item{Parch}{Parch} +\item{Fare}{Fare} +\item{Cabin}{Cabin} +\item{Embarked}{Embarked} +\item{title}{title} +}} +\source{ +datasets +} +\usage{ +titanic +} +\description{ +titanic data set' +} +\keyword{datasets} diff --git a/tests/figs/alluvial-model-response/all-facs-caret.svg b/tests/figs/alluvial-model-response/all-facs-caret.svg new file mode 100644 index 0000000..417fc61 --- /dev/null +++ b/tests/figs/alluvial-model-response/all-facs-caret.svg @@ -0,0 +1,280 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +no + +yes + +male +y: 0.09 +n: 0.85 + +female +y: 0.91 +n: 0.15 + +Mr +y: 0.09 +n: 0.15 + +Mrs +y: 0.14 +n: 0.12 + +Miss +y: 0.14 +n: 0.12 + +Master +y: 0.18 +n: 0.08 + +Sir +y: 0.09 +n: 0.15 + +Rev +y: 0.09 +n: 0.15 + +Dr +y: 0.14 +n: 0.12 + +Lady +y: 0.14 +n: 0.12 + +1 +y: 0.41 +n: 0.27 + +2 +y: 0.41 +n: 0.27 + +3 +y: 0.18 +n: 0.46 + + + + + +0 +10 +20 +30 +40 +50 + + + + + + + + + + +pred +Sex +title +Pclass +count +Presented Variables account for 100 % of Variable Importance +Model Response Plot + diff --git a/tests/figs/alluvial-model-response/all-nums-caret.svg b/tests/figs/alluvial-model-response/all-nums-caret.svg new file mode 100644 index 0000000..dbb702d --- /dev/null +++ b/tests/figs/alluvial-model-response/all-nums-caret.svg @@ -0,0 +1,573 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +LL +(93.2,151] + +ML +(151,216] + +M +(216,275] + +MH +(275,335] + +HH +(335,397] + +1.513 +HH: 0 +MH: 0.06 +M: 0.19 +ML: 0.31 +LL: 0.5 + +2.04 +HH: 0 +MH: 0.06 +M: 0.19 +ML: 0.31 +LL: 0.5 + +3.44 +HH: 0.14 +MH: 0.19 +M: 0.35 +ML: 0.17 +LL: 0 + +5.3 +HH: 0.43 +MH: 0.34 +M: 0.13 +ML: 0.11 +LL: 0 + +5.424 +HH: 0.43 +MH: 0.34 +M: 0.13 +ML: 0.11 +LL: 0 + +52 +HH: 0 +MH: 0.12 +M: 0.13 +ML: 0.31 +LL: 0.5 + +78.5 +HH: 0 +MH: 0.12 +M: 0.13 +ML: 0.31 +LL: 0.5 + +150 +HH: 0.29 +MH: 0.12 +M: 0.35 +ML: 0.17 +LL: 0 + +245 +HH: 0.36 +MH: 0.31 +M: 0.19 +ML: 0.11 +LL: 0 + +335 +HH: 0.36 +MH: 0.31 +M: 0.19 +ML: 0.11 +LL: 0 + +10.4 +HH: 0.57 +MH: 0.28 +M: 0.13 +ML: 0.11 +LL: 0 + +14.7 +HH: 0.43 +MH: 0.22 +M: 0.26 +ML: 0.11 +LL: 0 + +19.45 +HH: 0 +MH: 0.25 +M: 0.29 +ML: 0.11 +LL: 0.33 + +30.4 +HH: 0 +MH: 0.12 +M: 0.16 +ML: 0.33 +LL: 0.33 + +33.9 +HH: 0 +MH: 0.12 +M: 0.16 +ML: 0.33 +LL: 0.33 + + + + + +0 +40 +80 +120 + + + + + + + + +pred +wt +hp +mpg +count +Presented Variables account for 81.6 % of Variable Importance +Model Response Plot +Variables not shown have been set to median or mode: drat: 3.695; qsec: 17.71 + diff --git a/tests/figs/alluvial-model-response/model-response-all-facs.svg b/tests/figs/alluvial-model-response/model-response-all-facs.svg new file mode 100644 index 0000000..84b2577 --- /dev/null +++ b/tests/figs/alluvial-model-response/model-response-all-facs.svg @@ -0,0 +1,203 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +no + +yes + +Mr +y: 0.12 +n: 0.31 + +Mrs +y: 0.24 +n: 0.15 + +Miss +y: 0.24 +n: 0.15 + +Master +y: 0.29 +n: 0.08 + +Sir +y: 0.12 +n: 0.31 + +male +y: 0.24 +n: 0.85 + +female +y: 0.76 +n: 0.15 + +1 +y: 0.47 +n: 0.15 + +2 +y: 0.35 +n: 0.31 + +3 +y: 0.18 +n: 0.54 + + + + + +0 +10 +20 +30 + + + + + + + + +pred +title +Sex +Pclass +count +Presented Variables account for 100 % of Variable Importance +Model Response Plot + diff --git a/tests/figs/alluvial-model-response/model-response-all-nums.svg b/tests/figs/alluvial-model-response/model-response-all-nums.svg new file mode 100644 index 0000000..92e710e --- /dev/null +++ b/tests/figs/alluvial-model-response/model-response-all-nums.svg @@ -0,0 +1,573 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +LL +(121,170] + +ML +(170,216] + +M +(216,271] + +MH +(271,322] + +HH +(322,373] + +52 +HH: 0 +MH: 0.08 +M: 0.18 +ML: 0.32 +LL: 0.5 + +78.5 +HH: 0 +MH: 0.08 +M: 0.18 +ML: 0.32 +LL: 0.5 + +150 +HH: 0.21 +MH: 0.15 +M: 0.38 +ML: 0.12 +LL: 0 + +245 +HH: 0.42 +MH: 0.35 +M: 0.12 +ML: 0.12 +LL: 0 + +335 +HH: 0.37 +MH: 0.35 +M: 0.15 +ML: 0.12 +LL: 0 + +10.4 +HH: 0.42 +MH: 0.35 +M: 0.12 +ML: 0.12 +LL: 0 + +14.7 +HH: 0.37 +MH: 0.19 +M: 0.26 +ML: 0.12 +LL: 0 + +19.45 +HH: 0.21 +MH: 0.15 +M: 0.32 +ML: 0.06 +LL: 0.33 + +30.4 +HH: 0 +MH: 0.15 +M: 0.15 +ML: 0.35 +LL: 0.33 + +33.9 +HH: 0 +MH: 0.15 +M: 0.15 +ML: 0.35 +LL: 0.33 + +1.513 +HH: 0 +MH: 0.12 +M: 0.18 +ML: 0.29 +LL: 0.5 + +2.04 +HH: 0 +MH: 0.15 +M: 0.15 +ML: 0.29 +LL: 0.5 + +3.44 +HH: 0.16 +MH: 0.19 +M: 0.32 +ML: 0.18 +LL: 0 + +5.3 +HH: 0.42 +MH: 0.27 +M: 0.18 +ML: 0.12 +LL: 0 + +5.424 +HH: 0.42 +MH: 0.27 +M: 0.18 +ML: 0.12 +LL: 0 + + + + + +0 +40 +80 +120 + + + + + + + + +pred +hp +mpg +wt +count +Presented Variables account for 65.8 % of Variable Importance +Model Response Plot +Variables not shown have been set to median or mode: drat: 3.695; qsec: 17.71 + diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf index be2ac70..307661d 100644 Binary files a/tests/testthat/Rplots.pdf and b/tests/testthat/Rplots.pdf differ diff --git a/tests/testthat/test_alluvial_model_response.R b/tests/testthat/test_alluvial_model_response.R index eb4b0f0..2899539 100644 --- a/tests/testthat/test_alluvial_model_response.R +++ b/tests/testthat/test_alluvial_model_response.R @@ -168,20 +168,40 @@ test_that('alluvial_model_response' vdiffr::expect_doppelganger('model_response_cat_multi', p) - # all factors + # all factors ----------------------------------------------- set.seed(0) - m = randomForest::randomForest( Survived ~ ., titanic_fac) + + df = titanic %>% + select_if(is.factor) + + set.seed(0) + m = randomForest::randomForest( Survived ~ ., df) imp = m$importance - expect_warning( {dspace = get_data_space(titanic_fac, imp, degree = 3, max_levels = 5)} ) + expect_warning( {dspace = get_data_space(df, imp, degree = 3, max_levels = 5)} ) expect_true( nrow(dspace) == 30 ) pred = predict(m, newdata = dspace,type = 'response') p = alluvial_model_response(pred, dspace, imp, degree = 3) + vdiffr::expect_doppelganger('model_response_all_facs', p) -}) + # all numerics ---------------------------------------------- + + set.seed(0) + df = select(mtcars2, -ids) %>% + select_if(is.numeric) + + m = randomForest::randomForest( disp ~ ., df) + imp = m$importance + dspace = get_data_space(df, imp, degree = 3) + + pred = predict(m, newdata = dspace) + + p = alluvial_model_response(pred, dspace, imp, degree = 3) + vdiffr::expect_doppelganger('model_response_all_nums', p) + }) test_that('alluvial_model_response_caret' , { @@ -222,7 +242,26 @@ test_that('alluvial_model_response_caret' p = alluvial_model_response_caret(train, degree = 3) vdiffr::expect_doppelganger('model_response_caret_cat_multi', p) - + # all facs + set.seed(1) + + df = titanic %>% + select_if( is.factor ) + + train = caret::train( Survived ~ ., df, method = 'rf',trControl = caret::trainControl(method = 'none'), importance = T ) + p = alluvial_model_response_caret(train, degree = 3) + vdiffr::expect_doppelganger('all_facs_caret', p) + + # all nums + set.seed(1) + + df = select(mtcars2, -ids) %>% + select_if( is.numeric ) + + train = caret::train( disp ~ ., df, method = 'rf',trControl = caret::trainControl(method = 'none'), importance = T ) + p = alluvial_model_response_caret(train, degree = 3) + vdiffr::expect_doppelganger('all_nums_caret', p) + }) test_that('params_bin_numeric_pred',{