diff --git a/R/alluvial_model_response.R b/R/alluvial_model_response.R
index 2621891..42fed31 100644
--- a/R/alluvial_model_response.R
+++ b/R/alluvial_model_response.R
@@ -157,16 +157,20 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){
factors_top = names( select_if( df_top, is.factor) )
# generate warning if number of levels is > max_levels
- n_levels = select(df, one_of(factors_top) ) %>%
+ if( ! is_empty(factors_top) ){
+
+ n_levels = select(df, one_of(factors_top) ) %>%
summarise_all( ~ length( levels(.) ) ) %>%
unlist()
- most_levels = n_levels[ which(n_levels == max(n_levels)) ]
+ most_levels = n_levels[ which(n_levels == max(n_levels)) ]
+
- if(most_levels > max_levels){
- warning( paste('factor', names(most_levels), 'contains', most_levels
- , 'levels. Data space will only be created for top', max_levels
- , 'levels. Adjust this behaviour using the `max_levels` parameter ') )
+ if(most_levels[1] > max_levels){
+ warning( paste('factor', names(most_levels), 'contains', most_levels
+ , 'levels. Data space will only be created for top', max_levels
+ , 'levels. Adjust this behaviour using the `max_levels` parameter ') )
+ }
}
summarise_top = function(x, agg){
@@ -178,12 +182,9 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){
}
}
- # variant 1
-
- if(! is_empty(numerics_top)){
+ # mix factors and numerics in df_top -----------------------------------------------------------
+ if( ! is_empty(numerics_top) & ! is_empty(factors_top) ){
- factors_top = NULL
-
df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins-2) %>%
mutate_if( is.factor, fct_lump, n = max_levels, other_level = 'easyalluvial_factor_cap' ) %>%
distinct() %>%
@@ -195,37 +196,41 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){
mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) %>%
filter_at( vars(one_of(factors_top)), ~ . != 'easyalluvial_factor_cap') %>%
mutate_at(vars(one_of(factors_top)), fct_drop )
+
+ # only factors ---------------------------------------------------------------------------------
+ }else if( ! is_empty(factors_top) ){
- }else{
df_facs = df_top %>%
mutate_if( is.factor, fct_lump, n = max_levels, other_level = 'easyalluvial_factor_cap' ) %>%
distinct() %>%
tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
filter_at( vars(one_of(factors_top)), ~ . != 'easyalluvial_factor_cap') %>%
mutate_at(vars(one_of(factors_top)), fct_drop )
+
}
- # make sure levels are the same as in input data
- for(fac in factors_top){
- missing_levels = levels(df[[fac]])[ ! levels(df[[fac]]) %in% levels(df_facs[[fac]]) ]
- df_facs[[fac]] = fct_expand(df_facs[[fac]], missing_levels)
+ # make sure levels are the same as in input data ---------------------------------------------
+ if( ! is_empty(factors_top) ){
+ for(fac in factors_top){
+ missing_levels = levels(df[[fac]])[ ! levels(df[[fac]]) %in% levels(df_facs[[fac]]) ]
+ df_facs[[fac]] = fct_expand(df_facs[[fac]], missing_levels)
+ }
}
- # variant 2
- # df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins) %>%
- # distinct() %>%
- # mutate_if( ~ ! is.ordered(.), ~ as.numeric( as.character(.) ) ) %>%
- # #summary()
- # filter_all( all_vars( . > min(.) ) ) %>%
- # filter_all( all_vars( . < max(.) ) ) %>%
- # # summary()
- # bind_rows( summarise_all(df_top, max) ) %>%
- # bind_rows( summarise_all(df_top, min) ) %>%
- # mutate_if( ~ ! is.ordered(.), as.factor ) %>%
- # tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
- # mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) %>%
- # mutate_if( is.factor, fct_lump, n = bins )
+ # only numerics ------------------------------------------------------------------------------
+ if( ! is_empty(numerics_top) ){
+
+ df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins-2) %>%
+ distinct() %>%
+ mutate_at( vars( one_of(numerics_top) ), ~ as.numeric( as.character(.) ) ) %>%
+ bind_rows( summarise_all(df_top, summarise_top, agg = max) ) %>%
+ bind_rows( summarise_all(df_top, summarise_top, agg = min) ) %>%
+ mutate_at( vars( one_of(numerics_top) ), as.factor ) %>%
+ tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
+ mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) )
+ }
+ # Add remaining features as mode or median -----------------------------------------
mode = function(x) {
ux = unique(x)
ux[which.max(tabulate(match(x, ux)))]
diff --git a/R/data.R b/R/data.R
index 9d6089c..5f772ee 100644
--- a/R/data.R
+++ b/R/data.R
@@ -34,7 +34,6 @@
-
#'mtcars dataset with cyl, vs, am ,gear, carb as factor variables and car model
#'names as id
#'@format A data frame with 32 rows and 12 variables
@@ -54,4 +53,23 @@
#'\item{ids}{car model name}
#'}
#'@source datasets
-"mtcars2"
\ No newline at end of file
+"mtcars2"
+
+
+#'titanic data set'
+#'@format A data frame with 891 rows and 10 variables
+#'\describe{
+#'
+#'\item{Survived}{Survived}
+#'\item{Pclass}{Pclass}
+#'\item{Sex}{Sex}
+#'\item{Age}{Age}
+#'\item{SibSp}{SibSp}
+#'\item{Parch}{Parch}
+#'\item{Fare}{Fare}
+#'\item{Cabin}{Cabin}
+#'\item{Embarked}{Embarked}
+#'\item{title}{title}
+#'}
+#'@source datasets
+"titanic"
diff --git a/data-raw/create_titanic.R b/data-raw/create_titanic.R
index 604d3c8..0694942 100644
--- a/data-raw/create_titanic.R
+++ b/data-raw/create_titanic.R
@@ -1,6 +1,5 @@
data("titanic_train",package="titanic")
library(tidyverse)
-str(titanic_train)
d <- titanic_train %>% as_tibble %>%
mutate(title=str_replace_all(string = Name, # extract title as general feature
@@ -22,8 +21,6 @@ table(d$title,d$Sex) # look on title distribution
caret::nearZeroVar(x = d,saveMetrics = T) # search and drop some unusefull features (PassengerId,Name,Ticket)
d <- d %>% select_at(vars(-c(PassengerId,Name,Ticket)))
-titanic_fac = d %>%
- mutate_if(is.character, as.factor) %>%
- select( - Cabin, - Fare, - Age, -SibSp, - Parch)
+titanic = d
-usethis::use_data( titanic_fac, overwrite = TRUE )
+usethis::use_data( titanic, overwrite = TRUE )
diff --git a/data/titanic.rda b/data/titanic.rda
index fccf56c..787b85c 100644
Binary files a/data/titanic.rda and b/data/titanic.rda differ
diff --git a/data/titanic_fac.rda b/data/titanic_fac.rda
deleted file mode 100644
index ee81abe..0000000
Binary files a/data/titanic_fac.rda and /dev/null differ
diff --git a/man/get_data_space.Rd b/man/get_data_space.Rd
index f1cf5e4..c63a885 100644
--- a/man/get_data_space.Rd
+++ b/man/get_data_space.Rd
@@ -4,7 +4,7 @@
\alias{get_data_space}
\title{calculate data space}
\usage{
-get_data_space(df, imp, degree = 4, bins = 5)
+get_data_space(df, imp, degree = 4, bins = 5, max_levels = 10)
}
\arguments{
\item{df}{dataframe, training data}
@@ -17,8 +17,11 @@ corresponding variable names as found in training data.}
plotting more than 4 will result in two many flows and the alluvial plot
will not be very readable, Default: 4}
-\item{bins}{integer, number of bins for numeric variables, increasing this
-number might result in too many flows, Default: 5}
+\item{bins}{integer, number of bins for numeric variables, and maximum number
+of levels for factor variables, increasing this number might result in too
+many flows, Default: 5}
+
+\item{max_levels}{integer, maximum number of levels per factor variable, Default: 10}
}
\value{
data frame
diff --git a/man/titanic.Rd b/man/titanic.Rd
new file mode 100644
index 0000000..8bb11fe
--- /dev/null
+++ b/man/titanic.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{titanic}
+\alias{titanic}
+\title{titanic data set'}
+\format{A data frame with 891 rows and 10 variables
+\describe{
+
+\item{Survived}{Survived}
+\item{Pclass}{Pclass}
+\item{Sex}{Sex}
+\item{Age}{Age}
+\item{SibSp}{SibSp}
+\item{Parch}{Parch}
+\item{Fare}{Fare}
+\item{Cabin}{Cabin}
+\item{Embarked}{Embarked}
+\item{title}{title}
+}}
+\source{
+datasets
+}
+\usage{
+titanic
+}
+\description{
+titanic data set'
+}
+\keyword{datasets}
diff --git a/tests/figs/alluvial-model-response/all-facs-caret.svg b/tests/figs/alluvial-model-response/all-facs-caret.svg
new file mode 100644
index 0000000..417fc61
--- /dev/null
+++ b/tests/figs/alluvial-model-response/all-facs-caret.svg
@@ -0,0 +1,280 @@
+
+
diff --git a/tests/figs/alluvial-model-response/all-nums-caret.svg b/tests/figs/alluvial-model-response/all-nums-caret.svg
new file mode 100644
index 0000000..dbb702d
--- /dev/null
+++ b/tests/figs/alluvial-model-response/all-nums-caret.svg
@@ -0,0 +1,573 @@
+
+
diff --git a/tests/figs/alluvial-model-response/model-response-all-facs.svg b/tests/figs/alluvial-model-response/model-response-all-facs.svg
new file mode 100644
index 0000000..84b2577
--- /dev/null
+++ b/tests/figs/alluvial-model-response/model-response-all-facs.svg
@@ -0,0 +1,203 @@
+
+
diff --git a/tests/figs/alluvial-model-response/model-response-all-nums.svg b/tests/figs/alluvial-model-response/model-response-all-nums.svg
new file mode 100644
index 0000000..92e710e
--- /dev/null
+++ b/tests/figs/alluvial-model-response/model-response-all-nums.svg
@@ -0,0 +1,573 @@
+
+
diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf
index be2ac70..307661d 100644
Binary files a/tests/testthat/Rplots.pdf and b/tests/testthat/Rplots.pdf differ
diff --git a/tests/testthat/test_alluvial_model_response.R b/tests/testthat/test_alluvial_model_response.R
index eb4b0f0..2899539 100644
--- a/tests/testthat/test_alluvial_model_response.R
+++ b/tests/testthat/test_alluvial_model_response.R
@@ -168,20 +168,40 @@ test_that('alluvial_model_response'
vdiffr::expect_doppelganger('model_response_cat_multi', p)
- # all factors
+ # all factors -----------------------------------------------
set.seed(0)
- m = randomForest::randomForest( Survived ~ ., titanic_fac)
+
+ df = titanic %>%
+ select_if(is.factor)
+
+ set.seed(0)
+ m = randomForest::randomForest( Survived ~ ., df)
imp = m$importance
- expect_warning( {dspace = get_data_space(titanic_fac, imp, degree = 3, max_levels = 5)} )
+ expect_warning( {dspace = get_data_space(df, imp, degree = 3, max_levels = 5)} )
expect_true( nrow(dspace) == 30 )
pred = predict(m, newdata = dspace,type = 'response')
p = alluvial_model_response(pred, dspace, imp, degree = 3)
+ vdiffr::expect_doppelganger('model_response_all_facs', p)
-})
+ # all numerics ----------------------------------------------
+
+ set.seed(0)
+ df = select(mtcars2, -ids) %>%
+ select_if(is.numeric)
+
+ m = randomForest::randomForest( disp ~ ., df)
+ imp = m$importance
+ dspace = get_data_space(df, imp, degree = 3)
+
+ pred = predict(m, newdata = dspace)
+
+ p = alluvial_model_response(pred, dspace, imp, degree = 3)
+ vdiffr::expect_doppelganger('model_response_all_nums', p)
+ })
test_that('alluvial_model_response_caret'
, {
@@ -222,7 +242,26 @@ test_that('alluvial_model_response_caret'
p = alluvial_model_response_caret(train, degree = 3)
vdiffr::expect_doppelganger('model_response_caret_cat_multi', p)
-
+ # all facs
+ set.seed(1)
+
+ df = titanic %>%
+ select_if( is.factor )
+
+ train = caret::train( Survived ~ ., df, method = 'rf',trControl = caret::trainControl(method = 'none'), importance = T )
+ p = alluvial_model_response_caret(train, degree = 3)
+ vdiffr::expect_doppelganger('all_facs_caret', p)
+
+ # all nums
+ set.seed(1)
+
+ df = select(mtcars2, -ids) %>%
+ select_if( is.numeric )
+
+ train = caret::train( disp ~ ., df, method = 'rf',trControl = caret::trainControl(method = 'none'), importance = T )
+ p = alluvial_model_response_caret(train, degree = 3)
+ vdiffr::expect_doppelganger('all_nums_caret', p)
+
})
test_that('params_bin_numeric_pred',{