Skip to content

Commit

Permalink
fix for #13
Browse files Browse the repository at this point in the history
  • Loading branch information
erblast committed Jun 13, 2019
1 parent 9a8e350 commit 303d87f
Show file tree
Hide file tree
Showing 13 changed files with 1,766 additions and 45 deletions.
65 changes: 35 additions & 30 deletions R/alluvial_model_response.R
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,20 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){
factors_top = names( select_if( df_top, is.factor) )

# generate warning if number of levels is > max_levels
n_levels = select(df, one_of(factors_top) ) %>%
if( ! is_empty(factors_top) ){

n_levels = select(df, one_of(factors_top) ) %>%
summarise_all( ~ length( levels(.) ) ) %>%
unlist()

most_levels = n_levels[ which(n_levels == max(n_levels)) ]
most_levels = n_levels[ which(n_levels == max(n_levels)) ]


if(most_levels > max_levels){
warning( paste('factor', names(most_levels), 'contains', most_levels
, 'levels. Data space will only be created for top', max_levels
, 'levels. Adjust this behaviour using the `max_levels` parameter ') )
if(most_levels[1] > max_levels){
warning( paste('factor', names(most_levels), 'contains', most_levels
, 'levels. Data space will only be created for top', max_levels
, 'levels. Adjust this behaviour using the `max_levels` parameter ') )
}
}

summarise_top = function(x, agg){
Expand All @@ -178,12 +182,9 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){
}
}

# variant 1

if(! is_empty(numerics_top)){
# mix factors and numerics in df_top -----------------------------------------------------------
if( ! is_empty(numerics_top) & ! is_empty(factors_top) ){

factors_top = NULL

df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins-2) %>%
mutate_if( is.factor, fct_lump, n = max_levels, other_level = 'easyalluvial_factor_cap' ) %>%
distinct() %>%
Expand All @@ -195,37 +196,41 @@ get_data_space = function(df, imp, degree = 4, bins = 5, max_levels = 10){
mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) %>%
filter_at( vars(one_of(factors_top)), ~ . != 'easyalluvial_factor_cap') %>%
mutate_at(vars(one_of(factors_top)), fct_drop )

# only factors ---------------------------------------------------------------------------------
}else if( ! is_empty(factors_top) ){

}else{
df_facs = df_top %>%
mutate_if( is.factor, fct_lump, n = max_levels, other_level = 'easyalluvial_factor_cap' ) %>%
distinct() %>%
tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
filter_at( vars(one_of(factors_top)), ~ . != 'easyalluvial_factor_cap') %>%
mutate_at(vars(one_of(factors_top)), fct_drop )

}

# make sure levels are the same as in input data
for(fac in factors_top){
missing_levels = levels(df[[fac]])[ ! levels(df[[fac]]) %in% levels(df_facs[[fac]]) ]
df_facs[[fac]] = fct_expand(df_facs[[fac]], missing_levels)
# make sure levels are the same as in input data ---------------------------------------------
if( ! is_empty(factors_top) ){
for(fac in factors_top){
missing_levels = levels(df[[fac]])[ ! levels(df[[fac]]) %in% levels(df_facs[[fac]]) ]
df_facs[[fac]] = fct_expand(df_facs[[fac]], missing_levels)
}
}

# variant 2
# df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins) %>%
# distinct() %>%
# mutate_if( ~ ! is.ordered(.), ~ as.numeric( as.character(.) ) ) %>%
# #summary()
# filter_all( all_vars( . > min(.) ) ) %>%
# filter_all( all_vars( . < max(.) ) ) %>%
# # summary()
# bind_rows( summarise_all(df_top, max) ) %>%
# bind_rows( summarise_all(df_top, min) ) %>%
# mutate_if( ~ ! is.ordered(.), as.factor ) %>%
# tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
# mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) ) %>%
# mutate_if( is.factor, fct_lump, n = bins )
# only numerics ------------------------------------------------------------------------------
if( ! is_empty(numerics_top) ){

df_facs = manip_bin_numerics(df_top, bin_labels = 'median', bins = bins-2) %>%
distinct() %>%
mutate_at( vars( one_of(numerics_top) ), ~ as.numeric( as.character(.) ) ) %>%
bind_rows( summarise_all(df_top, summarise_top, agg = max) ) %>%
bind_rows( summarise_all(df_top, summarise_top, agg = min) ) %>%
mutate_at( vars( one_of(numerics_top) ), as.factor ) %>%
tidyr::complete( !!! map( names(df_top) , as.name) ) %>%
mutate_at( vars(one_of(numerics_top)), function(x) as.numeric( as.character(x)) )
}

# Add remaining features as mode or median -----------------------------------------
mode = function(x) {
ux = unique(x)
ux[which.max(tabulate(match(x, ux)))]
Expand Down
22 changes: 20 additions & 2 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@




#'mtcars dataset with cyl, vs, am ,gear, carb as factor variables and car model
#'names as id
#'@format A data frame with 32 rows and 12 variables
Expand All @@ -54,4 +53,23 @@
#'\item{ids}{car model name}
#'}
#'@source datasets
"mtcars2"
"mtcars2"


#'titanic data set'
#'@format A data frame with 891 rows and 10 variables
#'\describe{
#'
#'\item{Survived}{Survived}
#'\item{Pclass}{Pclass}
#'\item{Sex}{Sex}
#'\item{Age}{Age}
#'\item{SibSp}{SibSp}
#'\item{Parch}{Parch}
#'\item{Fare}{Fare}
#'\item{Cabin}{Cabin}
#'\item{Embarked}{Embarked}
#'\item{title}{title}
#'}
#'@source datasets
"titanic"
7 changes: 2 additions & 5 deletions data-raw/create_titanic.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
data("titanic_train",package="titanic")
library(tidyverse)
str(titanic_train)

d <- titanic_train %>% as_tibble %>%
mutate(title=str_replace_all(string = Name, # extract title as general feature
Expand All @@ -22,8 +21,6 @@ table(d$title,d$Sex) # look on title distribution
caret::nearZeroVar(x = d,saveMetrics = T) # search and drop some unusefull features (PassengerId,Name,Ticket)
d <- d %>% select_at(vars(-c(PassengerId,Name,Ticket)))

titanic_fac = d %>%
mutate_if(is.character, as.factor) %>%
select( - Cabin, - Fare, - Age, -SibSp, - Parch)
titanic = d

usethis::use_data( titanic_fac, overwrite = TRUE )
usethis::use_data( titanic, overwrite = TRUE )
Binary file modified data/titanic.rda
Binary file not shown.
Binary file removed data/titanic_fac.rda
Binary file not shown.
9 changes: 6 additions & 3 deletions man/get_data_space.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions man/titanic.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 303d87f

Please sign in to comment.