diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 40680d19..a056b7c2 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,7 +31,7 @@ jobs: version: '1' arch: x64 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v1 with: version: ${{ matrix.version }} diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 8afcca93..4594d229 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -10,7 +10,7 @@ jobs: steps: - uses: julia-actions/setup-julia@latest with: - version: 1.3 + version: 1.6 - name: Pkg.add("CompatHelper") run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - name: CompatHelper.main diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml index 6ad069bd..599d907e 100644 --- a/.github/workflows/Docs.yml +++ b/.github/workflows/Docs.yml @@ -10,7 +10,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@latest with: version: '1.6' diff --git a/Project.toml b/Project.toml index 5cc8bf4b..784acf1d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" authors = ["jeremiedb "] -version = "0.16.1" +version = "0.16.2" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" diff --git a/README.md b/README.md index fcbe6a98..868eb775 100644 --- a/README.md +++ b/README.md @@ -103,9 +103,9 @@ preds = m(x_train) ### DataFrames input -When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. +When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used to specify the variables to be used as features. -`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables. +`Categorical` features are treated accordingly by the algorithm: ordered variables are treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels categorical variables. ```julia dtrain = DataFrame(x_train, :auto) diff --git a/benchmarks/Yahoo-LTRC.jl b/benchmarks/Yahoo-LTRC.jl index 736caca2..6ef81c7b 100644 --- a/benchmarks/Yahoo-LTRC.jl +++ b/benchmarks/Yahoo-LTRC.jl @@ -59,10 +59,26 @@ x_train = dtrain[:x][:, .!drop_cols] x_eval = deval[:x][:, .!drop_cols] x_test = dtest[:x][:, .!drop_cols] +# x_train_miss = x_train .== 0 +# x_eval_miss = x_eval .== 0 +# x_test_miss = x_test .== 0 + +# x_train[x_train.==0] .= 0.5 +# x_eval[x_eval.==0] .= 0.5 +# x_test[x_test.==0] .= 0.5 + +# x_train = hcat(x_train, x_train_miss) +# x_eval = hcat(x_eval, x_eval_miss) +# x_test = hcat(x_test, x_test_miss) + q_train = dtrain[:q] q_eval = deval[:q] q_test = dtest[:q] +y_train = dtrain[:y]; +y_eval = deval[:y]; +y_test = dtest[:y]; + ##################################### # mse regression ##################################### @@ -98,12 +114,12 @@ p_test = m_mse(x_test); test_df = DataFrame(p=p_test, y=y_test, q=q_test) test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg") ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5) -@info "ndcg_test MSE" ndcg_test +@info "MSE - test data - MSE model" mean((p_test .- y_test) .^ 2) +@info "NDCG - test data - MSE model" ndcg_test ##################################### # logistic regression ##################################### - max_rank = 4 y_train = dtrain[:y] ./ max_rank y_eval = deval[:y] ./ max_rank @@ -145,49 +161,59 @@ p_test = m_logloss(x_test); test_df = DataFrame(p=p_test, y=y_test, q=q_test) test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg") ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5) -@info "ndcg_test LogLoss" ndcg_test - +@info "NDCG - test data - LogLoss model" ndcg_test ##################################### # logistic regression on DataFrame ##################################### +target_name = "y" df_train = DataFrame(x_train, :auto) -df_train.y = dtrain[:y] +df_train.y = dtrain[:y] ./ 4 df_train.q = dtrain[:q] df_eval = DataFrame(x_eval, :auto) -df_eval.y = deval[:y] +df_eval.y = deval[:y] ./ 4 df_eval.q = deval[:q] df_test = DataFrame(x_test, :auto) -df_test.y = dtest[:y] +df_test.y = dtest[:y] ./ 4 df_test.q = dtest[:q] function rank_target_norm(y::AbstractVector) out = similar(y) if minimum(y) == maximum(y) - # out .= 0.75 - out .= 0.75 + out .= 0.5 else - # out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y)) - out .= 0.5 .* (y .- minimum(y)) ./ (maximum(y) - minimum(y)) .+ 0.5 - + out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y)) end return out end -df_train = transform!( - groupby(df_train, "q"), - "y" => rank_target_norm => "y") +function percent_rank(x::AbstractVector{T}) where {T} + return tiedrank(x) / (length(x) + 1) +end + +feature_names_raw = setdiff(names(df_train), ["y", "q"]) +feature_names_rel = feature_names_raw .* "_rel" -df_eval = transform!( - groupby(df_eval, "q"), - "y" => rank_target_norm => "y") +transform!(df_train, feature_names_raw .=> percent_rank .=> feature_names_rel) +transform!(df_eval, feature_names_raw .=> percent_rank .=> feature_names_rel) +transform!(df_test, feature_names_raw .=> percent_rank .=> feature_names_rel) -df_test = transform!( - groupby(df_test, "q"), - "y" => rank_target_norm => "y") +feature_names = setdiff(names(df_train), ["y", "q"]) + +# df_train = transform!( +# groupby(df_train, "q"), +# "y" => rank_target_norm => "y") + +# df_eval = transform!( +# groupby(df_eval, "q"), +# "y" => rank_target_norm => "y") + +# df_test = transform!( +# groupby(df_test, "q"), +# "y" => rank_target_norm => "y") minimum(df_eval.y) maximum(df_eval.y) @@ -195,7 +221,7 @@ maximum(df_eval.y) config = EvoTreeRegressor( nrounds=6000, loss=:logloss, - eta=0.005, + eta=0.01, nbins=64, max_depth=11, rowsample=0.9, @@ -205,8 +231,8 @@ config = EvoTreeRegressor( @time m_logloss_df, logger_logloss_df = fit_evotree( config, df_train; - target_name="y", - fnames=setdiff(names(df_train), ["y", "q"]), + target_name, + fnames=feature_names_raw, deval=df_eval, early_stopping_rounds=200, print_every_n=50, @@ -214,14 +240,9 @@ config = EvoTreeRegressor( return_logger=true ); -# use the original y since NDCG is scale sensitive -y_train = dtrain[:y] -y_eval = deval[:y] -y_test = dtest[:y] - m_logloss_df.info p_test_df = m_logloss_df(df_test); -p_test_mat = m_logloss_df(x_test); +# p_test_mat = m_logloss_df(x_test); EvoTrees.importance(m_logloss_df) @@ -229,4 +250,6 @@ p_test = m_logloss_df(df_test); test_df = DataFrame(p=p_test, y=dtest[:y], q=dtest[:q]) test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg") ndcg_test = mean(test_df_agg.ndcg) -@info "ndcg_test LogLoss DF" ndcg_test +# ndcg_test = 0.8022558972243291 +# ndcg_test = 0.8020754563069513 +@info "NDCG - test data - LogLoss DF model" ndcg_test diff --git a/docs/src/index.md b/docs/src/index.md index 5a3c8ca0..116c3c27 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -52,9 +52,9 @@ m = fit_evotree(config; x_train, y_train) preds = m(x_train) ``` -### DataFrames and Tables input +### Tables and DataFrames input -When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. +When using a `Tables` compatible input such as `DataFrames`, features with element type `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. `Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables. @@ -75,7 +75,6 @@ m = fit_evotree(config, dtrain; target_name="y", device="gpu"); p = m(dtrain; device="gpu") ``` - ## Reproducibility EvoTrees models trained on cpu can be fully reproducible. @@ -107,6 +106,57 @@ Note that in presence of multiple identical or very highly correlated features, At the moment, there's no reproducibility guarantee on GPU, although this may change in the future. +## Missing values + +### Features + +EvoTrees does not handle features having missing values. Proper preprocessing of the data is therefore needed (and a general good practice regardless of the ML model used). + +This includes situations where values may be all non-missing, but where the `eltype` is the form `Union{Missing,Float64}`. A conversion the types using `identity` is recommended: + +```julia +julia> x = Vector{Union{Missing, Float64}}([1, 2]) +2-element Vector{Union{Missing, Float64}}: + 1.0 + 2.0 + +julia> identity.(x) +2-element Vector{Float64}: + 1.0 + 2.0 +``` + +For dealing with numerical or ordered categorical features containing missing values, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing: + +```julia +transform!(df, :my_feat => ByRow(ismissing) => :my_feat_ismissing) +``` + +Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or using a more sophisticated approach such as predictions from another model): + +```julia +transform!(df, :my_feat => (x -> coalesce.(x, median(skipmissing(x)))) => :my_feat); +``` + +For unordered categorical variables, a recode of the missing into a non missing level is sufficient: +```julia +julia> x = categorical(["a", "b", missing]) +3-element CategoricalArray{Union{Missing, String},1,UInt32}: + "a" + "b" + missing + +julia> x = recode(x_cat_m1, missing => "missing value") +3-element CategoricalArray{String,1,UInt32}: + "a" + "b" + "missing value" +``` + +### Target + +Target variable must have its element type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue`, `Integer`, `String` and `Char` are supported. + ## Save/Load ```julia diff --git a/docs/src/tutorials/logistic-regression-titanic.md b/docs/src/tutorials/logistic-regression-titanic.md index 1910c3ec..c7ff636c 100644 --- a/docs/src/tutorials/logistic-regression-titanic.md +++ b/docs/src/tutorials/logistic-regression-titanic.md @@ -21,11 +21,11 @@ df = MLDatasets.Titanic().dataframe A first step in data processing is to prepare the input features in a model compatible format. -EvoTrees' Tables API supports input that are either `Real`, `Bool` or `Categorical`. +EvoTrees' Tables API supports input that are either `Real` (incl. `Bool`) or `Categorical`. `Bool` variables are treated as unordered, 2-levels categorical variables. A recommended approach for `String` features such as `Sex` is to convert them into an unordered `Categorical`. -For dealing with features withh missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing. -Then, the missing values can be inputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model). +For dealing with features with missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing. +Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model). ```julia # convert string feature to Categorical diff --git a/src/fit-utils.jl b/src/fit-utils.jl index b4bae526..f0fb0b93 100644 --- a/src/fit-utils.jl +++ b/src/fit-utils.jl @@ -2,9 +2,10 @@ get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T} get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG()) -Get the braking points of the feature data. +Get the histogram breaking points of the feature data. """ function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.MersenneTwister()) where {T} + @assert T <: Real nobs = min(size(X, 1), 1000 * nbins) idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true) nfeats = size(X, 2) @@ -80,6 +81,8 @@ function binarize(df; fnames, edges) x_bin[:, j] .= levelcode.(col) elseif eltype(col) <: Real x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), col) + else + @error "Invalid feature eltype: $(fnames[j]) is $(eltype(col))" end end return x_bin diff --git a/src/gpu/init.jl b/src/gpu/init.jl index e4661182..3c3d682d 100644 --- a/src/gpu/init.jl +++ b/src/gpu/init.jl @@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o target_levels = nothing if L == Logistic + @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1 K = 1 y = T.(y_train) μ = [logit(mean(y))] !isnothing(offset) && (offset .= logit.(offset)) elseif L in [Poisson, Gamma, Tweedie] + @assert eltype(y_train) <: Real K = 1 y = T.(y_train) μ = fill(log(mean(y)), 1) @@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o if eltype(y_train) <: CategoricalValue target_levels = CategoricalArrays.levels(y_train) y = UInt32.(CategoricalArrays.levelcode.(y_train)) - else + elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char target_levels = sort(unique(y_train)) yc = CategoricalVector(y_train, levels=target_levels) y = UInt32.(CategoricalArrays.levelcode.(yc)) + else + @error "Invalid target eltype: $(eltype(y_train))" end K = length(target_levels) μ = T.(log.(proportions(y, UInt32(1):UInt32(K)))) μ .-= maximum(μ) !isnothing(offset) && (offset .= log.(offset)) elseif L == GaussianMLE + @assert eltype(y_train) <: Real K = 2 y = T.(y_train) μ = [mean(y), log(std(y))] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) elseif L == LogisticMLE + @assert eltype(y_train) <: Real K = 2 y = T.(y_train) μ = [mean(y), log(std(y) * sqrt(3) / π)] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) else + @assert eltype(y_train) <: Real K = 1 y = T.(y_train) μ = [mean(y)] diff --git a/src/init.jl b/src/init.jl index a6c8d299..be421e6b 100644 --- a/src/init.jl +++ b/src/init.jl @@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{CPU}, data, fnames, y_train, w, o target_levels = nothing if L == Logistic + @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1 K = 1 y = T.(y_train) μ = [logit(mean(y))] !isnothing(offset) && (offset .= logit.(offset)) elseif L in [Poisson, Gamma, Tweedie] + @assert eltype(y_train) <: Real K = 1 y = T.(y_train) μ = fill(log(mean(y)), 1) @@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{CPU}, data, fnames, y_train, w, o if eltype(y_train) <: CategoricalValue target_levels = CategoricalArrays.levels(y_train) y = UInt32.(CategoricalArrays.levelcode.(y_train)) - else + elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char target_levels = sort(unique(y_train)) yc = CategoricalVector(y_train, levels=target_levels) y = UInt32.(CategoricalArrays.levelcode.(yc)) + else + @error "Invalid target eltype: $(eltype(y_train))" end K = length(target_levels) μ = T.(log.(proportions(y, UInt32(1):UInt32(K)))) μ .-= maximum(μ) !isnothing(offset) && (offset .= log.(offset)) elseif L == GaussianMLE + @assert eltype(y_train) <: Real K = 2 y = T.(y_train) μ = [mean(y), log(std(y))] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) elseif L == LogisticMLE + @assert eltype(y_train) <: Real K = 2 y = T.(y_train) μ = [mean(y), log(std(y) * sqrt(3) / π)] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) else + @assert eltype(y_train) <: Real K = 1 y = T.(y_train) μ = [mean(y)] diff --git a/test/MLJ.jl b/test/MLJ.jl index 740b32c2..906503cf 100644 --- a/test/MLJ.jl +++ b/test/MLJ.jl @@ -121,12 +121,10 @@ fit!(mach, rows=train, verbosity=1) pred_train = predict(mach, selectrows(X, train)) pred_train_mode = predict_mode(mach, selectrows(X, train)) -cross_entropy(pred_train, selectrows(y, train)) |> mean sum(pred_train_mode .== y[train]) / length(y[train]) pred_test = predict(mach, selectrows(X, test)) pred_test_mode = predict_mode(mach, selectrows(X, test)) -cross_entropy(pred_test, selectrows(y, test)) |> mean sum(pred_test_mode .== y[test]) / length(y[test]) pred_test_mode = predict_mode(mach, selectrows(X, test)) diff --git a/test/missings.jl b/test/missings.jl new file mode 100644 index 00000000..d4edddb8 --- /dev/null +++ b/test/missings.jl @@ -0,0 +1,123 @@ +using Statistics +using StatsBase: sample +using EvoTrees: sigmoid, logit +using EvoTrees: check_args, check_parameter +using CategoricalArrays +using DataFrames +using Random: seed! + +# prepare a dataset +seed!(123) +nobs = 1_000 +x_num = rand(nobs) .* 5 +lvls = ["a", "b", "c"] +x_cat = categorical(rand(lvls, nobs), levels=lvls, ordered=false) +x_bool = rand(Bool, nobs) + +x_num_m1 = Vector{Union{Missing,Float64}}(copy(x_num)) +x_num_m2 = Vector{Any}(copy(x_num)) +lvls_m1 = ["a", "b", "c", missing] +x_cat_m1 = categorical(rand(lvls_m1, nobs), levels=lvls) +x_bool_m1 = Vector{Union{Missing,Bool}}(copy(x_bool)) + +# train-eval split +is = collect(1:nobs) +i_sample = sample(is, nobs, replace=false) +train_size = 0.8 +i_train = i_sample[1:floor(Int, train_size * nobs)] +i_eval = i_sample[floor(Int, train_size * nobs)+1:end] + +# target var +y_tot = sin.(x_num) .* 0.5 .+ 0.5 +y_tot = logit(y_tot) + randn(nobs) +y_tot = sigmoid(y_tot) +target_name = "y" +y_tot = sigmoid(y_tot) +y_tot_m1 = allowmissing(y_tot) +y_tot_m1[1] = missing + +config = EvoTreeRegressor( + loss=:linear, + nrounds=100, + nbins=16, + lambda=0.5, + gamma=0.1, + eta=0.05, + max_depth=3, + min_weight=1.0, + rowsample=0.5, + colsample=1.0, + rng=123, +) + +@testset "DataFrames - missing features" begin + + df_tot = DataFrame(x_num=x_num, x_bool=x_bool, x_cat=x_cat, y=y_tot) + dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :] + + model = fit_evotree( + config, + dtrain; + target_name) + + @test model.info[:fnames] == [:x_num, :x_bool, :x_cat] + + # keep only fnames <= Real or Categorical + df_tot = DataFrame(x_num=x_num, x_num_m1=x_num_m1, x_num_m2=x_num_m2, + x_cat_m1=x_cat_m1, x_bool_m1=x_bool_m1, y=y_tot) + dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :] + + model = fit_evotree( + config, + dtrain; + target_name, + deval) + + @test model.info[:fnames] == [:x_num] + + model = fit_evotree( + config, + dtrain; + target_name, + fnames=[:x_num]) + + @test model.info[:fnames] == [:x_num] + + # specifyin features with missings should error + @test_throws AssertionError fit_evotree( + config, + dtrain; + deval, + fnames=[:x_num, :x_num_m1, :x_num_m2, :x_cat_m1, :x_bool_m1], + target_name) + +end + +@testset "DataFrames - missing in target errors" begin + + df_tot = DataFrame(x_num=x_num, x_bool=x_bool, x_cat=x_cat, y=y_tot_m1) + dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :] + + @test_throws AssertionError fit_evotree( + config, + dtrain; + target_name) + +end + +@testset "Matrix - missing features" begin + + x_tot = allowmissing(hcat(x_num_m1)) + @test_throws AssertionError fit_evotree( + config; + x_train=x_tot, + y_train=y_tot) + + x_tot = Matrix{Any}(hcat(x_num_m2)) + @test_throws AssertionError fit_evotree( + config; + x_train=x_tot, + y_train=y_tot) + +end + diff --git a/test/runtests.jl b/test/runtests.jl index 891f6d34..a4228a49 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,6 +12,7 @@ using Test include("oblivious.jl") include("tables.jl") include("monotonic.jl") + include("missings.jl") end @testset "MLJ" begin