Evovest · jeremiedb · Oct 5, 2023 · Oct 2, 2023 · Oct 2, 2023 · Oct 5, 2023
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -31,7 +31,7 @@ jobs:
             version: '1'
             arch: x64
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v1
         with:
           version: ${{ matrix.version }}

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
@@ -10,7 +10,7 @@ jobs:
     steps:
       - uses: julia-actions/setup-julia@latest
         with:
-          version: 1.3
+          version: 1.6
       - name: Pkg.add("CompatHelper")
         run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
       - name: CompatHelper.main

diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml
@@ -10,7 +10,7 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@latest
         with:
           version: '1.6'

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <[email protected]>"]
-version = "0.16.1"
+version = "0.16.2"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"

diff --git a/README.md b/README.md
@@ -103,9 +103,9 @@ preds = m(x_train)
 
 ### DataFrames input
 
-When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
+When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used to specify the variables to be used as features. 
 
-`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.
+`Categorical` features are treated accordingly by the algorithm: ordered variables are treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels categorical variables.
 
 ```julia
 dtrain = DataFrame(x_train, :auto)

diff --git a/benchmarks/Yahoo-LTRC.jl b/benchmarks/Yahoo-LTRC.jl
@@ -59,10 +59,26 @@ x_train = dtrain[:x][:, .!drop_cols]
 x_eval = deval[:x][:, .!drop_cols]
 x_test = dtest[:x][:, .!drop_cols]
 
+# x_train_miss = x_train .== 0
+# x_eval_miss = x_eval .== 0
+# x_test_miss = x_test .== 0
+
+# x_train[x_train.==0] .= 0.5
+# x_eval[x_eval.==0] .= 0.5
+# x_test[x_test.==0] .= 0.5
+
+# x_train = hcat(x_train, x_train_miss)
+# x_eval = hcat(x_eval, x_eval_miss)
+# x_test = hcat(x_test, x_test_miss)
+
 q_train = dtrain[:q]
 q_eval = deval[:q]
 q_test = dtest[:q]
 
+y_train = dtrain[:y];
+y_eval = deval[:y];
+y_test = dtest[:y];
+
 #####################################
 # mse regression
 #####################################
@@ -98,12 +114,12 @@ p_test = m_mse(x_test);
 test_df = DataFrame(p=p_test, y=y_test, q=q_test)
 test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
 ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5)
-@info "ndcg_test MSE" ndcg_test
+@info "MSE - test data - MSE model" mean((p_test .- y_test) .^ 2)
+@info "NDCG - test data - MSE model" ndcg_test
 
 #####################################
 # logistic regression
 #####################################
-
 max_rank = 4
 y_train = dtrain[:y] ./ max_rank
 y_eval = deval[:y] ./ max_rank
@@ -145,57 +161,67 @@ p_test = m_logloss(x_test);
 test_df = DataFrame(p=p_test, y=y_test, q=q_test)
 test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
 ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5)
-@info "ndcg_test LogLoss" ndcg_test
-
+@info "NDCG - test data - LogLoss model" ndcg_test
 
 #####################################
 # logistic regression on DataFrame
 #####################################
+target_name = "y"
 
 df_train = DataFrame(x_train, :auto)
-df_train.y = dtrain[:y]
+df_train.y = dtrain[:y] ./ 4
 df_train.q = dtrain[:q]
 
 df_eval = DataFrame(x_eval, :auto)
-df_eval.y = deval[:y]
+df_eval.y = deval[:y] ./ 4
 df_eval.q = deval[:q]
 
 df_test = DataFrame(x_test, :auto)
-df_test.y = dtest[:y]
+df_test.y = dtest[:y] ./ 4
 df_test.q = dtest[:q]
 
 function rank_target_norm(y::AbstractVector)
     out = similar(y)
     if minimum(y) == maximum(y)
-        # out .= 0.75
-        out .= 0.75
+        out .= 0.5
     else
-        # out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
-        out .= 0.5 .* (y .- minimum(y)) ./ (maximum(y) - minimum(y)) .+ 0.5
-
+        out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
     end
     return out
 end
 
-df_train = transform!(
-    groupby(df_train, "q"),
-    "y" => rank_target_norm => "y")
+function percent_rank(x::AbstractVector{T}) where {T}
+    return tiedrank(x) / (length(x) + 1)
+end
+
+feature_names_raw = setdiff(names(df_train), ["y", "q"])
+feature_names_rel = feature_names_raw .* "_rel"
 
-df_eval = transform!(
-    groupby(df_eval, "q"),
-    "y" => rank_target_norm => "y")
+transform!(df_train, feature_names_raw .=> percent_rank .=> feature_names_rel)
+transform!(df_eval, feature_names_raw .=> percent_rank .=> feature_names_rel)
+transform!(df_test, feature_names_raw .=> percent_rank .=> feature_names_rel)
 
-df_test = transform!(
-    groupby(df_test, "q"),
-    "y" => rank_target_norm => "y")
+feature_names = setdiff(names(df_train), ["y", "q"])
+
+# df_train = transform!(
+#     groupby(df_train, "q"),
+#     "y" => rank_target_norm => "y")
+
+# df_eval = transform!(
+#     groupby(df_eval, "q"),
+#     "y" => rank_target_norm => "y")
+
+# df_test = transform!(
+#     groupby(df_test, "q"),
+#     "y" => rank_target_norm => "y")
 
 minimum(df_eval.y)
 maximum(df_eval.y)
 
 config = EvoTreeRegressor(
     nrounds=6000,
     loss=:logloss,
-    eta=0.005,
+    eta=0.01,
     nbins=64,
     max_depth=11,
     rowsample=0.9,
@@ -205,28 +231,25 @@ config = EvoTreeRegressor(
 @time m_logloss_df, logger_logloss_df = fit_evotree(
     config,
     df_train;
-    target_name="y",
-    fnames=setdiff(names(df_train), ["y", "q"]),
+    target_name,
+    fnames=feature_names_raw,
     deval=df_eval,
     early_stopping_rounds=200,
     print_every_n=50,
     metric=:logloss,
     return_logger=true
 );
 
-# use the original y since NDCG is scale sensitive
-y_train = dtrain[:y]
-y_eval = deval[:y]
-y_test = dtest[:y]
-
 m_logloss_df.info
 p_test_df = m_logloss_df(df_test);
-p_test_mat = m_logloss_df(x_test);
+# p_test_mat = m_logloss_df(x_test);
 
 EvoTrees.importance(m_logloss_df)
 
 p_test = m_logloss_df(df_test);
 test_df = DataFrame(p=p_test, y=dtest[:y], q=dtest[:q])
 test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
 ndcg_test = mean(test_df_agg.ndcg)
-@info "ndcg_test LogLoss DF" ndcg_test
+# ndcg_test = 0.8022558972243291
+# ndcg_test = 0.8020754563069513
+@info "NDCG - test data - LogLoss DF model" ndcg_test
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -52,9 +52,9 @@ m = fit_evotree(config; x_train, y_train)
 preds = m(x_train)
 ```
 
-### DataFrames and Tables input
+### Tables and DataFrames input
 
-When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
+When using a `Tables` compatible input such as `DataFrames`, features with element type `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
 
 `Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.
 
@@ -75,7 +75,6 @@ m = fit_evotree(config, dtrain; target_name="y", device="gpu");
 p = m(dtrain; device="gpu")
 ```
 
-
 ## Reproducibility
 
 EvoTrees models trained on cpu can be fully reproducible.
@@ -107,6 +106,57 @@ Note that in presence of multiple identical or very highly correlated features,
 
 At the moment, there's no reproducibility guarantee on GPU, although this may change in the future. 
 
+## Missing values
+
+### Features
+
+EvoTrees does not handle features having missing values. Proper preprocessing of the data is therefore needed (and a general good practice regardless of the ML model used).
+
+This includes situations where values may be all non-missing, but where the `eltype` is the form `Union{Missing,Float64}`. A conversion the types using `identity` is recommended: 
+
+```julia
+julia> x = Vector{Union{Missing, Float64}}([1, 2])
+2-element Vector{Union{Missing, Float64}}:
+ 1.0
+ 2.0
+
+julia> identity.(x)
+2-element Vector{Float64}:
+ 1.0
+ 2.0
+```
+
+For dealing with numerical or ordered categorical features containing missing values, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing:
+
+```julia
+transform!(df, :my_feat => ByRow(ismissing) => :my_feat_ismissing)
+```
+
+Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or using a more sophisticated approach such as predictions from another model):
+
+```julia
+transform!(df, :my_feat => (x -> coalesce.(x, median(skipmissing(x)))) => :my_feat);
+```
+
+For unordered categorical variables, a recode of the missing into a non missing level is sufficient:
+```julia
+julia> x = categorical(["a", "b", missing])
+3-element CategoricalArray{Union{Missing, String},1,UInt32}:
+ "a"
+ "b"
+ missing
+
+julia> x = recode(x_cat_m1, missing => "missing value")
+3-element CategoricalArray{String,1,UInt32}:
+ "a"
+ "b"
+ "missing value"
+```
+
+### Target
+
+Target variable must have its element type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue`, `Integer`, `String` and `Char` are supported.
+
 ## Save/Load
 
 ```julia

diff --git a/docs/src/tutorials/logistic-regression-titanic.md b/docs/src/tutorials/logistic-regression-titanic.md
@@ -21,11 +21,11 @@ df = MLDatasets.Titanic().dataframe
 
 A first step in data processing is to prepare the input features in a model compatible format. 
 
-EvoTrees' Tables API supports input that are either `Real`, `Bool` or `Categorical`.
+EvoTrees' Tables API supports input that are either `Real` (incl. `Bool`) or `Categorical`. `Bool` variables are treated as unordered, 2-levels categorical variables.
 A recommended approach for `String` features such as `Sex` is to convert them into an unordered `Categorical`. 
 
-For dealing with features withh missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
-Then, the missing values can be inputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).
+For dealing with features with missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
+Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).
 
 ```julia
 # convert string feature to Categorical

diff --git a/src/fit-utils.jl b/src/fit-utils.jl
@@ -2,9 +2,10 @@
     get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T}
     get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG())
 
-Get the braking points of the feature data.
+Get the histogram breaking points of the feature data.
 """
 function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.MersenneTwister()) where {T}
+    @assert T <: Real
     nobs = min(size(X, 1), 1000 * nbins)
     idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true)
     nfeats = size(X, 2)
@@ -80,6 +81,8 @@
             x_bin[:, j] .= levelcode.(col)
         elseif eltype(col) <: Real
             x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), col)
+        else
+            @error "Invalid feature eltype: $(fnames[j]) is $(eltype(col))"
         end
     end
     return x_bin

diff --git a/src/gpu/init.jl b/src/gpu/init.jl
@@ -8,11 +8,13 @@
 
     target_levels = nothing
     if L == Logistic
+        @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1
         K = 1
         y = T.(y_train)
         μ = [logit(mean(y))]
         !isnothing(offset) && (offset .= logit.(offset))
     elseif L in [Poisson, Gamma, Tweedie]
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = fill(log(mean(y)), 1)
@@ -21,26 +23,31 @@
         if eltype(y_train) <: CategoricalValue
             target_levels = CategoricalArrays.levels(y_train)
             y = UInt32.(CategoricalArrays.levelcode.(y_train))
-        else
+        elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char
             target_levels = sort(unique(y_train))
             yc = CategoricalVector(y_train, levels=target_levels)
             y = UInt32.(CategoricalArrays.levelcode.(yc))
+        else
+            @error "Invalid target eltype: $(eltype(y_train))"
         end
         K = length(target_levels)
         μ = T.(log.(proportions(y, UInt32(1):UInt32(K))))
         μ .-= maximum(μ)
         !isnothing(offset) && (offset .= log.(offset))
     elseif L == GaussianMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y))]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     elseif L == LogisticMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y) * sqrt(3) / π)]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     else
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = [mean(y)]