From b399207b025dc37ded5a7c806ae5aecb00821d1b Mon Sep 17 00:00:00 2001
From: "jeremie.desgagne.bouchard" <jeremie.desgagne.bouchard@gmail.com>
Date: Sun, 1 Oct 2023 22:05:37 -0400
Subject: [PATCH 1/3] ranking tests

---
 benchmarks/Yahoo-LTRC.jl | 85 +++++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 31 deletions(-)

diff --git a/benchmarks/Yahoo-LTRC.jl b/benchmarks/Yahoo-LTRC.jl
index 736caca2..6ef81c7b 100644
--- a/benchmarks/Yahoo-LTRC.jl
+++ b/benchmarks/Yahoo-LTRC.jl
@@ -59,10 +59,26 @@ x_train = dtrain[:x][:, .!drop_cols]
 x_eval = deval[:x][:, .!drop_cols]
 x_test = dtest[:x][:, .!drop_cols]
 
+# x_train_miss = x_train .== 0
+# x_eval_miss = x_eval .== 0
+# x_test_miss = x_test .== 0
+
+# x_train[x_train.==0] .= 0.5
+# x_eval[x_eval.==0] .= 0.5
+# x_test[x_test.==0] .= 0.5
+
+# x_train = hcat(x_train, x_train_miss)
+# x_eval = hcat(x_eval, x_eval_miss)
+# x_test = hcat(x_test, x_test_miss)
+
 q_train = dtrain[:q]
 q_eval = deval[:q]
 q_test = dtest[:q]
 
+y_train = dtrain[:y];
+y_eval = deval[:y];
+y_test = dtest[:y];
+
 #####################################
 # mse regression
 #####################################
@@ -98,12 +114,12 @@ p_test = m_mse(x_test);
 test_df = DataFrame(p=p_test, y=y_test, q=q_test)
 test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
 ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5)
-@info "ndcg_test MSE" ndcg_test
+@info "MSE - test data - MSE model" mean((p_test .- y_test) .^ 2)
+@info "NDCG - test data - MSE model" ndcg_test
 
 #####################################
 # logistic regression
 #####################################
-
 max_rank = 4
 y_train = dtrain[:y] ./ max_rank
 y_eval = deval[:y] ./ max_rank
@@ -145,49 +161,59 @@ p_test = m_logloss(x_test);
 test_df = DataFrame(p=p_test, y=y_test, q=q_test)
 test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
 ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5)
-@info "ndcg_test LogLoss" ndcg_test
-
+@info "NDCG - test data - LogLoss model" ndcg_test
 
 #####################################
 # logistic regression on DataFrame
 #####################################
+target_name = "y"
 
 df_train = DataFrame(x_train, :auto)
-df_train.y = dtrain[:y]
+df_train.y = dtrain[:y] ./ 4
 df_train.q = dtrain[:q]
 
 df_eval = DataFrame(x_eval, :auto)
-df_eval.y = deval[:y]
+df_eval.y = deval[:y] ./ 4
 df_eval.q = deval[:q]
 
 df_test = DataFrame(x_test, :auto)
-df_test.y = dtest[:y]
+df_test.y = dtest[:y] ./ 4
 df_test.q = dtest[:q]
 
 function rank_target_norm(y::AbstractVector)
     out = similar(y)
     if minimum(y) == maximum(y)
-        # out .= 0.75
-        out .= 0.75
+        out .= 0.5
     else
-        # out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
-        out .= 0.5 .* (y .- minimum(y)) ./ (maximum(y) - minimum(y)) .+ 0.5
-
+        out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
     end
     return out
 end
 
-df_train = transform!(
-    groupby(df_train, "q"),
-    "y" => rank_target_norm => "y")
+function percent_rank(x::AbstractVector{T}) where {T}
+    return tiedrank(x) / (length(x) + 1)
+end
+
+feature_names_raw = setdiff(names(df_train), ["y", "q"])
+feature_names_rel = feature_names_raw .* "_rel"
 
-df_eval = transform!(
-    groupby(df_eval, "q"),
-    "y" => rank_target_norm => "y")
+transform!(df_train, feature_names_raw .=> percent_rank .=> feature_names_rel)
+transform!(df_eval, feature_names_raw .=> percent_rank .=> feature_names_rel)
+transform!(df_test, feature_names_raw .=> percent_rank .=> feature_names_rel)
 
-df_test = transform!(
-    groupby(df_test, "q"),
-    "y" => rank_target_norm => "y")
+feature_names = setdiff(names(df_train), ["y", "q"])
+
+# df_train = transform!(
+#     groupby(df_train, "q"),
+#     "y" => rank_target_norm => "y")
+
+# df_eval = transform!(
+#     groupby(df_eval, "q"),
+#     "y" => rank_target_norm => "y")
+
+# df_test = transform!(
+#     groupby(df_test, "q"),
+#     "y" => rank_target_norm => "y")
 
 minimum(df_eval.y)
 maximum(df_eval.y)
@@ -195,7 +221,7 @@ maximum(df_eval.y)
 config = EvoTreeRegressor(
     nrounds=6000,
     loss=:logloss,
-    eta=0.005,
+    eta=0.01,
     nbins=64,
     max_depth=11,
     rowsample=0.9,
@@ -205,8 +231,8 @@ config = EvoTreeRegressor(
 @time m_logloss_df, logger_logloss_df = fit_evotree(
     config,
     df_train;
-    target_name="y",
-    fnames=setdiff(names(df_train), ["y", "q"]),
+    target_name,
+    fnames=feature_names_raw,
     deval=df_eval,
     early_stopping_rounds=200,
     print_every_n=50,
@@ -214,14 +240,9 @@ config = EvoTreeRegressor(
     return_logger=true
 );
 
-# use the original y since NDCG is scale sensitive
-y_train = dtrain[:y]
-y_eval = deval[:y]
-y_test = dtest[:y]
-
 m_logloss_df.info
 p_test_df = m_logloss_df(df_test);
-p_test_mat = m_logloss_df(x_test);
+# p_test_mat = m_logloss_df(x_test);
 
 EvoTrees.importance(m_logloss_df)
 
@@ -229,4 +250,6 @@ p_test = m_logloss_df(df_test);
 test_df = DataFrame(p=p_test, y=dtest[:y], q=dtest[:q])
 test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
 ndcg_test = mean(test_df_agg.ndcg)
-@info "ndcg_test LogLoss DF" ndcg_test
+# ndcg_test = 0.8022558972243291
+# ndcg_test = 0.8020754563069513
+@info "NDCG - test data - LogLoss DF model" ndcg_test

From a48de3cb0f33725c884a33c3f6dd0010af4db30e Mon Sep 17 00:00:00 2001
From: "jeremie.desgagne.bouchard" <jeremie.desgagne.bouchard@gmail.com>
Date: Wed, 4 Oct 2023 22:49:25 -0400
Subject: [PATCH 2/3] document support/handling of missings + tests

---
 .github/workflows/CI.yml                      |   2 +-
 .github/workflows/CompatHelper.yml            |   2 +-
 .github/workflows/Docs.yml                    |   2 +-
 Project.toml                                  |   2 +-
 README.md                                     |   4 +-
 docs/src/index.md                             |  55 +++++++-
 .../tutorials/logistic-regression-titanic.md  |   6 +-
 src/fit-utils.jl                              |   5 +-
 src/gpu/init.jl                               |   9 +-
 src/init.jl                                   |   9 +-
 test/MLJ.jl                                   |   2 -
 test/missings.jl                              | 123 ++++++++++++++++++
 test/runtests.jl                              |   1 +
 13 files changed, 206 insertions(+), 16 deletions(-)
 create mode 100644 test/missings.jl

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 40680d19..a056b7c2 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -31,7 +31,7 @@ jobs:
             version: '1'
             arch: x64
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v1
         with:
           version: ${{ matrix.version }}
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index 8afcca93..4594d229 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -10,7 +10,7 @@ jobs:
     steps:
       - uses: julia-actions/setup-julia@latest
         with:
-          version: 1.3
+          version: 1.6
       - name: Pkg.add("CompatHelper")
         run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
       - name: CompatHelper.main
diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml
index 6ad069bd..599d907e 100644
--- a/.github/workflows/Docs.yml
+++ b/.github/workflows/Docs.yml
@@ -10,7 +10,7 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@latest
         with:
           version: '1.6'
diff --git a/Project.toml b/Project.toml
index 5cc8bf4b..784acf1d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <jeremie.db@evovest.com>"]
-version = "0.16.1"
+version = "0.16.2"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
diff --git a/README.md b/README.md
index fcbe6a98..868eb775 100644
--- a/README.md
+++ b/README.md
@@ -103,9 +103,9 @@ preds = m(x_train)
 
 ### DataFrames input
 
-When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
+When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used to specify the variables to be used as features. 
 
-`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.
+`Categorical` features are treated accordingly by the algorithm: ordered variables are treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels categorical variables.
 
 ```julia
 dtrain = DataFrame(x_train, :auto)
diff --git a/docs/src/index.md b/docs/src/index.md
index 5a3c8ca0..28eee7ad 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -52,7 +52,7 @@ m = fit_evotree(config; x_train, y_train)
 preds = m(x_train)
 ```
 
-### DataFrames and Tables input
+### Tables and DataFrames input
 
 When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
 
@@ -75,7 +75,6 @@ m = fit_evotree(config, dtrain; target_name="y", device="gpu");
 p = m(dtrain; device="gpu")
 ```
 
-
 ## Reproducibility
 
 EvoTrees models trained on cpu can be fully reproducible.
@@ -107,6 +106,58 @@ Note that in presence of multiple identical or very highly correlated features,
 
 At the moment, there's no reproducibility guarantee on GPU, although this may change in the future. 
 
+## Missing values
+
+### Features
+
+EvoTrees does not handle features having missing values. Proper preprocessing of the data is therefore needed (and a general good practice regardless of the ML model used).
+
+This includes situations where values may be all non-missing, but where the `eltype` is the form `Union{Missing,Float64}`. A conversion the types using `identity` is recommended: 
+
+```julia
+julia> x = Vector{Union{Missing, Float64}}([1, 2])
+2-element Vector{Union{Missing, Float64}}:
+ 1.0
+ 2.0
+
+julia> identity.(x)
+2-element Vector{Float64}:
+ 1.0
+ 2.0
+```
+
+For dealing with numerical or ordered categorical features containing missing values, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing:
+
+```julia
+transform!(df, :my_feat => ByRow(ismissing) => :my_feat_ismissing)
+```
+
+Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or using a more sophisticated approach such as predictions from another model):
+
+```julia
+transform!(df, :my_feat => (x -> coalesce.(x, median(skipmissing(x)))) => :my_feat);
+```
+
+For unordered categorical variables, a recode of the missing into a non missing level is sufficient:
+```julia
+julia> x = categorical(["a", "b", missing])
+3-element CategoricalArray{Union{Missing, String},1,UInt32}:
+ "a"
+ "b"
+ missing
+
+julia> x = recode(x_cat_m1, missing => "missing value")
+3-element CategoricalArray{String,1,UInt32}:
+ "a"
+ "b"
+ "missing value"
+```
+
+### Target
+
+Target variable must have its elements type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue` and `String` are also supported.
+
+
 ## Save/Load
 
 ```julia
diff --git a/docs/src/tutorials/logistic-regression-titanic.md b/docs/src/tutorials/logistic-regression-titanic.md
index 1910c3ec..c7ff636c 100644
--- a/docs/src/tutorials/logistic-regression-titanic.md
+++ b/docs/src/tutorials/logistic-regression-titanic.md
@@ -21,11 +21,11 @@ df = MLDatasets.Titanic().dataframe
 
 A first step in data processing is to prepare the input features in a model compatible format. 
 
-EvoTrees' Tables API supports input that are either `Real`, `Bool` or `Categorical`.
+EvoTrees' Tables API supports input that are either `Real` (incl. `Bool`) or `Categorical`. `Bool` variables are treated as unordered, 2-levels categorical variables.
 A recommended approach for `String` features such as `Sex` is to convert them into an unordered `Categorical`. 
 
-For dealing with features withh missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
-Then, the missing values can be inputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).
+For dealing with features with missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
+Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).
 
 ```julia
 # convert string feature to Categorical
diff --git a/src/fit-utils.jl b/src/fit-utils.jl
index b4bae526..f0fb0b93 100644
--- a/src/fit-utils.jl
+++ b/src/fit-utils.jl
@@ -2,9 +2,10 @@
     get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T}
     get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG())
 
-Get the braking points of the feature data.
+Get the histogram breaking points of the feature data.
 """
 function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.MersenneTwister()) where {T}
+    @assert T <: Real
     nobs = min(size(X, 1), 1000 * nbins)
     idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true)
     nfeats = size(X, 2)
@@ -80,6 +81,8 @@ function binarize(df; fnames, edges)
             x_bin[:, j] .= levelcode.(col)
         elseif eltype(col) <: Real
             x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), col)
+        else
+            @error "Invalid feature eltype: $(fnames[j]) is $(eltype(col))"
         end
     end
     return x_bin
diff --git a/src/gpu/init.jl b/src/gpu/init.jl
index e4661182..3c3d682d 100644
--- a/src/gpu/init.jl
+++ b/src/gpu/init.jl
@@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o
 
     target_levels = nothing
     if L == Logistic
+        @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1
         K = 1
         y = T.(y_train)
         μ = [logit(mean(y))]
         !isnothing(offset) && (offset .= logit.(offset))
     elseif L in [Poisson, Gamma, Tweedie]
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = fill(log(mean(y)), 1)
@@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o
         if eltype(y_train) <: CategoricalValue
             target_levels = CategoricalArrays.levels(y_train)
             y = UInt32.(CategoricalArrays.levelcode.(y_train))
-        else
+        elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char
             target_levels = sort(unique(y_train))
             yc = CategoricalVector(y_train, levels=target_levels)
             y = UInt32.(CategoricalArrays.levelcode.(yc))
+        else
+            @error "Invalid target eltype: $(eltype(y_train))"
         end
         K = length(target_levels)
         μ = T.(log.(proportions(y, UInt32(1):UInt32(K))))
         μ .-= maximum(μ)
         !isnothing(offset) && (offset .= log.(offset))
     elseif L == GaussianMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y))]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     elseif L == LogisticMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y) * sqrt(3) / π)]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     else
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = [mean(y)]
diff --git a/src/init.jl b/src/init.jl
index a6c8d299..be421e6b 100644
--- a/src/init.jl
+++ b/src/init.jl
@@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{CPU}, data, fnames, y_train, w, o
 
     target_levels = nothing
     if L == Logistic
+        @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1
         K = 1
         y = T.(y_train)
         μ = [logit(mean(y))]
         !isnothing(offset) && (offset .= logit.(offset))
     elseif L in [Poisson, Gamma, Tweedie]
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = fill(log(mean(y)), 1)
@@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{CPU}, data, fnames, y_train, w, o
         if eltype(y_train) <: CategoricalValue
             target_levels = CategoricalArrays.levels(y_train)
             y = UInt32.(CategoricalArrays.levelcode.(y_train))
-        else
+        elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char
             target_levels = sort(unique(y_train))
             yc = CategoricalVector(y_train, levels=target_levels)
             y = UInt32.(CategoricalArrays.levelcode.(yc))
+        else
+            @error "Invalid target eltype: $(eltype(y_train))"
         end
         K = length(target_levels)
         μ = T.(log.(proportions(y, UInt32(1):UInt32(K))))
         μ .-= maximum(μ)
         !isnothing(offset) && (offset .= log.(offset))
     elseif L == GaussianMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y))]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     elseif L == LogisticMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y) * sqrt(3) / π)]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     else
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = [mean(y)]
diff --git a/test/MLJ.jl b/test/MLJ.jl
index 740b32c2..906503cf 100644
--- a/test/MLJ.jl
+++ b/test/MLJ.jl
@@ -121,12 +121,10 @@ fit!(mach, rows=train, verbosity=1)
 
 pred_train = predict(mach, selectrows(X, train))
 pred_train_mode = predict_mode(mach, selectrows(X, train))
-cross_entropy(pred_train, selectrows(y, train)) |> mean
 sum(pred_train_mode .== y[train]) / length(y[train])
 
 pred_test = predict(mach, selectrows(X, test))
 pred_test_mode = predict_mode(mach, selectrows(X, test))
-cross_entropy(pred_test, selectrows(y, test)) |> mean
 sum(pred_test_mode .== y[test]) / length(y[test])
 pred_test_mode = predict_mode(mach, selectrows(X, test))
 
diff --git a/test/missings.jl b/test/missings.jl
new file mode 100644
index 00000000..d4edddb8
--- /dev/null
+++ b/test/missings.jl
@@ -0,0 +1,123 @@
+using Statistics
+using StatsBase: sample
+using EvoTrees: sigmoid, logit
+using EvoTrees: check_args, check_parameter
+using CategoricalArrays
+using DataFrames
+using Random: seed!
+
+# prepare a dataset
+seed!(123)
+nobs = 1_000
+x_num = rand(nobs) .* 5
+lvls = ["a", "b", "c"]
+x_cat = categorical(rand(lvls, nobs), levels=lvls, ordered=false)
+x_bool = rand(Bool, nobs)
+
+x_num_m1 = Vector{Union{Missing,Float64}}(copy(x_num))
+x_num_m2 = Vector{Any}(copy(x_num))
+lvls_m1 = ["a", "b", "c", missing]
+x_cat_m1 = categorical(rand(lvls_m1, nobs), levels=lvls)
+x_bool_m1 = Vector{Union{Missing,Bool}}(copy(x_bool))
+
+# train-eval split
+is = collect(1:nobs)
+i_sample = sample(is, nobs, replace=false)
+train_size = 0.8
+i_train = i_sample[1:floor(Int, train_size * nobs)]
+i_eval = i_sample[floor(Int, train_size * nobs)+1:end]
+
+# target var
+y_tot = sin.(x_num) .* 0.5 .+ 0.5
+y_tot = logit(y_tot) + randn(nobs)
+y_tot = sigmoid(y_tot)
+target_name = "y"
+y_tot = sigmoid(y_tot)
+y_tot_m1 = allowmissing(y_tot)
+y_tot_m1[1] = missing
+
+config = EvoTreeRegressor(
+    loss=:linear,
+    nrounds=100,
+    nbins=16,
+    lambda=0.5,
+    gamma=0.1,
+    eta=0.05,
+    max_depth=3,
+    min_weight=1.0,
+    rowsample=0.5,
+    colsample=1.0,
+    rng=123,
+)
+
+@testset "DataFrames - missing features" begin
+
+    df_tot = DataFrame(x_num=x_num, x_bool=x_bool, x_cat=x_cat, y=y_tot)
+    dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :]
+
+    model = fit_evotree(
+        config,
+        dtrain;
+        target_name)
+
+    @test model.info[:fnames] == [:x_num, :x_bool, :x_cat]
+
+    # keep only fnames <= Real or Categorical
+    df_tot = DataFrame(x_num=x_num, x_num_m1=x_num_m1, x_num_m2=x_num_m2,
+        x_cat_m1=x_cat_m1, x_bool_m1=x_bool_m1, y=y_tot)
+    dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :]
+
+    model = fit_evotree(
+        config,
+        dtrain;
+        target_name,
+        deval)
+
+    @test model.info[:fnames] == [:x_num]
+
+    model = fit_evotree(
+        config,
+        dtrain;
+        target_name,
+        fnames=[:x_num])
+
+    @test model.info[:fnames] == [:x_num]
+
+    # specifyin features with missings should error
+    @test_throws AssertionError fit_evotree(
+        config,
+        dtrain;
+        deval,
+        fnames=[:x_num, :x_num_m1, :x_num_m2, :x_cat_m1, :x_bool_m1],
+        target_name)
+
+end
+
+@testset "DataFrames - missing in target errors" begin
+
+    df_tot = DataFrame(x_num=x_num, x_bool=x_bool, x_cat=x_cat, y=y_tot_m1)
+    dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :]
+
+    @test_throws AssertionError fit_evotree(
+        config,
+        dtrain;
+        target_name)
+
+end
+
+@testset "Matrix - missing features" begin
+
+    x_tot = allowmissing(hcat(x_num_m1))
+    @test_throws AssertionError fit_evotree(
+        config;
+        x_train=x_tot,
+        y_train=y_tot)
+
+    x_tot = Matrix{Any}(hcat(x_num_m2))
+    @test_throws AssertionError fit_evotree(
+        config;
+        x_train=x_tot,
+        y_train=y_tot)
+
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 891f6d34..a4228a49 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,6 +12,7 @@ using Test
         include("oblivious.jl")
         include("tables.jl")
         include("monotonic.jl")
+        include("missings.jl")
     end
 
     @testset "MLJ" begin

From 50c56914446198cc5fb2e4cdad183f198a0c5c19 Mon Sep 17 00:00:00 2001
From: "jeremie.desgagne.bouchard" <jeremie.desgagne.bouchard@gmail.com>
Date: Wed, 4 Oct 2023 23:05:09 -0400
Subject: [PATCH 3/3] typos

---
 docs/src/index.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index 28eee7ad..116c3c27 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -54,7 +54,7 @@ preds = m(x_train)
 
 ### Tables and DataFrames input
 
-When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
+When using a `Tables` compatible input such as `DataFrames`, features with element type `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
 
 `Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.
 
@@ -155,8 +155,7 @@ julia> x = recode(x_cat_m1, missing => "missing value")
 
 ### Target
 
-Target variable must have its elements type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue` and `String` are also supported.
-
+Target variable must have its element type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue`, `Integer`, `String` and `Char` are supported.
 
 ## Save/Load