Merge pull request #252 from Evovest/stochastic

Stochastic
Evovest · Aug 18, 2023 · d693ae1 · d693ae1 · jeremiedb · Aug 19, 2023
2 parents 46e9caa + 82a7c8d
commit d693ae1
Show file tree

Hide file tree

Showing 13 changed files with 541 additions and 103 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <[email protected]>"]
-version = "0.16.0"
+version = "0.16.1"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"

diff --git a/benchmarks/Yahoo-LTRC.jl b/benchmarks/Yahoo-LTRC.jl
@@ -0,0 +1,293 @@
+using Revise
+using CSV
+using DataFrames
+using EvoTrees
+using StatsBase: sample, tiedrank
+using Statistics
+using Random: seed!
+# using GLMakie
+
+
+# data is C14 - Yahoo! Learning to Rank Challenge
+# data can be obtained though a request to https://webscope.sandbox.yahoo.com/
+
+using AWS: AWSCredentials, AWSConfig, @service
+@service S3
+aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
+aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
+
+function read_libsvm(raw::Vector{UInt8}; has_query=false)
+
+    io = IOBuffer(raw)
+    lines = readlines(io)
+
+    nobs = length(lines)
+    nfeats = 0 # initialize number of features
+
+    y = zeros(Float64, nobs)
+
+    if has_query
+        offset = 2 # offset for feature idx: y + query entries
+        q = zeros(Int, nobs)
+    else
+        offset = 1 # offset for feature idx: y
+    end
+
+    vals = [Float64[] for _ in 1:nobs]
+    feats = [Int[] for _ in 1:nobs]
+
+    for i in eachindex(lines)
+        line = lines[i]
+        line_split = split(line, " ")
+
+        y[i] = parse(Int, line_split[1])
+        has_query ? q[i] = parse(Int, split(line_split[2], ":")[2]) : nothing
+
+        n = length(line_split) - offset
+        lfeats = zeros(Int, n)
+        lvals = zeros(Float64, n)
+        @inbounds for jdx in 1:n
+            ls = split(line_split[jdx+offset], ":")
+            lvals[jdx] = parse(Float64, ls[2])
+            lfeats[jdx] = parse(Int, ls[1])
+            lfeats[jdx] > nfeats ? nfeats = lfeats[jdx] : nothing
+        end
+        vals[i] = lvals
+        feats[i] = lfeats
+    end
+
+    x = zeros(Float64, nobs, nfeats)
+    @inbounds for i in 1:nobs
+        @inbounds for jdx in 1:length(feats[i])
+            j = feats[i][jdx]
+            val = vals[i][jdx]
+            x[i, j] = val
+        end
+    end
+
+    if has_query
+        return (x=x, y=y, q=q)
+    else
+        return (x=x, y=y)
+    end
+end
+
+function read_libsvm_aws(file::String; has_query=false, aws_config=AWSConfig())
+    raw = S3.get_object(
+        "jeremiedb",
+        file,
+        Dict("response-content-type" => "application/octet-stream");
+        aws_config
+    )
+    return read_libsvm(raw; has_query)
+end
+
+function ndcg(p, y, k=10)
+    k = min(k, length(p))
+    p_order = partialsortperm(p, 1:k, rev=true)
+    y_order = partialsortperm(y, 1:k, rev=true)
+    _y = y[p_order]
+    gains = 2 .^ _y .- 1
+    discounts = log2.((1:k) .+ 1)
+    ndcg = sum(gains ./ discounts)
+
+    y_order = partialsortperm(y, 1:k, rev=true)
+    _y = y[y_order]
+    gains = 2 .^ _y .- 1
+    discounts = log2.((1:k) .+ 1)
+    idcg = sum(gains ./ discounts)
+
+    return idcg == 0 ? 1.0 : ndcg / idcg
+end
+
+p = [6, 5, 4, 3, 2, 1, 0, -1] .+ 100
+y = [3, 2, 3, 0, 1, 2, 3, 2]
+ndcg(p, y, 6)
+
+@time dtrain = read_libsvm_aws("share/data/yahoo-ltrc/set1.train.txt"; has_query=true, aws_config)
+@time deval = read_libsvm_aws("share/data/yahoo-ltrc/set1.valid.txt"; has_query=true, aws_config)
+@time dtest = read_libsvm_aws("share/data/yahoo-ltrc/set1.test.txt"; has_query=true, aws_config)
+
+colsums_train = map(sum, eachcol(dtrain[:x]))
+# colsums_eval = map(sum, eachcol(deval[:x]))
+colsums_test = map(sum, eachcol(deval[:x]))
+
+sum(colsums_train .== 0)
+sum(colsums_test .== 0)
+@assert all((colsums_train .== 0) .== (colsums_test .== 0))
+drop_cols = colsums_train .== 0
+
+x_train = dtrain[:x][:, .!drop_cols]
+x_eval = deval[:x][:, .!drop_cols]
+x_test = dtest[:x][:, .!drop_cols]
+
+q_train = dtrain[:q]
+q_eval = deval[:q]
+q_test = dtest[:q]
+
+#####################################
+# mse regression
+#####################################
+
+y_train = dtrain[:y]
+y_eval = deval[:y]
+y_test = dtest[:y]
+
+config = EvoTreeRegressor(
+    nrounds=6000,
+    loss=:mse,
+    eta=0.02,
+    nbins=64,
+    max_depth=11,
+    rowsample=0.9,
+    colsample=0.9,
+)
+
+# @time m = fit_evotree(config; x_train, y_train, print_every_n=25);
+@time m_mse, logger_mse = fit_evotree(
+    config;
+    x_train=x_train,
+    y_train=y_train,
+    x_eval=x_eval,
+    y_eval=y_eval,
+    early_stopping_rounds=200,
+    print_every_n=50,
+    metric=:mse,
+    return_logger=true
+);
+
+p_test = m_mse(x_test);
+test_df = DataFrame(p=p_test, y=y_test, q=q_test)
+test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
+ndcg_test = mean(test_df_agg.ndcg)
+@info "ndcg_test MSE" ndcg_test
+
+#####################################
+# logistic regression
+#####################################
+
+y_train = (dtrain[:y] .+ 1) ./ 6
+y_eval = (deval[:y] .+ 1) ./ 6
+y_test = (dtest[:y] .+ 1) ./ 6
+
+config = EvoTreeRegressor(
+    nrounds=6000,
+    loss=:logloss,
+    eta=0.02,
+    nbins=64,
+    max_depth=11,
+    rowsample=0.9,
+    colsample=0.9,
+)
+
+@time m_logloss, logger_logloss = fit_evotree(
+    config;
+    x_train=x_train,
+    y_train=y_train,
+    x_eval=x_eval,
+    y_eval=y_eval,
+    early_stopping_rounds=200,
+    print_every_n=50,
+    metric=:logloss,
+    return_logger=true
+);
+
+# use the original y since NDCG is scale sensitive
+y_train = dtrain[:y]
+y_eval = deval[:y]
+y_test = dtest[:y]
+
+# p_eval = m(x_eval);
+# eval_df = DataFrame(p = p_eval, y = y_eval, q = q_eval)
+# eval_df_agg = combine(groupby(eval_df, "q"), ["p", "y"] => ndcg => "ndcg")
+# ndcg_eval = mean(eval_df_agg.ndcg)
+
+p_test = m_logloss(x_test);
+test_df = DataFrame(p=p_test, y=y_test, q=q_test)
+test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
+ndcg_test = mean(test_df_agg.ndcg)
+@info "ndcg_test LogLoss" ndcg_test
+
+
+#####################################
+# logistic regression on DataFrame
+#####################################
+
+df_train = DataFrame(x_train, :auto)
+df_train.y = dtrain[:y]
+df_train.q = dtrain[:q]
+
+df_eval = DataFrame(x_eval, :auto)
+df_eval.y = deval[:y]
+df_eval.q = deval[:q]
+
+df_test = DataFrame(x_test, :auto)
+df_test.y = dtest[:y]
+df_test.q = dtest[:q]
+
+function rank_target_norm(y::AbstractVector)
+    out = similar(y)
+    if minimum(y) == maximum(y)
+        # out .= 0.75
+        out .= 0.75
+    else
+        # out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
+        out .= 0.5 .* (y .- minimum(y)) ./ (maximum(y) - minimum(y)) .+ 0.5
+
+    end
+    return out
+end
+
+df_train = transform!(
+    groupby(df_train, "q"),
+    "y" => rank_target_norm => "y")
+
+df_eval = transform!(
+    groupby(df_eval, "q"),
+    "y" => rank_target_norm => "y")
+
+df_test = transform!(
+    groupby(df_test, "q"),
+    "y" => rank_target_norm => "y")
+
+minimum(df_eval.y)
+maximum(df_eval.y)
+
+config = EvoTreeRegressor(
+    nrounds=6000,
+    loss=:logloss,
+    eta=0.005,
+    nbins=64,
+    max_depth=11,
+    rowsample=0.9,
+    colsample=0.9,
+)
+
+@time m_logloss_df, logger_logloss_df = fit_evotree(
+    config,
+    df_train;
+    target_name="y",
+    fnames=setdiff(names(df_train), ["y", "q"]),
+    deval=df_eval,
+    early_stopping_rounds=200,
+    print_every_n=50,
+    metric=:logloss,
+    return_logger=true
+);
+
+# use the original y since NDCG is scale sensitive
+y_train = dtrain[:y]
+y_eval = deval[:y]
+y_test = dtest[:y]
+
+m_logloss_df.info
+p_test_df = m_logloss_df(df_test);
+p_test_mat = m_logloss_df(x_test);
+
+EvoTrees.importance(m_logloss_df)
+
+p_test = m_logloss_df(df_test);
+test_df = DataFrame(p=p_test, y=dtest[:y], q=dtest[:q])
+test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
+ndcg_test = mean(test_df_agg.ndcg)
+@info "ndcg_test LogLoss DF" ndcg_test
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -30,29 +30,83 @@ A model configuration must first be defined, using one of the model constructor:
 - [`EvoTreeCount`](@ref)
 - [`EvoTreeMLE`](@ref)
 
-Then fitting can be performed using [`fit_evotree`](@ref). This function supports additional arguments to provide eval data in order to track out of sample metrics and perform early stopping. Look at the docs for more details on available hyper-parameters for each of the above constructors and other options for training.
+Then fitting can be performed using [`fit_evotree`](@ref). 2 broad methods are supported: Matrix and Tables based inputs. Optional kwargs can be used to specify eval data on which to track eval metric and perform early stopping. Look at the docs for more details on available hyper-parameters for each of the above constructors and other options for training.
 
 Predictions are obtained by passing features data to the model. Model acts as a functor, ie. it's a struct containing the fitted model as well as a function generating the prediction of that model for the features argument. 
 
+
+### Matrix features input
+
 ```julia
 using EvoTrees
 
 config = EvoTreeRegressor(
-    loss=:linear, 
+    loss=:mse, 
     nrounds=100, 
-    max_depth=6, 
+    max_depth=6,
     nbins=32,
-    eta=0.1,
-    lambda=0.1, 
-    gamma=0.1, 
-    min_weight=1.0,
-    rowsample=0.5, 
-    colsample=0.8)
+    eta=0.1)
 
+x_train, y_train = rand(1_000, 10), rand(1_000)
 m = fit_evotree(config; x_train, y_train)
 preds = m(x_train)
 ```
 
+### DataFrames and Tables input
+
+When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
+
+`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.
+
+```julia
+dtrain = DataFrame(x_train, :auto)
+dtrain.y .= y_train
+m = fit_evotree(config, dtrain; target_name="y");
+m = fit_evotree(config, dtrain; target_name="y", fnames=["x1", "x3"]);
+```
+
+
+### GPU Training
+
+If running on a CUDA enabled machine, training and inference on GPU can be triggered through the `device` kwarg: 
+
+```julia
+m = fit_evotree(config, dtrain; target_name="y", device="gpu");
+p = m(dtrain; device="gpu")
+```
+
+
+## Reproducibility
+
+EvoTrees models trained on cpu can be fully reproducible.
+
+Models of the gradient boosting family typically involve some stochasticity. 
+In EvoTrees, this primarily concern the the 2 subsampling parameters `rowsample` and `colsample`. The other stochastic operation happens at model initialisation when the features are binarized to allow for fast histogram construction: a random subsample of `1_000 * nbins` is used to compute the breaking points. 
+
+These random parts of the algorithm can be deterministically reproduced on cpu by specifying an `rng` to the model constructor. `rng` can an `Int` (ex: `123`) or a random generator (ex: `Random.Xoshiro(123)`). 
+If no `rng` is specified, `123` is used by default. When an `Int `rng` is used, a `Random.MersenneTwister` generator will be created by the EvoTrees's constructor. Otherwise, the provided random generator will be used.  
+
+As a consequence, the following `m1` and `m2` models will be identical:
+
+```julia
+config = EvoTreeRegressor(rowsample=0.5, rng=123)
+m1 = fit_evotree(config, df; target_name="y");
+config = EvoTreeRegressor(rowsample=0.5, rng=123)
+m2 = fit_evotree(config, df; target_name="y");
+```
+
+However, the following `m1` and `m2` models won't be because the there's stochasticity involved in the model from `rowsample` and the random generator in the `config` isn't reset between the fits:
+
+```julia
+config = EvoTreeRegressor(rowsample=0.5, rng=123)
+m1 = fit_evotree(config, df; target_name="y");
+m2 = fit_evotree(config, df; target_name="y");
+```
+
+Note that in presence of multiple identical or very highly correlated features, model may not be reproducible if features are permutted since in situation where 2 features provide identical gains, the first one will be selected. Therefore, if the the identity relationship doesn't hold on out on new data, different predictions will results from models trained on different features order. 
+
+At the moment, there's no reproducibility guarantee on GPU, although this may change in the future. 
+
 ## Save/Load
 
 ```julia