Skip to content

Commit

Permalink
Merge pull request #252 from Evovest/stochastic
Browse files Browse the repository at this point in the history
Stochastic
  • Loading branch information
jeremiedb authored Aug 18, 2023
2 parents 46e9caa + 82a7c8d commit d693ae1
Show file tree
Hide file tree
Showing 13 changed files with 541 additions and 103 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.16.0"
version = "0.16.1"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
293 changes: 293 additions & 0 deletions benchmarks/Yahoo-LTRC.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
using Revise
using CSV
using DataFrames
using EvoTrees
using StatsBase: sample, tiedrank
using Statistics
using Random: seed!
# using GLMakie


# data is C14 - Yahoo! Learning to Rank Challenge
# data can be obtained though a request to https://webscope.sandbox.yahoo.com/

using AWS: AWSCredentials, AWSConfig, @service
@service S3
aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")

function read_libsvm(raw::Vector{UInt8}; has_query=false)

io = IOBuffer(raw)
lines = readlines(io)

nobs = length(lines)
nfeats = 0 # initialize number of features

y = zeros(Float64, nobs)

if has_query
offset = 2 # offset for feature idx: y + query entries
q = zeros(Int, nobs)
else
offset = 1 # offset for feature idx: y
end

vals = [Float64[] for _ in 1:nobs]
feats = [Int[] for _ in 1:nobs]

for i in eachindex(lines)
line = lines[i]
line_split = split(line, " ")

y[i] = parse(Int, line_split[1])
has_query ? q[i] = parse(Int, split(line_split[2], ":")[2]) : nothing

n = length(line_split) - offset
lfeats = zeros(Int, n)
lvals = zeros(Float64, n)
@inbounds for jdx in 1:n
ls = split(line_split[jdx+offset], ":")
lvals[jdx] = parse(Float64, ls[2])
lfeats[jdx] = parse(Int, ls[1])
lfeats[jdx] > nfeats ? nfeats = lfeats[jdx] : nothing
end
vals[i] = lvals
feats[i] = lfeats
end

x = zeros(Float64, nobs, nfeats)
@inbounds for i in 1:nobs
@inbounds for jdx in 1:length(feats[i])
j = feats[i][jdx]
val = vals[i][jdx]
x[i, j] = val
end
end

if has_query
return (x=x, y=y, q=q)
else
return (x=x, y=y)
end
end

function read_libsvm_aws(file::String; has_query=false, aws_config=AWSConfig())
raw = S3.get_object(
"jeremiedb",
file,
Dict("response-content-type" => "application/octet-stream");
aws_config
)
return read_libsvm(raw; has_query)
end

function ndcg(p, y, k=10)
k = min(k, length(p))
p_order = partialsortperm(p, 1:k, rev=true)
y_order = partialsortperm(y, 1:k, rev=true)
_y = y[p_order]
gains = 2 .^ _y .- 1
discounts = log2.((1:k) .+ 1)
ndcg = sum(gains ./ discounts)

y_order = partialsortperm(y, 1:k, rev=true)
_y = y[y_order]
gains = 2 .^ _y .- 1
discounts = log2.((1:k) .+ 1)
idcg = sum(gains ./ discounts)

return idcg == 0 ? 1.0 : ndcg / idcg
end

p = [6, 5, 4, 3, 2, 1, 0, -1] .+ 100
y = [3, 2, 3, 0, 1, 2, 3, 2]
ndcg(p, y, 6)

@time dtrain = read_libsvm_aws("share/data/yahoo-ltrc/set1.train.txt"; has_query=true, aws_config)
@time deval = read_libsvm_aws("share/data/yahoo-ltrc/set1.valid.txt"; has_query=true, aws_config)
@time dtest = read_libsvm_aws("share/data/yahoo-ltrc/set1.test.txt"; has_query=true, aws_config)

colsums_train = map(sum, eachcol(dtrain[:x]))
# colsums_eval = map(sum, eachcol(deval[:x]))
colsums_test = map(sum, eachcol(deval[:x]))

sum(colsums_train .== 0)
sum(colsums_test .== 0)
@assert all((colsums_train .== 0) .== (colsums_test .== 0))
drop_cols = colsums_train .== 0

x_train = dtrain[:x][:, .!drop_cols]
x_eval = deval[:x][:, .!drop_cols]
x_test = dtest[:x][:, .!drop_cols]

q_train = dtrain[:q]
q_eval = deval[:q]
q_test = dtest[:q]

#####################################
# mse regression
#####################################

y_train = dtrain[:y]
y_eval = deval[:y]
y_test = dtest[:y]

config = EvoTreeRegressor(
nrounds=6000,
loss=:mse,
eta=0.02,
nbins=64,
max_depth=11,
rowsample=0.9,
colsample=0.9,
)

# @time m = fit_evotree(config; x_train, y_train, print_every_n=25);
@time m_mse, logger_mse = fit_evotree(
config;
x_train=x_train,
y_train=y_train,
x_eval=x_eval,
y_eval=y_eval,
early_stopping_rounds=200,
print_every_n=50,
metric=:mse,
return_logger=true
);

p_test = m_mse(x_test);
test_df = DataFrame(p=p_test, y=y_test, q=q_test)
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = mean(test_df_agg.ndcg)
@info "ndcg_test MSE" ndcg_test

#####################################
# logistic regression
#####################################

y_train = (dtrain[:y] .+ 1) ./ 6
y_eval = (deval[:y] .+ 1) ./ 6
y_test = (dtest[:y] .+ 1) ./ 6

config = EvoTreeRegressor(
nrounds=6000,
loss=:logloss,
eta=0.02,
nbins=64,
max_depth=11,
rowsample=0.9,
colsample=0.9,
)

@time m_logloss, logger_logloss = fit_evotree(
config;
x_train=x_train,
y_train=y_train,
x_eval=x_eval,
y_eval=y_eval,
early_stopping_rounds=200,
print_every_n=50,
metric=:logloss,
return_logger=true
);

# use the original y since NDCG is scale sensitive
y_train = dtrain[:y]
y_eval = deval[:y]
y_test = dtest[:y]

# p_eval = m(x_eval);
# eval_df = DataFrame(p = p_eval, y = y_eval, q = q_eval)
# eval_df_agg = combine(groupby(eval_df, "q"), ["p", "y"] => ndcg => "ndcg")
# ndcg_eval = mean(eval_df_agg.ndcg)

p_test = m_logloss(x_test);
test_df = DataFrame(p=p_test, y=y_test, q=q_test)
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = mean(test_df_agg.ndcg)
@info "ndcg_test LogLoss" ndcg_test


#####################################
# logistic regression on DataFrame
#####################################

df_train = DataFrame(x_train, :auto)
df_train.y = dtrain[:y]
df_train.q = dtrain[:q]

df_eval = DataFrame(x_eval, :auto)
df_eval.y = deval[:y]
df_eval.q = deval[:q]

df_test = DataFrame(x_test, :auto)
df_test.y = dtest[:y]
df_test.q = dtest[:q]

function rank_target_norm(y::AbstractVector)
out = similar(y)
if minimum(y) == maximum(y)
# out .= 0.75
out .= 0.75
else
# out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
out .= 0.5 .* (y .- minimum(y)) ./ (maximum(y) - minimum(y)) .+ 0.5

end
return out
end

df_train = transform!(
groupby(df_train, "q"),
"y" => rank_target_norm => "y")

df_eval = transform!(
groupby(df_eval, "q"),
"y" => rank_target_norm => "y")

df_test = transform!(
groupby(df_test, "q"),
"y" => rank_target_norm => "y")

minimum(df_eval.y)
maximum(df_eval.y)

config = EvoTreeRegressor(
nrounds=6000,
loss=:logloss,
eta=0.005,
nbins=64,
max_depth=11,
rowsample=0.9,
colsample=0.9,
)

@time m_logloss_df, logger_logloss_df = fit_evotree(
config,
df_train;
target_name="y",
fnames=setdiff(names(df_train), ["y", "q"]),
deval=df_eval,
early_stopping_rounds=200,
print_every_n=50,
metric=:logloss,
return_logger=true
);

# use the original y since NDCG is scale sensitive
y_train = dtrain[:y]
y_eval = deval[:y]
y_test = dtest[:y]

m_logloss_df.info
p_test_df = m_logloss_df(df_test);
p_test_mat = m_logloss_df(x_test);

EvoTrees.importance(m_logloss_df)

p_test = m_logloss_df(df_test);
test_df = DataFrame(p=p_test, y=dtest[:y], q=dtest[:q])
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = mean(test_df_agg.ndcg)
@info "ndcg_test LogLoss DF" ndcg_test
72 changes: 63 additions & 9 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,29 +30,83 @@ A model configuration must first be defined, using one of the model constructor:
- [`EvoTreeCount`](@ref)
- [`EvoTreeMLE`](@ref)

Then fitting can be performed using [`fit_evotree`](@ref). This function supports additional arguments to provide eval data in order to track out of sample metrics and perform early stopping. Look at the docs for more details on available hyper-parameters for each of the above constructors and other options for training.
Then fitting can be performed using [`fit_evotree`](@ref). 2 broad methods are supported: Matrix and Tables based inputs. Optional kwargs can be used to specify eval data on which to track eval metric and perform early stopping. Look at the docs for more details on available hyper-parameters for each of the above constructors and other options for training.

Predictions are obtained by passing features data to the model. Model acts as a functor, ie. it's a struct containing the fitted model as well as a function generating the prediction of that model for the features argument.


### Matrix features input

```julia
using EvoTrees

config = EvoTreeRegressor(
loss=:linear,
loss=:mse,
nrounds=100,
max_depth=6,
max_depth=6,
nbins=32,
eta=0.1,
lambda=0.1,
gamma=0.1,
min_weight=1.0,
rowsample=0.5,
colsample=0.8)
eta=0.1)

x_train, y_train = rand(1_000, 10), rand(1_000)
m = fit_evotree(config; x_train, y_train)
preds = m(x_train)
```

### DataFrames and Tables input

When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used.

`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.

```julia
dtrain = DataFrame(x_train, :auto)
dtrain.y .= y_train
m = fit_evotree(config, dtrain; target_name="y");
m = fit_evotree(config, dtrain; target_name="y", fnames=["x1", "x3"]);
```


### GPU Training

If running on a CUDA enabled machine, training and inference on GPU can be triggered through the `device` kwarg:

```julia
m = fit_evotree(config, dtrain; target_name="y", device="gpu");
p = m(dtrain; device="gpu")
```


## Reproducibility

EvoTrees models trained on cpu can be fully reproducible.

Models of the gradient boosting family typically involve some stochasticity.
In EvoTrees, this primarily concern the the 2 subsampling parameters `rowsample` and `colsample`. The other stochastic operation happens at model initialisation when the features are binarized to allow for fast histogram construction: a random subsample of `1_000 * nbins` is used to compute the breaking points.

These random parts of the algorithm can be deterministically reproduced on cpu by specifying an `rng` to the model constructor. `rng` can an `Int` (ex: `123`) or a random generator (ex: `Random.Xoshiro(123)`).
If no `rng` is specified, `123` is used by default. When an `Int `rng` is used, a `Random.MersenneTwister` generator will be created by the EvoTrees's constructor. Otherwise, the provided random generator will be used.

As a consequence, the following `m1` and `m2` models will be identical:

```julia
config = EvoTreeRegressor(rowsample=0.5, rng=123)
m1 = fit_evotree(config, df; target_name="y");
config = EvoTreeRegressor(rowsample=0.5, rng=123)
m2 = fit_evotree(config, df; target_name="y");
```

However, the following `m1` and `m2` models won't be because the there's stochasticity involved in the model from `rowsample` and the random generator in the `config` isn't reset between the fits:

```julia
config = EvoTreeRegressor(rowsample=0.5, rng=123)
m1 = fit_evotree(config, df; target_name="y");
m2 = fit_evotree(config, df; target_name="y");
```

Note that in presence of multiple identical or very highly correlated features, model may not be reproducible if features are permutted since in situation where 2 features provide identical gains, the first one will be selected. Therefore, if the the identity relationship doesn't hold on out on new data, different predictions will results from models trained on different features order.

At the moment, there's no reproducibility guarantee on GPU, although this may change in the future.

## Save/Load

```julia
Expand Down
Loading

2 comments on commit d693ae1

@jeremiedb
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/89910

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.16.1 -m "<description of version>" d693ae1a3812216e449ea519c4a208a7940a1c19
git push origin v0.16.1

Please sign in to comment.