Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shuffling #250

Merged
merged 3 commits into from
Aug 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.15.2"
version = "0.16.0"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
4 changes: 4 additions & 0 deletions benchmarks/regressor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ import CUDA
### perf depth
# desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec | xgboost: 26s
# desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s

#threads
# laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB)

nobs = Int(1e6)
num_feat = Int(100)
nrounds = 200
Expand Down
144 changes: 144 additions & 0 deletions experiments/shuffling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
using DataFrames
using Distributions
using EvoTrees
using LinearAlgebra
using GLM
using Random

δ = 1.0e-6
b = fill(1.0 - δ, 3, 3) + δ * I
z = zeros(3, 3)
y = fill(0.5, 3)
dist = MvNormal([
b z z 0.8*y
z b z y
z z b 1.2*y
0.8*y' y' 1.2*y' 1.0])
Random.seed!(1)
mat = rand(dist, 10_000);
df = DataFrame(transpose(mat), [string.("x", 1:9); "y"]);
target_name = "y"

#################################
# Tables API
#################################
config = EvoTreeRegressor(seed=123)
m1 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m1)

config = EvoTreeRegressor(seed=124)
m2 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m2)

# permuted tables doesn't return the same result - numerical rounding error?
df2 = df[!, 10:-1:1]
config = EvoTreeRegressor()
m3 = fit_evotree(config,
df2;
target_name="y",
verbosity=0);
EvoTrees.importance(m3)

# manual check on col permutations
config = EvoTreeRegressor(max_depth=4)
m1, cache1 = EvoTrees.init(config, df; target_name);
EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU)
EvoTrees.importance(m1)

df2 = df[!, 10:-1:1];
config = EvoTreeRegressor(max_depth=4)
m2, cache2 = EvoTrees.init(config, df2; target_name);
EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU)
EvoTrees.importance(m2)

all(cache1.x_bin .== cache2.x_bin[:, 9:-1:1])
all(cache1.edges .== cache2.edges[9:-1:1])
m1.trees[2]
m2.trees[2]

m1.trees[2].feat
m2.trees[2].feat

Int.(m1.trees[2].cond_bin)
Int.(m2.trees[2].cond_bin)


config = EvoTreeRegressor(nrounds=100, eta=0.05, colsample=1.0)
m3 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m3)

#################################
# Tables API
#################################
config = EvoTreeRegressor(colsample=0.5)
m1 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m1)

m2 = fit_evotree(config,
df;
target_name="y",
verbosity=0);
EvoTrees.importance(m2)

#################################
# Matrix API
#################################
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]

config = EvoTreeRegressor()
m1 = fit_evotree(config;
x_train,
y_train,
verbosity=0);
EvoTrees.importance(m1)

m2 = fit_evotree(config;
x_train,
y_train,
verbosity=0);
EvoTrees.importance(m2)

using GLM
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]
lm(x_train, y_train)

#################################
# Matrix debug API
#################################
x_train = Matrix(mat[1:9, :]')
y_train = mat[10, :]

config = EvoTreeRegressor()
m1, cache1 = EvoTrees.init(config, x_train, y_train);
EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU)
EvoTrees.importance(m1)

m2, cache2 = EvoTrees.init(config, x_train, y_train);
EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU)
EvoTrees.importance(m2)

using MLJ
using EvoTrees
using MLJLinearModels
X, y = make_regression()
model = Stack(
metalearner=LinearRegressor(),
resampling=CV(nfolds=2),
tree=EvoTreeRegressor()
)
mach = machine(model, X, y)
fit!(mach)
20 changes: 10 additions & 10 deletions src/eval.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ function mse(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] = w[i] * (p[1, i] - y[i])^2
end
return sum(eval) / sum(w)
Expand All @@ -20,7 +20,7 @@ function mae(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] = w[i] * abs(p[1, i] - y[i])
end
return sum(eval) / sum(w)
Expand All @@ -33,7 +33,7 @@ function logloss(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
pred = sigmoid(p[1, i])
eval[i] = w[i] * (-y[i] * log(pred) + (y[i] - 1) * log(1 - pred))
end
Expand All @@ -48,7 +48,7 @@ function mlogloss(
kwargs...
) where {T}
K = size(p, 1)
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
isum = zero(T)
@inbounds for k in 1:K
isum += exp(p[k, i])
Expand All @@ -65,7 +65,7 @@ function poisson(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
pred = exp(p[1, i])
eval[i] = w[i] * 2 * (y[i] * (log(y[i]) - log(pred)) + pred - y[i])
end
Expand All @@ -79,7 +79,7 @@ function gamma(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
pred = exp(p[1, i])
eval[i] = w[i] * 2 * (log(pred / y[i]) + y[i] / pred - 1)
end
Expand All @@ -94,7 +94,7 @@ function tweedie(
kwargs...
) where {T}
rho = T(1.5)
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
pred = exp(p[1, i])
eval[i] =
w[i] *
Expand All @@ -114,7 +114,7 @@ function gaussian_mle(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] = -w[i] * (p[2, i] + (y[i] - p[1, i])^2 / (2 * exp(2 * p[2, i])))
end
return sum(eval) / sum(w)
Expand All @@ -127,7 +127,7 @@ function logistic_mle(
eval::AbstractVector;
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] = w[i] * (log(1 / 4 * sech(exp(-p[2, i]) * (y[i] - p[1, i]))^2) - p[2, i])
end
return sum(eval) / sum(w)
Expand All @@ -141,7 +141,7 @@ function wmae(
alpha=0.5,
kwargs...
) where {T}
@threads :static for i in eachindex(y)
@threads for i in eachindex(y)
eval[i] =
w[i] * (
alpha * max(y[i] - p[1, i], zero(T)) +
Expand Down
18 changes: 9 additions & 9 deletions src/fit-utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ Get the braking points of the feature data.
"""
function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T}
nobs = min(size(X, 1), 1000 * nbins)
idx = rand(rng, 1:size(X, 1), nobs)
idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true)
nfeats = size(X, 2)
edges = Vector{Vector{T}}(undef, nfeats)
featbins = Vector{UInt8}(undef, nfeats)
feattypes = Vector{Bool}(undef, nfeats)
@threads :static for j in 1:size(X, 2)
@threads for j in 1:size(X, 2)
edges[j] = quantile(view(X, idx, j), (1:nbins-1) / nbins)
if length(edges[j]) == 1
edges[j] = [minimum(view(X, idx, j))]
Expand All @@ -25,12 +25,12 @@ end
function get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG())
_nobs = length(Tables.getcolumn(df, 1))
nobs = min(_nobs, 1000 * nbins)
idx = rand(rng, 1:_nobs, nobs)
idx = sample(rng, 1:_nobs, nobs, replace=false, ordered=true)
edges = Vector{Any}([Vector{eltype(Tables.getcolumn(df, col))}() for col in fnames])
nfeats = length(fnames)
featbins = Vector{UInt8}(undef, nfeats)
feattypes = Vector{Bool}(undef, nfeats)
@threads :static for j in eachindex(fnames)
@threads for j in eachindex(fnames)
col = view(Tables.getcolumn(df, fnames[j]), idx)
if eltype(col) <: Bool
edges[j] = [false, true]
Expand Down Expand Up @@ -63,7 +63,7 @@ Transform feature data into a UInt8 binarized matrix.
"""
function binarize(X::AbstractMatrix; fnames, edges)
x_bin = zeros(UInt8, size(X))
@threads :static for j in axes(X, 2)
@threads for j in axes(X, 2)
x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), view(X, :, j))
end
return x_bin
Expand All @@ -72,7 +72,7 @@ end
function binarize(df; fnames, edges)
nobs = length(Tables.getcolumn(df, 1))
x_bin = zeros(UInt8, nobs, length(fnames))
@threads :static for j in eachindex(fnames)
@threads for j in eachindex(fnames)
col = Tables.getcolumn(df, fnames[j])
if eltype(col) <: Bool
x_bin[:, j] .= col .+ 1
Expand Down Expand Up @@ -232,7 +232,7 @@ function update_hist!(
is::AbstractVector,
js::AbstractVector,
) where {L<:GradientRegression}
@threads :static for j in js
@threads for j in js
@inbounds @simd for i in is
bin = x_bin[i, j]
hist[j][1, bin] += ∇[1, i]
Expand All @@ -255,7 +255,7 @@ function update_hist!(
is::AbstractVector,
js::AbstractVector,
) where {L<:MLE2P}
@threads :static for j in js
@threads for j in js
@inbounds @simd for i in is
bin = x_bin[i, j]
hist[j][1, bin] += ∇[1, i]
Expand All @@ -280,7 +280,7 @@ function update_hist!(
is::AbstractVector,
js::AbstractVector,
) where {L}
@threads :static for j in js
@threads for j in js
@inbounds for i in is
bin = x_bin[i, j]
@inbounds @simd for k in axes(∇, 1)
Expand Down
4 changes: 2 additions & 2 deletions src/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ function grow_tree!(
update_hist!(L, nodes[n].h, ∇, x_bin, nodes[n].is, js)
end
end
@threads :static for n ∈ sort(n_current)
@threads for n ∈ sort(n_current)
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end
end
Expand Down Expand Up @@ -215,7 +215,7 @@ function grow_otree!(
update_hist!(L, nodes[n].h, ∇, x_bin, nodes[n].is, js)
end
end
@threads :static for n ∈ n_current
@threads for n ∈ n_current
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end

Expand Down
4 changes: 2 additions & 2 deletions src/gpu/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ function grow_tree!(
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
@threads :static for n ∈ sort(n_current)
@threads for n ∈ sort(n_current)
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end
end
Expand Down Expand Up @@ -217,7 +217,7 @@ function grow_otree!(
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
@threads :static for n ∈ n_current
@threads for n ∈ n_current
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end

Expand Down
Loading