Merge pull request #244 from Evovest/perf-depth

Perf depth
Evovest · Aug 3, 2023 · 897ffca · 897ffca · jeremiedb · Aug 3, 2023
2 parents 5eabb3a + e5aad60
commit 897ffca
Show file tree

Hide file tree

Showing 17 changed files with 309 additions and 105 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <[email protected]>"]
-version = "0.15.1"
+version = "0.15.2"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"

diff --git a/README.md b/README.md
@@ -46,27 +46,27 @@ Code to reproduce is availabe in [`benchmarks/regressor.jl`](https://github.com/
     - Julia: v1.9.1.
 - Algorithms
     - XGBoost: v2.3.0 (Using the `hist` algorithm).
-    - EvoTrees: v0.15.0.
+    - EvoTrees: v0.15.2.
 
 ### Training: 
 
 | Dimensions   / Algo | XGBoost CPU | EvoTrees CPU | XGBoost GPU | EvoTrees GPU |
 |---------------------|:-----------:|:------------:|:-----------:|:------------:|
-| 100K x 100          |    2.33s    |     1.09s    |    0.90s    |     2.72s    |
-| 500K x 100          |    10.7s    |     2.96s    |    1.84s    |     3.65s    |
-| 1M x 100            |    20.9s    |     6.48s    |    3.10s    |     4.45s    |
-| 5M x 100            |    108s     |     35.8s    |    12.9s    |     12.7s    |
-| 10M x 100           |    216s     |     71.6s    |    25.5s    |     23.0s    |
+| 100K x 100          |    2.34s    |     1.01s    |    0.90s    |     2.61s    |
+| 500K x 100          |    10.7s    |     3.95s    |    1.84s    |     3.41s    |
+| 1M x 100            |    21.1s    |     6.57s    |    3.10s    |     4.47s    |
+| 5M x 100            |    108s     |     36.1s    |    12.9s    |     12.5s    |
+| 10M x 100           |    218s     |     72.6s    |    25.5s    |     23.0s    |
 
 ### Inference:
 
 | Dimensions   / Algo | XGBoost CPU  | EvoTrees CPU | XGBoost GPU | EvoTrees GPU |
 |---------------------|:------------:|:------------:|:-----------:|:------------:|
-| 100K x 100          |    0.151s    |    0.053s    |     NA      |    0.036s    |
-| 500K x 100          |    0.628s    |    0.276s    |     NA      |    0.169s    |
-| 1M x 100            |    1.26s     |    0.558s    |     NA      |    0.334s    |
+| 100K x 100          |    0.151s    |    0.058s    |     NA      |    0.045s    |
+| 500K x 100          |    0.647s    |    0.248s    |     NA      |    0.172s    |
+| 1M x 100            |    1.26s     |    0.573s    |     NA      |    0.327s    |
 | 5M x 100            |    6.04s     |    2.87s     |     NA      |    1.66s     |
-| 10M x 100           |    12.4s     |    5.71s     |     NA      |    3.31s     |
+| 10M x 100           |    12.4s     |    5.71s     |     NA      |    3.40s     |
 
 ## MLJ Integration
 

diff --git a/benchmarks/regressor.jl b/benchmarks/regressor.jl
@@ -8,13 +8,21 @@ using BenchmarkTools
 using Random: seed!
 import CUDA
 
+### v.0.15.1
+# desktop | 1e6 | depth 11 | cpu: 37.2s
+# desktop | 10e6 | depth 11 | cpu
+
+### perf depth
+# desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec  | xgboost: 26s
+# desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s
 nobs = Int(1e6)
 num_feat = Int(100)
 nrounds = 200
+max_depth = 6
 tree_type = "binary"
 T = Float64
 nthread = Base.Threads.nthreads()
-@info "testing with: $nobs observations | $num_feat features. nthread: $nthread | tree_type : $tree_type"
+@info "testing with: $nobs observations | $num_feat features. nthread: $nthread | tree_type : $tree_type | max_depth : $max_depth"
 seed!(123)
 x_train = rand(T, nobs, num_feat)
 y_train = rand(T, size(x_train, 1))
@@ -37,7 +45,7 @@ end
 @info "train"
 params_xgb = Dict(
     :num_round => nrounds,
-    :max_depth => 5,
+    :max_depth => max_depth - 1,
     :eta => 0.05,
     :objective => loss_xgb,
     :print_every_n => 5,
@@ -98,7 +106,7 @@ params_evo = EvoTreeRegressor(;
     lambda=0.0,
     gamma=0.0,
     eta=0.05,
-    max_depth=6,
+    max_depth=max_depth,
     min_weight=1.0,
     rowsample=0.5,
     colsample=0.5,
@@ -117,14 +125,11 @@ device = "cpu"
 # @time m_evo = fit_evotree(params_evo; x_train, y_train, device, verbosity, print_every_n=100);
 @info "train - eval"
 @time m_evo = fit_evotree(params_evo; x_train, y_train, x_eval=x_train, y_eval=y_train, metric=metric_evo, device, verbosity, print_every_n=100);
-# using Plots
-# plot(m_evo, 2)
-
 @time m_evo = fit_evotree(params_evo; x_train, y_train, x_eval=x_train, y_eval=y_train, metric=metric_evo, device, verbosity, print_every_n=100);
 @info "predict"
 @time pred_evo = m_evo(x_train);
 @time pred_evo = m_evo(x_train);
-@btime m_evo($x_train);
+# @btime m_evo($x_train);
 
 @info "EvoTrees GPU"
 device = "gpu"
@@ -139,4 +144,4 @@ CUDA.@time m_evo = fit_evotree(params_evo; x_train, y_train, x_eval=x_train, y_e
 @info "predict"
 CUDA.@time pred_evo = m_evo(x_train; device);
 CUDA.@time pred_evo = m_evo(x_train; device);
-@btime m_evo($x_train; device);
+# @btime m_evo($x_train; device);
diff --git a/docs/src/assets/regression-sinus-binary.png b/docs/src/assets/regression-sinus-binary.png
diff --git a/experiments/depth-debug.jl b/experiments/depth-debug.jl
@@ -0,0 +1,129 @@
+using Statistics
+using StatsBase:sample
+using Base.Threads:@threads
+using BenchmarkTools
+using Revise
+using EvoTrees
+using Profile
+
+nobs = Int(1e6)
+num_feat = Int(100)
+nrounds = 200
+nthread = Base.Threads.nthreads()
+x_train = rand(nobs, num_feat)
+y_train = rand(size(x_train, 1))
+
+config = EvoTreeRegressor(;
+    loss=:mse,
+    nrounds=200,
+    lambda=0.0,
+    gamma=0.0,
+    eta=0.05,
+    max_depth=10,
+    min_weight=1.0,
+    rowsample=0.5,
+    colsample=0.5,
+    nbins=64,
+    tree_type="binary",
+    rng=123
+)
+
+################################
+# high-level
+################################
+_device = EvoTrees.GPU
+@time EvoTrees.fit_evotree(config; x_train, y_train, device = "gpu")
+
+@time m, cache = EvoTrees.init(config, x_train, y_train);
+@time EvoTrees.grow_evotree!(m, cache, config)
+@btime EvoTrees.grow_evotree!(m, cache, config)
+
+Profile.clear()
+# Profile.init()
+Profile.init(n = 10^5, delay = 0.01)
+# @profile m, cache = EvoTrees.init(config, x_train, y_train);
+@profile EvoTrees.grow_evotree!(m, cache, config)
+Profile.print()
+
+################################
+# mid-level
+################################
+@time m, cache = EvoTrees.init(config, x_train, y_train);
+@time EvoTrees.grow_evotree!(m, cache, config)
+# compute gradients
+@time m, cache = EvoTrees.init(config, x_train, y_train);
+@time EvoTrees.update_grads!(cache.∇, cache.pred, cache.y, config)
+# subsample rows
+@time cache.nodes[1].is = EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, config.rowsample, config.rng)
+# subsample cols
+EvoTrees.sample!(config.rng, cache.js_, cache.js, replace=false, ordered=true)
+L = EvoTrees._get_struct_loss(m)
+# instantiate a tree then grow it
+tree = EvoTrees.Tree{L,1}(config.max_depth)
+grow! = config.tree_type == "oblivious" ? EvoTrees.grow_otree! : EvoTrees.grow_tree!
+@time EvoTrees.grow_tree!(
+    tree,
+    cache.nodes,
+    config,
+    cache.∇,
+    cache.edges,
+    cache.js,
+    cache.out,
+    cache.left,
+    cache.right,
+    cache.x_bin,
+    cache.feattypes,
+    cache.monotone_constraints
+)
+
+using ProfileView
+ProfileView.@profview EvoTrees.grow_tree!(
+    tree,
+    cache.nodes,
+    config,
+    cache.∇,
+    cache.edges,
+    cache.js,
+    cache.out,
+    cache.left,
+    cache.right,
+    cache.x_bin,
+    cache.feattypes,
+    cache.monotone_constraints
+)
+
+################################
+# end mid-level
+################################
+
+
+@time m_evo = grow_tree!(params_evo; x_train, y_train, device, print_every_n=100);
+
+@info "train - no eval"
+@time m_evo = fit_evotree(params_evo; x_train, y_train, device, print_every_n=100);
+
+
+offset = 0
+feat = 15
+cond_bin = 32
+@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14);
+@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14);
+@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset)
+
+offset = 0
+feat = 15
+cond_bin = 32
+lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset)
+offset = 0
+feat = 14
+cond_bin = 12
+lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset)
+offset = + length(lid1)
+feat = 14
+cond_bin = 12
+lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset)
+
+lid1_ = deepcopy(lid1)
+
+
+
diff --git a/experiments/learnAPI.jl b/experiments/learnAPI.jl
@@ -0,0 +1,67 @@
+using EvoTrees
+
+module LearnAPI
+
+abstract type Config end
+abstract type Learner end
+abstract type Model end
+
+function fit(config::Config; kwargs...)
+    return nothing
+end
+function fit(config::Config, data; kwargs...)
+    return nothing
+end
+function init(config::Config, data; kwargs...)
+    return nothing
+end
+# function fit!(learner::Learner)
+#     return nothing
+# end
+
+function predict(model::Model, x)
+    return x
+end
+function predict!(p, model::Model, x)
+    return nothing
+end
+
+function isiterative(m) end
+
+end #module
+
+struct EvoLearner
+    params
+end
+
+# 1 args fit: all needed supplemental info passed through kwargs: risk of having fragmentation of naming convention, hard to follow
+m = LearnAPI.fit(config::Config; kwargs)
+m = LearnAPI.fit(config::EvoTrees.EvoTypes; x_train=xt, y_train=yt)
+m = LearnAPI.fit(config::EvoTrees.EvoTypes; x_train=xt, y_train=yt, x_eval=xe, y_eval=ye)
+
+# 2 args fit: forces the notion of input data on which training is performed. May facilitates dispatch/specialisation on various supported data typees
+m = LearnAPI.fit(config::Config, data; kwargs)
+m = LearnAPI.fit(config::EvoTrees.EvoTypes, (x_train, y_train))
+m = LearnAPI.fit(config::EvoTrees.EvoTypes, (x_train, y_train); x_eval=xe, y_eval=ye)
+m = LearnAPI.fit(config::EvoTrees.EvoTypes, df::DataFrame)
+
+# Iterative models
+import .LearnAPI: isiterative
+LearnAPI.isiterative(m::EvoTree) = true
+
+# 2 args model initialization
+# Here a EvoTreeLearner is returned: a comprehensive struct that includes the config, the model, and cache/state
+m = LearnAPI.init(config::Config, data::DataFrame; kwargs)
+m = LearnAPI.init(config::EvoTrees.EvoTypes, df::DataFrame; x_eval=xe, y_eval=ye)
+
+LearnAPI.fit!(m::EvoTree)
+LearnAPI.fit!(m::EvoTree, data)
+
+# LearnAPI.fit!(m, config::EvoTrees.EvoTypes; kwargs)
+LearnAPI.predict(m::EvoTrees.EvoTypes, x)
+
+config = EvoTreeRegressor()
+# m, cache = LearnAPI.init()
+
+# should be possible to have model that specify feature treatment upfront at the Config level?
+# Or rather have those passed at the fitted level?
diff --git a/experiments/readme_plots_cpu.jl b/experiments/readme_plots_cpu.jl
@@ -90,7 +90,7 @@ params1 = EvoTreeRegressor(;
     nbins=64,
     lambda=0.1,
     gamma=0.1,
-    eta=0.05,
+    eta=0.1,
     max_depth=6,
     min_weight=1.0,
     rowsample=0.5,
@@ -132,7 +132,7 @@ params1 = EvoTreeRegressor(;
     nbins=64,
     lambda=0.1,
     gamma=0.1,
-    eta=0.05,
+    eta=0.1,
     max_depth=6,
     min_weight=1.0,
     rowsample=0.5,
@@ -288,7 +288,7 @@ params1 = EvoTreeRegressor(;
     loss=:tweedie,
     nrounds=500,
     nbins=64,
-    lambda=0.5,
+    lambda=0.1,
     gamma=0.1,
     eta=0.1,
     max_depth=6,
@@ -359,7 +359,7 @@ params1 = EvoTreeRegressor(;
     nbins=64,
     lambda=0.1,
     gamma=0.0,
-    eta=0.05,
+    eta=0.1,
     max_depth=6,
     min_weight=1.0,
     rowsample=0.5,
@@ -389,7 +389,7 @@ params1 = EvoTreeRegressor(;
     nbins=64,
     lambda=0.1,
     gamma=0.0,
-    eta=0.05,
+    eta=0.1,
     max_depth=6,
     min_weight=1.0,
     rowsample=0.5,
@@ -408,7 +408,7 @@ params1 = EvoTreeRegressor(;
     nbins=64,
     lambda=0.1,
     gamma=0.0,
-    eta=0.05,
+    eta=0.1,
     max_depth=6,
     min_weight=1.0,
     rowsample=0.5,
@@ -466,10 +466,10 @@ params1 = EvoTreeMLE(;
     nbins=64,
     lambda=0.1,
     gamma=0.1,
-    eta=0.05,
+    eta=0.1,
     max_depth=6,
-    min_weight=10.0,
-    rowsample=1.0,
+    min_weight=10,
+    rowsample=0.5,
     colsample=1.0,
     rng=123,
     tree_type,
@@ -549,12 +549,12 @@ params1 = EvoTrees.EvoTreeMLE(;
     loss=:logistic,
     nrounds=500,
     nbins=64,
-    lambda=1.0,
+    lambda=0.1,
     gamma=0.1,
-    eta=0.03,
+    eta=0.1,
     max_depth=6,
-    min_weight=1.0,
-    rowsample=1.0,
+    min_weight=10,
+    rowsample=0.5,
     colsample=1.0,
     tree_type,
     rng=123,