Skip to content

Commit

Permalink
Merge pull request #11 from Evovest/MLJ
Browse files Browse the repository at this point in the history
Mlj
  • Loading branch information
jeremiedb authored May 23, 2019
2 parents 114436d + 05ee7a9 commit ca2575b
Show file tree
Hide file tree
Showing 10 changed files with 138 additions and 35 deletions.
4 changes: 2 additions & 2 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -362,9 +362,9 @@ version = "1.0.0"

[[Tables]]
deps = ["IteratorInterfaceExtensions", "LinearAlgebra", "Requires", "TableTraits", "Test"]
git-tree-sha1 = "83b4a0261e5d01274f12b35d4c2212386fb15569"
git-tree-sha1 = "c5d784c61e9d243a5a6a8458d19f535b70bdedeb"
uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
version = "0.2.3"
version = "0.2.4"

[[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
Expand Down
9 changes: 7 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.2.0"
authors = ["jeremiedb <[email protected]>"]
version = "0.2.1"

[compat]
julia = "1"

[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Traceur = "37b6cedf-1f77-55f8-9503-c64b63398394"
7 changes: 5 additions & 2 deletions src/EvoTrees.jl
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
module EvoTrees

export grow_tree!, grow_gbtree, Tree, Node, Params, predict, EvoTreeRegressor, EvoTreeRegressorR

using DataFrames
using Statistics
using CSV
using Base.Threads: @threads
using StatsBase: sample

export grow_tree!, grow_gbtree, Tree, Node, Params, predict, EvoTreeRegressor, EvoTreeRegressorR
import MLJ
import MLJBase

include("struct.jl")
include("loss.jl")
include("eval.jl")
include("predict.jl")
include("tree_vector.jl")
include("histogram.jl")
include("MLJ.jl")

end # module
72 changes: 72 additions & 0 deletions src/MLJ.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
function MLJ.clean!(model::EvoTreeRegressor)
warning = "All Good!"
if model.nrounds < 1
warning *= "Need nrounds ≥ 1. Resetting nrounds=1. "
model.nrounds = 1
end
if model.λ < 0
warning *= "Need λ ≥ 0. Resetting λ=0. "
model.λ = 0.0
end
if model.γ < 0
warning *= "Need γ ≥ 0. Resetting γ=0. "
model.γ = 0.0
end
if model.η <= 0
warning *= "Need η > 0. Resetting η=0.001. "
model.η = 0.001
end
if model.max_depth < 1
warning *= "Need max_depth ≥ 0. Resetting max_depth=0. "
model.max_depth = 1
end
if model.min_weight < 0
warning *= "Need min_weight ≥ 0. Resetting min_weight=0. "
model.min_weight = 0.0
end
if model.rowsample < 0
warning *= "Need rowsample ≥ 0. Resetting rowsample=0. "
model.rowsample = 0.0
end
if model.rowsample > 1
warning *= "Need rowsample <= 1. Resetting rowsample=1. "
model.rowsample = 1.0
end
if model.colsample < 0
warning *= "Need colsample ≥ 0. Resetting colsample=0. "
model.colsample = 0.0
end
if model.colsample > 1
warning *= "Need colsample <= 1. Resetting colsample=1. "
model.colsample = 1.0
end
if model.nbins > 250
warning *= "Need nbins <= 250. Resetting nbins=250. "
model.nbins = 250
end
return warning
end

function MLJBase.fit(model::EvoTreeRegressor, verbosity::Integer, X, y)
Xmatrix = MLJBase.matrix(X)
fitresult = grow_gbtree(Xmatrix, y, model, verbosity = verbosity)
cache = nothing
report = nothing
return fitresult, cache, report
end

function MLJBase.predict(model::EvoTreeRegressor, fitresult, Xnew)
Xmatrix = MLJBase.matrix(Xnew)
pred = pred(fitresult, Xmatrix)
return pred
end

# shared metadata
const EvoTypes = Union{EvoTreeRegressor}
MLJBase.input_is_multivariate(::Type{<:EvoTreeRegressor}) = true
MLJBase.input_scitype_union(::Type{<:EvoTreeRegressor}) = MLJBase.Continuous
MLJBase.target_scitype_union(::Type{<:EvoTreeRegressor}) = MLJBase.Continuous

MLJBase.package_name(::Type{<:EvoTypes}) = "EvoTrees"
MLJBase.package_url(::Type{<:EvoTypes}) = "https://github.com/Evovest/EvoTrees.jl"
MLJBase.is_pure_julia(::Type{<:EvoTypes}) = true
4 changes: 2 additions & 2 deletions src/predict.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ function predict(model::GBTree, X::AbstractArray{T, 2}) where T<:Real
end

# prediction in Leaf - GradientRegression
function pred_leaf(loss::S, ∑δ::T, ∑δ²::T, ∑𝑤::T, params::Params) where {S<:GradientRegression, T<:AbstractFloat}
function pred_leaf(loss::S, ∑δ::T, ∑δ²::T, ∑𝑤::T, params::EvoTreeRegressor) where {S<:GradientRegression, T<:AbstractFloat}
pred = - params.η * ∑δ / (∑δ² + params.λ * ∑𝑤)
return pred
end

# prediction in Leaf - QuantileRegression
function pred_leaf(loss::S, ∑δ::T, ∑δ²::T, ∑𝑤::T, params::Params) where {S<:QuantileRegression, T<:AbstractFloat}
function pred_leaf(loss::S, ∑δ::T, ∑δ²::T, ∑𝑤::T, params::EvoTreeRegressor) where {S<:QuantileRegression, T<:AbstractFloat}
pred = params.η * ∑δ / (∑𝑤 * (1+params.λ))
return pred
end
17 changes: 10 additions & 7 deletions src/struct.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ end
TreeNode(left::S, right::S, feat::S, cond::T) where {T<:AbstractFloat, S<:Int} = TreeNode{T,S,Bool}(left, right, feat, cond, 0.0, true)
TreeNode(pred::T) where {T<:AbstractFloat} = TreeNode{T,Int,Bool}(0, 0, 0, 0.0, pred, false)

mutable struct Params{T<:AbstractFloat, U<:ModelType, S<:Int}
mutable struct EvoTreeRegressor{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJBase.Deterministic
loss::U
nrounds::S
λ::T
Expand All @@ -71,6 +71,7 @@ mutable struct Params{T<:AbstractFloat, U<:ModelType, S<:Int}
colsample::T
nbins::S
α::T
metric::Symbol
end

function EvoTreeRegressor(;
Expand All @@ -84,21 +85,22 @@ function EvoTreeRegressor(;
rowsample=1.0,
colsample=1.0,
nbins=64,
α=0.0)
α=0.0,
metric=:mse)

if loss == :linear model_type = Linear()
elseif loss == :logistic model_type = Logistic()
elseif loss == :poisson model_type = Poisson()
elseif loss == :quantile model_type = Quantile()
end

model = Params(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α)
model = EvoTreeRegressor(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric)
# message = MLJBase.clean!(model)
# isempty(message) || @warn message
return model
end

# For R support
# For R-package
function EvoTreeRegressorR(
loss,
nrounds,
Expand All @@ -110,15 +112,16 @@ function EvoTreeRegressorR(
rowsample,
colsample,
nbins,
α)
α,
metric)

if loss == :linear model_type = Linear()
elseif loss == :logistic model_type = Logistic()
elseif loss == :poisson model_type = Poisson()
elseif loss == :quantile model_type = Quantile()
end

model = Params(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α)
model = EvoTreeRegressor(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric)
# message = MLJBase.clean!(model)
# isempty(message) || @warn message
return model
Expand Down Expand Up @@ -150,6 +153,6 @@ Metric() = Metric([0], [Inf])
# gradient-boosted tree is formed by a vector of trees
struct GBTree{T<:AbstractFloat, S<:Int}
trees::Vector{Tree{T,S}}
params::Params
params::EvoTreeRegressor
metric::Metric
end
10 changes: 5 additions & 5 deletions src/tree_vector.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# initialize train_nodes
function grow_tree(X::AbstractArray{R, 2}, δ::AbstractArray{T, 1}, δ²::AbstractArray{T, 1}, 𝑤::AbstractArray{T, 1}, params::Params, perm_ini::AbstractArray{Int}, train_nodes::Vector{TrainNode{T, I, J, S}}, splits::Vector{SplitInfo{Float64, Int}}, tracks::Vector{SplitTrack{Float64}}, X_edges) where {R<:Real, T<:AbstractFloat, I<:AbstractArray{Int, 1}, J<:AbstractArray{Int, 1}, S<:Int}
function grow_tree(X::AbstractArray{R, 2}, δ::AbstractArray{T, 1}, δ²::AbstractArray{T, 1}, 𝑤::AbstractArray{T, 1}, params::EvoTreeRegressor, perm_ini::AbstractArray{Int}, train_nodes::Vector{TrainNode{T, I, J, S}}, splits::Vector{SplitInfo{Float64, Int}}, tracks::Vector{SplitTrack{Float64}}, X_edges) where {R<:Real, T<:AbstractFloat, I<:AbstractArray{Int, 1}, J<:AbstractArray{Int, 1}, S<:Int}

active_id = ones(Int, 1)
leaf_count = 1::Int
Expand Down Expand Up @@ -81,9 +81,9 @@ function binarize(X, edges)
end

# grow_gbtree
function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::Params;
function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::EvoTreeRegressor;
X_eval::AbstractArray{R, 2} = Array{R, 2}(undef, (0,0)), Y_eval::AbstractArray{T, 1} = Array{Float64, 1}(undef, 0),
metric::Symbol = :none, early_stopping_rounds = Int(1e5), print_every_n = 100) where {R<:Real, T<:AbstractFloat}
metric::Symbol = :none, early_stopping_rounds=Int(1e5), print_every_n=100, verbosity=1) where {R<:Real, T<:AbstractFloat}

X_edges = get_edges(X, params.nbins)
X_bin = binarize(X, X_edges)
Expand Down Expand Up @@ -178,7 +178,7 @@ function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::Par
iter_since_best += 1
end

if mod(i, print_every_n) == 0
if mod(i, print_every_n) == 0 && verbosity > 0
display(string("iter:", i, ", eval: ", metric_track.metric))
end
iter_since_best >= early_stopping_rounds ? break : nothing
Expand All @@ -193,7 +193,7 @@ function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::Par
end

# find best split
function find_split!(x::AbstractArray{T, 1}, δ::AbstractArray{Float64, 1}, δ²::AbstractArray{Float64, 1}, 𝑤::AbstractArray{Float64, 1}, ∑δ, ∑δ², ∑𝑤, params::Params, info::SplitInfo, track::SplitTrack, x_edges) where T<:Real
function find_split!(x::AbstractArray{T, 1}, δ::AbstractArray{Float64, 1}, δ²::AbstractArray{Float64, 1}, 𝑤::AbstractArray{Float64, 1}, ∑δ, ∑δ², ∑𝑤, params::EvoTreeRegressor, info::SplitInfo, track::SplitTrack, x_edges) where T<:Real

info.gain = get_gain(params.loss, ∑δ, ∑δ², ∑𝑤, params.λ)

Expand Down
19 changes: 19 additions & 0 deletions test/MLJ.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using Tables
using MLJ
import EvoTrees: EvoTreeRegressor
using EvoTrees: logit, sigmoid

features = rand(10_000) .* 5 .- 2
X = reshape(features, (size(features)[1], 1))
Y = sin.(features) .* 0.5 .+ 0.5
Y = logit(Y) + randn(size(Y))
Y = sigmoid(Y)
y=Y
X = Tables.table(X)

@load EvoTreeRegressor
tree_model = EvoTreeRegressor(max_depth=5)
tree = machine(tree_model, X, y)
train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split
fit!(tree, rows=train)
yhat = predict(tree, X[test,:])
8 changes: 4 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ params1 = EvoTreeRegressor(
λ = 0.1, γ=0.0, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric=:mae)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25, metric=:mae)
@time pred_train_linear = predict(model, X_train)
sqrt(mean((pred_train_linear .- Y_train) .^ 2))

Expand All @@ -41,7 +41,7 @@ params1 = EvoTreeRegressor(
λ = 0.1, γ=0.0, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :logloss)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25, metric = :logloss)
@time pred_train_logistic = predict(model, X_train)
sqrt(mean((pred_train_logistic .- Y_train) .^ 2))

Expand All @@ -52,7 +52,7 @@ params1 = EvoTreeRegressor(
λ = 0.1, γ=0.0, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :logloss)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25, metric = :logloss)
@time pred_train_poisson = predict(model, X_train)
sqrt(mean((pred_train_poisson .- Y_train) .^ 2))

Expand All @@ -63,6 +63,6 @@ params1 = EvoTreeRegressor(
λ = 0.1, γ=0.0, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :quantile)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25, metric = :quantile)
@time pred_train_poisson = predict(model, X_train)
sqrt(mean((pred_train_poisson .- Y_train) .^ 2))
23 changes: 12 additions & 11 deletions test/sinus_plot.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ using EvoTrees
using EvoTrees: sigmoid, logit

# prepare a dataset
features = rand(10_000) .* 5 .- 2
features = rand(10_000) .* 5 .- 0
X = reshape(features, (size(features)[1], 1))
Y = sin.(features) .* 0.5 .+ 0.5
Y = logit(Y) + randn(size(Y))
Expand All @@ -36,12 +36,12 @@ max_depth = 5
min_weight = 1.0
rowsample = 0.5
colsample = 1.0
nbins = 64
nbins = 100

# linear
params1 = EvoTreeRegressor(
loss=:linear,
nrounds=200,
nrounds=200, nbins = 100,
λ = 0.5, γ=0.5, η=0.1,
max_depth = 5, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
Expand All @@ -50,12 +50,13 @@ params1 = EvoTreeRegressor(
# @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval, print_every_n = 10, metric=:mae)
@time pred_train_linear = predict(model, X_train)
@time pred_eval_linear = predict(model, X_eval)
mean(abs.(pred_train_linear .- Y_train))
sqrt(mean((pred_train_linear .- Y_train) .^ 2))

# logistic / cross-entropy
params1 = EvoTreeRegressor(
loss=:logistic,
nrounds=200,
nrounds=200, nbins = 100,
λ = 0.5, γ=0.5, η=0.1,
max_depth = 5, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
Expand All @@ -67,7 +68,7 @@ sqrt(mean((pred_train_logistic .- Y_train) .^ 2))
# Poisson
params1 = EvoTreeRegressor(
loss=:poisson,
nrounds=200,
nrounds=200, nbins = 100,
λ = 0.5, γ=0.5, η=0.1,
max_depth = 5, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
Expand All @@ -89,8 +90,8 @@ savefig("regression_sinus.png")
# q50
params1 = EvoTreeRegressor(
loss=:quantile, α=0.5,
nrounds=200,
λ = 0.1, γ=0.0, η=0.1,
nrounds=200, nbins = 100,
λ = 0.5, γ=0.1, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=1.0)

Expand All @@ -101,8 +102,8 @@ params1 = EvoTreeRegressor(
# q20
params1 = EvoTreeRegressor(
loss=:quantile, α=0.2,
nrounds=200,
λ = 0.1, γ=0.0, η=0.1,
nrounds=200, nbins = 100,
λ = 0.5, γ=0.1, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :quantile)
Expand All @@ -111,8 +112,8 @@ params1 = EvoTreeRegressor(
# q80
params1 = EvoTreeRegressor(
loss=:quantile, α=0.8,
nrounds=200,
λ = 0.1, γ=0.0, η=0.1,
nrounds=200, nbins = 100,
λ = 0.5, γ=0.1, η=0.1,
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=1.0)
@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :quantile)
Expand Down

0 comments on commit ca2575b

Please sign in to comment.