diff --git a/Manifest.toml b/Manifest.toml index 1822952f..4fcb36c2 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -362,9 +362,9 @@ version = "1.0.0" [[Tables]] deps = ["IteratorInterfaceExtensions", "LinearAlgebra", "Requires", "TableTraits", "Test"] -git-tree-sha1 = "83b4a0261e5d01274f12b35d4c2212386fb15569" +git-tree-sha1 = "c5d784c61e9d243a5a6a8458d19f535b70bdedeb" uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "0.2.3" +version = "0.2.4" [[Test]] deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] diff --git a/Project.toml b/Project.toml index ecb08b08..f33e1c38 100644 --- a/Project.toml +++ b/Project.toml @@ -1,17 +1,22 @@ name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" -authors = ["jeremiedb "] -version = "0.2.0" +authors = ["jeremiedb "] +version = "0.2.1" + +[compat] +julia = "1" [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" +MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Traceur = "37b6cedf-1f77-55f8-9503-c64b63398394" diff --git a/src/EvoTrees.jl b/src/EvoTrees.jl index bfd3aa84..5a3c4768 100644 --- a/src/EvoTrees.jl +++ b/src/EvoTrees.jl @@ -1,12 +1,14 @@ module EvoTrees +export grow_tree!, grow_gbtree, Tree, Node, Params, predict, EvoTreeRegressor, EvoTreeRegressorR + using DataFrames using Statistics using CSV using Base.Threads: @threads using StatsBase: sample - -export grow_tree!, grow_gbtree, Tree, Node, Params, predict, EvoTreeRegressor, EvoTreeRegressorR +import MLJ +import MLJBase include("struct.jl") include("loss.jl") @@ -14,5 +16,6 @@ include("eval.jl") include("predict.jl") include("tree_vector.jl") include("histogram.jl") +include("MLJ.jl") end # module diff --git a/src/MLJ.jl b/src/MLJ.jl new file mode 100644 index 00000000..0b3fb554 --- /dev/null +++ b/src/MLJ.jl @@ -0,0 +1,72 @@ +function MLJ.clean!(model::EvoTreeRegressor) + warning = "All Good!" + if model.nrounds < 1 + warning *= "Need nrounds ≥ 1. Resetting nrounds=1. " + model.nrounds = 1 + end + if model.λ < 0 + warning *= "Need λ ≥ 0. Resetting λ=0. " + model.λ = 0.0 + end + if model.γ < 0 + warning *= "Need γ ≥ 0. Resetting γ=0. " + model.γ = 0.0 + end + if model.η <= 0 + warning *= "Need η > 0. Resetting η=0.001. " + model.η = 0.001 + end + if model.max_depth < 1 + warning *= "Need max_depth ≥ 0. Resetting max_depth=0. " + model.max_depth = 1 + end + if model.min_weight < 0 + warning *= "Need min_weight ≥ 0. Resetting min_weight=0. " + model.min_weight = 0.0 + end + if model.rowsample < 0 + warning *= "Need rowsample ≥ 0. Resetting rowsample=0. " + model.rowsample = 0.0 + end + if model.rowsample > 1 + warning *= "Need rowsample <= 1. Resetting rowsample=1. " + model.rowsample = 1.0 + end + if model.colsample < 0 + warning *= "Need colsample ≥ 0. Resetting colsample=0. " + model.colsample = 0.0 + end + if model.colsample > 1 + warning *= "Need colsample <= 1. Resetting colsample=1. " + model.colsample = 1.0 + end + if model.nbins > 250 + warning *= "Need nbins <= 250. Resetting nbins=250. " + model.nbins = 250 + end + return warning +end + +function MLJBase.fit(model::EvoTreeRegressor, verbosity::Integer, X, y) + Xmatrix = MLJBase.matrix(X) + fitresult = grow_gbtree(Xmatrix, y, model, verbosity = verbosity) + cache = nothing + report = nothing + return fitresult, cache, report +end + +function MLJBase.predict(model::EvoTreeRegressor, fitresult, Xnew) + Xmatrix = MLJBase.matrix(Xnew) + pred = pred(fitresult, Xmatrix) + return pred +end + +# shared metadata +const EvoTypes = Union{EvoTreeRegressor} +MLJBase.input_is_multivariate(::Type{<:EvoTreeRegressor}) = true +MLJBase.input_scitype_union(::Type{<:EvoTreeRegressor}) = MLJBase.Continuous +MLJBase.target_scitype_union(::Type{<:EvoTreeRegressor}) = MLJBase.Continuous + +MLJBase.package_name(::Type{<:EvoTypes}) = "EvoTrees" +MLJBase.package_url(::Type{<:EvoTypes}) = "https://github.com/Evovest/EvoTrees.jl" +MLJBase.is_pure_julia(::Type{<:EvoTypes}) = true diff --git a/src/predict.jl b/src/predict.jl index ec051086..181a895b 100644 --- a/src/predict.jl +++ b/src/predict.jl @@ -37,13 +37,13 @@ function predict(model::GBTree, X::AbstractArray{T, 2}) where T<:Real end # prediction in Leaf - GradientRegression -function pred_leaf(loss::S, ∑δ::T, ∑δ²::T, ∑𝑤::T, params::Params) where {S<:GradientRegression, T<:AbstractFloat} +function pred_leaf(loss::S, ∑δ::T, ∑δ²::T, ∑𝑤::T, params::EvoTreeRegressor) where {S<:GradientRegression, T<:AbstractFloat} pred = - params.η * ∑δ / (∑δ² + params.λ * ∑𝑤) return pred end # prediction in Leaf - QuantileRegression -function pred_leaf(loss::S, ∑δ::T, ∑δ²::T, ∑𝑤::T, params::Params) where {S<:QuantileRegression, T<:AbstractFloat} +function pred_leaf(loss::S, ∑δ::T, ∑δ²::T, ∑𝑤::T, params::EvoTreeRegressor) where {S<:QuantileRegression, T<:AbstractFloat} pred = params.η * ∑δ / (∑𝑤 * (1+params.λ)) return pred end diff --git a/src/struct.jl b/src/struct.jl index 53c672ab..f055d22e 100644 --- a/src/struct.jl +++ b/src/struct.jl @@ -59,7 +59,7 @@ end TreeNode(left::S, right::S, feat::S, cond::T) where {T<:AbstractFloat, S<:Int} = TreeNode{T,S,Bool}(left, right, feat, cond, 0.0, true) TreeNode(pred::T) where {T<:AbstractFloat} = TreeNode{T,Int,Bool}(0, 0, 0, 0.0, pred, false) -mutable struct Params{T<:AbstractFloat, U<:ModelType, S<:Int} +mutable struct EvoTreeRegressor{T<:AbstractFloat, U<:ModelType, S<:Int} <: MLJBase.Deterministic loss::U nrounds::S λ::T @@ -71,6 +71,7 @@ mutable struct Params{T<:AbstractFloat, U<:ModelType, S<:Int} colsample::T nbins::S α::T + metric::Symbol end function EvoTreeRegressor(; @@ -84,7 +85,8 @@ function EvoTreeRegressor(; rowsample=1.0, colsample=1.0, nbins=64, - α=0.0) + α=0.0, + metric=:mse) if loss == :linear model_type = Linear() elseif loss == :logistic model_type = Logistic() @@ -92,13 +94,13 @@ function EvoTreeRegressor(; elseif loss == :quantile model_type = Quantile() end - model = Params(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α) + model = EvoTreeRegressor(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric) # message = MLJBase.clean!(model) # isempty(message) || @warn message return model end -# For R support +# For R-package function EvoTreeRegressorR( loss, nrounds, @@ -110,7 +112,8 @@ function EvoTreeRegressorR( rowsample, colsample, nbins, - α) + α, + metric) if loss == :linear model_type = Linear() elseif loss == :logistic model_type = Logistic() @@ -118,7 +121,7 @@ function EvoTreeRegressorR( elseif loss == :quantile model_type = Quantile() end - model = Params(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α) + model = EvoTreeRegressor(model_type, nrounds, λ, γ, η, max_depth, min_weight, rowsample, colsample, nbins, α, metric) # message = MLJBase.clean!(model) # isempty(message) || @warn message return model @@ -150,6 +153,6 @@ Metric() = Metric([0], [Inf]) # gradient-boosted tree is formed by a vector of trees struct GBTree{T<:AbstractFloat, S<:Int} trees::Vector{Tree{T,S}} - params::Params + params::EvoTreeRegressor metric::Metric end diff --git a/src/tree_vector.jl b/src/tree_vector.jl index b93fed14..e15291a4 100644 --- a/src/tree_vector.jl +++ b/src/tree_vector.jl @@ -1,5 +1,5 @@ # initialize train_nodes -function grow_tree(X::AbstractArray{R, 2}, δ::AbstractArray{T, 1}, δ²::AbstractArray{T, 1}, 𝑤::AbstractArray{T, 1}, params::Params, perm_ini::AbstractArray{Int}, train_nodes::Vector{TrainNode{T, I, J, S}}, splits::Vector{SplitInfo{Float64, Int}}, tracks::Vector{SplitTrack{Float64}}, X_edges) where {R<:Real, T<:AbstractFloat, I<:AbstractArray{Int, 1}, J<:AbstractArray{Int, 1}, S<:Int} +function grow_tree(X::AbstractArray{R, 2}, δ::AbstractArray{T, 1}, δ²::AbstractArray{T, 1}, 𝑤::AbstractArray{T, 1}, params::EvoTreeRegressor, perm_ini::AbstractArray{Int}, train_nodes::Vector{TrainNode{T, I, J, S}}, splits::Vector{SplitInfo{Float64, Int}}, tracks::Vector{SplitTrack{Float64}}, X_edges) where {R<:Real, T<:AbstractFloat, I<:AbstractArray{Int, 1}, J<:AbstractArray{Int, 1}, S<:Int} active_id = ones(Int, 1) leaf_count = 1::Int @@ -81,9 +81,9 @@ function binarize(X, edges) end # grow_gbtree -function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::Params; +function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::EvoTreeRegressor; X_eval::AbstractArray{R, 2} = Array{R, 2}(undef, (0,0)), Y_eval::AbstractArray{T, 1} = Array{Float64, 1}(undef, 0), - metric::Symbol = :none, early_stopping_rounds = Int(1e5), print_every_n = 100) where {R<:Real, T<:AbstractFloat} + metric::Symbol = :none, early_stopping_rounds=Int(1e5), print_every_n=100, verbosity=1) where {R<:Real, T<:AbstractFloat} X_edges = get_edges(X, params.nbins) X_bin = binarize(X, X_edges) @@ -178,7 +178,7 @@ function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::Par iter_since_best += 1 end - if mod(i, print_every_n) == 0 + if mod(i, print_every_n) == 0 && verbosity > 0 display(string("iter:", i, ", eval: ", metric_track.metric)) end iter_since_best >= early_stopping_rounds ? break : nothing @@ -193,7 +193,7 @@ function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::Par end # find best split -function find_split!(x::AbstractArray{T, 1}, δ::AbstractArray{Float64, 1}, δ²::AbstractArray{Float64, 1}, 𝑤::AbstractArray{Float64, 1}, ∑δ, ∑δ², ∑𝑤, params::Params, info::SplitInfo, track::SplitTrack, x_edges) where T<:Real +function find_split!(x::AbstractArray{T, 1}, δ::AbstractArray{Float64, 1}, δ²::AbstractArray{Float64, 1}, 𝑤::AbstractArray{Float64, 1}, ∑δ, ∑δ², ∑𝑤, params::EvoTreeRegressor, info::SplitInfo, track::SplitTrack, x_edges) where T<:Real info.gain = get_gain(params.loss, ∑δ, ∑δ², ∑𝑤, params.λ) diff --git a/test/MLJ.jl b/test/MLJ.jl new file mode 100644 index 00000000..2b5c17bf --- /dev/null +++ b/test/MLJ.jl @@ -0,0 +1,19 @@ +using Tables +using MLJ +import EvoTrees: EvoTreeRegressor +using EvoTrees: logit, sigmoid + +features = rand(10_000) .* 5 .- 2 +X = reshape(features, (size(features)[1], 1)) +Y = sin.(features) .* 0.5 .+ 0.5 +Y = logit(Y) + randn(size(Y)) +Y = sigmoid(Y) +y=Y +X = Tables.table(X) + +@load EvoTreeRegressor +tree_model = EvoTreeRegressor(max_depth=5) +tree = machine(tree_model, X, y) +train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split +fit!(tree, rows=train) +yhat = predict(tree, X[test,:]) diff --git a/test/runtests.jl b/test/runtests.jl index eb5a6b93..58bbdd33 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -30,7 +30,7 @@ params1 = EvoTreeRegressor( λ = 0.1, γ=0.0, η=0.1, max_depth = 6, min_weight = 1.0, rowsample=0.5, colsample=1.0) -@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric=:mae) +@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25, metric=:mae) @time pred_train_linear = predict(model, X_train) sqrt(mean((pred_train_linear .- Y_train) .^ 2)) @@ -41,7 +41,7 @@ params1 = EvoTreeRegressor( λ = 0.1, γ=0.0, η=0.1, max_depth = 6, min_weight = 1.0, rowsample=0.5, colsample=1.0) -@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :logloss) +@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25, metric = :logloss) @time pred_train_logistic = predict(model, X_train) sqrt(mean((pred_train_logistic .- Y_train) .^ 2)) @@ -52,7 +52,7 @@ params1 = EvoTreeRegressor( λ = 0.1, γ=0.0, η=0.1, max_depth = 6, min_weight = 1.0, rowsample=0.5, colsample=1.0) -@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :logloss) +@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25, metric = :logloss) @time pred_train_poisson = predict(model, X_train) sqrt(mean((pred_train_poisson .- Y_train) .^ 2)) @@ -63,6 +63,6 @@ params1 = EvoTreeRegressor( λ = 0.1, γ=0.0, η=0.1, max_depth = 6, min_weight = 1.0, rowsample=0.5, colsample=1.0) -@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :quantile) +@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25, metric = :quantile) @time pred_train_poisson = predict(model, X_train) sqrt(mean((pred_train_poisson .- Y_train) .^ 2)) diff --git a/test/sinus_plot.jl b/test/sinus_plot.jl index a9339377..3cb3f0c5 100644 --- a/test/sinus_plot.jl +++ b/test/sinus_plot.jl @@ -10,7 +10,7 @@ using EvoTrees using EvoTrees: sigmoid, logit # prepare a dataset -features = rand(10_000) .* 5 .- 2 +features = rand(10_000) .* 5 .- 0 X = reshape(features, (size(features)[1], 1)) Y = sin.(features) .* 0.5 .+ 0.5 Y = logit(Y) + randn(size(Y)) @@ -36,12 +36,12 @@ max_depth = 5 min_weight = 1.0 rowsample = 0.5 colsample = 1.0 -nbins = 64 +nbins = 100 # linear params1 = EvoTreeRegressor( loss=:linear, - nrounds=200, + nrounds=200, nbins = 100, λ = 0.5, γ=0.5, η=0.1, max_depth = 5, min_weight = 1.0, rowsample=0.5, colsample=1.0) @@ -50,12 +50,13 @@ params1 = EvoTreeRegressor( # @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval, print_every_n = 10, metric=:mae) @time pred_train_linear = predict(model, X_train) @time pred_eval_linear = predict(model, X_eval) +mean(abs.(pred_train_linear .- Y_train)) sqrt(mean((pred_train_linear .- Y_train) .^ 2)) # logistic / cross-entropy params1 = EvoTreeRegressor( loss=:logistic, - nrounds=200, + nrounds=200, nbins = 100, λ = 0.5, γ=0.5, η=0.1, max_depth = 5, min_weight = 1.0, rowsample=0.5, colsample=1.0) @@ -67,7 +68,7 @@ sqrt(mean((pred_train_logistic .- Y_train) .^ 2)) # Poisson params1 = EvoTreeRegressor( loss=:poisson, - nrounds=200, + nrounds=200, nbins = 100, λ = 0.5, γ=0.5, η=0.1, max_depth = 5, min_weight = 1.0, rowsample=0.5, colsample=1.0) @@ -89,8 +90,8 @@ savefig("regression_sinus.png") # q50 params1 = EvoTreeRegressor( loss=:quantile, α=0.5, - nrounds=200, - λ = 0.1, γ=0.0, η=0.1, + nrounds=200, nbins = 100, + λ = 0.5, γ=0.1, η=0.1, max_depth = 6, min_weight = 1.0, rowsample=0.5, colsample=1.0) @@ -101,8 +102,8 @@ params1 = EvoTreeRegressor( # q20 params1 = EvoTreeRegressor( loss=:quantile, α=0.2, - nrounds=200, - λ = 0.1, γ=0.0, η=0.1, + nrounds=200, nbins = 100, + λ = 0.5, γ=0.1, η=0.1, max_depth = 6, min_weight = 1.0, rowsample=0.5, colsample=1.0) @time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :quantile) @@ -111,8 +112,8 @@ params1 = EvoTreeRegressor( # q80 params1 = EvoTreeRegressor( loss=:quantile, α=0.8, - nrounds=200, - λ = 0.1, γ=0.0, η=0.1, + nrounds=200, nbins = 100, + λ = 0.5, γ=0.1, η=0.1, max_depth = 6, min_weight = 1.0, rowsample=0.5, colsample=1.0) @time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10, metric = :quantile)