diff --git a/src/find_split.jl b/src/find_split.jl index 9a766c27..e93e8103 100644 --- a/src/find_split.jl +++ b/src/find_split.jl @@ -23,12 +23,21 @@ function binarize(X, edges) X_bin end -function find_bags(x::Vector{T}, edges::Vector{T}) where T<:Real +function find_bags(x_bin::Vector{T}) where T <: Real + 𝑖 = 1:length(x_bin) |> collect + bags = [BitSet() for _ in 1:maximum(x_bin)] + for bag in 1:length(bags) + bags[bag] = BitSet(𝑖[x_bin .== bag]) + end + return bags +end + +function find_bags_1(x::Vector{T}, edges::Vector{T}) where T<:Real idx = BitSet(1:length(x) |> collect) bags = [BitSet() for _ in 1:length(edges)] for i in idx bin = 1 - while x[i] > edges[bin] + while x[i] >= edges[bin] bin +=1 end union!(bags[bin], i) diff --git a/src/predict.jl b/src/predict.jl index c635b6e1..ea44f506 100644 --- a/src/predict.jl +++ b/src/predict.jl @@ -4,7 +4,7 @@ function predict!(pred, tree::Tree, X::AbstractArray{T, 2}) where T<:Real id = 1 x = view(X, i, :) while tree.nodes[id].split - if x[tree.nodes[id].feat] <= tree.nodes[id].cond + if x[tree.nodes[id].feat] < tree.nodes[id].cond id = tree.nodes[id].left else id = tree.nodes[id].right diff --git a/src/tree_vector.jl b/src/tree_vector.jl index 2de5df90..d595ee9a 100644 --- a/src/tree_vector.jl +++ b/src/tree_vector.jl @@ -91,7 +91,7 @@ function grow_gbtree(X::AbstractArray{R, 2}, Y::AbstractArray{T, 1}, params::Evo edges = get_edges(X, params.nbins) bags = Vector{Vector{BitSet}}(undef, size(𝑗_, 1)) @threads for feat in 1:size(𝑗_, 1) - bags[feat] = find_bags(X[:,feat], edges[feat]) + bags[feat] = find_bags(X_bin[:,feat]) end # initialize train nodes @@ -196,7 +196,7 @@ function grow_gbtree!(model::GBTree, X::AbstractArray{R, 2}, Y::AbstractArray{T, edges = get_edges(X, params.nbins) bags = Vector{Vector{BitSet}}(undef, size(𝑗_, 1)) @threads for feat in 1:size(𝑗_, 1) - bags[feat] = find_bags(X[:,feat], edges[feat]) + bags[feat] = find_bags(X_bin[:,feat]) end # initialize train nodes diff --git a/test/datacup.jl b/test/datacup.jl index 5d972db7..c8865122 100644 --- a/test/datacup.jl +++ b/test/datacup.jl @@ -15,7 +15,7 @@ names(data) features = data[1:53] X = convert(Array, features) -X = X + randn(size(X)) * 0.0001 +# X = X + randn(size(X)) * 0.0001 Y = data[54] Y = convert(Array{Float64}, Y) 𝑖 = collect(1:size(X,1)) @@ -32,12 +32,12 @@ Y_train, Y_eval = Y[𝑖_train], Y[𝑖_eval] params1 = EvoTreeRegressor( loss=:logistic, metric=:logloss, - nrounds=10, nbins=16, + nrounds=100, nbins=16, λ = 0.0, γ=0.0, η=0.1, max_depth = 6, min_weight = 1.0, - rowsample=0.5, colsample=0.5) + rowsample=0.5, colsample=0.5, seed = 127) -@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 1) +@time model = grow_gbtree(X_train, Y_train, params1, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 10) @time model = grow_gbtree(X_train, Y_train, params1, print_every_n = 1) @time pred_train_linear = EvoTrees.predict(model, X_train) @@ -70,11 +70,18 @@ end @time edges = get_edges(X, params1.nbins) @time X_bin = binarize(X, edges) +# manual check +x1 = edges[2] +x2 = [0, x1[1], 0.1, x1[2], 0.5, x1[3], 0.95, x1[4]] +x2_bin = searchsortedlast.(Ref(edges[2][1:end-1]), x2) .+ 1 +x2_bag = find_bags(x2_bin) + function prep(X, params) edges = get_edges(X, params.nbins) + X_bin = binarize(X, edges) bags = Vector{Vector{BitSet}}(undef, size(𝑗, 1)) for feat in 1:size(𝑗, 1) - bags[feat] = find_bags(X[:,feat], edges[feat]) + bags[feat] = find_bags(X_bin[:,feat]) end return bags end