Skip to content

Commit

Permalink
Merge pull request #5 from Evovest/type-tests
Browse files Browse the repository at this point in the history
fix logloss
  • Loading branch information
jeremiedb authored Apr 18, 2019
2 parents eb01053 + 93e20e5 commit 798c516
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 127 deletions.
40 changes: 26 additions & 14 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ git-tree-sha1 = "055eb2690182ebc31087859c3dd8598371d3ef9e"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.3"

[[CSTParser]]
deps = ["LibGit2", "Test", "Tokenize"]
git-tree-sha1 = "437c93bc191cd55957b3f8dee7794b6131997c56"
uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
version = "0.5.2"

[[CSV]]
deps = ["CategoricalArrays", "DataFrames", "DataStreams", "Dates", "Mmap", "Parsers", "Profile", "Random", "Tables", "Test", "Unicode", "WeakRefStrings"]
git-tree-sha1 = "b92c6f626a044cc9619156d54994b94084d40abe"
Expand All @@ -17,9 +23,9 @@ version = "0.4.3"

[[Cassette]]
deps = ["InteractiveUtils", "LinearAlgebra", "Test"]
git-tree-sha1 = "1d00c35118babf85c4a9a72c3d3550d498b64a03"
git-tree-sha1 = "5536cf95a81ff784f4d1eca2df39bb2629a7ee59"
uuid = "7057c7e9-c182-5462-911a-8362d720325c"
version = "0.2.2"
version = "0.2.3"

[[CategoricalArrays]]
deps = ["Compat", "Future", "Missings", "Printf", "Reexport", "Requires"]
Expand All @@ -29,9 +35,9 @@ version = "0.5.2"

[[CodeTracking]]
deps = ["InteractiveUtils", "Test", "UUIDs"]
git-tree-sha1 = "7e19dccd5667e0a8d9327ac76966977dbc0df85f"
git-tree-sha1 = "9b21a2dfe51ba71fdc5688039075819196595367"
uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
version = "0.5.0"
version = "0.5.7"

[[CodecZlib]]
deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"]
Expand Down Expand Up @@ -113,15 +119,15 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

[[LoweredCodeUtils]]
deps = ["JuliaInterpreter", "Test"]
git-tree-sha1 = "51e3d512848f96e0017338e1e7e3485ee2730e6e"
git-tree-sha1 = "6ab4d4f1c2ee5db423ada97ca7bec3081641321a"
uuid = "6f1432cf-f94c-5a45-995e-cdbf5db27b0b"
version = "0.3.0"
version = "0.3.1"

[[MacroTools]]
deps = ["Compat"]
git-tree-sha1 = "3fd1a3022952128935b449c33552eb65895380c1"
deps = ["CSTParser", "Compat", "DataStructures", "Test"]
git-tree-sha1 = "daecd9e452f38297c686eba90dba2a6d5da52162"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.4.5"
version = "0.5.0"

[[Markdown]]
deps = ["Base64"]
Expand All @@ -138,9 +144,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"

[[OrderedCollections]]
deps = ["Random", "Serialization", "Test"]
git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.0.2"
version = "1.1.0"

[[Parsers]]
deps = ["Dates", "Mmap", "Test", "WeakRefStrings"]
Expand Down Expand Up @@ -235,17 +241,23 @@ version = "0.1.18"
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[Tokenize]]
deps = ["Printf", "Test"]
git-tree-sha1 = "3e83f60b74911d3042d3550884ca2776386a02b8"
uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
version = "0.5.3"

[[Traceur]]
deps = ["Cassette", "InteractiveUtils", "Logging", "MacroTools", "Test"]
git-tree-sha1 = "c007db8fbec24668a7cdf4a878202abb648d568f"
uuid = "37b6cedf-1f77-55f8-9503-c64b63398394"
version = "0.3.0"

[[TranscodingStreams]]
deps = ["Pkg", "Random", "Test"]
git-tree-sha1 = "f42956022d8084539f1d7219f632542b0ea686ce"
deps = ["Random", "Test"]
git-tree-sha1 = "a25d8e5a28c3b1b06d3859f30757d43106791919"
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
version = "0.9.3"
version = "0.9.4"

[[UUIDs]]
deps = ["Random", "SHA"]
Expand Down
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ version = "0.1.0"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Traceur = "37b6cedf-1f77-55f8-9503-c64b63398394"
8 changes: 3 additions & 5 deletions src/eval.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
function eval_metric(::Val{:mse}, pred::AbstractArray{T, 1}, Y::AbstractArray{T, 1}) where T <: AbstractFloat
eval = mean(pred .- Y) .^ 2
eval = mean((pred .- Y) .^ 2)
return eval
end

Expand All @@ -13,9 +13,7 @@ function eval_metric(::Val{:mae}, pred::AbstractArray{T, 1}, Y::AbstractArray{T,
return eval
end

function eval_metric(::Val{:logloss}, pred::AbstractArray{T, 1}, Y::AbstractArray{T, 1}, tol=1e-15) where T <: AbstractFloat
@. pred = max(pred, tol)
@. pred = min(pred, 1-tol)
eval = -mean(Y .* log.(pred) .+ (1 .- Y).*log.(1 .- pred))
function eval_metric(::Val{:logloss}, pred::AbstractArray{T, 1}, Y::AbstractArray{T, 1}) where T <: AbstractFloat
eval = -mean(Y .* log.(max.(1e-8, sigmoid.(pred))) .+ (1 .- Y) .* log.(max.(1e-8, 1 .- sigmoid.(pred))))
return eval
end
22 changes: 4 additions & 18 deletions src/loss.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,36 +21,22 @@ end
# @. δ² = (1 - target) / (1 - pred) ^ 2 + target / pred ^ 2
# end


# compute the gradient and hessian given target and predict
# logistic
function logit(x::AbstractArray{T, 1}) where T <: AbstractFloat
@. x = x / (1 - x)
return δ, δ²
end

function sigmoid(x::AbstractArray{T, 1}) where T <: AbstractFloat
@. x = exp(x) / (1 + exp(x))
@. x = 1 / (1 + exp(-x))
return x
end

function sigmoid(x::AbstractFloat)
x = exp(x) / (1 + exp(x))
function sigmoid(x::T) where T <: AbstractFloat
x = 1 / (1 + exp(-x))
return x
end

# # compute the gradient and hessian given target and predict
# function grad_hess(pred::AbstractArray{T}, target::AbstractArray{T}, loss::logistic) where {T<:AbstractFloat}
# δ = 2 * (pred - target)
# δ² = ones(size(pred)) * 2.0
# return δ, δ²
# end

function update_gains!(info::SplitInfo{T}, ∑δL::T, ∑δ²L::T, ∑δR::T, ∑δ²R::T, λ::T) where T <: AbstractFloat
info.gainL = (∑δL ^ 2 / (∑δ²L + λ)) / 2.0
info.gainR = (∑δR ^ 2 / (∑δ²R + λ)) / 2.0
end

# update the performance tracker
function update_track!(track::SplitTrack{T}, λ::T) where T <: AbstractFloat
track.gainL = (track.∑δL ^ 2 / (track.∑δ²L + λ .* track.∑𝑤L)) / 2.0
track.gainR = (track.∑δR ^ 2 / (track.∑δ²R + λ .* track.∑𝑤R)) / 2.0
Expand Down
68 changes: 12 additions & 56 deletions src/predict.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
# prediction from single tree - assign each observation to its final leaf
function predict(tree::Tree, X::AbstractArray{T, 2}) where T<:Real
pred = zeros(size(X, 1))
function predict!(pred, tree::Tree, X::AbstractArray{T, 2}) where T<:Real
@threads for i in 1:size(X, 1)
# for i in 1:size(X, 1)
id = Int(1)
id = 1
x = view(X, i, :)
while tree.nodes[id].split
if x[tree.nodes[id].feat] <= tree.nodes[id].cond
Expand All @@ -18,64 +16,22 @@ function predict(tree::Tree, X::AbstractArray{T, 2}) where T<:Real
end

# prediction from single tree - assign each observation to its final leaf
# function predict!(pred, tree::Tree, X)
#
# @threads for i in 1:size(X, 1)
# # for i in 1:size(X, 1)
# node = tree.nodes[1]
# x = view(X, i, :)
# # x = X[i, :]
# while isa(node, SplitNode)
# id = node.feat
# cond = node.cond
# if x[id] <= cond
# node = tree.nodes[node.left]
# else
# node = tree.nodes[node.right]
# end
# end
# pred[i] += node.pred
# end
# return pred
# end

# prediction from single tree - assign each observation to its final leaf
function predict!(pred, tree::Tree, X::AbstractArray{T, 2}) where T<:Real
@threads for i in 1:size(X, 1)
# for i in 1:size(X, 1)
id = Int(1)
x = view(X, i, :)
while tree.nodes[id].split
if x[tree.nodes[id].feat] <= tree.nodes[id].cond
id = tree.nodes[id].left
else
id = tree.nodes[id].right
end
end
pred[i] += tree.nodes[id].pred
end
function predict(tree::Tree, X::AbstractArray{T, 2}) where T<:Real
pred = zeros(size(X, 1))
predict!(pred, tree, X)
return pred
end



# prediction from single tree - assign each observation to its final leaf
function predict(model::GBTree, X::AbstractArray{T, 2}) where T<:Real
pred = zeros(size(X, 1))
@threads for i in 1:size(X, 1)
# for i in 1:size(X, 1)
x = view(X, i, :)
for tree in model.trees
id = Int(1)
while tree.nodes[id].split
if x[tree.nodes[id].feat] <= tree.nodes[id].cond
id = tree.nodes[id].left
else
id = tree.nodes[id].right
end
end
pred[i] += tree.nodes[id].pred
end
for tree in model.trees
predict!(pred, tree, X)
end

if model.params.loss == :logistic
pred .= sigmoid.(pred)
end

return pred
end
10 changes: 5 additions & 5 deletions src/struct.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ struct TrainNode{T<:AbstractFloat, I<:AbstractArray{Int, 1}, J<:AbstractArray{In
end

# single tree is made of a root node that containes nested nodes and leafs
struct Tree
nodes::Vector{TreeNode}
struct Tree{T<:AbstractFloat, S<:Int}
nodes::Vector{TreeNode{T,S}}
end

# gradient-boosted tree is formed by a vector of trees
struct GBTree
trees::Vector{Tree}
params::Params
struct GBTree{T<:AbstractFloat, S<:Int}
trees::Vector{Tree{T,S}}
params::Params{T}
end
26 changes: 10 additions & 16 deletions src/tree_vector.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ function grow_tree(X::AbstractArray{T, 2}, δ::AbstractArray{Float64, 1}, δ²::
leaf_count = 1::Int
tree_depth = 1::Int

tree = Tree(Vector{TreeNode}())
tree = Tree(Vector{TreeNode{Float64, Int}}())

# grow while there are remaining active nodes
while size(active_id, 1) > 0 && tree_depth <= params.max_depth
Expand All @@ -20,7 +20,6 @@ function grow_tree(X::AbstractArray{T, 2}, δ::AbstractArray{Float64, 1}, δ²::
push!(tree.nodes, TreeNode(- params.η * node.∑δ / (node.∑δ² + params.λ * node.∑𝑤)))
else
node_size = size(node.𝑖, 1)

@threads for feat in node.𝑗
# for feat in node.𝑗
sortperm!(view(perm_ini, 1:node_size, feat), view(X, node.𝑖, feat), alg = QuickSort, initialized = false)
Expand All @@ -33,17 +32,13 @@ function grow_tree(X::AbstractArray{T, 2}, δ::AbstractArray{Float64, 1}, δ²::

# grow node if best split improve gain
if best.gain > node.gain + params.γ

# Node: depth, ∑δ, ∑δ², gain, 𝑖, 𝑗
train_nodes[leaf_count + 1] = TrainNode(node.depth + 1, best.∑δL, best.∑δ²L, best.∑𝑤L, best.gainL, node.𝑖[perm_ini[1:best.𝑖, best.feat]], node.𝑗[:])
train_nodes[leaf_count + 2] = TrainNode(node.depth + 1, best.∑δR, best.∑δ²R, best.∑𝑤R, best.gainR, node.𝑖[perm_ini[best.𝑖+1:node_size, best.feat]], node.𝑗[:])

# push split Node
push!(tree.nodes, TreeNode(leaf_count + 1, leaf_count + 2, best.feat, best.cond))

push!(next_active_id, leaf_count + 1)
push!(next_active_id, leaf_count + 2)

leaf_count += 2
else
push!(tree.nodes, TreeNode(- params.η * node.∑δ / (node.∑δ² + params.λ * node.∑𝑤)))
Expand All @@ -57,7 +52,6 @@ function grow_tree(X::AbstractArray{T, 2}, δ::AbstractArray{Float64, 1}, δ²::
return tree
end


# extract the gain value from the vector of best splits and return the split info associated with best split
function get_max_gain(splits::Vector{SplitInfo{Float64}})
gains = (x -> x.gain).(splits)
Expand All @@ -67,11 +61,11 @@ function get_max_gain(splits::Vector{SplitInfo{Float64}})
return best
end


function grow_gbtree(X::AbstractArray{T, 2}, Y::AbstractArray{<:AbstractFloat, 1}, params::Params; X_eval::AbstractArray{T, 2} = Array{T, 2}(undef, (0,0)), Y_eval::AbstractArray{<:AbstractFloat, 1} = Array{Float64, 1}(undef, 0)) where T<:Real
# grow_gbtree
function grow_gbtree(X::AbstractArray{T, 2}, Y::AbstractArray{<:AbstractFloat, 1}, params::Params; X_eval::AbstractArray{T, 2} = Array{T, 2}(undef, (0,0)), Y_eval::AbstractArray{<:AbstractFloat, 1} = Array{Float64, 1}(undef, 0), metric::Symbol = :rmse) where T<:Real
μ = mean(Y)
# pred = ones(size(Y, 1)) .* μ
@fastmath pred = ones(size(Y, 1)) .* μ
pred = ones(size(Y, 1)) .* μ

# initialize gradients and weights
δ, δ² = zeros(Float64, size(Y, 1)), zeros(Float64, size(Y, 1))
Expand All @@ -81,7 +75,7 @@ function grow_gbtree(X::AbstractArray{T, 2}, Y::AbstractArray{<:AbstractFloat, 1
# eval init
if size(Y_eval, 1) > 0
# pred_eval = ones(size(Y_eval, 1)) .* μ
@fastmath pred_eval = ones(size(Y_eval, 1)) .* μ
pred_eval = ones(size(Y_eval, 1)) .* μ
end

bias = Tree([TreeNode(μ)])
Expand Down Expand Up @@ -123,30 +117,30 @@ function grow_gbtree(X::AbstractArray{T, 2}, Y::AbstractArray{<:AbstractFloat, 1
# assign a root and grow tree
train_nodes[1] = TrainNode(1, ∑δ, ∑δ², ∑𝑤, gain, 𝑖, 𝑗)
tree = grow_tree(X, δ, δ², 𝑤, params, perm_ini, train_nodes, splits, tracks)
# update push tree to model
push!(gbtree.trees, tree)

# get update predictions
predict!(pred, tree, X)
# eval predictions
if size(Y_eval, 1) > 0
predict!(pred_eval, tree, X_eval)
end
# update push tree to model
push!(gbtree.trees, tree)

# callback function
if mod(i, 10) == 0
if size(Y_eval, 1) > 0
println("iter:", i, ", train:", eval_metric(Val{:rmse}(), pred, Y), ", eval: ", eval_metric(Val{:rmse}(), pred_eval, Y_eval))
println("iter:", i, ", train:", eval_metric(Val{metric}(), pred, Y), ", eval: ", eval_metric(Val{metric}(), pred_eval, Y_eval))
else
println("iter:", i, ", train:", eval_metric(Val{:rmse}(), pred, Y))
println("iter:", i, ", train:", eval_metric(Val{metric}(), pred, Y))
end
end # end of callback

end #end of nrounds
return gbtree
end


# find best split
function find_split!(x::AbstractArray{T, 1}, δ::AbstractArray{Float64, 1}, δ²::AbstractArray{Float64, 1}, 𝑤::AbstractArray{Float64, 1}, ∑δ, ∑δ², ∑𝑤, λ, info::SplitInfo, track::SplitTrack) where T<:Real

# info.gain = (∑δ ^ 2 / (∑δ² + λ)) / 2.0
Expand Down
Loading

0 comments on commit 798c516

Please sign in to comment.