From 4fd170e0660505d20d412248d8c18a40469d2ab8 Mon Sep 17 00:00:00 2001 From: saolof Date: Sun, 11 Apr 2021 19:45:44 -0400 Subject: [PATCH 01/18] Reduced time & memory footprint for Tarjans algorithm I worked on improving the strongly_connected_components function in this graph. This PR eliminates several of the arrays that were used for auxiliary data using various tricks inspired by https://homepages.ecs.vuw.ac.nz/~djp/files/IPL15-preprint.pdf , which reduces memory usage & allocations and improves cache efficiency. This seems to lead to a speedup in every category of random graphs in benchmarking. I would also subjectively claim that this version of the algorithm is more easily readable than Tarjan's original version. Also put up a gist of a few alternatives I tried out: https://gist.github.com/saolof/7b5d26a41e6a34ff2b3e76d3fc5541da --- src/connectivity.jl | 103 +++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 53 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 0c597b489..06ed0d865 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -219,31 +219,31 @@ julia> strongly_connected_components(g) function strongly_connected_components end # see https://github.com/mauro3/SimpleTraits.jl/issues/47#issuecomment-327880153 for syntax -@traitfn function strongly_connected_components(g::AG::IsDirected) where {T<:Integer, AG <: AbstractGraph{T}} +empty_graph_data(type,g::AG) where {T, AG <: AbstractGraph{T}} = Dict{T,type}() +empty_graph_data(type,g::AG) where {T<:Integer, AG <: AbstractGraph{T}} = zeros(type,nv(g)) +is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) +is_unvisited(data::Dict,v) = !haskey(data,v) + +@traitfn function strongly_connected_components_modified(g::AG::IsDirected) where {T, AG <: AbstractGraph{T}} zero_t = zero(T) - one_t = one(T) nvg = nv(g) - count = one_t - - - index = zeros(T, nvg) # first time in which vertex is discovered - stack = Vector{T}() # stores vertices which have been discovered and not yet assigned to any component - onstack = zeros(Bool, nvg) # false if a vertex is waiting in the stack to receive a component assignment - lowlink = zeros(T, nvg) # lowest index vertex that it can reach through back edge (index array not vertex id number) - parents = zeros(T, nvg) # parent of every vertex in dfs + count = 1 # Visitation order for the branch being explored. Backtracks when we pop an scc. + component_count = nvg - 1 # Reversed Index of the current component being discovered. + # Invariant: count is always smaller than component_count. + # This lets us tell if a node belongs to a previously discovered scc without any extra bits. + + component_root = empty_graph_data(Bool,g) + rindex = empty_graph_data(Int,g) # The arrays should not be T-valued for integer T, the rindexes should be the same type as nvg components = Vector{Vector{T}}() # maintains a list of scc (order is not guaranteed in API) - + stack = Vector{T}() # stores vertices which have been discovered and not yet assigned to any component dfs_stack = Vector{T}() @inbounds for s in vertices(g) - if index[s] == zero_t - index[s] = count - lowlink[s] = count - onstack[s] = true - parents[s] = s - push!(stack, s) - count = count + one_t + if is_unvisited(rindex,s) + rindex[s] = count + component_root[s] = true + count += 1 # start dfs from 's' push!(dfs_stack, s) @@ -252,16 +252,16 @@ function strongly_connected_components end v = dfs_stack[end] #end is the most recently added item u = zero_t @inbounds for v_neighbor in outneighbors(g, v) - if index[v_neighbor] == zero_t - # unvisited neighbor found + if is_unvisited(rindex,v_neighbor) u = v_neighbor break #GOTO A push u onto DFS stack and continue DFS - elseif onstack[v_neighbor] - # we have already seen n, but can update the lowlink of v - # which has the effect of possibly keeping v on the stack until n is ready to pop. - # update lowest index 'v' can reach through out neighbors - lowlink[v] = min(lowlink[v], index[v_neighbor]) + # TODO: This is accidentally quadratic for tournament graphs or star graphs. + # Breaking the for loop to resume it from the beginning when returning leads to issues for graphs with high node orders like the star graph. + # One option is to save the iteration state in a third stack, but there may be other approaches. + elseif (rindex[v_neighbor] < rindex[v]) + rindex[v] = rindex[v_neighbor] + component_root[v] = false end end if u == zero_t @@ -269,49 +269,46 @@ function strongly_connected_components end # we have fully explored the DFS tree from v. # time to start popping. popped = pop!(dfs_stack) - lowlink[parents[popped]] = min(lowlink[parents[popped]], lowlink[popped]) - - if index[v] == lowlink[v] - # found a cycle in a completed dfs tree. - component = Vector{T}() - - while !isempty(stack) #break when popped == v - # drain stack until we see v. - # everything on the stack until we see v is in the SCC rooted at v. - popped = pop!(stack) - push!(component, popped) - onstack[popped] = false - # popped has been assigned a component, so we will never see it again. - if popped == v - # we have drained the stack of an entire component. - break - end + if component_root[popped] # Found an SCC rooted at popped which is a bottom cycle in remaining graph. + component = T[popped] + count -= 1 # We also backtrack the count to reset it to what it would be if the component were never in the graph. + while !isempty(stack) && (rindex[popped] <= rindex[stack[end]]) # Keep popping its children from the backtracking stack. + newpopped = pop!(stack) + rindex[newpopped] = component_count # Bigger than the value of anything unexplored. + push!(component, newpopped) # popped has been assigned a component, so we will never see it again. + count -=1 + end + rindex[popped] = component_count + component_count -= 1 + push!(components,component) + else # Invariant: the DFS stack can never be empty in this second branch where popped is not a root. + if (rindex[popped] < rindex[dfs_stack[end]]) + rindex[dfs_stack[end]] = rindex[popped] + component_root[dfs_stack[end]] = false end - - reverse!(component) - push!(components, component) + # Because we only push to stack when backtracking, it gets filled up less than in Tarjan's original algorithm. + # For DAG inputs, the stack variable never gets touched at all. + push!(stack,popped) end else #LABEL A # add unvisited neighbor to dfs - index[u] = count - lowlink[u] = count - onstack[u] = true - parents[u] = v - count = count + one_t - - push!(stack, u) push!(dfs_stack, u) + component_root[u] = true + rindex[u] = count + count += 1 # next iteration of while loop will expand the DFS tree from u. end end end end + #Unlike in the original Tajans, rindex are potentially also worth returning here. + # Lowlink values are topologically sorted in the same order as components. + # Scipy's graph library returns only that and lets the user sort by its values. return components end - """ strongly_connected_components_kosaraju(g) From 1e9dd732d906c84db4548b62d37fcbe158ae6e42 Mon Sep 17 00:00:00 2001 From: saolof Date: Sun, 11 Apr 2021 20:43:00 -0400 Subject: [PATCH 02/18] Update connectivity.jl spelled function name correctly --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 06ed0d865..9ce652f92 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -224,7 +224,7 @@ empty_graph_data(type,g::AG) where {T<:Integer, AG <: AbstractGraph{T}} = zeros( is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) is_unvisited(data::Dict,v) = !haskey(data,v) -@traitfn function strongly_connected_components_modified(g::AG::IsDirected) where {T, AG <: AbstractGraph{T}} +@traitfn function strongly_connected_components(g::AG::IsDirected) where {T, AG <: AbstractGraph{T}} zero_t = zero(T) nvg = nv(g) count = 1 # Visitation order for the branch being explored. Backtracks when we pop an scc. From eaf19467a14c45f8655f11c5a9569a4ef0c0d378 Mon Sep 17 00:00:00 2001 From: saolof Date: Mon, 12 Apr 2021 04:37:08 -0400 Subject: [PATCH 03/18] Update connectivity.jl Counting downwards instead of upwards has the advantage that rindex becomes a lookup table for components, if we ever decide to return both. Also makes the algorithm invariant crystal clear. --- src/connectivity.jl | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 9ce652f92..3edba1f65 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -227,23 +227,24 @@ is_unvisited(data::Dict,v) = !haskey(data,v) @traitfn function strongly_connected_components(g::AG::IsDirected) where {T, AG <: AbstractGraph{T}} zero_t = zero(T) nvg = nv(g) - count = 1 # Visitation order for the branch being explored. Backtracks when we pop an scc. - component_count = nvg - 1 # Reversed Index of the current component being discovered. - # Invariant: count is always smaller than component_count. - # This lets us tell if a node belongs to a previously discovered scc without any extra bits. + count = nvg # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. + component_count = 1 # Index of the current component being discovered. + # Invariant 1: count is always smaller than component_count. + # Invariant 2: if rindex[v] < component_count, then v is in components[rindex[v]]. + # This trivially lets us tell if a node belongs to a previously discovered scc without any extra bits, just inequalities. component_root = empty_graph_data(Bool,g) - rindex = empty_graph_data(Int,g) # The arrays should not be T-valued for integer T, the rindexes should be the same type as nvg + rindex = empty_graph_data(Int,g) # The arrays should not be T-valued for integer T, the rindexes should be the same type as nvg. components = Vector{Vector{T}}() # maintains a list of scc (order is not guaranteed in API) - stack = Vector{T}() # stores vertices which have been discovered and not yet assigned to any component + stack = Vector{T}() # while backtracking, stores vertices which have been discovered and not yet assigned to any component dfs_stack = Vector{T}() @inbounds for s in vertices(g) if is_unvisited(rindex,s) rindex[s] = count component_root[s] = true - count += 1 + count -= 1 # start dfs from 's' push!(dfs_stack, s) @@ -256,10 +257,11 @@ is_unvisited(data::Dict,v) = !haskey(data,v) u = v_neighbor break #GOTO A push u onto DFS stack and continue DFS - # TODO: This is accidentally quadratic for tournament graphs or star graphs. - # Breaking the for loop to resume it from the beginning when returning leads to issues for graphs with high node orders like the star graph. + # TODO: This is accidentally(?) quadratic for (very large) tournament graphs or star graphs, + # but the loop turns out to be very tight so this still benchmarks well unless the vertex orders are huge. + # Breaking the for loop to resume it from the beginning when returning leads to this being quadratic as a function of node order. # One option is to save the iteration state in a third stack, but there may be other approaches. - elseif (rindex[v_neighbor] < rindex[v]) + elseif (rindex[v_neighbor] > rindex[v]) rindex[v] = rindex[v_neighbor] component_root[v] = false end @@ -271,24 +273,23 @@ is_unvisited(data::Dict,v) = !haskey(data,v) popped = pop!(dfs_stack) if component_root[popped] # Found an SCC rooted at popped which is a bottom cycle in remaining graph. component = T[popped] - count -= 1 # We also backtrack the count to reset it to what it would be if the component were never in the graph. - while !isempty(stack) && (rindex[popped] <= rindex[stack[end]]) # Keep popping its children from the backtracking stack. + count += 1 # We also backtrack the count to reset it to what it would be if the component were never in the graph. + while !isempty(stack) && (rindex[popped] >= rindex[stack[end]]) # Keep popping its children from the backtracking stack. newpopped = pop!(stack) rindex[newpopped] = component_count # Bigger than the value of anything unexplored. push!(component, newpopped) # popped has been assigned a component, so we will never see it again. - count -=1 + count +=1 end rindex[popped] = component_count - component_count -= 1 + component_count += 1 push!(components,component) else # Invariant: the DFS stack can never be empty in this second branch where popped is not a root. - if (rindex[popped] < rindex[dfs_stack[end]]) + if (rindex[popped] > rindex[dfs_stack[end]]) rindex[dfs_stack[end]] = rindex[popped] component_root[dfs_stack[end]] = false end # Because we only push to stack when backtracking, it gets filled up less than in Tarjan's original algorithm. - # For DAG inputs, the stack variable never gets touched at all. - push!(stack,popped) + push!(stack,popped) # For DAG inputs, the stack variable never gets touched at all. end else #LABEL A @@ -296,19 +297,19 @@ is_unvisited(data::Dict,v) = !haskey(data,v) push!(dfs_stack, u) component_root[u] = true rindex[u] = count - count += 1 + count -= 1 # next iteration of while loop will expand the DFS tree from u. end end end end - #Unlike in the original Tajans, rindex are potentially also worth returning here. - # Lowlink values are topologically sorted in the same order as components. + #Unlike in the original Tarjans, rindex are potentially also worth returning here. + # For any v, v is in components[rindex[v]], s it acts as a lookup table for components. # Scipy's graph library returns only that and lets the user sort by its values. - return components + return components # ,rindex end - + """ strongly_connected_components_kosaraju(g) From 9308b9e500e1a5b9d0a9324d64cb1dfc785a4640 Mon Sep 17 00:00:00 2001 From: saolof Date: Mon, 12 Apr 2021 16:55:18 -0400 Subject: [PATCH 04/18] Update connectivity.jl Fixed O(|E|^2) performance bug that used to be an issue for star graphs. Minimal change in performance for large random graphs, but significant speedup for graphs that have both lots of SCCs and high node orders. --- src/connectivity.jl | 82 +++++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 3edba1f65..d26fd80ad 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -214,64 +214,89 @@ julia> strongly_connected_components(g) [1, 2, 3, 4] [10, 11] + +This currently uses a modern variation on Tarjan's algorithm, largely derived from algorithm 3 in David J. Pearce's +preprint: https://homepages.ecs.vuw.ac.nz/~djp/files/IPL15-preprint.pdf , with some changes & tradeoffs when unrolling it to an +imperative algorithm. ``` """ function strongly_connected_components end # see https://github.com/mauro3/SimpleTraits.jl/issues/47#issuecomment-327880153 for syntax -empty_graph_data(type,g::AG) where {T, AG <: AbstractGraph{T}} = Dict{T,type}() -empty_graph_data(type,g::AG) where {T<:Integer, AG <: AbstractGraph{T}} = zeros(type,nv(g)) + + +# Required to prevent quadratic time in star graphs without causing type instability. Returns the type of the state object returned by iterate that may be saved to a stack. +neighbor_iter_statetype(::Type{AG}) where {AG <: AbstractGraph} = Any # Analogous to eltype, but for the state of the iterator rather than the elements. +neighbor_iter_statetype(::Type{AG}) where {AG <: LightGraphs.SimpleGraphs.AbstractSimpleGraph} = Int # Since outneighbours is an array. + +# Threshold below which it isn't worth keeping the DFS iteration state. +is_large_vertex(g,v) = length(outneighbors(g,v)) >= 16 is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) -is_unvisited(data::Dict,v) = !haskey(data,v) -@traitfn function strongly_connected_components(g::AG::IsDirected) where {T, AG <: AbstractGraph{T}} + +# The key idea behind any variation on Tarjan's algorithm is to use DFS and pop off found components. +# Whenever we are forced to backtrack, we are in a bottom cycle of the remaining graph, +# which we accumulate in a stack while backtracking, until we reach a local root. +# A local root is a vertex from which we cannot reach any node that was visited earlier by DFS. +# As such, when we have backtracked to it, we may pop off the contents the stack as a strongly connected component. +@traitfn function strongly_connected_components(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} zero_t = zero(T) nvg = nv(g) - count = nvg # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. + count = Int(nvg) # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. component_count = 1 # Index of the current component being discovered. # Invariant 1: count is always smaller than component_count. # Invariant 2: if rindex[v] < component_count, then v is in components[rindex[v]]. - # This trivially lets us tell if a node belongs to a previously discovered scc without any extra bits, just inequalities. + # This trivially lets us tell if a vertex belongs to a previously discovered scc without any extra bits, + # just inequalities that combine naturally with other checks. - component_root = empty_graph_data(Bool,g) - rindex = empty_graph_data(Int,g) # The arrays should not be T-valued for integer T, the rindexes should be the same type as nvg. + is_component_root = Vector{Bool}(undef,nvg) # Fields are set when tracing and read when backtracking, so can be initialized undef. + rindex = zeros(Int,nvg) components = Vector{Vector{T}}() # maintains a list of scc (order is not guaranteed in API) stack = Vector{T}() # while backtracking, stores vertices which have been discovered and not yet assigned to any component dfs_stack = Vector{T}() - + largev_iterstate_stack = Vector{Tuple{T,neighbor_iter_statetype(AG)}}() # For large vertexes we push the iteration state into a stack so we may resume it. + # adding this last stack fixes the O(|E|^2) performance bug that could previously be seen in large star graphs. + @inbounds for s in vertices(g) - if is_unvisited(rindex,s) + if is_unvisited(rindex, s) rindex[s] = count - component_root[s] = true + is_component_root[s] = true count -= 1 # start dfs from 's' - push!(dfs_stack, s) + push!(dfs_stack, s) + if is_large_vertex(g, s) + push!(largev_iterstate_stack, iterate(outneighbors(g, s))) + end - while !isempty(dfs_stack) + @inbounds while !isempty(dfs_stack) v = dfs_stack[end] #end is the most recently added item u = zero_t - @inbounds for v_neighbor in outneighbors(g, v) - if is_unvisited(rindex,v_neighbor) + outn = outneighbors(g, v) + v_is_large = is_large_vertex(g, v) + next = v_is_large ? pop!(largev_iterstate_stack) : iterate(outn) + while next !== nothing + (v_neighbor, state) = next + if is_unvisited(rindex, v_neighbor) u = v_neighbor break #GOTO A push u onto DFS stack and continue DFS - # TODO: This is accidentally(?) quadratic for (very large) tournament graphs or star graphs, - # but the loop turns out to be very tight so this still benchmarks well unless the vertex orders are huge. - # Breaking the for loop to resume it from the beginning when returning leads to this being quadratic as a function of node order. - # One option is to save the iteration state in a third stack, but there may be other approaches. + # Note: This is no longer quadratic for (very large) tournament graphs or star graphs, + # as we save the iteration state in largev_iterstate_stack for large vertices. + # The loop is tight so not saving the state still benchmarks well unless the vertex orders are large enough to make quadratic growth kick in. elseif (rindex[v_neighbor] > rindex[v]) rindex[v] = rindex[v_neighbor] - component_root[v] = false + is_component_root[v] = false end + next = iterate(outn, state) end if u == zero_t # All out neighbors already visited or no out neighbors # we have fully explored the DFS tree from v. # time to start popping. popped = pop!(dfs_stack) - if component_root[popped] # Found an SCC rooted at popped which is a bottom cycle in remaining graph. + if is_component_root[popped] # Found an SCC rooted at popped which is a bottom cycle in remaining graph. component = T[popped] count += 1 # We also backtrack the count to reset it to what it would be if the component were never in the graph. while !isempty(stack) && (rindex[popped] >= rindex[stack[end]]) # Keep popping its children from the backtracking stack. @@ -282,20 +307,26 @@ is_unvisited(data::Dict,v) = !haskey(data,v) end rindex[popped] = component_count component_count += 1 - push!(components,component) + push!(components, component) else # Invariant: the DFS stack can never be empty in this second branch where popped is not a root. if (rindex[popped] > rindex[dfs_stack[end]]) rindex[dfs_stack[end]] = rindex[popped] - component_root[dfs_stack[end]] = false + is_component_root[dfs_stack[end]] = false end # Because we only push to stack when backtracking, it gets filled up less than in Tarjan's original algorithm. - push!(stack,popped) # For DAG inputs, the stack variable never gets touched at all. + push!(stack, popped) # For DAG inputs, the stack variable never gets touched at all. end else #LABEL A # add unvisited neighbor to dfs push!(dfs_stack, u) - component_root[u] = true + if v_is_large + push!(largev_iterstate_stack, next) + end + if is_large_vertex(g, u) + push!(largev_iterstate_stack, iterate(outneighbors(g, u))) + end + is_component_root[u] = true rindex[u] = count count -= 1 # next iteration of while loop will expand the DFS tree from u. @@ -309,6 +340,7 @@ is_unvisited(data::Dict,v) = !haskey(data,v) # Scipy's graph library returns only that and lets the user sort by its values. return components # ,rindex end + """ strongly_connected_components_kosaraju(g) From 705b925f67fdca51611eafcb5f4b817d639e2dfd Mon Sep 17 00:00:00 2001 From: saolof Date: Mon, 12 Apr 2021 17:09:30 -0400 Subject: [PATCH 05/18] Update src/connectivity.jl Co-authored-by: Simon Schoelly --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index d26fd80ad..8a1d9ef5e 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -291,7 +291,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) end next = iterate(outn, state) end - if u == zero_t + iszero(u) # All out neighbors already visited or no out neighbors # we have fully explored the DFS tree from v. # time to start popping. From 3427221f826bb037074ce83c8dcafa871f78c6e9 Mon Sep 17 00:00:00 2001 From: saolof Date: Mon, 12 Apr 2021 17:23:40 -0400 Subject: [PATCH 06/18] Update connectivity.jl Previous commit removed the if in front of iszero somehow --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 8a1d9ef5e..0fc0a1b40 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -291,7 +291,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) end next = iterate(outn, state) end - iszero(u) + if iszero(u) # All out neighbors already visited or no out neighbors # we have fully explored the DFS tree from v. # time to start popping. From 156793a735b1399fb07d3dbd4f1b222602b201e1 Mon Sep 17 00:00:00 2001 From: saolof Date: Tue, 13 Apr 2021 03:43:17 -0400 Subject: [PATCH 07/18] Update connectivity.jl Slightly simplified logic and removed the need for zero_t. --- src/connectivity.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 0fc0a1b40..355290a31 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -240,7 +240,6 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) # A local root is a vertex from which we cannot reach any node that was visited earlier by DFS. # As such, when we have backtracked to it, we may pop off the contents the stack as a strongly connected component. @traitfn function strongly_connected_components(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} - zero_t = zero(T) nvg = nv(g) count = Int(nvg) # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. component_count = 1 # Index of the current component being discovered. @@ -272,14 +271,12 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) @inbounds while !isempty(dfs_stack) v = dfs_stack[end] #end is the most recently added item - u = zero_t outn = outneighbors(g, v) v_is_large = is_large_vertex(g, v) next = v_is_large ? pop!(largev_iterstate_stack) : iterate(outn) while next !== nothing (v_neighbor, state) = next if is_unvisited(rindex, v_neighbor) - u = v_neighbor break #GOTO A push u onto DFS stack and continue DFS # Note: This is no longer quadratic for (very large) tournament graphs or star graphs, @@ -291,7 +288,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) end next = iterate(outn, state) end - if iszero(u) + if isnothing(state) # natural loop end # All out neighbors already visited or no out neighbors # we have fully explored the DFS tree from v. # time to start popping. @@ -319,6 +316,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) else #LABEL A # add unvisited neighbor to dfs + (u, state) = next push!(dfs_stack, u) if v_is_large push!(largev_iterstate_stack, next) From 234ace41b1a79496f862403a2415147e8a342d6d Mon Sep 17 00:00:00 2001 From: saolof Date: Tue, 13 Apr 2021 04:33:40 -0400 Subject: [PATCH 08/18] Update connectivity.jl Trying to figure out what broke. Can elements of outneighbours be equal to nothing? --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 355290a31..24334a4ad 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -316,7 +316,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) else #LABEL A # add unvisited neighbor to dfs - (u, state) = next + u = next[1] push!(dfs_stack, u) if v_is_large push!(largev_iterstate_stack, next) From 9169d80ab579cceb7f9c6bcabb54f59c93f5ce57 Mon Sep 17 00:00:00 2001 From: saolof Date: Tue, 13 Apr 2021 04:44:05 -0400 Subject: [PATCH 09/18] Update connectivity.jl testing --- src/connectivity.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 24334a4ad..275b2fabe 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -239,7 +239,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) # which we accumulate in a stack while backtracking, until we reach a local root. # A local root is a vertex from which we cannot reach any node that was visited earlier by DFS. # As such, when we have backtracked to it, we may pop off the contents the stack as a strongly connected component. -@traitfn function strongly_connected_components(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} +@traitfn function strongly_connected_components_4(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} nvg = nv(g) count = Int(nvg) # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. component_count = 1 # Index of the current component being discovered. @@ -278,7 +278,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) (v_neighbor, state) = next if is_unvisited(rindex, v_neighbor) break - #GOTO A push u onto DFS stack and continue DFS + #GOTO A push v_neighbor onto DFS stack and continue DFS # Note: This is no longer quadratic for (very large) tournament graphs or star graphs, # as we save the iteration state in largev_iterstate_stack for large vertices. # The loop is tight so not saving the state still benchmarks well unless the vertex orders are large enough to make quadratic growth kick in. @@ -288,7 +288,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) end next = iterate(outn, state) end - if isnothing(state) # natural loop end + if isnothing(next) # Natural loop end. # All out neighbors already visited or no out neighbors # we have fully explored the DFS tree from v. # time to start popping. @@ -316,7 +316,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) else #LABEL A # add unvisited neighbor to dfs - u = next[1] + (u, state) = next push!(dfs_stack, u) if v_is_large push!(largev_iterstate_stack, next) @@ -338,7 +338,6 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) # Scipy's graph library returns only that and lets the user sort by its values. return components # ,rindex end - """ strongly_connected_components_kosaraju(g) From b441446936435ecdd66b0091036da1904a7449dd Mon Sep 17 00:00:00 2001 From: saolof Date: Tue, 13 Apr 2021 04:46:59 -0400 Subject: [PATCH 10/18] Update connectivity.jl Set correct name on method --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 275b2fabe..8e1a7bff6 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -239,7 +239,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) # which we accumulate in a stack while backtracking, until we reach a local root. # A local root is a vertex from which we cannot reach any node that was visited earlier by DFS. # As such, when we have backtracked to it, we may pop off the contents the stack as a strongly connected component. -@traitfn function strongly_connected_components_4(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} +@traitfn function strongly_connected_components(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} nvg = nv(g) count = Int(nvg) # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. component_count = 1 # Index of the current component being discovered. From c30cb2de0f8242adb156f51a14061b7014faff94 Mon Sep 17 00:00:00 2001 From: saolof Date: Tue, 13 Apr 2021 05:30:28 -0400 Subject: [PATCH 11/18] Update connectivity.jl Added comments. --- src/connectivity.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 8e1a7bff6..86a2ab624 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -256,6 +256,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) dfs_stack = Vector{T}() largev_iterstate_stack = Vector{Tuple{T,neighbor_iter_statetype(AG)}}() # For large vertexes we push the iteration state into a stack so we may resume it. # adding this last stack fixes the O(|E|^2) performance bug that could previously be seen in large star graphs. + # The Tuples come from Julia's iteration protocol, and the code is structured so that we never push a Nothing into thise last stack. @inbounds for s in vertices(g) if is_unvisited(rindex, s) @@ -278,7 +279,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) (v_neighbor, state) = next if is_unvisited(rindex, v_neighbor) break - #GOTO A push v_neighbor onto DFS stack and continue DFS + #GOTO A: push v_neighbor onto DFS stack and continue DFS # Note: This is no longer quadratic for (very large) tournament graphs or star graphs, # as we save the iteration state in largev_iterstate_stack for large vertices. # The loop is tight so not saving the state still benchmarks well unless the vertex orders are large enough to make quadratic growth kick in. @@ -319,10 +320,10 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) (u, state) = next push!(dfs_stack, u) if v_is_large - push!(largev_iterstate_stack, next) + push!(largev_iterstate_stack, next) # Because this is the else branch of isnothing(state), we can push this on the stack. end if is_large_vertex(g, u) - push!(largev_iterstate_stack, iterate(outneighbors(g, u))) + push!(largev_iterstate_stack, iterate(outneighbors(g, u))) # Because u is large, iterate cannot return nothing, so we can push this on stack. end is_component_root[u] = true rindex[u] = count From b0953f1656f5a832a47934368417759cb8bf0a5c Mon Sep 17 00:00:00 2001 From: saolof Date: Tue, 13 Apr 2021 13:12:17 -0400 Subject: [PATCH 12/18] Update connectivity.jl Added a dispatch to infer the types. --- src/connectivity.jl | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 86a2ab624..f2fd762c4 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -225,21 +225,34 @@ function strongly_connected_components end # see https://github.com/mauro3/SimpleTraits.jl/issues/47#issuecomment-327880153 for syntax -# Required to prevent quadratic time in star graphs without causing type instability. Returns the type of the state object returned by iterate that may be saved to a stack. -neighbor_iter_statetype(::Type{AG}) where {AG <: AbstractGraph} = Any # Analogous to eltype, but for the state of the iterator rather than the elements. -neighbor_iter_statetype(::Type{AG}) where {AG <: LightGraphs.SimpleGraphs.AbstractSimpleGraph} = Int # Since outneighbours is an array. +@traitfn function strongly_connected_components(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} + if iszero(nv(g)) return Vector{Vector{T}}() end + strongly_connected_components_tarjan(g,infer_nb_iterstate_type(g)) +end -# Threshold below which it isn't worth keeping the DFS iteration state. +# In recursive form, Tarjans algorithm has a recursive call inside a for loop. +# To save the loop state of each recursive step in a stack efficiently, +# we need to infer the type of its state (which should almost always be an int). +infer_nb_iterstate_type(g::AbstractSimpleGraph) = Int +function infer_nb_iterstate_type(g::AbstractGraph{T}) where {T} + infer_iterstate(outneighbors(g,one(T))) # If we don't have a static dispatch, peek at the first vertex and check the type at runtime. +end +infer_iterstate(container::AbstractVector) = Int +function infer_iterstate(container) + next = iterate(container) + isnothing(next) ? Any : typeof(next[2]) +end + +# Vertex size threshold below which it isn't worth keeping the DFS iteration state. is_large_vertex(g,v) = length(outneighbors(g,v)) >= 16 is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) - # The key idea behind any variation on Tarjan's algorithm is to use DFS and pop off found components. # Whenever we are forced to backtrack, we are in a bottom cycle of the remaining graph, # which we accumulate in a stack while backtracking, until we reach a local root. # A local root is a vertex from which we cannot reach any node that was visited earlier by DFS. # As such, when we have backtracked to it, we may pop off the contents the stack as a strongly connected component. -@traitfn function strongly_connected_components(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} +@traitfn function strongly_connected_components_tarjan(g::AG::IsDirected, nb_iter_statetype::Type{S}) where {T <: Integer, AG <: AbstractGraph{T}, S} nvg = nv(g) count = Int(nvg) # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. component_count = 1 # Index of the current component being discovered. @@ -254,7 +267,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) stack = Vector{T}() # while backtracking, stores vertices which have been discovered and not yet assigned to any component dfs_stack = Vector{T}() - largev_iterstate_stack = Vector{Tuple{T,neighbor_iter_statetype(AG)}}() # For large vertexes we push the iteration state into a stack so we may resume it. + largev_iterstate_stack = Vector{Tuple{T,S}}() # For large vertexes we push the iteration state into a stack so we may resume it. # adding this last stack fixes the O(|E|^2) performance bug that could previously be seen in large star graphs. # The Tuples come from Julia's iteration protocol, and the code is structured so that we never push a Nothing into thise last stack. From 810c91f759c4e03c0c8010db80df60f6418a0d55 Mon Sep 17 00:00:00 2001 From: saolof Date: Tue, 13 Apr 2021 14:12:56 -0400 Subject: [PATCH 13/18] Update connectivity.jl --- src/connectivity.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index f2fd762c4..96d32a1ea 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -234,15 +234,15 @@ end # To save the loop state of each recursive step in a stack efficiently, # we need to infer the type of its state (which should almost always be an int). infer_nb_iterstate_type(g::AbstractSimpleGraph) = Int + function infer_nb_iterstate_type(g::AbstractGraph{T}) where {T} - infer_iterstate(outneighbors(g,one(T))) # If we don't have a static dispatch, peek at the first vertex and check the type at runtime. -end -infer_iterstate(container::AbstractVector) = Int -function infer_iterstate(container) - next = iterate(container) - isnothing(next) ? Any : typeof(next[2]) + destructure_type(x) = Any + destructure_type(x::Type{Union{Nothing,Tuple{A,B}}}) where {A,B} = B + # If no specific dispatch is given, we peek at the first vertex and use Base.Iterator magic to try infering the type. + destructure_type(Base.Iterators.approx_iter_type(outneighbors(g,one(T)))) end + # Vertex size threshold below which it isn't worth keeping the DFS iteration state. is_large_vertex(g,v) = length(outneighbors(g,v)) >= 16 is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) From 97edcbd52f918765394bab145d01548542724862 Mon Sep 17 00:00:00 2001 From: saolof Date: Tue, 13 Apr 2021 14:31:30 -0400 Subject: [PATCH 14/18] Update connectivity.jl --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 96d32a1ea..0e9a58a14 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -239,7 +239,7 @@ function infer_nb_iterstate_type(g::AbstractGraph{T}) where {T} destructure_type(x) = Any destructure_type(x::Type{Union{Nothing,Tuple{A,B}}}) where {A,B} = B # If no specific dispatch is given, we peek at the first vertex and use Base.Iterator magic to try infering the type. - destructure_type(Base.Iterators.approx_iter_type(outneighbors(g,one(T)))) + destructure_type(Base.Iterators.approx_iter_type(typeof(outneighbors(g,one(T))))) end From 336107fc78891f0b985aeb9180984488c63dbfb5 Mon Sep 17 00:00:00 2001 From: saolof Date: Wed, 14 Apr 2021 04:00:51 -0400 Subject: [PATCH 15/18] Raising theshold to 1024 based on benchmarking --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 0e9a58a14..8893fdc9e 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -244,7 +244,7 @@ end # Vertex size threshold below which it isn't worth keeping the DFS iteration state. -is_large_vertex(g,v) = length(outneighbors(g,v)) >= 16 +is_large_vertex(g,v) = length(outneighbors(g,v)) >= 1024 is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) # The key idea behind any variation on Tarjan's algorithm is to use DFS and pop off found components. From ce1c9ae4a69050617e66471f1b78d9b2e458bc85 Mon Sep 17 00:00:00 2001 From: saolof Date: Wed, 14 Apr 2021 12:01:39 -0400 Subject: [PATCH 16/18] Changed rindex to be T valued Changed everything to be T-valued. Partly to save space for small graphs represented using smaller types, and partly for correctness on machines where Int = Int32 and where the graph is large enough to require Int64s --- src/connectivity.jl | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 8893fdc9e..340646f87 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -252,17 +252,18 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) # which we accumulate in a stack while backtracking, until we reach a local root. # A local root is a vertex from which we cannot reach any node that was visited earlier by DFS. # As such, when we have backtracked to it, we may pop off the contents the stack as a strongly connected component. -@traitfn function strongly_connected_components_tarjan(g::AG::IsDirected, nb_iter_statetype::Type{S}) where {T <: Integer, AG <: AbstractGraph{T}, S} +function strongly_connected_components_tarjan(g::AG, nb_iter_statetype::Type{S}) where {T <: Integer, AG <: AbstractGraph{T}, S} nvg = nv(g) - count = Int(nvg) # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. - component_count = 1 # Index of the current component being discovered. + one_count = one(T) + count = nvg # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. + component_count = one_count # Index of the current component being discovered. # Invariant 1: count is always smaller than component_count. # Invariant 2: if rindex[v] < component_count, then v is in components[rindex[v]]. # This trivially lets us tell if a vertex belongs to a previously discovered scc without any extra bits, # just inequalities that combine naturally with other checks. is_component_root = Vector{Bool}(undef,nvg) # Fields are set when tracing and read when backtracking, so can be initialized undef. - rindex = zeros(Int,nvg) + rindex = zeros(T,nvg) components = Vector{Vector{T}}() # maintains a list of scc (order is not guaranteed in API) stack = Vector{T}() # while backtracking, stores vertices which have been discovered and not yet assigned to any component @@ -275,7 +276,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) if is_unvisited(rindex, s) rindex[s] = count is_component_root[s] = true - count -= 1 + count -= one_count # start dfs from 's' push!(dfs_stack, s) @@ -309,15 +310,15 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) popped = pop!(dfs_stack) if is_component_root[popped] # Found an SCC rooted at popped which is a bottom cycle in remaining graph. component = T[popped] - count += 1 # We also backtrack the count to reset it to what it would be if the component were never in the graph. + count += one_count # We also backtrack the count to reset it to what it would be if the component were never in the graph. while !isempty(stack) && (rindex[popped] >= rindex[stack[end]]) # Keep popping its children from the backtracking stack. newpopped = pop!(stack) rindex[newpopped] = component_count # Bigger than the value of anything unexplored. push!(component, newpopped) # popped has been assigned a component, so we will never see it again. - count +=1 + count += one_count end rindex[popped] = component_count - component_count += 1 + component_count += one_count push!(components, component) else # Invariant: the DFS stack can never be empty in this second branch where popped is not a root. if (rindex[popped] > rindex[dfs_stack[end]]) @@ -340,7 +341,7 @@ is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) end is_component_root[u] = true rindex[u] = count - count -= 1 + count -= one_count # next iteration of while loop will expand the DFS tree from u. end end From c60466d31f7d7f54c02c5404c7ef5ced9c43841f Mon Sep 17 00:00:00 2001 From: saolof Date: Wed, 14 Apr 2021 21:46:01 -0400 Subject: [PATCH 17/18] added spaces after commas --- src/connectivity.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 340646f87..f6a40f488 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -227,7 +227,7 @@ function strongly_connected_components end @traitfn function strongly_connected_components(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} if iszero(nv(g)) return Vector{Vector{T}}() end - strongly_connected_components_tarjan(g,infer_nb_iterstate_type(g)) + strongly_connected_components_tarjan(g, infer_nb_iterstate_type(g)) end # In recursive form, Tarjans algorithm has a recursive call inside a for loop. @@ -237,15 +237,15 @@ infer_nb_iterstate_type(g::AbstractSimpleGraph) = Int function infer_nb_iterstate_type(g::AbstractGraph{T}) where {T} destructure_type(x) = Any - destructure_type(x::Type{Union{Nothing,Tuple{A,B}}}) where {A,B} = B + destructure_type(x::Type{Union{Nothing, Tuple{A,B}}}) where {A,B} = B # If no specific dispatch is given, we peek at the first vertex and use Base.Iterator magic to try infering the type. - destructure_type(Base.Iterators.approx_iter_type(typeof(outneighbors(g,one(T))))) + destructure_type(Base.Iterators.approx_iter_type(typeof(outneighbors(g, one(T))))) end # Vertex size threshold below which it isn't worth keeping the DFS iteration state. -is_large_vertex(g,v) = length(outneighbors(g,v)) >= 1024 -is_unvisited(data::AbstractVector,v::Integer) = iszero(data[v]) +is_large_vertex(g, v) = length(outneighbors(g, v)) >= 1024 +is_unvisited(data::AbstractVector, v::Integer) = iszero(data[v]) # The key idea behind any variation on Tarjan's algorithm is to use DFS and pop off found components. # Whenever we are forced to backtrack, we are in a bottom cycle of the remaining graph, @@ -262,13 +262,13 @@ function strongly_connected_components_tarjan(g::AG, nb_iter_statetype::Type{S}) # This trivially lets us tell if a vertex belongs to a previously discovered scc without any extra bits, # just inequalities that combine naturally with other checks. - is_component_root = Vector{Bool}(undef,nvg) # Fields are set when tracing and read when backtracking, so can be initialized undef. - rindex = zeros(T,nvg) + is_component_root = Vector{Bool}(undef, nvg) # Fields are set when tracing and read when backtracking, so can be initialized undef. + rindex = zeros(T, nvg) components = Vector{Vector{T}}() # maintains a list of scc (order is not guaranteed in API) stack = Vector{T}() # while backtracking, stores vertices which have been discovered and not yet assigned to any component dfs_stack = Vector{T}() - largev_iterstate_stack = Vector{Tuple{T,S}}() # For large vertexes we push the iteration state into a stack so we may resume it. + largev_iterstate_stack = Vector{Tuple{T, S}}() # For large vertexes we push the iteration state into a stack so we may resume it. # adding this last stack fixes the O(|E|^2) performance bug that could previously be seen in large star graphs. # The Tuples come from Julia's iteration protocol, and the code is structured so that we never push a Nothing into thise last stack. From 008da10710c1b7baa87b72fec9d74d53a40397ec Mon Sep 17 00:00:00 2001 From: saolof Date: Fri, 16 Apr 2021 04:07:28 -0400 Subject: [PATCH 18/18] Added underscore in front of `_strongly_connected_components_tarjan` --- src/connectivity.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index f6a40f488..feaa059f9 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -227,7 +227,7 @@ function strongly_connected_components end @traitfn function strongly_connected_components(g::AG::IsDirected) where {T <: Integer, AG <: AbstractGraph{T}} if iszero(nv(g)) return Vector{Vector{T}}() end - strongly_connected_components_tarjan(g, infer_nb_iterstate_type(g)) + _strongly_connected_components_tarjan(g, infer_nb_iterstate_type(g)) end # In recursive form, Tarjans algorithm has a recursive call inside a for loop. @@ -252,7 +252,7 @@ is_unvisited(data::AbstractVector, v::Integer) = iszero(data[v]) # which we accumulate in a stack while backtracking, until we reach a local root. # A local root is a vertex from which we cannot reach any node that was visited earlier by DFS. # As such, when we have backtracked to it, we may pop off the contents the stack as a strongly connected component. -function strongly_connected_components_tarjan(g::AG, nb_iter_statetype::Type{S}) where {T <: Integer, AG <: AbstractGraph{T}, S} +function _strongly_connected_components_tarjan(g::AG, nb_iter_statetype::Type{S}) where {T <: Integer, AG <: AbstractGraph{T}, S} nvg = nv(g) one_count = one(T) count = nvg # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc.