From 59270deaa730b6901dea2fdf0677a6a3e65c56e7 Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 14 Nov 2024 12:32:02 +0100 Subject: [PATCH 01/11] implement `count_connected_components(g)` as faster version of `length(connected_components(g))` --- src/Graphs.jl | 1 + src/connectivity.jl | 76 ++++++++++++++++++++++++++++++----- test/spanningtrees/boruvka.jl | 12 +++--- 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/src/Graphs.jl b/src/Graphs.jl index 86bc0946a..8b990186a 100644 --- a/src/Graphs.jl +++ b/src/Graphs.jl @@ -210,6 +210,7 @@ export # connectivity connected_components, + count_connected_components, strongly_connected_components, strongly_connected_components_kosaraju, strongly_connected_components_tarjan, diff --git a/src/connectivity.jl b/src/connectivity.jl index 18e0aca94..f7bb537f6 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -1,26 +1,32 @@ # Parts of this code were taken / derived from Graphs.jl. See LICENSE for # licensing details. """ - connected_components!(label, g) + connected_components!(label, g, [search_queue]) Fill `label` with the `id` of the connected component in the undirected graph `g` to which it belongs. Return a vector representing the component assigned to each vertex. The component value is the smallest vertex ID in the component. +A `search_queue`, an empty `Vector{eltype(edgetype(g))}`, can be provided to reduce +allocations if `connected_components!` is intended to be called multiple times sequentially. +If not provided, it is automatically instantiated. + ### Performance This algorithm is linear in the number of edges of the graph. """ -function connected_components!(label::AbstractVector, g::AbstractGraph{T}) where {T} +function connected_components!( + label::AbstractVector, g::AbstractGraph{T}, search_queue::Vector{T}=Vector{T}() +) where {T} + isempty(search_queue) || error("provided `search_queue` is not empty") for u in vertices(g) label[u] != zero(T) && continue label[u] = u - Q = Vector{T}() - push!(Q, u) - while !isempty(Q) - src = popfirst!(Q) + push!(search_queue, u) + while !isempty(search_queue) + src = popfirst!(search_queue) for vertex in all_neighbors(g, src) if label[vertex] == zero(T) - push!(Q, vertex) + push!(search_queue, vertex) label[vertex] = u end end @@ -129,9 +135,61 @@ julia> is_connected(g) true ``` """ -function is_connected(g::AbstractGraph) +function is_connected(g::AbstractGraph{T}) where T mult = is_directed(g) ? 2 : 1 - return mult * ne(g) + 1 >= nv(g) && length(connected_components(g)) == 1 + if mult * ne(g) + 1 >= nv(g) + label = zeros(T, nv(g)) + connected_components!(label, g) + return allequal(label) + else + return false + end +end + +""" + count_connected_components( g, [label, search_queue]) + +Return the number of connected components in `g`. + +Equivalent to `length(connected_components(g))` but uses fewer allocations by not +materializing the component vectors explicitly. Additionally, mutated work-arrays `label` +and `search_queue` can be provided to reduce allocations further (see +[`connected_components!`](@ref)). + +``` +julia> using Graphs + +julia> g = Graph(Edge.([1=>2, 2=>3, 3=>1, 4=>5, 5=>6, 6=>4, 7=>8])); + +length> connected_components(g) +3-element Vector{Vector{Int64}}: + [1, 2, 3] + [4, 5, 6] + [7, 8] + +julia> count_connected_components(g) +3 +``` +""" +function count_connected_components( + g::AbstractGraph{T}, + label::AbstractVector=zeros(T, nv(g)), + search_queue::Vector{T}=Vector{T}() +) where T + connected_components!(label, g, search_queue) + return count_unique(label) +end + +function count_unique(label::Vector{T}) where T + seen = Set{T}() + c = 0 + for l in label + if l ∉ seen + push!(seen, l) + c += 1 + end + end + return c end """ diff --git a/test/spanningtrees/boruvka.jl b/test/spanningtrees/boruvka.jl index dfabbaebe..7828fbab0 100644 --- a/test/spanningtrees/boruvka.jl +++ b/test/spanningtrees/boruvka.jl @@ -21,14 +21,14 @@ g1t = GenericGraph(SimpleGraph(edges1)) @test res1.weight == cost_mst # acyclic graphs have n - c edges - @test nv(g1t) - length(connected_components(g1t)) == ne(g1t) + @test nv(g1t) - ne(g1t) == length(connected_components(g1t)) == count_connected_components(g1t) @test nv(g1t) == nv(g) res2 = boruvka_mst(g, distmx; minimize=false) edges2 = [Edge(src(e), dst(e)) for e in res2.mst] g2t = GenericGraph(SimpleGraph(edges2)) @test res2.weight == cost_max_vec_mst - @test nv(g2t) - length(connected_components(g2t)) == ne(g2t) + @test nv(g2t) - ne(g2t) == length(connected_components(g2t)) == count_connected_components(g2t) @test nv(g2t) == nv(g) end # second test @@ -60,14 +60,14 @@ edges3 = [Edge(src(e), dst(e)) for e in res3.mst] g3t = GenericGraph(SimpleGraph(edges3)) @test res3.weight == weight_vec2 - @test nv(g3t) - length(connected_components(g3t)) == ne(g3t) + @test nv(g3t) - ne(g3t) == length(connected_components(g3t)) == count_connected_components(g3t) @test nv(g3t) == nv(gx) res4 = boruvka_mst(g, distmx_sec; minimize=false) edges4 = [Edge(src(e), dst(e)) for e in res4.mst] g4t = GenericGraph(SimpleGraph(edges4)) @test res4.weight == weight_max_vec2 - @test nv(g4t) - length(connected_components(g4t)) == ne(g4t) + @test nv(g4t) - ne(g4t) == length(connected_components(g4t)) == count_connected_components(g4t) @test nv(g4t) == nv(gx) end @@ -123,14 +123,14 @@ edges5 = [Edge(src(e), dst(e)) for e in res5.mst] g5t = GenericGraph(SimpleGraph(edges5)) @test res5.weight == weight_vec3 - @test nv(g5t) - length(connected_components(g5t)) == ne(g5t) + @test nv(g5t) - ne(g5t) == length(connected_components(g5t)) == count_connected_components(g5t) @test nv(g5t) == nv(gd) res6 = boruvka_mst(g, distmx_third; minimize=false) edges6 = [Edge(src(e), dst(e)) for e in res6.mst] g6t = GenericGraph(SimpleGraph(edges6)) @test res6.weight == weight_max_vec3 - @test nv(g6t) - length(connected_components(g6t)) == ne(g6t) + @test nv(g6t) - ne(g6t) == length(connected_components(g6t)) == count_connected_components(g6t) @test nv(g6t) == nv(gd) end end From a8f8d19051476ac4607c2a019ba40af128c39bbd Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 14 Nov 2024 12:32:14 +0100 Subject: [PATCH 02/11] export `connected_components!` --- src/Graphs.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Graphs.jl b/src/Graphs.jl index 8b990186a..8f20ec0d8 100644 --- a/src/Graphs.jl +++ b/src/Graphs.jl @@ -210,6 +210,7 @@ export # connectivity connected_components, + connected_components!, count_connected_components, strongly_connected_components, strongly_connected_components_kosaraju, From 2316b0fc644a3e41ee1f9a2948018ce1b1ebcca3 Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 14 Nov 2024 12:55:00 +0100 Subject: [PATCH 03/11] allow resetting `label` inside `count_connected_components` --- src/connectivity.jl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index f7bb537f6..a992ff6b8 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -147,15 +147,20 @@ function is_connected(g::AbstractGraph{T}) where T end """ - count_connected_components( g, [label, search_queue]) + count_connected_components( g, [label, search_queue]; reset_label::Bool=false) Return the number of connected components in `g`. Equivalent to `length(connected_components(g))` but uses fewer allocations by not materializing the component vectors explicitly. Additionally, mutated work-arrays `label` and `search_queue` can be provided to reduce allocations further (see -[`connected_components!`](@ref)). +[`_connected_components!`](@ref)). +## Keyword arguments +- `reset_label :: Bool` (default, `false`): if `true`, `label` is reset to zero before + returning. + +## Example ``` julia> using Graphs @@ -174,10 +179,13 @@ julia> count_connected_components(g) function count_connected_components( g::AbstractGraph{T}, label::AbstractVector=zeros(T, nv(g)), - search_queue::Vector{T}=Vector{T}() + search_queue::Vector{T}=Vector{T}(); + reset_label::Bool=false ) where T - connected_components!(label, g, search_queue) - return count_unique(label) + _connected_components!(label, g, search_queue) + c = count_unique(label) + reset_label && fill!(label, zero(eltype(label))) + return c end function count_unique(label::Vector{T}) where T From 05e3b7e57b01b5ae021f15a94d7f83836eed9c6b Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 14 Nov 2024 12:55:32 +0100 Subject: [PATCH 04/11] fix unsaved test --- test/operators.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/operators.jl b/test/operators.jl index bf4931ebf..f849b5bcc 100644 --- a/test/operators.jl +++ b/test/operators.jl @@ -268,6 +268,7 @@ for i in 3:4 @testset "Tensor Product: $g" for g in testgraphs(path_graph(i)) @test length(connected_components(tensor_product(g, g))) == 2 + @test count_connected_components(tensor_product(g, g)) == 2 end end From 0153724cc79d420cdb7012280ae1535e8cfae913 Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 14 Nov 2024 13:15:23 +0100 Subject: [PATCH 05/11] fix copy bug and nits --- src/connectivity.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index a992ff6b8..9791fa851 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -15,7 +15,7 @@ If not provided, it is automatically instantiated. This algorithm is linear in the number of edges of the graph. """ function connected_components!( - label::AbstractVector, g::AbstractGraph{T}, search_queue::Vector{T}=Vector{T}() + label::AbstractVector{T}, g::AbstractGraph{T}, search_queue::Vector{T}=Vector{T}() ) where {T} isempty(search_queue) || error("provided `search_queue` is not empty") for u in vertices(g) @@ -178,11 +178,11 @@ julia> count_connected_components(g) """ function count_connected_components( g::AbstractGraph{T}, - label::AbstractVector=zeros(T, nv(g)), + label::AbstractVector{T}=zeros(T, nv(g)), search_queue::Vector{T}=Vector{T}(); reset_label::Bool=false ) where T - _connected_components!(label, g, search_queue) + connected_components!(label, g, search_queue) c = count_unique(label) reset_label && fill!(label, zero(eltype(label))) return c From ce66be55edec4d63c85259d794a37fc79d5d0de6 Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 14 Nov 2024 13:17:46 +0100 Subject: [PATCH 06/11] JuliaFormatter --- src/connectivity.jl | 8 ++++---- test/spanningtrees/boruvka.jl | 24 ++++++++++++++++++------ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 9791fa851..6ef1d7050 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -135,7 +135,7 @@ julia> is_connected(g) true ``` """ -function is_connected(g::AbstractGraph{T}) where T +function is_connected(g::AbstractGraph{T}) where {T} mult = is_directed(g) ? 2 : 1 if mult * ne(g) + 1 >= nv(g) label = zeros(T, nv(g)) @@ -180,15 +180,15 @@ function count_connected_components( g::AbstractGraph{T}, label::AbstractVector{T}=zeros(T, nv(g)), search_queue::Vector{T}=Vector{T}(); - reset_label::Bool=false -) where T + reset_label::Bool=false, +) where {T} connected_components!(label, g, search_queue) c = count_unique(label) reset_label && fill!(label, zero(eltype(label))) return c end -function count_unique(label::Vector{T}) where T +function count_unique(label::Vector{T}) where {T} seen = Set{T}() c = 0 for l in label diff --git a/test/spanningtrees/boruvka.jl b/test/spanningtrees/boruvka.jl index 7828fbab0..552b77bea 100644 --- a/test/spanningtrees/boruvka.jl +++ b/test/spanningtrees/boruvka.jl @@ -21,14 +21,18 @@ g1t = GenericGraph(SimpleGraph(edges1)) @test res1.weight == cost_mst # acyclic graphs have n - c edges - @test nv(g1t) - ne(g1t) == length(connected_components(g1t)) == count_connected_components(g1t) + @test nv(g1t) - ne(g1t) == + length(connected_components(g1t)) == + count_connected_components(g1t) @test nv(g1t) == nv(g) res2 = boruvka_mst(g, distmx; minimize=false) edges2 = [Edge(src(e), dst(e)) for e in res2.mst] g2t = GenericGraph(SimpleGraph(edges2)) @test res2.weight == cost_max_vec_mst - @test nv(g2t) - ne(g2t) == length(connected_components(g2t)) == count_connected_components(g2t) + @test nv(g2t) - ne(g2t) == + length(connected_components(g2t)) == + count_connected_components(g2t) @test nv(g2t) == nv(g) end # second test @@ -60,14 +64,18 @@ edges3 = [Edge(src(e), dst(e)) for e in res3.mst] g3t = GenericGraph(SimpleGraph(edges3)) @test res3.weight == weight_vec2 - @test nv(g3t) - ne(g3t) == length(connected_components(g3t)) == count_connected_components(g3t) + @test nv(g3t) - ne(g3t) == + length(connected_components(g3t)) == + count_connected_components(g3t) @test nv(g3t) == nv(gx) res4 = boruvka_mst(g, distmx_sec; minimize=false) edges4 = [Edge(src(e), dst(e)) for e in res4.mst] g4t = GenericGraph(SimpleGraph(edges4)) @test res4.weight == weight_max_vec2 - @test nv(g4t) - ne(g4t) == length(connected_components(g4t)) == count_connected_components(g4t) + @test nv(g4t) - ne(g4t) == + length(connected_components(g4t)) == + count_connected_components(g4t) @test nv(g4t) == nv(gx) end @@ -123,14 +131,18 @@ edges5 = [Edge(src(e), dst(e)) for e in res5.mst] g5t = GenericGraph(SimpleGraph(edges5)) @test res5.weight == weight_vec3 - @test nv(g5t) - ne(g5t) == length(connected_components(g5t)) == count_connected_components(g5t) + @test nv(g5t) - ne(g5t) == + length(connected_components(g5t)) == + count_connected_components(g5t) @test nv(g5t) == nv(gd) res6 = boruvka_mst(g, distmx_third; minimize=false) edges6 = [Edge(src(e), dst(e)) for e in res6.mst] g6t = GenericGraph(SimpleGraph(edges6)) @test res6.weight == weight_max_vec3 - @test nv(g6t) - ne(g6t) == length(connected_components(g6t)) == count_connected_components(g6t) + @test nv(g6t) - ne(g6t) == + length(connected_components(g6t)) == + count_connected_components(g6t) @test nv(g6t) == nv(gd) end end From f440525849be691758e52f80307422f135cf7a6f Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 14 Nov 2024 14:18:04 +0100 Subject: [PATCH 07/11] lingering copy typo --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 6ef1d7050..86ce77554 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -154,7 +154,7 @@ Return the number of connected components in `g`. Equivalent to `length(connected_components(g))` but uses fewer allocations by not materializing the component vectors explicitly. Additionally, mutated work-arrays `label` and `search_queue` can be provided to reduce allocations further (see -[`_connected_components!`](@ref)). +[`connected_components!`](@ref)). ## Keyword arguments - `reset_label :: Bool` (default, `false`): if `true`, `label` is reset to zero before From da1d31ed918234e766e98ffba1dc8d5b100793f6 Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 21 Nov 2024 09:30:33 +0100 Subject: [PATCH 08/11] updates from code-review --- src/connectivity.jl | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 86ce77554..a6c187eb8 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -7,17 +7,18 @@ Fill `label` with the `id` of the connected component in the undirected graph `g` to which it belongs. Return a vector representing the component assigned to each vertex. The component value is the smallest vertex ID in the component. -A `search_queue`, an empty `Vector{eltype(edgetype(g))}`, can be provided to reduce -allocations if `connected_components!` is intended to be called multiple times sequentially. -If not provided, it is automatically instantiated. +## Optional arguments +- `search_queue`, an empty `Vector{eltype(edgetype(g))}`, can be provided to avoid + reallocating this work array repeatedly on repeated calls of `connected_components!`. + If not provided, it is automatically instantiated. -### Performance +## Performance This algorithm is linear in the number of edges of the graph. """ function connected_components!( label::AbstractVector{T}, g::AbstractGraph{T}, search_queue::Vector{T}=Vector{T}() ) where {T} - isempty(search_queue) || error("provided `search_queue` is not empty") + empty!(search_queue) for u in vertices(g) label[u] != zero(T) && continue label[u] = u @@ -152,13 +153,17 @@ end Return the number of connected components in `g`. Equivalent to `length(connected_components(g))` but uses fewer allocations by not -materializing the component vectors explicitly. Additionally, mutated work-arrays `label` -and `search_queue` can be provided to reduce allocations further (see -[`connected_components!`](@ref)). +materializing the component vectors explicitly. + +## Optional arguments +Mutated work arrays, `label` and `search_queue` can be provided to avoid allocating these +arrays repeatedly on repeated calls of `count_connected_components`. +For `g :: AbstractGraph{T}`, `label` must be a zero-initialized `Vector{T}` of length +`nv(g)` and `search_queue` a `Vector{T}`. See also [`connected_components!`](@ref). ## Keyword arguments -- `reset_label :: Bool` (default, `false`): if `true`, `label` is reset to zero before - returning. +- `reset_label :: Bool` (default, `false`): if `true`, `label` is reset to a zero-vector + before returning. ## Example ``` From 21b185411c85460038bfb6ff46a5d687d857e71d Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 21 Nov 2024 09:58:13 +0100 Subject: [PATCH 09/11] simplify `count_unique` --- src/connectivity.jl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index a6c187eb8..890cf8d87 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -194,15 +194,13 @@ function count_connected_components( end function count_unique(label::Vector{T}) where {T} + # effectively does `length(Set(label))` but slightly faster since `Set(label)` + # sizehints too aggressively for the use case of having relatively few unique elements seen = Set{T}() - c = 0 for l in label - if l ∉ seen - push!(seen, l) - c += 1 - end + l ∉ seen && push!(seen, l) # currently faster than direct `push!(seen, l)` end - return c + return length(seen) end """ From ead687e26c328990a9adcd261c1e31bb011fb11e Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Thu, 21 Nov 2024 10:23:12 +0100 Subject: [PATCH 10/11] improve comments for `count_unique` --- src/connectivity.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 890cf8d87..83a5ff3c9 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -194,11 +194,14 @@ function count_connected_components( end function count_unique(label::Vector{T}) where {T} - # effectively does `length(Set(label))` but slightly faster since `Set(label)` - # sizehints too aggressively for the use case of having relatively few unique elements + # effectively does `length(Set(label))` but faster, since `Set(label)` sizehints + # aggressively and assumes that most elements of `label` will be unique, which very + # rarely will be the case for caller `count_connected_components!` seen = Set{T}() for l in label - l ∉ seen && push!(seen, l) # currently faster than direct `push!(seen, l)` + # faster than direct `push!(seen, l)` when `label` has few unique elements relative + # to `length(label)` + l ∉ seen && push!(seen, l) end return length(seen) end From d5e0e37da4aeb97ec2e499dc41d09ba6bf92efbd Mon Sep 17 00:00:00 2001 From: Thomas Christensen Date: Tue, 14 Jan 2025 09:26:29 +0100 Subject: [PATCH 11/11] use `BitSet` in `count_unique` --- src/connectivity.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/connectivity.jl b/src/connectivity.jl index 83a5ff3c9..188b31e7e 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -197,7 +197,7 @@ function count_unique(label::Vector{T}) where {T} # effectively does `length(Set(label))` but faster, since `Set(label)` sizehints # aggressively and assumes that most elements of `label` will be unique, which very # rarely will be the case for caller `count_connected_components!` - seen = Set{T}() + seen = T === Int ? BitSet() : Set{T}() # if `T=Int`, we can use faster BitSet for l in label # faster than direct `push!(seen, l)` when `label` has few unique elements relative # to `length(label)`