From 2198f009a863c46034c146fd03b85fc3ef26da4d Mon Sep 17 00:00:00 2001 From: Emilio Jesus Gallego Arias Date: Tue, 4 Feb 2020 16:10:18 +0100 Subject: [PATCH 1/7] [performance] Avoid linear walk of graph children. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling Dune in scenarios where targets have a large number of deps, Dune will take a long time to start. A common case is when depending on `(package coq)`, which brings into the DAG a few thousand files. `perf` data show this is due to the linear walk in `Dag.is_child`; indeed, doing a naive replacement of the list for a more efficient access structure solves the problem: ``` with this PR: real 0m1,684s user 0m1,552s sys 0m0,128s with master: real 0m11,450s user 0m10,587s sys 0m0,264s ``` We fix this by adding an efficient representation of `deps` that allows checking if an edge is already in the graph `log n` time, so the complexity of `is_child` goes from O(n²) to O(n log(n)). Note that `raw_add_edge` has also changed complexity from O(1) to O(log n) due to extra map insertion. Signed-off-by: Emilio Jesus Gallego Arias --- CHANGES.md | 7 +++++++ src/dag/dag.ml | 12 ++++++++---- src/memo/memo.ml | 8 ++++++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 36405e65cac..fa34d5cedce 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,10 @@ +2.3.0 (unreleased) +------------------ + +- Avoid linear walk to detect children, this should greatly improve + performance when a target has a large number of dependencies (#2959, + @ejgallego, @aalekseyev, @Armael) + 2.2.0 (06/02/2020) ------------------ diff --git a/src/dag/dag.ml b/src/dag/dag.ml index c1a25f5de14..3ab6d300f55 100644 --- a/src/dag/dag.ml +++ b/src/dag/dag.ml @@ -12,12 +12,16 @@ module Make (Value : Value) : S with type value := Value.t = struct type graph = t + module Node_map = Map.Make(Int) + type node_info = { id : int ; (* only used for printing *) mutable mark : mark ; mutable level : int ; mutable deps : node list + ; (* see #2959, we need to implement is_child efficiently *) + mutable deps_set : unit Node_map.t ; mutable rev_deps : node list ; mutable parent : node option } @@ -60,7 +64,8 @@ module Make (Value : Value) : S with type value := Value.t = struct let set_parent _ v p = v.info.parent <- Some p let raw_add_edge _ v w = - v.info.deps <- w :: v.info.deps + v.info.deps <- w :: v.info.deps; + v.info.deps_set <- Node_map.add_exn v.info.deps_set w.info.id () let raw_add_vertex _ _ = () end @@ -75,7 +80,7 @@ module Make (Value : Value) : S with type value := Value.t = struct let create_node_info g = let id = g.fresh_id in g.fresh_id <- g.fresh_id + 1; - { id; mark = -1; level = 1; deps = []; rev_deps = []; parent = None } + { id; mark = -1; level = 1; deps = []; deps_set = Node_map.empty; rev_deps = []; parent = None } let add g v w = match IC.add_edge_or_detect_cycle g v w with @@ -102,6 +107,5 @@ module Make (Value : Value) : S with type value := Value.t = struct let pp_node pp_value fmt n = pp_depth 0 pp_value fmt n - let is_child v w = - v.info.deps |> List.exists ~f:(fun c -> c.info.id = w.info.id) + let is_child v w = Node_map.mem v.info.deps_set w.info.id end diff --git a/src/memo/memo.ml b/src/memo/memo.ml index ab6c2e0bbed..29bf5988f72 100644 --- a/src/memo/memo.ml +++ b/src/memo/memo.ml @@ -448,7 +448,11 @@ let add_rev_dep (type i o f) ~called_from_peek (dep_node : (i, o, f) Dep_node.t) let dag_node = dep_node.dag_node in let rev_dep = rev_dep.dag_node in try - (* if the caller doesn't already contain this as a dependent *) + (* if the caller doesn't already contain this as a dependent, we + add it to the graph; note that the complexity guarantees for + `Dag.add` don't hold if the edge is already in the graph, + hence the check , see #2959 for more details and the + README of the vendored library *) if Dag.is_child rev_dep dag_node |> not then Dag.add global_dep_dag rev_dep dag_node with Dag.Cycle cycle -> @@ -459,7 +463,7 @@ let add_rev_dep (type i o f) ~called_from_peek (dep_node : (i, o, f) Dep_node.t) }) ) (* CR-soon amokhov: The order of dependencies in the resulting list seems to be - wrong: [Dag.children] returns children in the reverse order compared to the + wrong: [Dag.children] returns children in the reverse order instead of the order in which they were added. See the comment for [deps : Last_dep.t list]. *) let get_deps_from_graph_exn (dep_node : _ Dep_node.t) = Dag.children dep_node.dag_node From 8a2812ab90d5f348a03ee329d9a90ddd2d1784d9 Mon Sep 17 00:00:00 2001 From: Emilio Jesus Gallego Arias Date: Thu, 6 Feb 2020 20:56:21 +0100 Subject: [PATCH 2/7] [doc] Document invariants and process for vendor/incremental_cycles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Emilio Jesus Gallego Arias Co-authored-by: Arseniy Alekseyev Co-authored-by: Armaël Guéneau --- src/dag/dag.ml | 4 +++ src/dag/dag.mli | 4 +++ vendor/incremental-cycles/README.md | 49 +++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 vendor/incremental-cycles/README.md diff --git a/src/dag/dag.ml b/src/dag/dag.ml index 3ab6d300f55..9bf71c38241 100644 --- a/src/dag/dag.ml +++ b/src/dag/dag.ml @@ -2,6 +2,10 @@ open! Stdune include Dag_intf module Make (Value : Value) : S with type value := Value.t = struct + + (* Raw_graph here should have the same complexity than the assumed + interface on the incremental_cycles proofs, in particular + [get_outgoing] should run in constant time. *) module Raw_graph = struct type mark = int diff --git a/src/dag/dag.mli b/src/dag/dag.mli index 1ee95b9241e..ed1faaeadf4 100644 --- a/src/dag/dag.mli +++ b/src/dag/dag.mli @@ -9,6 +9,10 @@ open! Stdune ACM Trans. Algorithms 12, 2, Article 14 (December 2015), 22 pages. DOI: https://doi.org/10.1145/2756553 *) +(** Note that this file uses [vendor/incremental-cycles] and has to + meet some invariants, for more information see incremental-cycles' + README *) + module type Value = Dag_intf.Value module type S = Dag_intf.S diff --git a/vendor/incremental-cycles/README.md b/vendor/incremental-cycles/README.md new file mode 100644 index 00000000000..5394d4b61fc --- /dev/null +++ b/vendor/incremental-cycles/README.md @@ -0,0 +1,49 @@ +# README for incremental_cycles library + +This library is vendored from +https://gitlab.inria.fr/agueneau/incremental-cycles + +## Details on the vendoring process + +The vendoring process is a bit involved due to the way the library is +specified upstream. In particular, it assumes a graph interface +`Raw_graph` that we have to copy by hand in Dune [see +`src/dag/dag.ml`], and in particular we have to be careful about not +altering the complexity guarantees. + + +## Complexity guarantees + +The complexity and correctness of the implementation of +`incremental_cycles` has been mechanically-verified using the Coq +theorem prover. Note however, that for the main theorem to hold there +are a few requirements that cannot be capture by ML-level interfaces; +more concretely: + +- the current specification for the algorithm requires the + `get_outgoing` function provided by the client to return a list of + all successors, and do so in constant time. This is quite demanding, + as basically requires the client to already have the list at hand. + +- the main theorem for `Dag.add` does require that the vertex is not + already in the graph; otherwise the theorem doesn't apply. Thus, + clients must ensure that no duplicate edge is added to the graph. + +## Dune-specific modifications + +Dune uses incremental_cycles in a way that the no-duplicate-egdes +requirement is not satisfied by construction; thus, before a call to +`Dag.add` edge membership on the graph must be checked. + +This is a common operation and thus should be done efficiently, thus +Dune performs the following modifications to `dag.ml`: + +- we add a set of children nodes in addition to the current list +- we modify `raw_add_edge` so it updates this set, and `is_child` so + it uses the efficient membership set + +The rationale for add a duplicate children field is to actually +preserve the order the edges were added, this could be important in +other parts of the algo, see comment on `is_child` use at `memo.ml`. + +For more details see discussion at https://github.com/ocaml/dune/pull/2959 From d03be37189b40e59f10be291da12bd7d537dd122 Mon Sep 17 00:00:00 2001 From: Arseniy Alekseyev Date: Thu, 6 Feb 2020 22:48:20 +0000 Subject: [PATCH 3/7] typos Signed-off-by: Arseniy Alekseyev --- vendor/incremental-cycles/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/incremental-cycles/README.md b/vendor/incremental-cycles/README.md index 5394d4b61fc..7d0b7b1d373 100644 --- a/vendor/incremental-cycles/README.md +++ b/vendor/incremental-cycles/README.md @@ -17,7 +17,7 @@ altering the complexity guarantees. The complexity and correctness of the implementation of `incremental_cycles` has been mechanically-verified using the Coq theorem prover. Note however, that for the main theorem to hold there -are a few requirements that cannot be capture by ML-level interfaces; +are a few requirements that cannot be captured by ML-level interfaces; more concretely: - the current specification for the algorithm requires the @@ -42,7 +42,7 @@ Dune performs the following modifications to `dag.ml`: - we modify `raw_add_edge` so it updates this set, and `is_child` so it uses the efficient membership set -The rationale for add a duplicate children field is to actually +The rationale for adding a duplicate children field is to actually preserve the order the edges were added, this could be important in other parts of the algo, see comment on `is_child` use at `memo.ml`. From 12f5bb5d57a7b381247445037b5e2155dcbd9fc5 Mon Sep 17 00:00:00 2001 From: Arseniy Alekseyev Date: Thu, 6 Feb 2020 22:56:43 +0000 Subject: [PATCH 4/7] add add_idempotent Signed-off-by: Arseniy Alekseyev --- src/dag/dag.ml | 8 ++++++++ src/dag/dag_intf.ml | 7 ++++++- src/memo/memo.ml | 8 +------- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/dag/dag.ml b/src/dag/dag.ml index 9bf71c38241..0dfa7e0808e 100644 --- a/src/dag/dag.ml +++ b/src/dag/dag.ml @@ -112,4 +112,12 @@ module Make (Value : Value) : S with type value := Value.t = struct let pp_node pp_value fmt n = pp_depth 0 pp_value fmt n let is_child v w = Node_map.mem v.info.deps_set w.info.id + + let add_idempotent g v w = + (* if the edge doesn't already exist, we + add it to the graph; note that the complexity guarantees for + `Dag.add` don't hold if the edge is already in the graph, + hence the check , see #2959 for more details and the + README of the vendored library *) + if is_child v w then () else add g v w end diff --git a/src/dag/dag_intf.ml b/src/dag/dag_intf.ml index fceea848402..8b7e1287d2b 100644 --- a/src/dag/dag_intf.ml +++ b/src/dag/dag_intf.ml @@ -31,9 +31,14 @@ module type S = sig val create_node_info : t -> node_info (** [add dag v w] creates an arc going from [v] to [w]. @raise Cycle if - creating the arc would create a cycle. *) + creating the arc would create a cycle. + This assumes that the arc does not already exist. *) val add : t -> node -> node -> unit + (** [add_idempotent dag v w] creates an arc going from [v] to [w] unless + it already exists. @raise Cycle if creating the arc would create a cycle. *) + val add_idempotent : t -> node -> node -> unit + (** [children v] returns all nodes [w] for which an arc going from [v] to [w] exists. *) val children : node -> node list diff --git a/src/memo/memo.ml b/src/memo/memo.ml index 29bf5988f72..9dc9c5a74b6 100644 --- a/src/memo/memo.ml +++ b/src/memo/memo.ml @@ -448,13 +448,7 @@ let add_rev_dep (type i o f) ~called_from_peek (dep_node : (i, o, f) Dep_node.t) let dag_node = dep_node.dag_node in let rev_dep = rev_dep.dag_node in try - (* if the caller doesn't already contain this as a dependent, we - add it to the graph; note that the complexity guarantees for - `Dag.add` don't hold if the edge is already in the graph, - hence the check , see #2959 for more details and the - README of the vendored library *) - if Dag.is_child rev_dep dag_node |> not then - Dag.add global_dep_dag rev_dep dag_node + Dag.add_idempotent global_dep_dag rev_dep dag_node with Dag.Cycle cycle -> raise (Cycle_error.E From 7790607f45efe8415027fe1816a58e51e988d8ed Mon Sep 17 00:00:00 2001 From: Arseniy Alekseyev Date: Thu, 6 Feb 2020 23:01:41 +0000 Subject: [PATCH 5/7] ocamlformat Signed-off-by: Arseniy Alekseyev --- src/dag/dag.ml | 32 ++++++++++++++++++++------------ src/dag/dag.mli | 5 ++--- src/dag/dag_intf.ml | 8 ++++---- src/memo/memo.ml | 3 +-- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/dag/dag.ml b/src/dag/dag.ml index 0dfa7e0808e..41f03480eea 100644 --- a/src/dag/dag.ml +++ b/src/dag/dag.ml @@ -2,10 +2,9 @@ open! Stdune include Dag_intf module Make (Value : Value) : S with type value := Value.t = struct - - (* Raw_graph here should have the same complexity than the assumed - interface on the incremental_cycles proofs, in particular - [get_outgoing] should run in constant time. *) + (* Raw_graph here should have the same complexity than the assumed interface + on the incremental_cycles proofs, in particular [get_outgoing] should run + in constant time. *) module Raw_graph = struct type mark = int @@ -16,7 +15,7 @@ module Make (Value : Value) : S with type value := Value.t = struct type graph = t - module Node_map = Map.Make(Int) + module Node_map = Map.Make (Int) type node_info = { id : int @@ -84,7 +83,14 @@ module Make (Value : Value) : S with type value := Value.t = struct let create_node_info g = let id = g.fresh_id in g.fresh_id <- g.fresh_id + 1; - { id; mark = -1; level = 1; deps = []; deps_set = Node_map.empty; rev_deps = []; parent = None } + { id + ; mark = -1 + ; level = 1 + ; deps = [] + ; deps_set = Node_map.empty + ; rev_deps = [] + ; parent = None + } let add g v w = match IC.add_edge_or_detect_cycle g v w with @@ -114,10 +120,12 @@ module Make (Value : Value) : S with type value := Value.t = struct let is_child v w = Node_map.mem v.info.deps_set w.info.id let add_idempotent g v w = - (* if the edge doesn't already exist, we - add it to the graph; note that the complexity guarantees for - `Dag.add` don't hold if the edge is already in the graph, - hence the check , see #2959 for more details and the - README of the vendored library *) - if is_child v w then () else add g v w + (* if the edge doesn't already exist, we add it to the graph; note that the + complexity guarantees for `Dag.add` don't hold if the edge is already in + the graph, hence the check , see #2959 for more details and the README of + the vendored library *) + if is_child v w then + () + else + add g v w end diff --git a/src/dag/dag.mli b/src/dag/dag.mli index ed1faaeadf4..f86fa237e8e 100644 --- a/src/dag/dag.mli +++ b/src/dag/dag.mli @@ -9,9 +9,8 @@ open! Stdune ACM Trans. Algorithms 12, 2, Article 14 (December 2015), 22 pages. DOI: https://doi.org/10.1145/2756553 *) -(** Note that this file uses [vendor/incremental-cycles] and has to - meet some invariants, for more information see incremental-cycles' - README *) +(** Note that this file uses [vendor/incremental-cycles] and has to meet some + invariants, for more information see incremental-cycles' README *) module type Value = Dag_intf.Value diff --git a/src/dag/dag_intf.ml b/src/dag/dag_intf.ml index 8b7e1287d2b..6ed7d5fafb0 100644 --- a/src/dag/dag_intf.ml +++ b/src/dag/dag_intf.ml @@ -31,12 +31,12 @@ module type S = sig val create_node_info : t -> node_info (** [add dag v w] creates an arc going from [v] to [w]. @raise Cycle if - creating the arc would create a cycle. - This assumes that the arc does not already exist. *) + creating the arc would create a cycle. This assumes that the arc does not + already exist. *) val add : t -> node -> node -> unit - (** [add_idempotent dag v w] creates an arc going from [v] to [w] unless - it already exists. @raise Cycle if creating the arc would create a cycle. *) + (** [add_idempotent dag v w] creates an arc going from [v] to [w] unless it + already exists. @raise Cycle if creating the arc would create a cycle. *) val add_idempotent : t -> node -> node -> unit (** [children v] returns all nodes [w] for which an arc going from [v] to [w] diff --git a/src/memo/memo.ml b/src/memo/memo.ml index 9dc9c5a74b6..f6756ceed6f 100644 --- a/src/memo/memo.ml +++ b/src/memo/memo.ml @@ -447,8 +447,7 @@ let add_rev_dep (type i o f) ~called_from_peek (dep_node : (i, o, f) Dep_node.t) in let dag_node = dep_node.dag_node in let rev_dep = rev_dep.dag_node in - try - Dag.add_idempotent global_dep_dag rev_dep dag_node + try Dag.add_idempotent global_dep_dag rev_dep dag_node with Dag.Cycle cycle -> raise (Cycle_error.E From 6709de1dc938b5acedbe6f027934fb46a095cb1c Mon Sep 17 00:00:00 2001 From: Arseniy Alekseyev Date: Thu, 6 Feb 2020 23:07:12 +0000 Subject: [PATCH 6/7] remove [add] because it's error-prone Signed-off-by: Arseniy Alekseyev --- src/dag/dag.ml | 7 +++++-- src/dag/dag_intf.ml | 5 ----- test/expect-tests/dag/dag_tests.ml | 14 +++++++------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/dag/dag.ml b/src/dag/dag.ml index 41f03480eea..9d89f363608 100644 --- a/src/dag/dag.ml +++ b/src/dag/dag.ml @@ -92,7 +92,10 @@ module Make (Value : Value) : S with type value := Value.t = struct ; parent = None } - let add g v w = + (* [add_assuming_missing dag v w] creates an arc going from [v] to [w]. @raise Cycle if + creating the arc would create a cycle. This assumes that the arc does not + already exist. *) + let add_assuming_missing g v w = match IC.add_edge_or_detect_cycle g v w with | IC.EdgeAdded -> () | IC.EdgeCreatesCycle compute_cycle -> @@ -127,5 +130,5 @@ module Make (Value : Value) : S with type value := Value.t = struct if is_child v w then () else - add g v w + add_assuming_missing g v w end diff --git a/src/dag/dag_intf.ml b/src/dag/dag_intf.ml index 6ed7d5fafb0..797cd53f504 100644 --- a/src/dag/dag_intf.ml +++ b/src/dag/dag_intf.ml @@ -30,11 +30,6 @@ module type S = sig (** [create_node_info dag v] creates new node info that belongs to [dag]. *) val create_node_info : t -> node_info - (** [add dag v w] creates an arc going from [v] to [w]. @raise Cycle if - creating the arc would create a cycle. This assumes that the arc does not - already exist. *) - val add : t -> node -> node -> unit - (** [add_idempotent dag v w] creates an arc going from [v] to [w] unless it already exists. @raise Cycle if creating the arc would create a cycle. *) val add_idempotent : t -> node -> node -> unit diff --git a/test/expect-tests/dag/dag_tests.ml b/test/expect-tests/dag/dag_tests.ml index 2325553e7fd..a8b233f004a 100644 --- a/test/expect-tests/dag/dag_tests.ml +++ b/test/expect-tests/dag/dag_tests.ml @@ -30,10 +30,10 @@ let node21 = Dag.node dag { name = "child 2 1" } let node31 = Dag.node dag { name = "child 3 1" } let () = - Dag.add dag node node11; - Dag.add dag node node12; - Dag.add dag node12 node21; - Dag.add dag node21 node31 + Dag.add_idempotent dag node node11; + Dag.add_idempotent dag node node12; + Dag.add_idempotent dag node12 node21; + Dag.add_idempotent dag node21 node31 let pp_mynode fmt n = Format.fprintf fmt "%s" n.name @@ -42,11 +42,11 @@ let dag_pp_mynode = Dag.pp_node pp_mynode let%expect_test _ = Format.printf "%a@." dag_pp_mynode node; let node41 = Dag.node dag { name = "child 4 1" } in - Dag.add dag node31 node41; + Dag.add_idempotent dag node31 node41; Format.printf "%a@." dag_pp_mynode node; let name node = node.data.name in try - Dag.add dag node41 node; + Dag.add_idempotent dag node41 node; print_endline "no cycle" with Dag.Cycle cycle -> let cycle = List.map cycle ~f:name in @@ -80,7 +80,7 @@ let cycle_test variant = let edges = ref [] in let add d n1 n2 = edges := (n1.data, n2.data) :: !edges; - add d n1 n2 + add_idempotent d n1 n2 in let d = Dag.create () in let _n1 = node d 1 in From a15205e72a968a1ba32eba8ed3f304334dbbc6a6 Mon Sep 17 00:00:00 2001 From: Arseniy Alekseyev Date: Thu, 6 Feb 2020 23:08:15 +0000 Subject: [PATCH 7/7] ocamlformat Signed-off-by: Arseniy Alekseyev --- src/dag/dag.ml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dag/dag.ml b/src/dag/dag.ml index 9d89f363608..aa1107976f7 100644 --- a/src/dag/dag.ml +++ b/src/dag/dag.ml @@ -92,9 +92,9 @@ module Make (Value : Value) : S with type value := Value.t = struct ; parent = None } - (* [add_assuming_missing dag v w] creates an arc going from [v] to [w]. @raise Cycle if - creating the arc would create a cycle. This assumes that the arc does not - already exist. *) + (* [add_assuming_missing dag v w] creates an arc going from [v] to [w]. @raise + Cycle if creating the arc would create a cycle. This assumes that the arc + does not already exist. *) let add_assuming_missing g v w = match IC.add_edge_or_detect_cycle g v w with | IC.EdgeAdded -> ()