From f8ee2edb1290016a5ff806ba5146aacefbf64e08 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 13 Nov 2023 15:32:39 -0500 Subject: [PATCH] Minor: Update JoinHashMap example to make it clearer --- .../physical-plan/src/joins/hash_join.rs | 6 +- .../src/joins/hash_join_utils.rs | 55 ++++++++++--------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 546a929bf939..22a1ae6f1d93 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -609,6 +609,8 @@ impl ExecutionPlan for HashJoinExec { } } +/// Reads the left (build) side of the input, buffering it in memory, to build a +/// hash table (`LeftJoinData`) async fn collect_left_input( partition: Option, random_state: RandomState, @@ -801,7 +803,7 @@ impl RecordBatchStream for HashJoinStream { /// # Example /// /// For `LEFT.b1 = RIGHT.b2`: -/// LEFT Table: +/// LEFT (build) Table: /// ```text /// a1 b1 c1 /// 1 1 10 @@ -813,7 +815,7 @@ impl RecordBatchStream for HashJoinStream { /// 13 10 130 /// ``` /// -/// RIGHT Table: +/// RIGHT (probe) Table: /// ```text /// a2 b2 c2 /// 2 2 20 diff --git a/datafusion/physical-plan/src/joins/hash_join_utils.rs b/datafusion/physical-plan/src/joins/hash_join_utils.rs index 5ebf370b6d71..b162506e4ed9 100644 --- a/datafusion/physical-plan/src/joins/hash_join_utils.rs +++ b/datafusion/physical-plan/src/joins/hash_join_utils.rs @@ -61,44 +61,45 @@ use hashbrown::HashSet; /// /// ``` text /// See the example below: -/// Insert (1,1) +/// +/// Insert (10,1) <-- insert hash value 10 with row index 1 /// map: -/// --------- -/// | 1 | 2 | -/// --------- +/// ---------- +/// | 10 | 2 | +/// ---------- /// next: /// --------------------- /// | 0 | 0 | 0 | 0 | 0 | /// --------------------- -/// Insert (2,2) +/// Insert (20,2) /// map: -/// --------- -/// | 1 | 2 | -/// | 2 | 3 | -/// --------- +/// ---------- +/// | 10 | 2 | +/// | 20 | 3 | +/// ---------- /// next: /// --------------------- /// | 0 | 0 | 0 | 0 | 0 | /// --------------------- -/// Insert (1,3) +/// Insert (10,3) <-- collision! row index 3 has a hash value of 10 as well /// map: -/// --------- -/// | 1 | 4 | -/// | 2 | 3 | -/// --------- +/// ---------- +/// | 10 | 4 | +/// | 20 | 3 | +/// ---------- /// next: /// --------------------- -/// | 0 | 0 | 0 | 2 | 0 | <--- hash value 1 maps to 4,2 (which means indices values 3,1) +/// | 0 | 0 | 0 | 2 | 0 | <--- hash value 10 maps to 4,2 (which means indices values 3,1) /// --------------------- -/// Insert (1,4) +/// Insert (10,4) <-- another collision! row index 4 ALSO has a hash value of 10 /// map: /// --------- -/// | 1 | 5 | -/// | 2 | 3 | +/// | 10 | 5 | +/// | 20 | 3 | /// --------- /// next: /// --------------------- -/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1) +/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 10 maps to 5,4,2 (which means indices values 4,3,1) /// --------------------- /// ``` pub struct JoinHashMap { @@ -119,7 +120,7 @@ impl JoinHashMap { /// Trait defining methods that must be implemented by a hash map type to be used for joins. pub trait JoinHashMapType { - /// The type of list used to store the hash values. + /// The type of list used to store the next list type NextType: IndexMut; /// Extend with zero fn extend_zero(&mut self, len: usize); @@ -196,15 +197,15 @@ impl fmt::Debug for JoinHashMap { /// Let's continue the example of `JoinHashMap` and then show how `PruningJoinHashMap` would /// handle the pruning scenario. /// -/// Insert the pair (1,4) into the `PruningJoinHashMap`: +/// Insert the pair (10,4) into the `PruningJoinHashMap`: /// map: -/// --------- -/// | 1 | 5 | -/// | 2 | 3 | -/// --------- +/// ---------- +/// | 10 | 5 | +/// | 20 | 3 | +/// ---------- /// list: /// --------------------- -/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1) +/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 10 maps to 5,4,2 (which means indices values 4,3,1) /// --------------------- /// /// Now, let's prune 3 rows from `PruningJoinHashMap`: @@ -214,7 +215,7 @@ impl fmt::Debug for JoinHashMap { /// --------- /// list: /// --------- -/// | 2 | 4 | <--- hash value 1 maps to 2 (5 - 3), 1 (4 - 3), NA (2 - 3) (which means indices values 1,0) +/// | 2 | 4 | <--- hash value 10 maps to 2 (5 - 3), 1 (4 - 3), NA (2 - 3) (which means indices values 1,0) /// --------- /// /// After pruning, the | 2 | 3 | entry is deleted from `PruningJoinHashMap` since