From 0e1bdb45da71feb9470b487d627d35252267761d Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Tue, 27 Apr 2021 17:06:53 +0200 Subject: [PATCH] Add extra documentation to hash join hashmap structure --- datafusion/src/physical_plan/hash_join.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 5e26329af008..2edd0c7ee5e3 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -63,8 +63,17 @@ use crate::physical_plan::coalesce_batches::concat_batches; use log::debug; // Maps a `u64` hash value based on the left ["on" values] to a list of indices with this key's value. +// +// Note that the `u64` keys are not stored in the hashmap (hence the `()` as key), but are only used +// to put the indices in a certain bucket. +// By allocating a `HashMap` with capacity for *at least* the number of rows for entries at the left side, +// we make sure that we don't have to re-hash the hashmap, which needs access to the key (the hash in this case) value. // E.g. 1 -> [3, 6, 8] indicates that the column values map to rows 3, 6 and 8 for hash value 1 // As the key is a hash value, we need to check possible hash collisions in the probe stage +// During this stage it might be the case that a row is contained the same hashmap value, +// but the values don't match. Those are checked in the [equal_rows] macro +// TODO: speed up collission check and move away from using a hashbrown HashMap +// https://github.com/apache/arrow-datafusion/issues/50 type JoinHashMap = HashMap<(), SmallVec<[u64; 1]>, IdHashBuilder>; type JoinLeftData = Arc<(JoinHashMap, RecordBatch)>;