From 1c0d6d0e40d805a49afbc0216d4109a699e52d9c Mon Sep 17 00:00:00 2001 From: jeff washington Date: Tue, 21 Mar 2023 11:39:29 -0500 Subject: [PATCH] better duplicate key stats during index generation --- runtime/src/accounts_db.rs | 59 +++++++++++++++++++--------- runtime/src/in_mem_accounts_index.rs | 11 +++++- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/runtime/src/accounts_db.rs b/runtime/src/accounts_db.rs index 8453d9acb972b6..4b8ab84bf169ab 100644 --- a/runtime/src/accounts_db.rs +++ b/runtime/src/accounts_db.rs @@ -620,8 +620,10 @@ struct GenerateIndexTimings { pub index_flush_us: u64, pub rent_paying: AtomicUsize, pub amount_to_top_off_rent: AtomicU64, - pub total_duplicates: u64, + pub total_including_duplicates: u64, pub accounts_data_len_dedup_time_us: u64, + pub total_duplicate_slot_keys: u64, + pub populate_duplicate_keys_us: u64, } #[derive(Default, Debug, PartialEq, Eq)] @@ -668,8 +670,8 @@ impl GenerateIndexTimings { i64 ), ( - "total_items_with_duplicates", - self.total_duplicates as i64, + "total_items_including_duplicates", + self.total_including_duplicates as i64, i64 ), ("total_items", self.total_items as i64, i64), @@ -678,6 +680,16 @@ impl GenerateIndexTimings { self.accounts_data_len_dedup_time_us as i64, i64 ), + ( + "total_duplicate_slot_keys", + self.total_duplicate_slot_keys as i64, + i64 + ), + ( + "populate_duplicate_keys_us", + self.populate_duplicate_keys_us as i64, + i64 + ), ); } } @@ -8982,7 +8994,7 @@ impl AccountsDb { let insertion_time_us = AtomicU64::new(0); let rent_paying = AtomicUsize::new(0); let amount_to_top_off_rent = AtomicU64::new(0); - let total_duplicates = AtomicU64::new(0); + let total_including_duplicates = AtomicU64::new(0); let storage_info_timings = Mutex::new(GenerateIndexTimings::default()); let scan_time: u64 = slots .par_chunks(chunk_size) @@ -9025,7 +9037,8 @@ impl AccountsDb { rent_paying.fetch_add(rent_paying_this_slot, Ordering::Relaxed); amount_to_top_off_rent .fetch_add(amount_to_top_off_rent_this_slot, Ordering::Relaxed); - total_duplicates.fetch_add(total_this_slot, Ordering::Relaxed); + total_including_duplicates + .fetch_add(total_this_slot, Ordering::Relaxed); accounts_data_len .fetch_add(accounts_data_len_this_slot, Ordering::Relaxed); let mut rent_paying_accounts_by_partition = @@ -9088,6 +9101,8 @@ impl AccountsDb { .sum(); let mut index_flush_us = 0; + let mut total_duplicate_slot_keys = 0; + let mut populate_duplicate_keys_us = 0; if pass == 0 { // tell accounts index we are done adding the initial accounts at startup let mut m = Measure::start("accounts_index_idle_us"); @@ -9095,21 +9110,25 @@ impl AccountsDb { m.stop(); index_flush_us = m.as_us(); - // this has to happen before visit_duplicate_pubkeys_during_startup below - // get duplicate keys from acct idx. We have to wait until we've finished flushing. - for (slot, key) in self - .accounts_index - .retrieve_duplicate_keys_from_startup() - .into_iter() - .flatten() - { - match self.uncleaned_pubkeys.entry(slot) { - Occupied(mut occupied) => occupied.get_mut().push(key), - Vacant(vacant) => { - vacant.insert(vec![key]); + populate_duplicate_keys_us = measure_us!({ + // this has to happen before visit_duplicate_pubkeys_during_startup below + // get duplicate keys from acct idx. We have to wait until we've finished flushing. + for (slot, key) in self + .accounts_index + .retrieve_duplicate_keys_from_startup() + .into_iter() + .flatten() + { + total_duplicate_slot_keys += 1; + match self.uncleaned_pubkeys.entry(slot) { + Occupied(mut occupied) => occupied.get_mut().push(key), + Vacant(vacant) => { + vacant.insert(vec![key]); + } } } - } + }) + .1; } let storage_info_timings = storage_info_timings.into_inner().unwrap(); @@ -9123,7 +9142,9 @@ impl AccountsDb { total_items, rent_paying, amount_to_top_off_rent, - total_duplicates: total_duplicates.load(Ordering::Relaxed), + total_duplicate_slot_keys, + populate_duplicate_keys_us, + total_including_duplicates: total_including_duplicates.load(Ordering::Relaxed), storage_size_accounts_map_us: storage_info_timings.storage_size_accounts_map_us, storage_size_accounts_map_flatten_us: storage_info_timings .storage_size_accounts_map_flatten_us, diff --git a/runtime/src/in_mem_accounts_index.rs b/runtime/src/in_mem_accounts_index.rs index 904ed25d6e338e..1641d278f20168 100644 --- a/runtime/src/in_mem_accounts_index.rs +++ b/runtime/src/in_mem_accounts_index.rs @@ -1051,12 +1051,19 @@ impl + Into> InMemAccountsIndex { - // merge this in, mark as conflict + // merge this in, mark as duplicate + duplicates.push((slot, k)); + if current_slot_list.len() == 1 { + // accurately account for there being a duplicate for the first entry that was previously added to the disk index. + // That entry could not have known yet that it was a duplicate. + // It is important to capture each slot with a duplicate because of slot limits applied to clean. + let first_entry_slot = current_slot_list[0].0; + duplicates.push((first_entry_slot, k)); + } let mut slot_list = Vec::with_capacity(current_slot_list.len() + 1); slot_list.extend_from_slice(current_slot_list); slot_list.push((entry.0, entry.1.into())); // will never be from the same slot that already exists in the list ref_count += new_ref_count; - duplicates.push((slot, k)); Some((slot_list, ref_count)) } None => {