Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shares accounts hash cache data between full and incremental #33164

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 22 additions & 16 deletions accounts-db/src/accounts_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1487,8 +1487,7 @@ pub struct AccountsDb {
#[allow(dead_code)]
base_working_temp_dir: Option<TempDir>,

full_accounts_hash_cache_path: PathBuf,
incremental_accounts_hash_cache_path: PathBuf,
accounts_hash_cache_path: PathBuf,
transient_accounts_hash_cache_path: PathBuf,

pub shrink_paths: RwLock<Option<Vec<PathBuf>>>,
Expand Down Expand Up @@ -2452,8 +2451,15 @@ impl AccountsDb {
(base_working_path, Some(base_working_temp_dir))
};

let accounts_hash_cache_path = accounts_hash_cache_path
.unwrap_or_else(|| base_working_path.join(Self::DEFAULT_ACCOUNTS_HASH_CACHE_DIR));
let accounts_hash_cache_path = accounts_hash_cache_path.unwrap_or_else(|| {
let accounts_hash_cache_path =
base_working_path.join(Self::DEFAULT_ACCOUNTS_HASH_CACHE_DIR);
if !accounts_hash_cache_path.exists() {
fs_err::create_dir(&accounts_hash_cache_path)
.expect("create accounts hash cache dir");
}
accounts_hash_cache_path
});

let mut bank_hash_stats = HashMap::new();
bank_hash_stats.insert(0, BankHashStats::default());
Expand Down Expand Up @@ -2487,9 +2493,8 @@ impl AccountsDb {
paths: vec![],
base_working_path,
base_working_temp_dir,
full_accounts_hash_cache_path: accounts_hash_cache_path.join("full"),
incremental_accounts_hash_cache_path: accounts_hash_cache_path.join("incremental"),
transient_accounts_hash_cache_path: accounts_hash_cache_path.join("transient"),
accounts_hash_cache_path,
shrink_paths: RwLock::new(None),
temp_paths: None,
file_size: DEFAULT_FILE_SIZE,
Expand Down Expand Up @@ -7626,18 +7631,20 @@ impl AccountsDb {
fn get_cache_hash_data(
accounts_hash_cache_path: PathBuf,
config: &CalcAccountsHashConfig<'_>,
kind: CalcAccountsHashKind,
slot: Slot,
) -> CacheHashData {
if !config.store_detailed_debug_info_on_failure {
CacheHashData::new(accounts_hash_cache_path)
let accounts_hash_cache_path = if !config.store_detailed_debug_info_on_failure {
accounts_hash_cache_path
} else {
// this path executes when we are failing with a hash mismatch
let failed_dir = accounts_hash_cache_path
.join("failed_calculate_accounts_hash_cache")
.join(slot.to_string());
let _ = std::fs::remove_dir_all(&failed_dir);
CacheHashData::new(failed_dir)
}
_ = std::fs::remove_dir_all(&failed_dir);
failed_dir
};
CacheHashData::new(accounts_hash_cache_path, kind == CalcAccountsHashKind::Full)
}

// modeled after calculate_accounts_delta_hash
Expand All @@ -7653,7 +7660,6 @@ impl AccountsDb {
storages,
stats,
CalcAccountsHashKind::Full,
self.full_accounts_hash_cache_path.clone(),
)?;
let AccountsHashKind::Full(accounts_hash) = accounts_hash else {
panic!("calculate_accounts_hash_from_storages must return a FullAccountsHash");
Expand Down Expand Up @@ -7681,7 +7687,6 @@ impl AccountsDb {
storages,
stats,
CalcAccountsHashKind::Incremental,
self.incremental_accounts_hash_cache_path.clone(),
)?;
let AccountsHashKind::Incremental(incremental_accounts_hash) = accounts_hash else {
panic!("calculate_incremental_accounts_hash must return an IncrementalAccountsHash");
Expand All @@ -7695,7 +7700,6 @@ impl AccountsDb {
storages: &SortedStorages<'_>,
mut stats: HashStats,
kind: CalcAccountsHashKind,
accounts_hash_cache_path: PathBuf,
) -> Result<(AccountsHashKind, u64), AccountsHashVerificationError> {
let total_time = Measure::start("");
let _guard = self.active_stats.activate(ActiveStatItem::Hash);
Expand All @@ -7705,10 +7709,12 @@ impl AccountsDb {

let slot = storages.max_slot_inclusive();
let use_bg_thread_pool = config.use_bg_thread_pool;
let accounts_hash_cache_path = self.accounts_hash_cache_path.clone();
let scan_and_hash = || {
let (cache_hash_data, cache_hash_data_us) = measure_us!(Self::get_cache_hash_data(
accounts_hash_cache_path,
config,
kind,
slot
));
stats.cache_hash_data_us += cache_hash_data_us;
Expand Down Expand Up @@ -9971,7 +9977,7 @@ pub mod tests {
let temp_dir = TempDir::new().unwrap();
let accounts_hash_cache_path = temp_dir.path().to_path_buf();
self.scan_snapshot_stores_with_cache(
&CacheHashData::new(accounts_hash_cache_path),
&CacheHashData::new(accounts_hash_cache_path, true),
storage,
stats,
bins,
Expand Down Expand Up @@ -11011,7 +11017,7 @@ pub mod tests {
};

let result = accounts_db.scan_account_storage_no_bank(
&CacheHashData::new(accounts_hash_cache_path),
&CacheHashData::new(accounts_hash_cache_path, true),
&CalcAccountsHashConfig::default(),
&get_storage_refs(&[storage]),
test_scan,
Expand Down
17 changes: 10 additions & 7 deletions accounts-db/src/cache_hash_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,29 +196,32 @@ impl CacheHashDataFile {
}
}

pub type PreExistingCacheFiles = HashSet<PathBuf>;
pub struct CacheHashData {
cache_dir: PathBuf,
pre_existing_cache_files: Arc<Mutex<PreExistingCacheFiles>>,
pre_existing_cache_files: Arc<Mutex<HashSet<PathBuf>>>,
should_delete_old_cache_files_on_drop: bool,
pub stats: Arc<CacheHashDataStats>,
}

impl Drop for CacheHashData {
fn drop(&mut self) {
self.delete_old_cache_files();
if self.should_delete_old_cache_files_on_drop {
self.delete_old_cache_files();
}
Comment on lines +208 to +210
Copy link
Contributor Author

@brooksprumo brooksprumo Sep 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This impl will be correct, but it currently also won't clean up the non-full ranges that exist in the newest slots.

For example, here's what the accounts hash cache dir could look like:

1. -rw-r--r-- 1 sol users 45080 Sep  8 20:14 0.2500.0.65536.7195195454841191751
2. -rw-r--r-- 1 sol users   872 Sep  8 20:31 2500.4001.0.65536.16192472734143462858
3. -rw-r--r-- 1 sol users   584 Sep  8 20:32 4064.4101.0.65536.7196231288448241015
4. -rw-r--r-- 1 sol users  1016 Sep  8 20:34 4064.4201.0.65536.13132790460150755029
5. -rw-r--r-- 1 sol users  1016 Sep  8 20:36 4064.4301.0.65536.7276174198129441015
6. -rw-r--r-- 1 sol users  1016 Sep  8 20:38 4064.4401.0.65536.14404559913584167901
7. -rw-r--r-- 1 sol users  1016 Sep  8 20:39 4064.4501.0.65536.4542188866695766599
8. -rw-r--r-- 1 sol users  1016 Sep  8 20:41 4064.4601.0.65536.11919391694395303358
9. -rw-r--r-- 1 sol users  1016 Sep  8 20:43 4064.4701.0.65536.12791177168748020520

Lines 1 and 2 are full ranges, so these ranges will not change.

Lines 3-9 all have the same starting slot, but their ending slot continues to grow, since the range is not full yet. These are created as a result of incremental snapshots. Ideally lines 3-8 should have been deleted. Without this PR, they are indeed deleted.

So we do want to delete the files with non-full ranges. (At least the ones for the newer slots)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While the current impl is not perfect, it is at least better than the status quo. Luckily, the incremental accounts hash feature gate is not enabled, so no real clusters have nodes with double the cache files.

Without this PR, IAH will duplicate all the files it needs, and requires the full accounts hash calculation to redo all the work that IAH did (to recreate the cache files, but for 'full' instead).

On pop-net, where the snapshot intervals are quite long, this ends up being a lot of duplicated work.

With this PR, we will have old files that won't get cleaned up until the next full accounts hash calculation runs. But we will save on redoing the work of creating the cache files.

I think this is a net win, as perf should be better, and the overall number of files is lower. Future PRs can cleanup these unused files sooner, if needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for writing up the explanation.

yeah. keeping them util next full snapshot works fine for me.

self.stats.report();
}
}

impl CacheHashData {
pub fn new(cache_dir: PathBuf) -> CacheHashData {
pub fn new(cache_dir: PathBuf, should_delete_old_cache_files_on_drop: bool) -> CacheHashData {
std::fs::create_dir_all(&cache_dir).unwrap_or_else(|err| {
panic!("error creating cache dir {}: {err}", cache_dir.display())
});

let result = CacheHashData {
cache_dir,
pre_existing_cache_files: Arc::new(Mutex::new(PreExistingCacheFiles::default())),
pre_existing_cache_files: Arc::new(Mutex::new(HashSet::default())),
should_delete_old_cache_files_on_drop,
stats: Arc::default(),
};

Expand Down Expand Up @@ -281,7 +284,7 @@ impl CacheHashData {
})
}

pub(crate) fn pre_existing_cache_file_will_be_used(&self, file_name: impl AsRef<Path>) {
fn pre_existing_cache_file_will_be_used(&self, file_name: impl AsRef<Path>) {
self.pre_existing_cache_files
.lock()
.unwrap()
Expand Down Expand Up @@ -424,7 +427,7 @@ mod tests {
data_this_pass.push(this_bin_data);
}
}
let cache = CacheHashData::new(cache_dir.clone());
let cache = CacheHashData::new(cache_dir.clone(), true);
let file_name = PathBuf::from("test");
cache.save(&file_name, &data_this_pass).unwrap();
cache.get_cache_files();
Expand Down