From 18acdd98cb5b4bb729f95d2f96073f6ed749daea Mon Sep 17 00:00:00 2001 From: Yueh-Hsuan Chiang <93241502+yhchiang-sol@users.noreply.github.com> Date: Wed, 8 Nov 2023 21:58:44 -0800 Subject: [PATCH] [TieredStorage] Make HotStorageReader use AccountOffset type (#33964) [TieredStorage] Improve param naming of IndexBlockFormat (#34033) [TieredStorage] HotStorageReader::get_account_offset (#34031) [TieredStorage] Rename owners_offset to owners_block_offset (#34047) [TieredStorage] HotStorageReader::get_account_address (#34032) [TieredStorage] OwnersBlock (#34052) [TieredStorage] HotStorageReader::get_owner_address (#34053) [TieredStorage] Define OwnerOffset as u32 (#34105) [TieredStorage] Use OwnerOffset type in TieredAccountMeta (#34106) Refactors TieredStorageFile read/write methods (#34147) [TieredStorage] Make IndexBlock persist u32 offsets (#34133) [TieredStorage] Make IndexOffset use u32 (#34152) Move MatchAccountOwnerError from append_vec to accounts_file (#34187) [TieredStorage] Make AccountOffset use u32 (#34151) [TieredStorage] Allow HotStorage to handle more account data (#34155) [TieredStorage] Make AccountOffset a trait, introduce HotAccountOffset (#34335) [TieredStorage] Improve comments for HOT_ACCOUNT_ALIGNMENT (#34404) [TieredStorage] Unit-tests for checking invalid HotAccountOffset (#34376) [TieredStorage] Boundary check for accessing hot account meta (#34349) [TieredStorage] boundary check for get_account_address() (#34529) Sanitizes tiered storage footer after reading from disk (#34200) Adds read/write/get_pod() fns to tiered storage (#34415) Uses consistent error types in tiered storage (#34110) [TieredStorage] Boundary check for get_account_offset() (#34531) [TieredStorage] HotStorageReader::account_matches_owners (#34350) [TieredStorage] Fix typos in index.rs (#34546) [TieredStorage] HotAccountsReader::get_account (#34499) [TieredStorage] Rename AddressAndBlockOffsetOnly to AddressesThenOffsets (#34658) [TieredStorage] HotStorageWriter::new() (#34659) [TieredStorage] Include executable field into AccountMetaFlags (#34724) [TieredStorage] Code refactoring for OwnersBlock (#34854) [TieredStorage] In-memory struct for writing OwnersBlock (#34853) [TieredStorage] writing hot account blocks and index blocks (#34828) [TieredStorage] Use RENT_EXEMPT_RENT_EPOCH in HotStorageWriter (#34950) [TieredStorage] Write owners block for HotAccountStorage (#34927) [TieredStorage] Avoid AccountHash copy in AccountMetaOptionalFields (#34969) [TieredStorage] Correct the HotStorage API for account_matches_owners (#34967) [TS] Add get_account() and account_matches_owner() to TieredStorageReader (#34968) [TieredStorage] Have HotStorageWriter::write_account() return Vec (#34929) [TieredStorage] Use IndexOffset in TieredStorageMeta and get_account() (#35046) [TieredStorage] TieredStorageReader:: and HotStorageReader:: accounts() (#35031) [TieredStorage] Enable hot-storage in TieredStorage::write_accounts() (#35049) [TieredStorage] Put commonly used test functions into test_utils.rs (#35065) [TieredStorage] Make TieredStorage::write_accounts() thread-safe (#35143) [TieredStorage] rent_epoch() returns 0 for zero-lamport accounts (#35344) [TieredStorage] Deprecate the use of account-hash in HotStorage (#93) --- Cargo.lock | 1 + accounts-db/Cargo.toml | 1 + accounts-db/src/account_storage/meta.rs | 5 +- accounts-db/src/accounts_db.rs | 5 +- accounts-db/src/accounts_file.rs | 10 +- accounts-db/src/append_vec.rs | 10 +- accounts-db/src/tiered_storage.rs | 172 ++- accounts-db/src/tiered_storage/byte_block.rs | 102 +- accounts-db/src/tiered_storage/error.rs | 19 +- accounts-db/src/tiered_storage/file.rs | 85 +- accounts-db/src/tiered_storage/footer.rs | 272 ++++- accounts-db/src/tiered_storage/hot.rs | 1035 ++++++++++++++++-- accounts-db/src/tiered_storage/index.rs | 289 ++++- accounts-db/src/tiered_storage/meta.rs | 132 +-- accounts-db/src/tiered_storage/mmap_utils.rs | 35 +- accounts-db/src/tiered_storage/owners.rs | 198 ++++ accounts-db/src/tiered_storage/readable.rs | 86 +- accounts-db/src/tiered_storage/test_utils.rs | 63 ++ programs/sbf/Cargo.lock | 1 + 19 files changed, 2051 insertions(+), 470 deletions(-) create mode 100644 accounts-db/src/tiered_storage/owners.rs create mode 100644 accounts-db/src/tiered_storage/test_utils.rs diff --git a/Cargo.lock b/Cargo.lock index d027457241e2c6..d092352a56825e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5291,6 +5291,7 @@ dependencies = [ "fs-err", "im", "index_list", + "indexmap 2.1.0", "itertools", "lazy_static", "libsecp256k1", diff --git a/accounts-db/Cargo.toml b/accounts-db/Cargo.toml index 6ce4d2f087e72d..c2bfeb75697e99 100644 --- a/accounts-db/Cargo.toml +++ b/accounts-db/Cargo.toml @@ -24,6 +24,7 @@ fnv = { workspace = true } fs-err = { workspace = true } im = { workspace = true, features = ["rayon", "serde"] } index_list = { workspace = true } +indexmap = { workspace = true } itertools = { workspace = true } lazy_static = { workspace = true } log = { workspace = true } diff --git a/accounts-db/src/account_storage/meta.rs b/accounts-db/src/account_storage/meta.rs index 1442b4845bf604..b6c8d72042097a 100644 --- a/accounts-db/src/account_storage/meta.rs +++ b/accounts-db/src/account_storage/meta.rs @@ -128,7 +128,8 @@ impl<'storage> StoredAccountMeta<'storage> { pub fn hash(&self) -> &'storage AccountHash { match self { Self::AppendVec(av) => av.hash(), - Self::Hot(hot) => hot.hash().unwrap_or(&DEFAULT_ACCOUNT_HASH), + // tiered-storage has deprecated the use of AccountHash + Self::Hot(_) => &DEFAULT_ACCOUNT_HASH, } } @@ -142,7 +143,7 @@ impl<'storage> StoredAccountMeta<'storage> { pub fn offset(&self) -> usize { match self { Self::AppendVec(av) => av.offset(), - Self::Hot(hot) => hot.index(), + Self::Hot(hot) => hot.index().0 as usize, } } diff --git a/accounts-db/src/accounts_db.rs b/accounts-db/src/accounts_db.rs index 65c6a9a52cb23e..7a6ac011cc901c 100644 --- a/accounts-db/src/accounts_db.rs +++ b/accounts-db/src/accounts_db.rs @@ -31,7 +31,7 @@ use { AccountStorage, AccountStorageStatus, ShrinkInProgress, }, accounts_cache::{AccountsCache, CachedAccount, SlotCache}, - accounts_file::{AccountsFile, AccountsFileError}, + accounts_file::{AccountsFile, AccountsFileError, MatchAccountOwnerError}, accounts_hash::{ AccountHash, AccountsDeltaHash, AccountsHash, AccountsHashKind, AccountsHasher, CalcAccountsHashConfig, CalculateHashIntermediate, HashStats, IncrementalAccountsHash, @@ -54,8 +54,7 @@ use { get_ancient_append_vec_capacity, is_ancient, AccountsToStore, StorageSelector, }, append_vec::{ - aligned_stored_size, AppendVec, MatchAccountOwnerError, APPEND_VEC_MMAPPED_FILES_OPEN, - STORE_META_OVERHEAD, + aligned_stored_size, AppendVec, APPEND_VEC_MMAPPED_FILES_OPEN, STORE_META_OVERHEAD, }, cache_hash_data::{CacheHashData, CacheHashDataFileReference}, contains::Contains, diff --git a/accounts-db/src/accounts_file.rs b/accounts-db/src/accounts_file.rs index 77f1717a9ca259..97c761616e7ce3 100644 --- a/accounts-db/src/accounts_file.rs +++ b/accounts-db/src/accounts_file.rs @@ -4,7 +4,7 @@ use { StorableAccountsWithHashesAndWriteVersions, StoredAccountInfo, StoredAccountMeta, }, accounts_hash::AccountHash, - append_vec::{AppendVec, AppendVecError, MatchAccountOwnerError}, + append_vec::{AppendVec, AppendVecError}, storable_accounts::StorableAccounts, tiered_storage::error::TieredStorageError, }, @@ -40,6 +40,14 @@ pub enum AccountsFileError { TieredStorageError(#[from] TieredStorageError), } +#[derive(Error, Debug, PartialEq, Eq)] +pub enum MatchAccountOwnerError { + #[error("The account owner does not match with the provided list")] + NoMatch, + #[error("Unable to load the account")] + UnableToLoad, +} + pub type Result = std::result::Result; #[derive(Debug)] diff --git a/accounts-db/src/append_vec.rs b/accounts-db/src/append_vec.rs index bd789aa3092ad9..2dc32343696287 100644 --- a/accounts-db/src/append_vec.rs +++ b/accounts-db/src/append_vec.rs @@ -10,7 +10,7 @@ use { AccountMeta, StorableAccountsWithHashesAndWriteVersions, StoredAccountInfo, StoredAccountMeta, StoredMeta, StoredMetaWriteVersion, }, - accounts_file::{AccountsFileError, Result, ALIGN_BOUNDARY_OFFSET}, + accounts_file::{AccountsFileError, MatchAccountOwnerError, Result, ALIGN_BOUNDARY_OFFSET}, accounts_hash::AccountHash, storable_accounts::StorableAccounts, u64_align, @@ -96,14 +96,6 @@ impl<'append_vec> Iterator for AppendVecAccountsIter<'append_vec> { } } -#[derive(Error, Debug, PartialEq, Eq)] -pub enum MatchAccountOwnerError { - #[error("The account owner does not match with the provided list")] - NoMatch, - #[error("Unable to load the account")] - UnableToLoad, -} - /// References to account data stored elsewhere. Getting an `Account` requires cloning /// (see `StoredAccountMeta::clone_account()`). #[derive(PartialEq, Eq, Debug)] diff --git a/accounts-db/src/tiered_storage.rs b/accounts-db/src/tiered_storage.rs index 829b0cb033b4f5..2f8ebac65e3b57 100644 --- a/accounts-db/src/tiered_storage.rs +++ b/accounts-db/src/tiered_storage.rs @@ -8,7 +8,9 @@ pub mod hot; pub mod index; pub mod meta; pub mod mmap_utils; +pub mod owners; pub mod readable; +mod test_utils; pub mod writer; use { @@ -18,24 +20,28 @@ use { storable_accounts::StorableAccounts, }, error::TieredStorageError, - footer::{AccountBlockFormat, AccountMetaFormat, OwnersBlockFormat}, + footer::{AccountBlockFormat, AccountMetaFormat}, + hot::{HotStorageWriter, HOT_FORMAT}, index::IndexBlockFormat, + owners::OwnersBlockFormat, readable::TieredStorageReader, solana_sdk::account::ReadableAccount, std::{ borrow::Borrow, - fs::OpenOptions, + fs::{self, OpenOptions}, path::{Path, PathBuf}, - sync::OnceLock, + sync::{ + atomic::{AtomicBool, Ordering}, + OnceLock, + }, }, - writer::TieredStorageWriter, }; pub type TieredStorageResult = Result; /// The struct that defines the formats of all building blocks of a /// TieredStorage. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub struct TieredStorageFormat { pub meta_entry_size: usize, pub account_meta_format: AccountMetaFormat, @@ -44,16 +50,24 @@ pub struct TieredStorageFormat { pub account_block_format: AccountBlockFormat, } +/// The implementation of AccountsFile for tiered-storage. #[derive(Debug)] pub struct TieredStorage { + /// The internal reader instance for its accounts file. reader: OnceLock, + /// A status flag indicating whether its file has been already written. + already_written: AtomicBool, + /// The path to the file that stores accounts. path: PathBuf, } impl Drop for TieredStorage { fn drop(&mut self) { - if let Err(err) = fs_err::remove_file(&self.path) { - panic!("TieredStorage failed to remove backing storage file: {err}"); + if let Err(err) = fs::remove_file(&self.path) { + panic!( + "TieredStorage failed to remove backing storage file '{}': {err}", + self.path.display(), + ); } } } @@ -67,6 +81,7 @@ impl TieredStorage { pub fn new_writable(path: impl Into) -> Self { Self { reader: OnceLock::::new(), + already_written: false.into(), path: path.into(), } } @@ -77,6 +92,7 @@ impl TieredStorage { let path = path.into(); Ok(Self { reader: TieredStorageReader::new_from_path(&path).map(OnceLock::from)?, + already_written: true.into(), path, }) } @@ -89,9 +105,7 @@ impl TieredStorage { /// Writes the specified accounts into this TieredStorage. /// /// Note that this function can only be called once per a TieredStorage - /// instance. TieredStorageError::AttemptToUpdateReadOnly will be returned - /// if this function is invoked more than once on the same TieredStorage - /// instance. + /// instance. Otherwise, it will trigger panic. pub fn write_accounts< 'a, 'b, @@ -104,25 +118,30 @@ impl TieredStorage { skip: usize, format: &TieredStorageFormat, ) -> TieredStorageResult> { - if self.is_read_only() { - return Err(TieredStorageError::AttemptToUpdateReadOnly( - self.path.to_path_buf(), - )); - } + let was_written = self.already_written.swap(true, Ordering::AcqRel); - let result = { - let writer = TieredStorageWriter::new(&self.path, format)?; - writer.write_accounts(accounts, skip) - }; - - // panic here if self.reader.get() is not None as self.reader can only be - // None since we have passed `is_read_only()` check previously, indicating - // self.reader is not yet set. - self.reader - .set(TieredStorageReader::new_from_path(&self.path)?) - .unwrap(); + if was_written { + panic!("cannot write same tiered storage file more than once"); + } - result + if format == &HOT_FORMAT { + let result = { + let writer = HotStorageWriter::new(&self.path)?; + writer.write_accounts(accounts, skip) + }; + + // panic here if self.reader.get() is not None as self.reader can only be + // None since a false-value `was_written` indicates the accounts file has + // not been written previously, implying is_read_only() was also false. + debug_assert!(!self.is_read_only()); + self.reader + .set(TieredStorageReader::new_from_path(&self.path)?) + .unwrap(); + + result + } else { + Err(TieredStorageError::UnknownFormat(self.path.to_path_buf())) + } } /// Returns the underlying reader of the TieredStorage. None will be @@ -151,19 +170,20 @@ impl TieredStorage { mod tests { use { super::*, - crate::account_storage::meta::{StoredMeta, StoredMetaWriteVersion}, + crate::account_storage::meta::StoredMetaWriteVersion, footer::{TieredStorageFooter, TieredStorageMagicNumber}, hot::HOT_FORMAT, - solana_accounts_db::rent_collector::RENT_EXEMPT_RENT_EPOCH, + index::IndexOffset, solana_sdk::{ - account::{Account, AccountSharedData}, - clock::Slot, - hash::Hash, - pubkey::Pubkey, + account::AccountSharedData, clock::Slot, hash::Hash, pubkey::Pubkey, system_instruction::MAX_PERMITTED_DATA_LENGTH, }, - std::mem::ManuallyDrop, + std::{ + collections::{HashMap, HashSet}, + mem::ManuallyDrop, + }, tempfile::tempdir, + test_utils::{create_test_account, verify_test_account}, }; impl TieredStorage { @@ -196,6 +216,7 @@ mod tests { Err(TieredStorageError::AttemptToUpdateReadOnly(_)), ) => {} (Err(TieredStorageError::Unsupported()), Err(TieredStorageError::Unsupported())) => {} + (Ok(_), Ok(_)) => {} // we don't expect error type mis-match or other error types here _ => { panic!("actual: {result:?}, expected: {expected_result:?}"); @@ -224,10 +245,7 @@ mod tests { assert_eq!(tiered_storage.path(), tiered_storage_path); assert_eq!(tiered_storage.file_size().unwrap(), 0); - // Expect the result to be TieredStorageError::Unsupported as the feature - // is not yet fully supported, but we can still check its partial results - // in the test. - write_zero_accounts(&tiered_storage, Err(TieredStorageError::Unsupported())); + write_zero_accounts(&tiered_storage, Ok(vec![])); } let tiered_storage_readonly = TieredStorage::new_readonly(&tiered_storage_path).unwrap(); @@ -246,16 +264,14 @@ mod tests { } #[test] + #[should_panic(expected = "cannot write same tiered storage file more than once")] fn test_write_accounts_twice() { // Generate a new temp path that is guaranteed to NOT already have a file. let temp_dir = tempdir().unwrap(); let tiered_storage_path = temp_dir.path().join("test_write_accounts_twice"); let tiered_storage = TieredStorage::new_writable(&tiered_storage_path); - // Expect the result to be TieredStorageError::Unsupported as the feature - // is not yet fully supported, but we can still check its partial results - // in the test. - write_zero_accounts(&tiered_storage, Err(TieredStorageError::Unsupported())); + write_zero_accounts(&tiered_storage, Ok(vec![])); // Expect AttemptToUpdateReadOnly error as write_accounts can only // be invoked once. write_zero_accounts( @@ -273,7 +289,7 @@ mod tests { let tiered_storage_path = temp_dir.path().join("test_remove_on_drop"); { let tiered_storage = TieredStorage::new_writable(&tiered_storage_path); - write_zero_accounts(&tiered_storage, Err(TieredStorageError::Unsupported())); + write_zero_accounts(&tiered_storage, Ok(vec![])); } // expect the file does not exists as it has been removed on drop assert!(!tiered_storage_path.try_exists().unwrap()); @@ -281,7 +297,7 @@ mod tests { { let tiered_storage = ManuallyDrop::new(TieredStorage::new_writable(&tiered_storage_path)); - write_zero_accounts(&tiered_storage, Err(TieredStorageError::Unsupported())); + write_zero_accounts(&tiered_storage, Ok(vec![])); } // expect the file exists as we have ManuallyDrop this time. assert!(tiered_storage_path.try_exists().unwrap()); @@ -301,29 +317,6 @@ mod tests { assert!(!tiered_storage_path.try_exists().unwrap()); } - /// Create a test account based on the specified seed. - fn create_account(seed: u64) -> (StoredMeta, AccountSharedData) { - let data_byte = seed as u8; - let account = Account { - lamports: seed, - data: std::iter::repeat(data_byte).take(seed as usize).collect(), - owner: Pubkey::new_unique(), - executable: seed % 2 > 0, - rent_epoch: if seed % 3 > 0 { - seed - } else { - RENT_EXEMPT_RENT_EPOCH - }, - }; - - let stored_meta = StoredMeta { - write_version_obsolete: StoredMetaWriteVersion::default(), - pubkey: Pubkey::new_unique(), - data_len: seed, - }; - (stored_meta, AccountSharedData::from(account)) - } - /// The helper function for all write_accounts tests. /// Currently only supports hot accounts. fn do_test_write_accounts( @@ -333,7 +326,7 @@ mod tests { ) { let accounts: Vec<_> = account_data_sizes .iter() - .map(|size| create_account(*size)) + .map(|size| create_test_account(*size)) .collect(); let account_refs: Vec<_> = accounts @@ -363,34 +356,27 @@ mod tests { let tiered_storage = TieredStorage::new_writable(tiered_storage_path); _ = tiered_storage.write_accounts(&storable_accounts, 0, &format); - verify_hot_storage(&tiered_storage, &accounts, format); - } - - /// Verify the generated tiered storage in the test. - fn verify_hot_storage( - tiered_storage: &TieredStorage, - expected_accounts: &[(StoredMeta, AccountSharedData)], - expected_format: TieredStorageFormat, - ) { let reader = tiered_storage.reader().unwrap(); - assert_eq!(reader.num_accounts(), expected_accounts.len()); - - let footer = reader.footer(); - let expected_footer = TieredStorageFooter { - account_meta_format: expected_format.account_meta_format, - owners_block_format: expected_format.owners_block_format, - index_block_format: expected_format.index_block_format, - account_block_format: expected_format.account_block_format, - account_entry_count: expected_accounts.len() as u32, - // Hash is not yet implemented, so we bypass the check - hash: footer.hash, - ..TieredStorageFooter::default() - }; + let num_accounts = storable_accounts.len(); + assert_eq!(reader.num_accounts(), num_accounts); - // TODO(yhchiang): verify account meta and data once the reader side - // is implemented in a separate PR. + let mut expected_accounts_map = HashMap::new(); + for i in 0..num_accounts { + let (account, address, _account_hash, _write_version) = storable_accounts.get(i); + expected_accounts_map.insert(address, account); + } - assert_eq!(*footer, expected_footer); + let mut index_offset = IndexOffset(0); + let mut verified_accounts = HashSet::new(); + while let Some((stored_meta, next)) = reader.get_account(index_offset).unwrap() { + if let Some(account) = expected_accounts_map.get(stored_meta.pubkey()) { + verify_test_account(&stored_meta, *account, stored_meta.pubkey()); + verified_accounts.insert(stored_meta.pubkey()); + } + index_offset = next; + } + assert!(!verified_accounts.is_empty()); + assert_eq!(verified_accounts.len(), expected_accounts_map.len()) } #[test] diff --git a/accounts-db/src/tiered_storage/byte_block.rs b/accounts-db/src/tiered_storage/byte_block.rs index e0fa8b4b136b3b..6fc7dec611e9a9 100644 --- a/accounts-db/src/tiered_storage/byte_block.rs +++ b/accounts-db/src/tiered_storage/byte_block.rs @@ -4,7 +4,7 @@ use { crate::tiered_storage::{footer::AccountBlockFormat, meta::AccountMetaOptionalFields}, std::{ - io::{Cursor, Read, Write}, + io::{Cursor, Read, Result as IoResult, Write}, mem, }, }; @@ -53,11 +53,31 @@ impl ByteBlockWriter { self.len } + /// Write plain ol' data to the internal buffer of the ByteBlockWriter instance + /// + /// Prefer this over `write_type()`, as it prevents some undefined behavior. + pub fn write_pod(&mut self, value: &T) -> IoResult { + // SAFETY: Since T is NoUninit, it does not contain any uninitialized bytes. + unsafe { self.write_type(value) } + } + /// Write the specified typed instance to the internal buffer of /// the ByteBlockWriter instance. - pub fn write_type(&mut self, value: &T) -> std::io::Result { + /// + /// Prefer `write_pod()` when possible, because `write_type()` may cause + /// undefined behavior if `value` contains uninitialized bytes. + /// + /// # Safety + /// + /// Caller must ensure casting T to bytes is safe. + /// Refer to the Safety sections in std::slice::from_raw_parts() + /// and bytemuck's Pod and NoUninit for more information. + pub unsafe fn write_type(&mut self, value: &T) -> IoResult { let size = mem::size_of::(); let ptr = value as *const _ as *const u8; + // SAFETY: The caller ensures that `value` contains no uninitialized bytes, + // we ensure the size is safe by querying T directly, + // and Rust ensures all values are at least byte-aligned. let slice = unsafe { std::slice::from_raw_parts(ptr, size) }; self.write(slice)?; Ok(size) @@ -65,18 +85,15 @@ impl ByteBlockWriter { /// Write all the Some fields of the specified AccountMetaOptionalFields. /// - /// Note that the existance of each optional field is stored separately in + /// Note that the existence of each optional field is stored separately in /// AccountMetaFlags. pub fn write_optional_fields( &mut self, opt_fields: &AccountMetaOptionalFields, - ) -> std::io::Result { + ) -> IoResult { let mut size = 0; if let Some(rent_epoch) = opt_fields.rent_epoch { - size += self.write_type(&rent_epoch)?; - } - if let Some(hash) = opt_fields.account_hash { - size += self.write_type(&hash)?; + size += self.write_pod(&rent_epoch)?; } debug_assert_eq!(size, opt_fields.size()); @@ -86,7 +103,7 @@ impl ByteBlockWriter { /// Write the specified typed bytes to the internal buffer of the /// ByteBlockWriter instance. - pub fn write(&mut self, buf: &[u8]) -> std::io::Result<()> { + pub fn write(&mut self, buf: &[u8]) -> IoResult<()> { match &mut self.encoder { ByteBlockEncoder::Raw(cursor) => cursor.write_all(buf)?, ByteBlockEncoder::Lz4(lz4_encoder) => lz4_encoder.write_all(buf)?, @@ -97,7 +114,7 @@ impl ByteBlockWriter { /// Flush the internal byte buffer that collects all the previous writes /// into an encoded byte array. - pub fn finish(self) -> std::io::Result> { + pub fn finish(self) -> IoResult> { match self.encoder { ByteBlockEncoder::Raw(cursor) => Ok(cursor.into_inner()), ByteBlockEncoder::Lz4(lz4_encoder) => { @@ -112,18 +129,40 @@ impl ByteBlockWriter { /// The util struct for reading byte blocks. pub struct ByteBlockReader; +/// Reads the raw part of the input byte_block, at the specified offset, as type T. +/// +/// Returns None if `offset` + size_of::() exceeds the size of the input byte_block. +/// +/// Type T must be plain ol' data to ensure no undefined behavior. +pub fn read_pod(byte_block: &[u8], offset: usize) -> Option<&T> { + // SAFETY: Since T is AnyBitPattern, it is safe to cast bytes to T. + unsafe { read_type(byte_block, offset) } +} + /// Reads the raw part of the input byte_block at the specified offset /// as type T. /// /// If `offset` + size_of::() exceeds the size of the input byte_block, /// then None will be returned. -pub fn read_type(byte_block: &[u8], offset: usize) -> Option<&T> { +/// +/// Prefer `read_pod()` when possible, because `read_type()` may cause +/// undefined behavior. +/// +/// # Safety +/// +/// Caller must ensure casting bytes to T is safe. +/// Refer to the Safety sections in std::slice::from_raw_parts() +/// and bytemuck's Pod and AnyBitPattern for more information. +pub unsafe fn read_type(byte_block: &[u8], offset: usize) -> Option<&T> { let (next, overflow) = offset.overflowing_add(std::mem::size_of::()); if overflow || next > byte_block.len() { return None; } let ptr = byte_block[offset..].as_ptr() as *const T; debug_assert!(ptr as usize % std::mem::align_of::() == 0); + // SAFETY: The caller ensures it is safe to cast bytes to T, + // we ensure the size is safe by querying T directly, + // and we just checked above to ensure the ptr is aligned for T. Some(unsafe { &*ptr }) } @@ -134,7 +173,7 @@ impl ByteBlockReader { /// /// Note that calling this function with AccountBlockFormat::AlignedRaw encoding /// will result in panic as the input is already decoded. - pub fn decode(encoding: AccountBlockFormat, input: &[u8]) -> std::io::Result> { + pub fn decode(encoding: AccountBlockFormat, input: &[u8]) -> IoResult> { match encoding { AccountBlockFormat::Lz4 => { let mut decoder = lz4::Decoder::new(input).unwrap(); @@ -149,11 +188,7 @@ impl ByteBlockReader { #[cfg(test)] mod tests { - use { - super::*, - crate::accounts_hash::AccountHash, - solana_sdk::{hash::Hash, stake_history::Epoch}, - }; + use {super::*, solana_sdk::stake_history::Epoch}; fn read_type_unaligned(buffer: &[u8], offset: usize) -> (T, usize) { let size = std::mem::size_of::(); @@ -169,7 +204,7 @@ mod tests { let mut writer = ByteBlockWriter::new(format); let value: u32 = 42; - writer.write_type(&value).unwrap(); + writer.write_pod(&value).unwrap(); assert_eq!(writer.raw_len(), mem::size_of::()); let buffer = writer.finish().unwrap(); @@ -231,12 +266,14 @@ mod tests { let test_data3 = [33u8; 300]; // Write the above meta and data in an interleaving way. - writer.write_type(&test_metas[0]).unwrap(); - writer.write_type(&test_data1).unwrap(); - writer.write_type(&test_metas[1]).unwrap(); - writer.write_type(&test_data2).unwrap(); - writer.write_type(&test_metas[2]).unwrap(); - writer.write_type(&test_data3).unwrap(); + unsafe { + writer.write_type(&test_metas[0]).unwrap(); + writer.write_type(&test_data1).unwrap(); + writer.write_type(&test_metas[1]).unwrap(); + writer.write_type(&test_data2).unwrap(); + writer.write_type(&test_metas[2]).unwrap(); + writer.write_type(&test_data3).unwrap(); + } assert_eq!( writer.raw_len(), mem::size_of::() * 3 @@ -312,14 +349,9 @@ mod tests { // prepare a vector of optional fields that contains all combinations // of Some and None. for rent_epoch in [None, Some(test_epoch)] { - for account_hash in [None, Some(AccountHash(Hash::new_unique()))] { - some_count += rent_epoch.iter().count() + account_hash.iter().count(); + some_count += rent_epoch.iter().count(); - opt_fields_vec.push(AccountMetaOptionalFields { - rent_epoch, - account_hash, - }); - } + opt_fields_vec.push(AccountMetaOptionalFields { rent_epoch }); test_epoch += 1; } @@ -346,17 +378,11 @@ mod tests { let mut offset = 0; for opt_fields in &opt_fields_vec { if let Some(expected_rent_epoch) = opt_fields.rent_epoch { - let rent_epoch = read_type::(&decoded_buffer, offset).unwrap(); + let rent_epoch = read_pod::(&decoded_buffer, offset).unwrap(); assert_eq!(*rent_epoch, expected_rent_epoch); verified_count += 1; offset += std::mem::size_of::(); } - if let Some(expected_hash) = opt_fields.account_hash { - let hash = read_type::(&decoded_buffer, offset).unwrap(); - assert_eq!(hash, &expected_hash); - verified_count += 1; - offset += std::mem::size_of::(); - } } // make sure the number of Some fields matches the number of fields we diff --git a/accounts-db/src/tiered_storage/error.rs b/accounts-db/src/tiered_storage/error.rs index 822b8bcde4810b..145334574b4ea3 100644 --- a/accounts-db/src/tiered_storage/error.rs +++ b/accounts-db/src/tiered_storage/error.rs @@ -1,4 +1,4 @@ -use {std::path::PathBuf, thiserror::Error}; +use {super::footer::SanitizeFooterError, std::path::PathBuf, thiserror::Error}; #[derive(Error, Debug)] pub enum TieredStorageError { @@ -11,9 +11,24 @@ pub enum TieredStorageError { #[error("AttemptToUpdateReadOnly: attempted to update read-only file {0}")] AttemptToUpdateReadOnly(PathBuf), - #[error("UnknownFormat: the tiered storage format is unavailable for file {0}")] + #[error("UnknownFormat: the tiered storage format is unknown for file {0}")] UnknownFormat(PathBuf), #[error("Unsupported: the feature is not yet supported")] Unsupported(), + + #[error("invalid footer size: {0}, expected: {1}")] + InvalidFooterSize(u64, u64), + + #[error("invalid footer version: {0}")] + InvalidFooterVersion(u64), + + #[error("footer is unsanitary: {0}")] + SanitizeFooter(#[from] SanitizeFooterError), + + #[error("OffsetOutOfBounds: offset {0} is larger than the supported size {1}")] + OffsetOutOfBounds(usize, usize), + + #[error("OffsetAlignmentError: offset {0} must be multiple of {1}")] + OffsetAlignmentError(usize, usize), } diff --git a/accounts-db/src/tiered_storage/file.rs b/accounts-db/src/tiered_storage/file.rs index 0799c1eec8610a..51801c6133e1f7 100644 --- a/accounts-db/src/tiered_storage/file.rs +++ b/accounts-db/src/tiered_storage/file.rs @@ -1,8 +1,11 @@ -use std::{ - fs::{File, OpenOptions}, - io::{Read, Seek, SeekFrom, Write}, - mem, - path::Path, +use { + bytemuck::{AnyBitPattern, NoUninit}, + std::{ + fs::{File, OpenOptions}, + io::{Read, Result as IoResult, Seek, SeekFrom, Write}, + mem, + path::Path, + }, }; #[derive(Debug)] @@ -15,17 +18,16 @@ impl TieredStorageFile { .read(true) .create(false) .open(&file_path) - .unwrap_or_else(|e| { + .unwrap_or_else(|err| { panic!( - "[TieredStorageError] Unable to open {:?} as read-only: {:?}", + "[TieredStorageError] Unable to open {} as read-only: {err}", file_path.as_ref().display(), - e ); }), ) } - pub fn new_writable(file_path: impl AsRef) -> Result { + pub fn new_writable(file_path: impl AsRef) -> IoResult { Ok(Self( OpenOptions::new() .create_new(true) @@ -34,39 +36,72 @@ impl TieredStorageFile { )) } - pub fn write_type(&self, value: &T) -> Result { + /// Writes `value` to the file. + /// + /// `value` must be plain ol' data. + pub fn write_pod(&self, value: &T) -> IoResult { + // SAFETY: Since T is NoUninit, it does not contain any uninitialized bytes. + unsafe { self.write_type(value) } + } + + /// Writes `value` to the file. + /// + /// Prefer `write_pod` when possible, because `write_value` may cause + /// undefined behavior if `value` contains uninitialized bytes. + /// + /// # Safety + /// + /// Caller must ensure casting T to bytes is safe. + /// Refer to the Safety sections in std::slice::from_raw_parts() + /// and bytemuck's Pod and NoUninit for more information. + pub unsafe fn write_type(&self, value: &T) -> IoResult { let ptr = value as *const _ as *const u8; - let slice = unsafe { std::slice::from_raw_parts(ptr, mem::size_of::()) }; - (&self.0).write_all(slice)?; + let bytes = unsafe { std::slice::from_raw_parts(ptr, mem::size_of::()) }; + self.write_bytes(bytes) + } - Ok(std::mem::size_of::()) + /// Reads a value of type `T` from the file. + /// + /// Type T must be plain ol' data. + pub fn read_pod(&self, value: &mut T) -> IoResult<()> { + // SAFETY: Since T is AnyBitPattern, it is safe to cast bytes to T. + unsafe { self.read_type(value) } } - pub fn read_type(&self, value: &mut T) -> Result<(), std::io::Error> { + /// Reads a value of type `T` from the file. + /// + /// Prefer `read_pod()` when possible, because `read_type()` may cause + /// undefined behavior. + /// + /// # Safety + /// + /// Caller must ensure casting bytes to T is safe. + /// Refer to the Safety sections in std::slice::from_raw_parts() + /// and bytemuck's Pod and AnyBitPattern for more information. + pub unsafe fn read_type(&self, value: &mut T) -> IoResult<()> { let ptr = value as *mut _ as *mut u8; - let slice = unsafe { std::slice::from_raw_parts_mut(ptr, mem::size_of::()) }; - (&self.0).read_exact(slice)?; - - Ok(()) + // SAFETY: The caller ensures it is safe to cast bytes to T, + // we ensure the size is safe by querying T directly, + // and Rust ensures ptr is aligned. + let bytes = unsafe { std::slice::from_raw_parts_mut(ptr, mem::size_of::()) }; + self.read_bytes(bytes) } - pub fn seek(&self, offset: u64) -> Result { + pub fn seek(&self, offset: u64) -> IoResult { (&self.0).seek(SeekFrom::Start(offset)) } - pub fn seek_from_end(&self, offset: i64) -> Result { + pub fn seek_from_end(&self, offset: i64) -> IoResult { (&self.0).seek(SeekFrom::End(offset)) } - pub fn write_bytes(&self, bytes: &[u8]) -> Result { + pub fn write_bytes(&self, bytes: &[u8]) -> IoResult { (&self.0).write_all(bytes)?; Ok(bytes.len()) } - pub fn read_bytes(&self, buffer: &mut [u8]) -> Result<(), std::io::Error> { - (&self.0).read_exact(buffer)?; - - Ok(()) + pub fn read_bytes(&self, buffer: &mut [u8]) -> IoResult<()> { + (&self.0).read_exact(buffer) } } diff --git a/accounts-db/src/tiered_storage/footer.rs b/accounts-db/src/tiered_storage/footer.rs index 7763d8d5622a0a..1eb4fbdb3ff2ec 100644 --- a/accounts-db/src/tiered_storage/footer.rs +++ b/accounts-db/src/tiered_storage/footer.rs @@ -1,11 +1,18 @@ use { crate::tiered_storage::{ - error::TieredStorageError, file::TieredStorageFile, index::IndexBlockFormat, - mmap_utils::get_type, TieredStorageResult as TsResult, + error::TieredStorageError, + file::TieredStorageFile, + index::IndexBlockFormat, + mmap_utils::{get_pod, get_type}, + owners::OwnersBlockFormat, + TieredStorageResult, }, + bytemuck::{Pod, Zeroable}, memmap2::Mmap, + num_enum::TryFromPrimitiveError, solana_sdk::{hash::Hash, pubkey::Pubkey}, std::{mem, path::Path}, + thiserror::Error, }; pub const FOOTER_FORMAT_VERSION: u64 = 1; @@ -22,10 +29,13 @@ pub const FOOTER_TAIL_SIZE: usize = 24; /// The ending 8 bytes of a valid tiered account storage file. pub const FOOTER_MAGIC_NUMBER: u64 = 0x502A2AB5; // SOLALABS -> SOLANA LABS -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy, Pod, Zeroable)] #[repr(C)] pub struct TieredStorageMagicNumber(pub u64); +// Ensure there are no implicit padding bytes +const _: () = assert!(std::mem::size_of::() == 8); + impl Default for TieredStorageMagicNumber { fn default() -> Self { Self(FOOTER_MAGIC_NUMBER) @@ -69,24 +79,7 @@ pub enum AccountBlockFormat { Lz4 = 1, } -#[repr(u16)] -#[derive( - Clone, - Copy, - Debug, - Default, - Eq, - Hash, - PartialEq, - num_enum::IntoPrimitive, - num_enum::TryFromPrimitive, -)] -pub enum OwnersBlockFormat { - #[default] - LocalIndex = 0, -} - -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] #[repr(C)] pub struct TieredStorageFooter { // formats @@ -122,7 +115,7 @@ pub struct TieredStorageFooter { /// The offset pointing to the first byte of the account index block. pub index_block_offset: u64, /// The offset pointing to the first byte of the owners block. - pub owners_offset: u64, + pub owners_block_offset: u64, // account range /// The smallest account address in this file. @@ -133,17 +126,41 @@ pub struct TieredStorageFooter { /// A hash that represents a tiered accounts file for consistency check. pub hash: Hash, + /// The format version of the tiered accounts file. + pub format_version: u64, // The below fields belong to footer tail. // The sum of their sizes should match FOOTER_TAIL_SIZE. /// The size of the footer including the magic number. pub footer_size: u64, - /// The format version of the tiered accounts file. - pub format_version: u64, // This field is persisted in the storage but not in this struct. // The number should match FOOTER_MAGIC_NUMBER. // pub magic_number: u64, } +// It is undefined behavior to read/write uninitialized bytes. +// The `Pod` marker trait indicates there are no uninitialized bytes. +// In order to safely guarantee a type is POD, it cannot have any padding. +const _: () = assert!( + std::mem::size_of::() + == std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() // account_entry_count + + std::mem::size_of::() // account_meta_entry_size + + std::mem::size_of::() // account_block_size + + std::mem::size_of::() // owner_count + + std::mem::size_of::() // owner_entry_size + + std::mem::size_of::() // index_block_offset + + std::mem::size_of::() // owners_block_offset + + std::mem::size_of::() // min_account_address + + std::mem::size_of::() // max_account_address + + std::mem::size_of::() // hash + + std::mem::size_of::() // format_version + + std::mem::size_of::(), // footer_size + "TieredStorageFooter cannot have any padding" +); + impl Default for TieredStorageFooter { fn default() -> Self { Self { @@ -157,39 +174,50 @@ impl Default for TieredStorageFooter { owner_count: 0, owner_entry_size: 0, index_block_offset: 0, - owners_offset: 0, + owners_block_offset: 0, hash: Hash::new_unique(), min_account_address: Pubkey::default(), max_account_address: Pubkey::default(), - footer_size: FOOTER_SIZE as u64, format_version: FOOTER_FORMAT_VERSION, + footer_size: FOOTER_SIZE as u64, } } } impl TieredStorageFooter { - pub fn new_from_path(path: impl AsRef) -> TsResult { + pub fn new_from_path(path: impl AsRef) -> TieredStorageResult { let file = TieredStorageFile::new_readonly(path); Self::new_from_footer_block(&file) } - pub fn write_footer_block(&self, file: &TieredStorageFile) -> TsResult<()> { - file.write_type(self)?; - file.write_type(&TieredStorageMagicNumber::default())?; + pub fn write_footer_block(&self, file: &TieredStorageFile) -> TieredStorageResult<()> { + // SAFETY: The footer does not contain any uninitialized bytes. + unsafe { file.write_type(self)? }; + file.write_pod(&TieredStorageMagicNumber::default())?; Ok(()) } - pub fn new_from_footer_block(file: &TieredStorageFile) -> TsResult { - let mut footer_size: u64 = 0; + pub fn new_from_footer_block(file: &TieredStorageFile) -> TieredStorageResult { + file.seek_from_end(-(FOOTER_TAIL_SIZE as i64))?; + let mut footer_version: u64 = 0; - let mut magic_number = TieredStorageMagicNumber(0); + file.read_pod(&mut footer_version)?; + if footer_version != FOOTER_FORMAT_VERSION { + return Err(TieredStorageError::InvalidFooterVersion(footer_version)); + } - file.seek_from_end(-(FOOTER_TAIL_SIZE as i64))?; - file.read_type(&mut footer_size)?; - file.read_type(&mut footer_version)?; - file.read_type(&mut magic_number)?; + let mut footer_size: u64 = 0; + file.read_pod(&mut footer_size)?; + if footer_size != FOOTER_SIZE as u64 { + return Err(TieredStorageError::InvalidFooterSize( + footer_size, + FOOTER_SIZE as u64, + )); + } + let mut magic_number = TieredStorageMagicNumber::zeroed(); + file.read_pod(&mut magic_number)?; if magic_number != TieredStorageMagicNumber::default() { return Err(TieredStorageError::MagicNumberMismatch( TieredStorageMagicNumber::default().0, @@ -199,17 +227,31 @@ impl TieredStorageFooter { let mut footer = Self::default(); file.seek_from_end(-(footer_size as i64))?; - file.read_type(&mut footer)?; + // SAFETY: We sanitize the footer to ensure all the bytes are + // actually safe to interpret as a TieredStorageFooter. + unsafe { file.read_type(&mut footer)? }; + Self::sanitize(&footer)?; Ok(footer) } - pub fn new_from_mmap(map: &Mmap) -> TsResult<&TieredStorageFooter> { - let offset = map.len().saturating_sub(FOOTER_TAIL_SIZE); - let (footer_size, offset) = get_type::(map, offset)?; - let (_footer_version, offset) = get_type::(map, offset)?; - let (magic_number, _offset) = get_type::(map, offset)?; + pub fn new_from_mmap(mmap: &Mmap) -> TieredStorageResult<&TieredStorageFooter> { + let offset = mmap.len().saturating_sub(FOOTER_TAIL_SIZE); + + let (footer_version, offset) = get_pod::(mmap, offset)?; + if *footer_version != FOOTER_FORMAT_VERSION { + return Err(TieredStorageError::InvalidFooterVersion(*footer_version)); + } + + let (&footer_size, offset) = get_pod::(mmap, offset)?; + if footer_size != FOOTER_SIZE as u64 { + return Err(TieredStorageError::InvalidFooterSize( + footer_size, + FOOTER_SIZE as u64, + )); + } + let (magic_number, _offset) = get_pod::(mmap, offset)?; if *magic_number != TieredStorageMagicNumber::default() { return Err(TieredStorageError::MagicNumberMismatch( TieredStorageMagicNumber::default().0, @@ -217,11 +259,66 @@ impl TieredStorageFooter { )); } - let (footer, _offset) = - get_type::(map, map.len().saturating_sub(*footer_size as usize))?; + let footer_offset = mmap.len().saturating_sub(footer_size as usize); + // SAFETY: We sanitize the footer to ensure all the bytes are + // actually safe to interpret as a TieredStorageFooter. + let (footer, _offset) = unsafe { get_type::(mmap, footer_offset)? }; + Self::sanitize(footer)?; Ok(footer) } + + /// Sanitizes the footer + /// + /// Since the various formats only have specific valid values, they must be sanitized + /// prior to use. This ensures the formats are valid to interpret as (rust) enums. + fn sanitize(footer: &Self) -> Result<(), SanitizeFooterError> { + let account_meta_format_u16 = + unsafe { &*(&footer.account_meta_format as *const _ as *const u16) }; + let owners_block_format_u16 = + unsafe { &*(&footer.owners_block_format as *const _ as *const u16) }; + let index_block_format_u16 = + unsafe { &*(&footer.index_block_format as *const _ as *const u16) }; + let account_block_format_u16 = + unsafe { &*(&footer.account_block_format as *const _ as *const u16) }; + + _ = AccountMetaFormat::try_from(*account_meta_format_u16) + .map_err(SanitizeFooterError::InvalidAccountMetaFormat)?; + _ = OwnersBlockFormat::try_from(*owners_block_format_u16) + .map_err(SanitizeFooterError::InvalidOwnersBlockFormat)?; + _ = IndexBlockFormat::try_from(*index_block_format_u16) + .map_err(SanitizeFooterError::InvalidIndexBlockFormat)?; + _ = AccountBlockFormat::try_from(*account_block_format_u16) + .map_err(SanitizeFooterError::InvalidAccountBlockFormat)?; + + // Since we just sanitized the formats within the footer, + // it is now safe to read them as (rust) enums. + // + // from https://doc.rust-lang.org/reference/items/enumerations.html#casting: + // > If an enumeration is unit-only (with no tuple and struct variants), + // > then its discriminant can be directly accessed with a numeric cast; + // + // from https://doc.rust-lang.org/reference/items/enumerations.html#pointer-casting: + // > If the enumeration specifies a primitive representation, + // > then the discriminant may be reliably accessed via unsafe pointer casting + Ok(()) + } +} + +/// Errors that can happen while sanitizing the footer +#[derive(Error, Debug)] +pub enum SanitizeFooterError { + #[error("invalid account meta format: {0}")] + InvalidAccountMetaFormat(#[from] TryFromPrimitiveError), + + #[error("invalid owners block format: {0}")] + InvalidOwnersBlockFormat(#[from] TryFromPrimitiveError), + + #[error("invalid index block format: {0}")] + InvalidIndexBlockFormat(#[from] TryFromPrimitiveError), + + #[error("invalid account block format: {0}")] + InvalidAccountBlockFormat(#[from] TryFromPrimitiveError), } #[cfg(test)] @@ -240,8 +337,8 @@ mod tests { let path = get_append_vec_path("test_file_footer"); let expected_footer = TieredStorageFooter { account_meta_format: AccountMetaFormat::Hot, - owners_block_format: OwnersBlockFormat::LocalIndex, - index_block_format: IndexBlockFormat::AddressAndOffset, + owners_block_format: OwnersBlockFormat::AddressesOnly, + index_block_format: IndexBlockFormat::AddressesThenOffsets, account_block_format: AccountBlockFormat::AlignedRaw, account_entry_count: 300, account_meta_entry_size: 24, @@ -249,12 +346,12 @@ mod tests { owner_count: 250, owner_entry_size: 32, index_block_offset: 1069600, - owners_offset: 1081200, + owners_block_offset: 1081200, hash: Hash::new_unique(), min_account_address: Pubkey::default(), max_account_address: Pubkey::new_unique(), - footer_size: FOOTER_SIZE as u64, format_version: FOOTER_FORMAT_VERSION, + footer_size: FOOTER_SIZE as u64, }; // Persist the expected footer. @@ -286,11 +383,82 @@ mod tests { assert_eq!(offset_of!(TieredStorageFooter, owner_count), 0x18); assert_eq!(offset_of!(TieredStorageFooter, owner_entry_size), 0x1C); assert_eq!(offset_of!(TieredStorageFooter, index_block_offset), 0x20); - assert_eq!(offset_of!(TieredStorageFooter, owners_offset), 0x28); + assert_eq!(offset_of!(TieredStorageFooter, owners_block_offset), 0x28); assert_eq!(offset_of!(TieredStorageFooter, min_account_address), 0x30); assert_eq!(offset_of!(TieredStorageFooter, max_account_address), 0x50); assert_eq!(offset_of!(TieredStorageFooter, hash), 0x70); - assert_eq!(offset_of!(TieredStorageFooter, footer_size), 0x90); - assert_eq!(offset_of!(TieredStorageFooter, format_version), 0x98); + assert_eq!(offset_of!(TieredStorageFooter, format_version), 0x90); + assert_eq!(offset_of!(TieredStorageFooter, footer_size), 0x98); + } + + #[test] + fn test_sanitize() { + // test: all good + { + let footer = TieredStorageFooter::default(); + let result = TieredStorageFooter::sanitize(&footer); + assert!(result.is_ok()); + } + + // test: bad account meta format + { + let mut footer = TieredStorageFooter::default(); + unsafe { + std::ptr::write( + &mut footer.account_meta_format as *mut _ as *mut u16, + 0xBAD0, + ); + } + let result = TieredStorageFooter::sanitize(&footer); + assert!(matches!( + result, + Err(SanitizeFooterError::InvalidAccountMetaFormat(_)) + )); + } + + // test: bad owners block format + { + let mut footer = TieredStorageFooter::default(); + unsafe { + std::ptr::write( + &mut footer.owners_block_format as *mut _ as *mut u16, + 0xBAD0, + ); + } + let result = TieredStorageFooter::sanitize(&footer); + assert!(matches!( + result, + Err(SanitizeFooterError::InvalidOwnersBlockFormat(_)) + )); + } + + // test: bad index block format + { + let mut footer = TieredStorageFooter::default(); + unsafe { + std::ptr::write(&mut footer.index_block_format as *mut _ as *mut u16, 0xBAD0); + } + let result = TieredStorageFooter::sanitize(&footer); + assert!(matches!( + result, + Err(SanitizeFooterError::InvalidIndexBlockFormat(_)) + )); + } + + // test: bad account block format + { + let mut footer = TieredStorageFooter::default(); + unsafe { + std::ptr::write( + &mut footer.account_block_format as *mut _ as *mut u16, + 0xBAD0, + ); + } + let result = TieredStorageFooter::sanitize(&footer); + assert!(matches!( + result, + Err(SanitizeFooterError::InvalidAccountBlockFormat(_)) + )); + } } } diff --git a/accounts-db/src/tiered_storage/hot.rs b/accounts-db/src/tiered_storage/hot.rs index 9e987f886de101..7972fe85eb006b 100644 --- a/accounts-db/src/tiered_storage/hot.rs +++ b/accounts-db/src/tiered_storage/hot.rs @@ -1,43 +1,87 @@ -#![allow(dead_code)] //! The account meta and related structs for hot accounts. use { crate::{ + account_storage::meta::{StoredAccountInfo, StoredAccountMeta}, + accounts_file::MatchAccountOwnerError, accounts_hash::AccountHash, + rent_collector::RENT_EXEMPT_RENT_EPOCH, tiered_storage::{ byte_block, - footer::{ - AccountBlockFormat, AccountMetaFormat, OwnersBlockFormat, TieredStorageFooter, - }, - index::IndexBlockFormat, + file::TieredStorageFile, + footer::{AccountBlockFormat, AccountMetaFormat, TieredStorageFooter}, + index::{AccountIndexWriterEntry, AccountOffset, IndexBlockFormat, IndexOffset}, meta::{AccountMetaFlags, AccountMetaOptionalFields, TieredAccountMeta}, - mmap_utils::get_type, + mmap_utils::{get_pod, get_slice}, + owners::{OwnerOffset, OwnersBlockFormat, OwnersTable, OWNER_NO_OWNER}, + readable::TieredReadableAccount, + StorableAccounts, StorableAccountsWithHashesAndWriteVersions, TieredStorageError, TieredStorageFormat, TieredStorageResult, }, }, + bytemuck::{Pod, Zeroable}, memmap2::{Mmap, MmapOptions}, modular_bitfield::prelude::*, - solana_sdk::stake_history::Epoch, - std::{fs::OpenOptions, option::Option, path::Path}, + solana_sdk::{ + account::ReadableAccount, pubkey::Pubkey, stake_history::Epoch, + }, + std::{borrow::Borrow, fs::OpenOptions, option::Option, path::Path}, }; pub const HOT_FORMAT: TieredStorageFormat = TieredStorageFormat { meta_entry_size: std::mem::size_of::(), account_meta_format: AccountMetaFormat::Hot, - owners_block_format: OwnersBlockFormat::LocalIndex, - index_block_format: IndexBlockFormat::AddressAndOffset, + owners_block_format: OwnersBlockFormat::AddressesOnly, + index_block_format: IndexBlockFormat::AddressesThenOffsets, account_block_format: AccountBlockFormat::AlignedRaw, }; +/// An helper function that creates a new default footer for hot +/// accounts storage. +fn new_hot_footer() -> TieredStorageFooter { + TieredStorageFooter { + account_meta_format: HOT_FORMAT.account_meta_format, + account_meta_entry_size: HOT_FORMAT.meta_entry_size as u32, + account_block_format: HOT_FORMAT.account_block_format, + index_block_format: HOT_FORMAT.index_block_format, + owners_block_format: HOT_FORMAT.owners_block_format, + ..TieredStorageFooter::default() + } +} + +/// The maximum allowed value for the owner index of a hot account. +const MAX_HOT_OWNER_OFFSET: OwnerOffset = OwnerOffset((1 << 29) - 1); + +/// The byte alignment for hot accounts. This alignment serves duo purposes. +/// First, it allows hot accounts to be directly accessed when the underlying +/// file is mmapped. In addition, as all hot accounts are aligned, it allows +/// each hot accounts file to handle more accounts with the same number of +/// bytes in HotAccountOffset. +pub(crate) const HOT_ACCOUNT_ALIGNMENT: usize = 8; + +/// The alignemnt for the blocks inside a hot accounts file. A hot accounts +/// file consists of accounts block, index block, owners block, and footer. +/// This requirement allows the offset of each block properly aligned so +/// that they can be readable under mmap. +pub(crate) const HOT_BLOCK_ALIGNMENT: usize = 8; + +/// The maximum supported offset for hot accounts storage. +const MAX_HOT_ACCOUNT_OFFSET: usize = u32::MAX as usize * HOT_ACCOUNT_ALIGNMENT; + +// returns the required number of padding +fn padding_bytes(data_len: usize) -> u8 { + ((HOT_ACCOUNT_ALIGNMENT - (data_len % HOT_ACCOUNT_ALIGNMENT)) % HOT_ACCOUNT_ALIGNMENT) as u8 +} + /// The maximum number of padding bytes used in a hot account entry. const MAX_HOT_PADDING: u8 = 7; -/// The maximum allowed value for the owner index of a hot account. -const MAX_HOT_OWNER_INDEX: u32 = (1 << 29) - 1; +/// The buffer that is used for padding. +const PADDING_BUFFER: [u8; 8] = [0u8; HOT_ACCOUNT_ALIGNMENT]; #[bitfield(bits = 32)] #[repr(C)] -#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)] +#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Pod, Zeroable)] struct HotMetaPackedFields { /// A hot account entry consists of the following elements: /// @@ -50,12 +94,52 @@ struct HotMetaPackedFields { /// in its hot account entry. padding: B3, /// The index to the owner of a hot account inside an AccountsFile. - owner_index: B29, + owner_offset: B29, +} + +// Ensure there are no implicit padding bytes +const _: () = assert!(std::mem::size_of::() == 4); + +/// The offset to access a hot account. +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Pod, Zeroable)] +pub struct HotAccountOffset(u32); + +// Ensure there are no implicit padding bytes +const _: () = assert!(std::mem::size_of::() == 4); + +impl AccountOffset for HotAccountOffset {} + +impl HotAccountOffset { + /// Creates a new AccountOffset instance + pub fn new(offset: usize) -> TieredStorageResult { + if offset > MAX_HOT_ACCOUNT_OFFSET { + return Err(TieredStorageError::OffsetOutOfBounds( + offset, + MAX_HOT_ACCOUNT_OFFSET, + )); + } + + // Hot accounts are aligned based on HOT_ACCOUNT_ALIGNMENT. + if offset % HOT_ACCOUNT_ALIGNMENT != 0 { + return Err(TieredStorageError::OffsetAlignmentError( + offset, + HOT_ACCOUNT_ALIGNMENT, + )); + } + + Ok(HotAccountOffset((offset / HOT_ACCOUNT_ALIGNMENT) as u32)) + } + + /// Returns the offset to the account. + fn offset(&self) -> usize { + self.0 as usize * HOT_ACCOUNT_ALIGNMENT + } } /// The storage and in-memory representation of the metadata entry for a /// hot account. -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Pod, Zeroable)] #[repr(C)] pub struct HotAccountMeta { /// The balance of this account. @@ -66,6 +150,9 @@ pub struct HotAccountMeta { flags: AccountMetaFlags, } +// Ensure there are no implicit padding bytes +const _: () = assert!(std::mem::size_of::() == 8 + 4 + 4); + impl TieredAccountMeta for HotAccountMeta { /// Construct a HotAccountMeta instance. fn new() -> Self { @@ -93,11 +180,11 @@ impl TieredAccountMeta for HotAccountMeta { } /// A builder function that initializes the owner's index. - fn with_owner_index(mut self, owner_index: u32) -> Self { - if owner_index > MAX_HOT_OWNER_INDEX { - panic!("owner_index exceeds MAX_HOT_OWNER_INDEX"); + fn with_owner_offset(mut self, owner_offset: OwnerOffset) -> Self { + if owner_offset > MAX_HOT_OWNER_OFFSET { + panic!("owner_offset exceeds MAX_HOT_OWNER_OFFSET"); } - self.packed_fields.set_owner_index(owner_index); + self.packed_fields.set_owner_offset(owner_offset.0); self } @@ -126,8 +213,8 @@ impl TieredAccountMeta for HotAccountMeta { } /// Returns the index to the accounts' owner in the current AccountsFile. - fn owner_index(&self) -> u32 { - self.packed_fields.owner_index() + fn owner_offset(&self) -> OwnerOffset { + OwnerOffset(self.packed_fields.owner_offset()) } /// Returns the AccountMetaFlags of the current meta. @@ -150,20 +237,7 @@ impl TieredAccountMeta for HotAccountMeta { .then(|| { let offset = self.optional_fields_offset(account_block) + AccountMetaOptionalFields::rent_epoch_offset(self.flags()); - byte_block::read_type::(account_block, offset).copied() - }) - .flatten() - } - - /// Returns the account hash by parsing the specified account block. None - /// will be returned if this account does not persist this optional field. - fn account_hash<'a>(&self, account_block: &'a [u8]) -> Option<&'a AccountHash> { - self.flags() - .has_account_hash() - .then(|| { - let offset = self.optional_fields_offset(account_block) - + AccountMetaOptionalFields::account_hash_offset(self.flags()); - byte_block::read_type::(account_block, offset) + byte_block::read_pod::(account_block, offset).copied() }) .flatten() } @@ -202,11 +276,11 @@ impl HotStorageReader { pub fn new_from_path(path: impl AsRef) -> TieredStorageResult { let file = OpenOptions::new().read(true).open(path)?; let mmap = unsafe { MmapOptions::new().map(&file)? }; - // Here we are cloning the footer as accessing any data in a + // Here we are copying the footer, as accessing any data in a // TieredStorage instance requires accessing its Footer. // This can help improve cache locality and reduce the overhead // of indirection associated with memory-mapped accesses. - let footer = TieredStorageFooter::new_from_mmap(&mmap)?.clone(); + let footer = *TieredStorageFooter::new_from_mmap(&mmap)?; Ok(Self { mmap, footer }) } @@ -223,10 +297,333 @@ impl HotStorageReader { } /// Returns the account meta located at the specified offset. - fn get_account_meta_from_offset(&self, offset: usize) -> TieredStorageResult<&HotAccountMeta> { - let (meta, _) = get_type::(&self.mmap, offset)?; + fn get_account_meta_from_offset( + &self, + account_offset: HotAccountOffset, + ) -> TieredStorageResult<&HotAccountMeta> { + let offset = account_offset.offset(); + + assert!( + offset.saturating_add(std::mem::size_of::()) + <= self.footer.index_block_offset as usize, + "reading HotAccountOffset ({}) would exceed accounts blocks offset boundary ({}).", + offset, + self.footer.index_block_offset, + ); + let (meta, _) = get_pod::(&self.mmap, offset)?; Ok(meta) } + + /// Returns the offset to the account given the specified index. + pub(super) fn get_account_offset( + &self, + index_offset: IndexOffset, + ) -> TieredStorageResult { + self.footer + .index_block_format + .get_account_offset::(&self.mmap, &self.footer, index_offset) + } + + /// Returns the address of the account associated with the specified index. + fn get_account_address(&self, index: IndexOffset) -> TieredStorageResult<&Pubkey> { + self.footer + .index_block_format + .get_account_address(&self.mmap, &self.footer, index) + } + + /// Returns the address of the account owner given the specified + /// owner_offset. + fn get_owner_address(&self, owner_offset: OwnerOffset) -> TieredStorageResult<&Pubkey> { + self.footer + .owners_block_format + .get_owner_address(&self.mmap, &self.footer, owner_offset) + } + + /// Returns Ok(index_of_matching_owner) if the account owner at + /// `account_offset` is one of the pubkeys in `owners`. + /// + /// Returns Err(MatchAccountOwnerError::NoMatch) if the account has 0 + /// lamports or the owner is not one of the pubkeys in `owners`. + /// + /// Returns Err(MatchAccountOwnerError::UnableToLoad) if there is any internal + /// error that causes the data unable to load, including `account_offset` + /// causes a data overrun. + pub fn account_matches_owners( + &self, + account_offset: HotAccountOffset, + owners: &[Pubkey], + ) -> Result { + let account_meta = self + .get_account_meta_from_offset(account_offset) + .map_err(|_| MatchAccountOwnerError::UnableToLoad)?; + + if account_meta.lamports() == 0 { + Err(MatchAccountOwnerError::NoMatch) + } else { + let account_owner = self + .get_owner_address(account_meta.owner_offset()) + .map_err(|_| MatchAccountOwnerError::UnableToLoad)?; + + owners + .iter() + .position(|candidate| account_owner == candidate) + .ok_or(MatchAccountOwnerError::NoMatch) + } + } + + /// Returns the size of the account block based on its account offset + /// and index offset. + /// + /// The account block size information is omitted in the hot accounts file + /// as it can be derived by comparing the offset of the next hot account + /// meta in the index block. + fn get_account_block_size( + &self, + account_offset: HotAccountOffset, + index_offset: IndexOffset, + ) -> TieredStorageResult { + // the offset that points to the hot account meta. + let account_meta_offset = account_offset.offset(); + + // Obtain the ending offset of the account block. If the current + // account is the last account, then the ending offset is the + // index_block_offset. + let account_block_ending_offset = + if index_offset.0.saturating_add(1) == self.footer.account_entry_count { + self.footer.index_block_offset as usize + } else { + self.get_account_offset(IndexOffset(index_offset.0.saturating_add(1)))? + .offset() + }; + + // With the ending offset, minus the starting offset (i.e., + // the account meta offset) and the HotAccountMeta size, the reminder + // is the account block size (account data + optional fields). + Ok(account_block_ending_offset + .saturating_sub(account_meta_offset) + .saturating_sub(std::mem::size_of::())) + } + + /// Returns the account block that contains the account associated with + /// the specified index given the offset to the account meta and its index. + fn get_account_block( + &self, + account_offset: HotAccountOffset, + index_offset: IndexOffset, + ) -> TieredStorageResult<&[u8]> { + let (data, _) = get_slice( + &self.mmap, + account_offset.offset() + std::mem::size_of::(), + self.get_account_block_size(account_offset, index_offset)?, + )?; + + Ok(data) + } + + /// Returns the account located at the specified index offset. + pub fn get_account( + &self, + index_offset: IndexOffset, + ) -> TieredStorageResult, IndexOffset)>> { + if index_offset.0 >= self.footer.account_entry_count { + return Ok(None); + } + + let account_offset = self.get_account_offset(index_offset)?; + + let meta = self.get_account_meta_from_offset(account_offset)?; + let address = self.get_account_address(index_offset)?; + let owner = self.get_owner_address(meta.owner_offset())?; + let account_block = self.get_account_block(account_offset, index_offset)?; + + Ok(Some(( + StoredAccountMeta::Hot(TieredReadableAccount { + meta, + address, + owner, + index: index_offset, + account_block, + }), + IndexOffset(index_offset.0.saturating_add(1)), + ))) + } + + /// Return a vector of account metadata for each account, starting from + /// `index_offset` + pub fn accounts( + &self, + mut index_offset: IndexOffset, + ) -> TieredStorageResult> { + let mut accounts = Vec::with_capacity( + self.footer + .account_entry_count + .saturating_sub(index_offset.0) as usize, + ); + while let Some((account, next)) = self.get_account(index_offset)? { + accounts.push(account); + index_offset = next; + } + Ok(accounts) + } +} + +fn write_optional_fields( + file: &TieredStorageFile, + opt_fields: &AccountMetaOptionalFields, +) -> TieredStorageResult { + let mut size = 0; + if let Some(rent_epoch) = opt_fields.rent_epoch { + size += file.write_pod(&rent_epoch)?; + } + + debug_assert_eq!(size, opt_fields.size()); + + Ok(size) +} + +/// The writer that creates a hot accounts file. +#[derive(Debug)] +pub struct HotStorageWriter { + storage: TieredStorageFile, +} + +impl HotStorageWriter { + /// Create a new HotStorageWriter with the specified path. + pub fn new(file_path: impl AsRef) -> TieredStorageResult { + Ok(Self { + storage: TieredStorageFile::new_writable(file_path)?, + }) + } + + /// Persists an account with the specified information and returns + /// the stored size of the account. + fn write_account( + &self, + lamports: u64, + owner_offset: OwnerOffset, + account_data: &[u8], + executable: bool, + rent_epoch: Option, + ) -> TieredStorageResult { + let optional_fields = AccountMetaOptionalFields { rent_epoch }; + + let mut flags = AccountMetaFlags::new_from(&optional_fields); + flags.set_executable(executable); + + let padding_len = padding_bytes(account_data.len()); + let meta = HotAccountMeta::new() + .with_lamports(lamports) + .with_owner_offset(owner_offset) + .with_account_data_size(account_data.len() as u64) + .with_account_data_padding(padding_len) + .with_flags(&flags); + + let mut stored_size = 0; + + stored_size += self.storage.write_pod(&meta)?; + stored_size += self.storage.write_bytes(account_data)?; + stored_size += self + .storage + .write_bytes(&PADDING_BUFFER[0..(padding_len as usize)])?; + stored_size += write_optional_fields(&self.storage, &optional_fields)?; + + Ok(stored_size) + } + + /// Persists `accounts` into the underlying hot accounts file associated + /// with this HotStorageWriter. The first `skip` number of accounts are + /// *not* persisted. + pub fn write_accounts< + 'a, + 'b, + T: ReadableAccount + Sync, + U: StorableAccounts<'a, T>, + V: Borrow, + >( + &self, + accounts: &StorableAccountsWithHashesAndWriteVersions<'a, 'b, T, U, V>, + skip: usize, + ) -> TieredStorageResult> { + let mut footer = new_hot_footer(); + let mut index = vec![]; + let mut owners_table = OwnersTable::default(); + let mut cursor = 0; + + // writing accounts blocks + let len = accounts.accounts.len(); + let total_input_accounts = len - skip; + let mut stored_infos = Vec::with_capacity(total_input_accounts); + for i in skip..len { + let (account, address, _account_hash, _write_version) = accounts.get(i); + let index_entry = AccountIndexWriterEntry { + address, + offset: HotAccountOffset::new(cursor)?, + }; + + // Obtain necessary fields from the account, or default fields + // for a zero-lamport account in the None case. + let (lamports, owner, data, executable, rent_epoch) = account + .map(|acc| { + ( + acc.lamports(), + acc.owner(), + acc.data(), + acc.executable(), + // only persist rent_epoch for those rent-paying accounts + (acc.rent_epoch() != RENT_EXEMPT_RENT_EPOCH).then_some(acc.rent_epoch()), + ) + }) + .unwrap_or((0, &OWNER_NO_OWNER, &[], false, None)); + let owner_offset = owners_table.insert(owner); + let stored_size = + self.write_account(lamports, owner_offset, data, executable, rent_epoch)?; + cursor += stored_size; + + stored_infos.push(StoredAccountInfo { + // Here we pass the IndexOffset as the get_account() API + // takes IndexOffset. Given the account address is also + // maintained outside the TieredStorage, a potential optimization + // is to store AccountOffset instead, which can further save + // one jump from the index block to the accounts block. + offset: index.len(), + // Here we only include the stored size that the account directly + // contribute (i.e., account entry + index entry that include the + // account meta, data, optional fields, its address, and AccountOffset). + // Storage size from those shared blocks like footer and owners block + // is not included. + size: stored_size + footer.index_block_format.entry_size::(), + }); + index.push(index_entry); + } + footer.account_entry_count = total_input_accounts as u32; + + // writing index block + // expect the offset of each block aligned. + assert!(cursor % HOT_BLOCK_ALIGNMENT == 0); + footer.index_block_offset = cursor as u64; + cursor += footer + .index_block_format + .write_index_block(&self.storage, &index)?; + if cursor % HOT_BLOCK_ALIGNMENT != 0 { + // In case it is not yet aligned, it is due to the fact that + // the index block has an odd number of entries. In such case, + // we expect the amount off is equal to 4. + assert_eq!(cursor % HOT_BLOCK_ALIGNMENT, 4); + cursor += self.storage.write_pod(&0u32)?; + } + + // writing owners block + assert!(cursor % HOT_BLOCK_ALIGNMENT == 0); + footer.owners_block_offset = cursor as u64; + footer.owner_count = owners_table.len() as u32; + footer + .owners_block_format + .write_owners_block(&self.storage, &owners_table)?; + + footer.write_footer_block(&self.storage)?; + + Ok(stored_infos) + } } #[cfg(test)] @@ -236,17 +633,20 @@ pub mod tests { crate::tiered_storage::{ byte_block::ByteBlockWriter, file::TieredStorageFile, - footer::{ - AccountBlockFormat, AccountMetaFormat, OwnersBlockFormat, TieredStorageFooter, - FOOTER_SIZE, - }, + footer::{AccountBlockFormat, AccountMetaFormat, TieredStorageFooter, FOOTER_SIZE}, hot::{HotAccountMeta, HotStorageReader}, - index::IndexBlockFormat, + index::{AccountIndexWriterEntry, IndexBlockFormat, IndexOffset}, meta::{AccountMetaFlags, AccountMetaOptionalFields, TieredAccountMeta}, + owners::{OwnersBlockFormat, OwnersTable}, + test_utils::{create_test_account, verify_test_account}, }, + assert_matches::assert_matches, memoffset::offset_of, - rand::Rng, - solana_sdk::{hash::Hash, pubkey::Pubkey, stake_history::Epoch}, + rand::{seq::SliceRandom, Rng}, + solana_sdk::{ + account::ReadableAccount, hash::Hash, pubkey::Pubkey, slot_history::Slot, + stake_history::Epoch, + }, tempfile::TempDir, }; @@ -261,31 +661,53 @@ pub mod tests { #[test] fn test_packed_fields() { const TEST_PADDING: u8 = 7; - const TEST_OWNER_INDEX: u32 = 0x1fff_ef98; + const TEST_OWNER_OFFSET: u32 = 0x1fff_ef98; let mut packed_fields = HotMetaPackedFields::default(); packed_fields.set_padding(TEST_PADDING); - packed_fields.set_owner_index(TEST_OWNER_INDEX); + packed_fields.set_owner_offset(TEST_OWNER_OFFSET); assert_eq!(packed_fields.padding(), TEST_PADDING); - assert_eq!(packed_fields.owner_index(), TEST_OWNER_INDEX); + assert_eq!(packed_fields.owner_offset(), TEST_OWNER_OFFSET); } #[test] fn test_packed_fields_max_values() { let mut packed_fields = HotMetaPackedFields::default(); packed_fields.set_padding(MAX_HOT_PADDING); - packed_fields.set_owner_index(MAX_HOT_OWNER_INDEX); + packed_fields.set_owner_offset(MAX_HOT_OWNER_OFFSET.0); assert_eq!(packed_fields.padding(), MAX_HOT_PADDING); - assert_eq!(packed_fields.owner_index(), MAX_HOT_OWNER_INDEX); + assert_eq!(packed_fields.owner_offset(), MAX_HOT_OWNER_OFFSET.0); } #[test] fn test_hot_meta_max_values() { let meta = HotAccountMeta::new() .with_account_data_padding(MAX_HOT_PADDING) - .with_owner_index(MAX_HOT_OWNER_INDEX); + .with_owner_offset(MAX_HOT_OWNER_OFFSET); assert_eq!(meta.account_data_padding(), MAX_HOT_PADDING); - assert_eq!(meta.owner_index(), MAX_HOT_OWNER_INDEX); + assert_eq!(meta.owner_offset(), MAX_HOT_OWNER_OFFSET); + } + + #[test] + fn test_max_hot_account_offset() { + assert_matches!(HotAccountOffset::new(0), Ok(_)); + assert_matches!(HotAccountOffset::new(MAX_HOT_ACCOUNT_OFFSET), Ok(_)); + } + + #[test] + fn test_max_hot_account_offset_out_of_bounds() { + assert_matches!( + HotAccountOffset::new(MAX_HOT_ACCOUNT_OFFSET + HOT_ACCOUNT_ALIGNMENT), + Err(TieredStorageError::OffsetOutOfBounds(_, _)) + ); + } + + #[test] + fn test_max_hot_account_offset_alignment_error() { + assert_matches!( + HotAccountOffset::new(HOT_ACCOUNT_ALIGNMENT - 1), + Err(TieredStorageError::OffsetAlignmentError(_, _)) + ); } #[test] @@ -295,33 +717,32 @@ pub mod tests { } #[test] - #[should_panic(expected = "owner_index exceeds MAX_HOT_OWNER_INDEX")] - fn test_hot_meta_owner_index_exceeds_limit() { - HotAccountMeta::new().with_owner_index(MAX_HOT_OWNER_INDEX + 1); + #[should_panic(expected = "owner_offset exceeds MAX_HOT_OWNER_OFFSET")] + fn test_hot_meta_owner_offset_exceeds_limit() { + HotAccountMeta::new().with_owner_offset(OwnerOffset(MAX_HOT_OWNER_OFFSET.0 + 1)); } #[test] fn test_hot_account_meta() { const TEST_LAMPORTS: u64 = 2314232137; const TEST_PADDING: u8 = 5; - const TEST_OWNER_INDEX: u32 = 0x1fef_1234; + const TEST_OWNER_OFFSET: OwnerOffset = OwnerOffset(0x1fef_1234); const TEST_RENT_EPOCH: Epoch = 7; let optional_fields = AccountMetaOptionalFields { rent_epoch: Some(TEST_RENT_EPOCH), - account_hash: Some(AccountHash(Hash::new_unique())), }; let flags = AccountMetaFlags::new_from(&optional_fields); let meta = HotAccountMeta::new() .with_lamports(TEST_LAMPORTS) .with_account_data_padding(TEST_PADDING) - .with_owner_index(TEST_OWNER_INDEX) + .with_owner_offset(TEST_OWNER_OFFSET) .with_flags(&flags); assert_eq!(meta.lamports(), TEST_LAMPORTS); assert_eq!(meta.account_data_padding(), TEST_PADDING); - assert_eq!(meta.owner_index(), TEST_OWNER_INDEX); + assert_eq!(meta.owner_offset(), TEST_OWNER_OFFSET); assert_eq!(*meta.flags(), flags); } @@ -331,32 +752,33 @@ pub mod tests { let padding = [0u8; 5]; const TEST_LAMPORT: u64 = 2314232137; - const OWNER_INDEX: u32 = 0x1fef_1234; + const OWNER_OFFSET: u32 = 0x1fef_1234; const TEST_RENT_EPOCH: Epoch = 7; let optional_fields = AccountMetaOptionalFields { rent_epoch: Some(TEST_RENT_EPOCH), - account_hash: Some(AccountHash(Hash::new_unique())), }; let flags = AccountMetaFlags::new_from(&optional_fields); let expected_meta = HotAccountMeta::new() .with_lamports(TEST_LAMPORT) .with_account_data_padding(padding.len().try_into().unwrap()) - .with_owner_index(OWNER_INDEX) + .with_owner_offset(OwnerOffset(OWNER_OFFSET)) .with_flags(&flags); let mut writer = ByteBlockWriter::new(AccountBlockFormat::AlignedRaw); - writer.write_type(&expected_meta).unwrap(); - writer.write_type(&account_data).unwrap(); - writer.write_type(&padding).unwrap(); + writer.write_pod(&expected_meta).unwrap(); + // SAFETY: These values are POD, so they are safe to write. + unsafe { + writer.write_type(&account_data).unwrap(); + writer.write_type(&padding).unwrap(); + } writer.write_optional_fields(&optional_fields).unwrap(); let buffer = writer.finish().unwrap(); - let meta = byte_block::read_type::(&buffer, 0).unwrap(); + let meta = byte_block::read_pod::(&buffer, 0).unwrap(); assert_eq!(expected_meta, *meta); assert!(meta.flags().has_rent_epoch()); - assert!(meta.flags().has_account_hash()); assert_eq!(meta.account_data_padding() as usize, padding.len()); let account_block = &buffer[std::mem::size_of::()..]; @@ -369,10 +791,6 @@ pub mod tests { assert_eq!(account_data.len(), meta.account_data_size(account_block)); assert_eq!(account_data, meta.account_data(account_block)); assert_eq!(meta.rent_epoch(account_block), optional_fields.rent_epoch); - assert_eq!( - *(meta.account_hash(account_block).unwrap()), - optional_fields.account_hash.unwrap() - ); } #[test] @@ -382,8 +800,8 @@ pub mod tests { let path = temp_dir.path().join("test_hot_storage_footer"); let expected_footer = TieredStorageFooter { account_meta_format: AccountMetaFormat::Hot, - owners_block_format: OwnersBlockFormat::LocalIndex, - index_block_format: IndexBlockFormat::AddressAndOffset, + owners_block_format: OwnersBlockFormat::AddressesOnly, + index_block_format: IndexBlockFormat::AddressesThenOffsets, account_block_format: AccountBlockFormat::AlignedRaw, account_entry_count: 300, account_meta_entry_size: 16, @@ -391,7 +809,7 @@ pub mod tests { owner_count: 250, owner_entry_size: 32, index_block_offset: 1069600, - owners_offset: 1081200, + owners_block_offset: 1081200, hash: Hash::new_unique(), min_account_address: Pubkey::default(), max_account_address: Pubkey::new_unique(), @@ -425,12 +843,12 @@ pub mod tests { .map(|_| { HotAccountMeta::new() .with_lamports(rng.gen_range(0..u64::MAX)) - .with_owner_index(rng.gen_range(0..NUM_ACCOUNTS)) + .with_owner_offset(OwnerOffset(rng.gen_range(0..NUM_ACCOUNTS))) }) .collect(); let account_offsets: Vec<_>; - let footer = TieredStorageFooter { + let mut footer = TieredStorageFooter { account_meta_format: AccountMetaFormat::Hot, account_entry_count: NUM_ACCOUNTS, ..TieredStorageFooter::default() @@ -443,12 +861,13 @@ pub mod tests { .iter() .map(|meta| { let prev_offset = current_offset; - current_offset += file.write_type(meta).unwrap(); - prev_offset + current_offset += file.write_pod(meta).unwrap(); + HotAccountOffset::new(prev_offset).unwrap() }) .collect(); // while the test only focuses on account metas, writing a footer // here is necessary to make it a valid tiered-storage file. + footer.index_block_offset = current_offset as u64; footer.write_footer_block(&file).unwrap(); } @@ -458,6 +877,464 @@ pub mod tests { let meta = hot_storage.get_account_meta_from_offset(*offset).unwrap(); assert_eq!(meta, expected_meta); } + assert_eq!(&footer, hot_storage.footer()); } + + #[test] + #[should_panic(expected = "would exceed accounts blocks offset boundary")] + fn test_get_acount_meta_from_offset_out_of_bounds() { + // Generate a new temp path that is guaranteed to NOT already have a file. + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir + .path() + .join("test_get_acount_meta_from_offset_out_of_bounds"); + + let footer = TieredStorageFooter { + account_meta_format: AccountMetaFormat::Hot, + index_block_offset: 160, + ..TieredStorageFooter::default() + }; + + { + let file = TieredStorageFile::new_writable(&path).unwrap(); + footer.write_footer_block(&file).unwrap(); + } + + let hot_storage = HotStorageReader::new_from_path(&path).unwrap(); + let offset = HotAccountOffset::new(footer.index_block_offset as usize).unwrap(); + // Read from index_block_offset, which offset doesn't belong to + // account blocks. Expect assert failure here + hot_storage.get_account_meta_from_offset(offset).unwrap(); + } + + #[test] + fn test_hot_storage_get_account_offset_and_address() { + // Generate a new temp path that is guaranteed to NOT already have a file. + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir + .path() + .join("test_hot_storage_get_account_offset_and_address"); + const NUM_ACCOUNTS: u32 = 10; + let mut rng = rand::thread_rng(); + + let addresses: Vec<_> = std::iter::repeat_with(Pubkey::new_unique) + .take(NUM_ACCOUNTS as usize) + .collect(); + + let index_writer_entries: Vec<_> = addresses + .iter() + .map(|address| AccountIndexWriterEntry { + address, + offset: HotAccountOffset::new( + rng.gen_range(0..u32::MAX) as usize * HOT_ACCOUNT_ALIGNMENT, + ) + .unwrap(), + }) + .collect(); + + let mut footer = TieredStorageFooter { + account_meta_format: AccountMetaFormat::Hot, + account_entry_count: NUM_ACCOUNTS, + // Set index_block_offset to 0 as we didn't write any account + // meta/data in this test + index_block_offset: 0, + ..TieredStorageFooter::default() + }; + { + let file = TieredStorageFile::new_writable(&path).unwrap(); + + let cursor = footer + .index_block_format + .write_index_block(&file, &index_writer_entries) + .unwrap(); + footer.owners_block_offset = cursor as u64; + footer.write_footer_block(&file).unwrap(); + } + + let hot_storage = HotStorageReader::new_from_path(&path).unwrap(); + for (i, index_writer_entry) in index_writer_entries.iter().enumerate() { + let account_offset = hot_storage + .get_account_offset(IndexOffset(i as u32)) + .unwrap(); + assert_eq!(account_offset, index_writer_entry.offset); + + let account_address = hot_storage + .get_account_address(IndexOffset(i as u32)) + .unwrap(); + assert_eq!(account_address, index_writer_entry.address); + } + } + + #[test] + fn test_hot_storage_get_owner_address() { + // Generate a new temp path that is guaranteed to NOT already have a file. + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("test_hot_storage_get_owner_address"); + const NUM_OWNERS: usize = 10; + + let addresses: Vec<_> = std::iter::repeat_with(Pubkey::new_unique) + .take(NUM_OWNERS) + .collect(); + + let footer = TieredStorageFooter { + account_meta_format: AccountMetaFormat::Hot, + // meta/data nor index block in this test + owners_block_offset: 0, + ..TieredStorageFooter::default() + }; + + { + let file = TieredStorageFile::new_writable(&path).unwrap(); + + let mut owners_table = OwnersTable::default(); + addresses.iter().for_each(|owner_address| { + owners_table.insert(owner_address); + }); + footer + .owners_block_format + .write_owners_block(&file, &owners_table) + .unwrap(); + + // while the test only focuses on account metas, writing a footer + // here is necessary to make it a valid tiered-storage file. + footer.write_footer_block(&file).unwrap(); + } + + let hot_storage = HotStorageReader::new_from_path(&path).unwrap(); + for (i, address) in addresses.iter().enumerate() { + assert_eq!( + hot_storage + .get_owner_address(OwnerOffset(i as u32)) + .unwrap(), + address, + ); + } + } + + #[test] + fn test_account_matches_owners() { + // Generate a new temp path that is guaranteed to NOT already have a file. + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("test_hot_storage_get_owner_address"); + const NUM_OWNERS: u32 = 10; + + let owner_addresses: Vec<_> = std::iter::repeat_with(Pubkey::new_unique) + .take(NUM_OWNERS as usize) + .collect(); + + const NUM_ACCOUNTS: u32 = 30; + let mut rng = rand::thread_rng(); + + let hot_account_metas: Vec<_> = std::iter::repeat_with({ + || { + HotAccountMeta::new() + .with_lamports(rng.gen_range(1..u64::MAX)) + .with_owner_offset(OwnerOffset(rng.gen_range(0..NUM_OWNERS))) + } + }) + .take(NUM_ACCOUNTS as usize) + .collect(); + let mut footer = TieredStorageFooter { + account_meta_format: AccountMetaFormat::Hot, + account_entry_count: NUM_ACCOUNTS, + owner_count: NUM_OWNERS, + ..TieredStorageFooter::default() + }; + let account_offsets: Vec<_>; + + { + let file = TieredStorageFile::new_writable(&path).unwrap(); + let mut current_offset = 0; + + account_offsets = hot_account_metas + .iter() + .map(|meta| { + let prev_offset = current_offset; + current_offset += file.write_pod(meta).unwrap(); + HotAccountOffset::new(prev_offset).unwrap() + }) + .collect(); + footer.index_block_offset = current_offset as u64; + // Typically, the owners block is stored after index block, but + // since we don't write index block in this test, so we have + // the owners_block_offset set to the end of the accounts blocks. + footer.owners_block_offset = footer.index_block_offset; + + let mut owners_table = OwnersTable::default(); + owner_addresses.iter().for_each(|owner_address| { + owners_table.insert(owner_address); + }); + footer + .owners_block_format + .write_owners_block(&file, &owners_table) + .unwrap(); + + // while the test only focuses on account metas, writing a footer + // here is necessary to make it a valid tiered-storage file. + footer.write_footer_block(&file).unwrap(); + } + + let hot_storage = HotStorageReader::new_from_path(&path).unwrap(); + + // First, verify whether we can find the expected owners. + let mut owner_candidates = owner_addresses.clone(); + owner_candidates.shuffle(&mut rng); + + for (account_offset, account_meta) in account_offsets.iter().zip(hot_account_metas.iter()) { + let index = hot_storage + .account_matches_owners(*account_offset, &owner_candidates) + .unwrap(); + assert_eq!( + owner_candidates[index], + owner_addresses[account_meta.owner_offset().0 as usize] + ); + } + + // Second, verify the MatchAccountOwnerError::NoMatch case + const NUM_UNMATCHED_OWNERS: usize = 20; + let unmatched_candidates: Vec<_> = std::iter::repeat_with(Pubkey::new_unique) + .take(NUM_UNMATCHED_OWNERS) + .collect(); + + for account_offset in account_offsets.iter() { + assert_eq!( + hot_storage.account_matches_owners(*account_offset, &unmatched_candidates), + Err(MatchAccountOwnerError::NoMatch) + ); + } + + // Thirdly, we mixed two candidates and make sure we still find the + // matched owner. + owner_candidates.extend(unmatched_candidates); + owner_candidates.shuffle(&mut rng); + + for (account_offset, account_meta) in account_offsets.iter().zip(hot_account_metas.iter()) { + let index = hot_storage + .account_matches_owners(*account_offset, &owner_candidates) + .unwrap(); + assert_eq!( + owner_candidates[index], + owner_addresses[account_meta.owner_offset().0 as usize] + ); + } + } + + #[test] + fn test_hot_storage_get_account() { + // Generate a new temp path that is guaranteed to NOT already have a file. + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("test_hot_storage_get_account"); + + let mut rng = rand::thread_rng(); + + // create owners + const NUM_OWNERS: usize = 10; + let owners: Vec<_> = std::iter::repeat_with(Pubkey::new_unique) + .take(NUM_OWNERS) + .collect(); + + // create account data + const NUM_ACCOUNTS: usize = 20; + let account_datas: Vec<_> = (0..NUM_ACCOUNTS) + .map(|i| vec![i as u8; rng.gen_range(0..4096)]) + .collect(); + + // create account metas that link to its data and owner + let account_metas: Vec<_> = (0..NUM_ACCOUNTS) + .map(|i| { + HotAccountMeta::new() + .with_lamports(rng.gen_range(0..u64::MAX)) + .with_owner_offset(OwnerOffset(rng.gen_range(0..NUM_OWNERS) as u32)) + .with_account_data_padding(padding_bytes(account_datas[i].len())) + }) + .collect(); + + // create account addresses + let addresses: Vec<_> = std::iter::repeat_with(Pubkey::new_unique) + .take(NUM_ACCOUNTS) + .collect(); + + let mut footer = TieredStorageFooter { + account_meta_format: AccountMetaFormat::Hot, + account_entry_count: NUM_ACCOUNTS as u32, + owner_count: NUM_OWNERS as u32, + ..TieredStorageFooter::default() + }; + + { + let file = TieredStorageFile::new_writable(&path).unwrap(); + let mut current_offset = 0; + + // write accounts blocks + let padding_buffer = [0u8; HOT_ACCOUNT_ALIGNMENT]; + let index_writer_entries: Vec<_> = account_metas + .iter() + .zip(account_datas.iter()) + .zip(addresses.iter()) + .map(|((meta, data), address)| { + let prev_offset = current_offset; + current_offset += file.write_pod(meta).unwrap(); + current_offset += file.write_bytes(data).unwrap(); + current_offset += file + .write_bytes(&padding_buffer[0..padding_bytes(data.len()) as usize]) + .unwrap(); + AccountIndexWriterEntry { + address, + offset: HotAccountOffset::new(prev_offset).unwrap(), + } + }) + .collect(); + + // write index blocks + footer.index_block_offset = current_offset as u64; + current_offset += footer + .index_block_format + .write_index_block(&file, &index_writer_entries) + .unwrap(); + + // write owners block + footer.owners_block_offset = current_offset as u64; + let mut owners_table = OwnersTable::default(); + owners.iter().for_each(|owner_address| { + owners_table.insert(owner_address); + }); + footer + .owners_block_format + .write_owners_block(&file, &owners_table) + .unwrap(); + + footer.write_footer_block(&file).unwrap(); + } + + let hot_storage = HotStorageReader::new_from_path(&path).unwrap(); + + for i in 0..NUM_ACCOUNTS { + let (stored_meta, next) = hot_storage + .get_account(IndexOffset(i as u32)) + .unwrap() + .unwrap(); + assert_eq!(stored_meta.lamports(), account_metas[i].lamports()); + assert_eq!(stored_meta.data().len(), account_datas[i].len()); + assert_eq!(stored_meta.data(), account_datas[i]); + assert_eq!( + *stored_meta.owner(), + owners[account_metas[i].owner_offset().0 as usize] + ); + assert_eq!(*stored_meta.pubkey(), addresses[i]); + + assert_eq!(i + 1, next.0 as usize); + } + // Make sure it returns None on NUM_ACCOUNTS to allow termination on + // while loop in actual accounts-db read case. + assert_matches!( + hot_storage.get_account(IndexOffset(NUM_ACCOUNTS as u32)), + Ok(None) + ); + } + + #[test] + fn test_hot_storage_writer_twice_on_same_path() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir + .path() + .join("test_hot_storage_writer_twice_on_same_path"); + + // Expect the first returns Ok + assert_matches!(HotStorageWriter::new(&path), Ok(_)); + // Expect the second call on the same path returns Err, as the + // HotStorageWriter only writes once. + assert_matches!(HotStorageWriter::new(&path), Err(_)); + } + + #[test] + fn test_write_account_and_index_blocks() { + let account_data_sizes = &[ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1000, 2000, 3000, 4000, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + ]; + + let accounts: Vec<_> = account_data_sizes + .iter() + .map(|size| create_test_account(*size)) + .collect(); + + let account_refs: Vec<_> = accounts + .iter() + .map(|account| (&account.0.pubkey, &account.1)) + .collect(); + + // Slot information is not used here + let account_data = (Slot::MAX, &account_refs[..]); + let hashes: Vec<_> = std::iter::repeat_with(|| AccountHash(Hash::new_unique())) + .take(account_data_sizes.len()) + .collect(); + + let write_versions: Vec<_> = accounts + .iter() + .map(|account| account.0.write_version_obsolete) + .collect(); + + let storable_accounts = + StorableAccountsWithHashesAndWriteVersions::new_with_hashes_and_write_versions( + &account_data, + hashes.clone(), + write_versions.clone(), + ); + + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("test_write_account_and_index_blocks"); + let stored_infos = { + let writer = HotStorageWriter::new(&path).unwrap(); + writer.write_accounts(&storable_accounts, 0).unwrap() + }; + + let hot_storage = HotStorageReader::new_from_path(&path).unwrap(); + + let num_accounts = account_data_sizes.len(); + + for i in 0..num_accounts { + let (stored_meta, next) = hot_storage + .get_account(IndexOffset(i as u32)) + .unwrap() + .unwrap(); + + let (account, address, _account_hash, _write_version) = storable_accounts.get(i); + verify_test_account(&stored_meta, account, address); + + assert_eq!(i + 1, next.0 as usize); + } + // Make sure it returns None on NUM_ACCOUNTS to allow termination on + // while loop in actual accounts-db read case. + assert_matches!( + hot_storage.get_account(IndexOffset(num_accounts as u32)), + Ok(None) + ); + + for stored_info in stored_infos { + let (stored_meta, _) = hot_storage + .get_account(IndexOffset(stored_info.offset as u32)) + .unwrap() + .unwrap(); + + let (account, address, _account_hash, _write_version) = + storable_accounts.get(stored_info.offset); + verify_test_account(&stored_meta, account, address); + } + + // verify get_accounts + let accounts = hot_storage.accounts(IndexOffset(0)).unwrap(); + + // first, we verify everything + for (i, stored_meta) in accounts.iter().enumerate() { + let (account, address, _account_hash, _write_version) = storable_accounts.get(i); + verify_test_account(stored_meta, account, address); + } + + // second, we verify various initial position + let total_stored_accounts = accounts.len(); + for i in 0..total_stored_accounts { + let partial_accounts = hot_storage.accounts(IndexOffset(i as u32)).unwrap(); + assert_eq!(&partial_accounts, &accounts[i..]); + } + } } diff --git a/accounts-db/src/tiered_storage/index.rs b/accounts-db/src/tiered_storage/index.rs index cd8b2a33c82529..c82e65ce6d275a 100644 --- a/accounts-db/src/tiered_storage/index.rs +++ b/accounts-db/src/tiered_storage/index.rs @@ -1,36 +1,34 @@ use { crate::tiered_storage::{ - file::TieredStorageFile, footer::TieredStorageFooter, mmap_utils::get_type, + file::TieredStorageFile, footer::TieredStorageFooter, mmap_utils::get_pod, TieredStorageResult, }, + bytemuck::{Pod, Zeroable}, memmap2::Mmap, solana_sdk::pubkey::Pubkey, }; /// The in-memory struct for the writing index block. -/// The actual storage format of a tiered account index entry might be different -/// from this. #[derive(Debug)] -pub struct AccountIndexWriterEntry<'a> { +pub struct AccountIndexWriterEntry<'a, Offset: AccountOffset> { + /// The account address. pub address: &'a Pubkey, - pub block_offset: u64, - pub intra_block_offset: u64, + /// The offset to the account. + pub offset: Offset, } -/// The offset to an account stored inside its accounts block. -/// This struct is used to access the meta and data of an account by looking through -/// its accounts block. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct AccountOffset { - /// The offset to the accounts block that contains the account meta/data. - pub block: usize, -} +/// The offset to an account. +pub trait AccountOffset: Clone + Copy + Pod + Zeroable {} /// The offset to an account/address entry in the accounts index block. /// This can be used to obtain the AccountOffset and address by looking through /// the accounts index block. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct IndexOffset(usize); +#[repr(C)] +#[derive(Clone, Copy, Debug, Eq, PartialEq, Pod, Zeroable)] +pub struct IndexOffset(pub u32); + +// Ensure there are no implicit padding bytes +const _: () = assert!(std::mem::size_of::() == 4); /// The index format of a tiered accounts file. #[repr(u16)] @@ -47,28 +45,31 @@ pub struct IndexOffset(usize); )] pub enum IndexBlockFormat { /// This format optimizes the storage size by storing only account addresses - /// and offsets. It skips storing the size of account data by storing account - /// block entries and index block entries in the same order. + /// and block offsets. It skips storing the size of account data by storing + /// account block entries and index block entries in the same order. #[default] - AddressAndOffset = 0, + AddressesThenOffsets = 0, } +// Ensure there are no implicit padding bytes +const _: () = assert!(std::mem::size_of::() == 2); + impl IndexBlockFormat { /// Persists the specified index_entries to the specified file and returns /// the total number of bytes written. pub fn write_index_block( &self, file: &TieredStorageFile, - index_entries: &[AccountIndexWriterEntry], + index_entries: &[AccountIndexWriterEntry], ) -> TieredStorageResult { match self { - Self::AddressAndOffset => { + Self::AddressesThenOffsets => { let mut bytes_written = 0; for index_entry in index_entries { - bytes_written += file.write_type(index_entry.address)?; + bytes_written += file.write_pod(index_entry.address)?; } for index_entry in index_entries { - bytes_written += file.write_type(&index_entry.block_offset)?; + bytes_written += file.write_pod(&index_entry.offset)?; } Ok(bytes_written) } @@ -78,43 +79,65 @@ impl IndexBlockFormat { /// Returns the address of the account given the specified index. pub fn get_account_address<'a>( &self, - map: &'a Mmap, + mmap: &'a Mmap, footer: &TieredStorageFooter, - offset: IndexOffset, + index_offset: IndexOffset, ) -> TieredStorageResult<&'a Pubkey> { let offset = match self { - Self::AddressAndOffset => { - footer.index_block_offset as usize + std::mem::size_of::() * offset.0 + Self::AddressesThenOffsets => { + debug_assert!(index_offset.0 < footer.account_entry_count); + footer.index_block_offset as usize + + std::mem::size_of::() * (index_offset.0 as usize) } }; - let (address, _) = get_type::(map, offset)?; + + debug_assert!( + offset.saturating_add(std::mem::size_of::()) + <= footer.owners_block_offset as usize, + "reading IndexOffset ({}) would exceed index block boundary ({}).", + offset, + footer.owners_block_offset, + ); + + let (address, _) = get_pod::(mmap, offset)?; Ok(address) } /// Returns the offset to the account given the specified index. - pub fn get_account_offset( + pub fn get_account_offset( &self, - map: &Mmap, + mmap: &Mmap, footer: &TieredStorageFooter, - offset: IndexOffset, - ) -> TieredStorageResult { - match self { - Self::AddressAndOffset => { - let offset = footer.index_block_offset as usize + index_offset: IndexOffset, + ) -> TieredStorageResult { + let offset = match self { + Self::AddressesThenOffsets => { + debug_assert!(index_offset.0 < footer.account_entry_count); + footer.index_block_offset as usize + std::mem::size_of::() * footer.account_entry_count as usize - + offset.0 * std::mem::size_of::(); - let (account_block_offset, _) = get_type(map, offset)?; - Ok(AccountOffset { - block: *account_block_offset, - }) + + std::mem::size_of::() * index_offset.0 as usize } - } + }; + + debug_assert!( + offset.saturating_add(std::mem::size_of::()) + <= footer.owners_block_offset as usize, + "reading IndexOffset ({}) would exceed index block boundary ({}).", + offset, + footer.owners_block_offset, + ); + + let (account_offset, _) = get_pod::(mmap, offset)?; + + Ok(*account_offset) } /// Returns the size of one index entry. - pub fn entry_size(&self) -> usize { + pub fn entry_size(&self) -> usize { match self { - Self::AddressAndOffset => std::mem::size_of::() + std::mem::size_of::(), + Self::AddressesThenOffsets => { + std::mem::size_of::() + std::mem::size_of::() + } } } } @@ -122,14 +145,21 @@ impl IndexBlockFormat { #[cfg(test)] mod tests { use { - super::*, crate::tiered_storage::file::TieredStorageFile, memmap2::MmapOptions, rand::Rng, - std::fs::OpenOptions, tempfile::TempDir, + super::*, + crate::tiered_storage::{ + file::TieredStorageFile, + hot::{HotAccountOffset, HOT_ACCOUNT_ALIGNMENT}, + }, + memmap2::MmapOptions, + rand::Rng, + std::fs::OpenOptions, + tempfile::TempDir, }; #[test] fn test_address_and_offset_indexer() { const ENTRY_COUNT: usize = 100; - let footer = TieredStorageFooter { + let mut footer = TieredStorageFooter { account_entry_count: ENTRY_COUNT as u32, ..TieredStorageFooter::default() }; @@ -143,33 +173,182 @@ mod tests { .iter() .map(|address| AccountIndexWriterEntry { address, - block_offset: rng.gen_range(128..2048), - intra_block_offset: 0, + offset: HotAccountOffset::new( + rng.gen_range(0..u32::MAX) as usize * HOT_ACCOUNT_ALIGNMENT, + ) + .unwrap(), }) .collect(); { let file = TieredStorageFile::new_writable(&path).unwrap(); - let indexer = IndexBlockFormat::AddressAndOffset; - indexer.write_index_block(&file, &index_entries).unwrap(); + let indexer = IndexBlockFormat::AddressesThenOffsets; + let cursor = indexer.write_index_block(&file, &index_entries).unwrap(); + footer.owners_block_offset = cursor as u64; } - let indexer = IndexBlockFormat::AddressAndOffset; + let indexer = IndexBlockFormat::AddressesThenOffsets; let file = OpenOptions::new() .read(true) .create(false) .open(&path) .unwrap(); - let map = unsafe { MmapOptions::new().map(&file).unwrap() }; + let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; for (i, index_entry) in index_entries.iter().enumerate() { let account_offset = indexer - .get_account_offset(&map, &footer, IndexOffset(i)) + .get_account_offset::(&mmap, &footer, IndexOffset(i as u32)) .unwrap(); - assert_eq!(index_entry.block_offset, account_offset.block as u64); + assert_eq!(index_entry.offset, account_offset); let address = indexer - .get_account_address(&map, &footer, IndexOffset(i)) + .get_account_address(&mmap, &footer, IndexOffset(i as u32)) .unwrap(); assert_eq!(index_entry.address, address); } } + + #[test] + #[should_panic(expected = "index_offset.0 < footer.account_entry_count")] + fn test_get_account_address_out_of_bounds() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir + .path() + .join("test_get_account_address_out_of_bounds"); + + let footer = TieredStorageFooter { + account_entry_count: 100, + index_block_format: IndexBlockFormat::AddressesThenOffsets, + ..TieredStorageFooter::default() + }; + + { + // we only write a footer here as the test should hit an assert + // failure before it actually reads the file. + let file = TieredStorageFile::new_writable(&path).unwrap(); + footer.write_footer_block(&file).unwrap(); + } + + let file = OpenOptions::new() + .read(true) + .create(false) + .open(&path) + .unwrap(); + let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; + footer + .index_block_format + .get_account_address(&mmap, &footer, IndexOffset(footer.account_entry_count)) + .unwrap(); + } + + #[test] + #[should_panic(expected = "would exceed index block boundary")] + fn test_get_account_address_exceeds_index_block_boundary() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir + .path() + .join("test_get_account_address_exceeds_index_block_boundary"); + + let footer = TieredStorageFooter { + account_entry_count: 100, + index_block_format: IndexBlockFormat::AddressesThenOffsets, + index_block_offset: 1024, + // only holds one index entry + owners_block_offset: 1024 + std::mem::size_of::() as u64, + ..TieredStorageFooter::default() + }; + + { + // we only write a footer here as the test should hit an assert + // failure before it actually reads the file. + let file = TieredStorageFile::new_writable(&path).unwrap(); + footer.write_footer_block(&file).unwrap(); + } + + let file = OpenOptions::new() + .read(true) + .create(false) + .open(&path) + .unwrap(); + let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; + // IndexOffset does not exceed the account_entry_count but exceeds + // the index block boundary. + footer + .index_block_format + .get_account_address(&mmap, &footer, IndexOffset(2)) + .unwrap(); + } + + #[test] + #[should_panic(expected = "index_offset.0 < footer.account_entry_count")] + fn test_get_account_offset_out_of_bounds() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir + .path() + .join("test_get_account_offset_out_of_bounds"); + + let footer = TieredStorageFooter { + account_entry_count: 100, + index_block_format: IndexBlockFormat::AddressesThenOffsets, + ..TieredStorageFooter::default() + }; + + { + // we only write a footer here as the test should hit an assert + // failure before we actually read the file. + let file = TieredStorageFile::new_writable(&path).unwrap(); + footer.write_footer_block(&file).unwrap(); + } + + let file = OpenOptions::new() + .read(true) + .create(false) + .open(&path) + .unwrap(); + let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; + footer + .index_block_format + .get_account_offset::( + &mmap, + &footer, + IndexOffset(footer.account_entry_count), + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "would exceed index block boundary")] + fn test_get_account_offset_exceeds_index_block_boundary() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir + .path() + .join("test_get_account_offset_exceeds_index_block_boundary"); + + let footer = TieredStorageFooter { + account_entry_count: 100, + index_block_format: IndexBlockFormat::AddressesThenOffsets, + index_block_offset: 1024, + // only holds one index entry + owners_block_offset: 1024 + std::mem::size_of::() as u64, + ..TieredStorageFooter::default() + }; + + { + // we only write a footer here as the test should hit an assert + // failure before we actually read the file. + let file = TieredStorageFile::new_writable(&path).unwrap(); + footer.write_footer_block(&file).unwrap(); + } + + let file = OpenOptions::new() + .read(true) + .create(false) + .open(&path) + .unwrap(); + let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; + // IndexOffset does not exceed the account_entry_count but exceeds + // the index block boundary. + footer + .index_block_format + .get_account_offset::(&mmap, &footer, IndexOffset(2)) + .unwrap(); + } } diff --git a/accounts-db/src/tiered_storage/meta.rs b/accounts-db/src/tiered_storage/meta.rs index 668c6ab93d8310..1cba3188b4c401 100644 --- a/accounts-db/src/tiered_storage/meta.rs +++ b/accounts-db/src/tiered_storage/meta.rs @@ -1,19 +1,21 @@ #![allow(dead_code)] //! The account meta and related structs for the tiered storage. use { - crate::accounts_hash::AccountHash, modular_bitfield::prelude::*, + crate::tiered_storage::owners::OwnerOffset, + bytemuck::{Pod, Zeroable}, + modular_bitfield::prelude::*, solana_sdk::stake_history::Epoch, }; /// The struct that handles the account meta flags. #[bitfield(bits = 32)] #[repr(C)] -#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)] +#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Pod, Zeroable)] pub struct AccountMetaFlags { /// whether the account meta has rent epoch pub has_rent_epoch: bool, - /// whether the account meta has account hash - pub has_account_hash: bool, + /// whether the account is executable + pub executable: bool, /// the reserved bits. reserved: B30, } @@ -31,8 +33,8 @@ pub trait TieredAccountMeta: Sized { /// for the account data associated with the current meta. fn with_account_data_padding(self, padding: u8) -> Self; - /// A builder function that initializes the owner's index. - fn with_owner_index(self, index: u32) -> Self; + /// A builder function that initializes the owner offset. + fn with_owner_offset(self, owner_offset: OwnerOffset) -> Self; /// A builder function that initializes the account data size. /// The size here represents the logical data size without compression. @@ -48,8 +50,8 @@ pub trait TieredAccountMeta: Sized { /// Returns the number of padding bytes for the associated account data fn account_data_padding(&self) -> u8; - /// Returns the index to the accounts' owner in the current AccountsFile. - fn owner_index(&self) -> u32; + /// Returns the offset to the accounts' owner in the current AccountsFile. + fn owner_offset(&self) -> OwnerOffset; /// Returns the AccountMetaFlags of the current meta. fn flags(&self) -> &AccountMetaFlags; @@ -63,10 +65,6 @@ pub trait TieredAccountMeta: Sized { /// does not persist this optional field. fn rent_epoch(&self, _account_block: &[u8]) -> Option; - /// Returns the account hash by parsing the specified account block. None - /// will be returned if this account does not persist this optional field. - fn account_hash<'a>(&self, _account_block: &'a [u8]) -> Option<&'a AccountHash>; - /// Returns the offset of the optional fields based on the specified account /// block. fn optional_fields_offset(&self, _account_block: &[u8]) -> usize; @@ -84,7 +82,7 @@ impl AccountMetaFlags { pub fn new_from(optional_fields: &AccountMetaOptionalFields) -> Self { let mut flags = AccountMetaFlags::default(); flags.set_has_rent_epoch(optional_fields.rent_epoch.is_some()); - flags.set_has_account_hash(optional_fields.account_hash.is_some()); + flags.set_executable(false); flags } } @@ -97,17 +95,12 @@ impl AccountMetaFlags { pub struct AccountMetaOptionalFields { /// the epoch at which its associated account will next owe rent pub rent_epoch: Option, - /// the hash of its associated account - pub account_hash: Option, } impl AccountMetaOptionalFields { /// The size of the optional fields in bytes (excluding the boolean flags). pub fn size(&self) -> usize { self.rent_epoch.map_or(0, |_| std::mem::size_of::()) - + self - .account_hash - .map_or(0, |_| std::mem::size_of::()) } /// Given the specified AccountMetaFlags, returns the size of its @@ -117,9 +110,6 @@ impl AccountMetaOptionalFields { if flags.has_rent_epoch() { fields_size += std::mem::size_of::(); } - if flags.has_account_hash() { - fields_size += std::mem::size_of::(); - } fields_size } @@ -129,29 +119,17 @@ impl AccountMetaOptionalFields { pub fn rent_epoch_offset(_flags: &AccountMetaFlags) -> usize { 0 } - - /// Given the specified AccountMetaFlags, returns the relative offset - /// of its account_hash field to the offset of its optional fields entry. - pub fn account_hash_offset(flags: &AccountMetaFlags) -> usize { - let mut offset = Self::rent_epoch_offset(flags); - // rent_epoch is the previous field to account hash - if flags.has_rent_epoch() { - offset += std::mem::size_of::(); - } - offset - } } #[cfg(test)] pub mod tests { - use {super::*, solana_sdk::hash::Hash}; + use super::*; #[test] fn test_account_meta_flags_new() { let flags = AccountMetaFlags::new(); assert!(!flags.has_rent_epoch()); - assert!(!flags.has_account_hash()); assert_eq!(flags.reserved(), 0u32); assert_eq!( @@ -171,13 +149,12 @@ pub mod tests { flags.set_has_rent_epoch(true); assert!(flags.has_rent_epoch()); - assert!(!flags.has_account_hash()); + assert!(!flags.executable()); verify_flags_serialization(&flags); - flags.set_has_account_hash(true); - + flags.set_executable(true); assert!(flags.has_rent_epoch()); - assert!(flags.has_account_hash()); + assert!(flags.executable()); verify_flags_serialization(&flags); // make sure the reserved bits are untouched. @@ -187,7 +164,6 @@ pub mod tests { fn update_and_verify_flags(opt_fields: &AccountMetaOptionalFields) { let flags: AccountMetaFlags = AccountMetaFlags::new_from(opt_fields); assert_eq!(flags.has_rent_epoch(), opt_fields.rent_epoch.is_some()); - assert_eq!(flags.has_account_hash(), opt_fields.account_hash.is_some()); assert_eq!(flags.reserved(), 0u32); } @@ -196,12 +172,7 @@ pub mod tests { let test_epoch = 5432312; for rent_epoch in [None, Some(test_epoch)] { - for account_hash in [None, Some(AccountHash(Hash::new_unique()))] { - update_and_verify_flags(&AccountMetaOptionalFields { - rent_epoch, - account_hash, - }); - } + update_and_verify_flags(&AccountMetaOptionalFields { rent_epoch }); } } @@ -210,23 +181,17 @@ pub mod tests { let test_epoch = 5432312; for rent_epoch in [None, Some(test_epoch)] { - for account_hash in [None, Some(AccountHash(Hash::new_unique()))] { - let opt_fields = AccountMetaOptionalFields { - rent_epoch, - account_hash, - }; - assert_eq!( - opt_fields.size(), - rent_epoch.map_or(0, |_| std::mem::size_of::()) - + account_hash.map_or(0, |_| std::mem::size_of::()) - ); - assert_eq!( - opt_fields.size(), - AccountMetaOptionalFields::size_from_flags(&AccountMetaFlags::new_from( - &opt_fields - )) - ); - } + let opt_fields = AccountMetaOptionalFields { rent_epoch }; + assert_eq!( + opt_fields.size(), + rent_epoch.map_or(0, |_| std::mem::size_of::()), + ); + assert_eq!( + opt_fields.size(), + AccountMetaOptionalFields::size_from_flags(&AccountMetaFlags::new_from( + &opt_fields + )) + ); } } @@ -235,33 +200,22 @@ pub mod tests { let test_epoch = 5432312; for rent_epoch in [None, Some(test_epoch)] { - for account_hash in [None, Some(AccountHash(Hash::new_unique()))] { - let rent_epoch_offset = 0; - let account_hash_offset = - rent_epoch_offset + rent_epoch.as_ref().map(std::mem::size_of_val).unwrap_or(0); - let derived_size = account_hash_offset - + account_hash - .as_ref() - .map(std::mem::size_of_val) - .unwrap_or(0); - let opt_fields = AccountMetaOptionalFields { - rent_epoch, - account_hash, - }; - let flags = AccountMetaFlags::new_from(&opt_fields); - assert_eq!( - AccountMetaOptionalFields::rent_epoch_offset(&flags), - rent_epoch_offset - ); - assert_eq!( - AccountMetaOptionalFields::account_hash_offset(&flags), - account_hash_offset - ); - assert_eq!( - AccountMetaOptionalFields::size_from_flags(&flags), - derived_size - ); - } + let rent_epoch_offset = 0; + let derived_size = if rent_epoch.is_some() { + std::mem::size_of::() + } else { + 0 + }; + let opt_fields = AccountMetaOptionalFields { rent_epoch }; + let flags = AccountMetaFlags::new_from(&opt_fields); + assert_eq!( + AccountMetaOptionalFields::rent_epoch_offset(&flags), + rent_epoch_offset + ); + assert_eq!( + AccountMetaOptionalFields::size_from_flags(&flags), + derived_size + ); } } } diff --git a/accounts-db/src/tiered_storage/mmap_utils.rs b/accounts-db/src/tiered_storage/mmap_utils.rs index a1e70a1e617949..610384efd271c4 100644 --- a/accounts-db/src/tiered_storage/mmap_utils.rs +++ b/accounts-db/src/tiered_storage/mmap_utils.rs @@ -2,12 +2,33 @@ use { crate::{accounts_file::ALIGN_BOUNDARY_OFFSET, u64_align}, log::*, memmap2::Mmap, + std::io::Result as IoResult, }; -pub fn get_type(map: &Mmap, offset: usize) -> std::io::Result<(&T, usize)> { - let (data, next) = get_slice(map, offset, std::mem::size_of::())?; +/// Borrows a value of type `T` from `mmap` +/// +/// Type T must be plain ol' data to ensure no undefined behavior. +pub fn get_pod(mmap: &Mmap, offset: usize) -> IoResult<(&T, usize)> { + // SAFETY: Since T is AnyBitPattern, it is safe to cast bytes to T. + unsafe { get_type::(mmap, offset) } +} + +/// Borrows a value of type `T` from `mmap` +/// +/// Prefer `get_pod()` when possible, because `get_type()` may cause undefined behavior. +/// +/// # Safety +/// +/// Caller must ensure casting bytes to T is safe. +/// Refer to the Safety sections in std::slice::from_raw_parts() +/// and bytemuck's Pod and AnyBitPattern for more information. +pub unsafe fn get_type(mmap: &Mmap, offset: usize) -> IoResult<(&T, usize)> { + let (data, next) = get_slice(mmap, offset, std::mem::size_of::())?; let ptr = data.as_ptr() as *const T; debug_assert!(ptr as usize % std::mem::align_of::() == 0); + // SAFETY: The caller ensures it is safe to cast bytes to T, + // we ensure the size is safe by querying T directly, + // and we just checked above to ensure the ptr is aligned for T. Ok((unsafe { &*ptr }, next)) } @@ -15,23 +36,25 @@ pub fn get_type(map: &Mmap, offset: usize) -> std::io::Result<(&T, usize)> { /// doesn't overrun the internal buffer. Otherwise return an Error. /// Also return the offset of the first byte after the requested data that /// falls on a 64-byte boundary. -pub fn get_slice(map: &Mmap, offset: usize, size: usize) -> std::io::Result<(&[u8], usize)> { +pub fn get_slice(mmap: &Mmap, offset: usize, size: usize) -> IoResult<(&[u8], usize)> { let (next, overflow) = offset.overflowing_add(size); - if overflow || next > map.len() { + if overflow || next > mmap.len() { error!( "Requested offset {} and size {} while mmap only has length {}", offset, size, - map.len() + mmap.len() ); return Err(std::io::Error::new( std::io::ErrorKind::AddrNotAvailable, "Requested offset and data length exceeds the mmap slice", )); } - let data = &map[offset..next]; + let data = &mmap[offset..next]; let next = u64_align!(next); let ptr = data.as_ptr(); + // SAFETY: The Mmap ensures the bytes are safe the read, and we just checked + // to ensure we don't read past the end of the internal buffer. Ok((unsafe { std::slice::from_raw_parts(ptr, size) }, next)) } diff --git a/accounts-db/src/tiered_storage/owners.rs b/accounts-db/src/tiered_storage/owners.rs new file mode 100644 index 00000000000000..ebe60cc6f8ed0f --- /dev/null +++ b/accounts-db/src/tiered_storage/owners.rs @@ -0,0 +1,198 @@ +use { + crate::tiered_storage::{ + file::TieredStorageFile, footer::TieredStorageFooter, mmap_utils::get_pod, + TieredStorageResult, + }, + indexmap::set::IndexSet, + memmap2::Mmap, + solana_sdk::pubkey::Pubkey, +}; + +/// The offset to an owner entry in the owners block. +/// This is used to obtain the address of the account owner. +/// +/// Note that as its internal type is u32, it means the maximum number of +/// unique owners in one TieredStorageFile is 2^32. +#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd)] +pub struct OwnerOffset(pub u32); + +lazy_static! { + pub static ref OWNER_NO_OWNER: Pubkey = Pubkey::default(); +} + +/// Owner block holds a set of unique addresses of account owners, +/// and an account meta has a owner_offset field for accessing +/// it's owner address. +#[repr(u16)] +#[derive( + Clone, + Copy, + Debug, + Default, + Eq, + Hash, + PartialEq, + num_enum::IntoPrimitive, + num_enum::TryFromPrimitive, +)] +pub enum OwnersBlockFormat { + /// This format persists OwnerBlock as a consecutive bytes of pubkeys + /// without any meta-data. For each account meta, it has a owner_offset + /// field to access its owner's address in the OwnersBlock. + #[default] + AddressesOnly = 0, +} + +impl OwnersBlockFormat { + /// Persists the provided owners' addresses into the specified file. + pub fn write_owners_block( + &self, + file: &TieredStorageFile, + owners_table: &OwnersTable, + ) -> TieredStorageResult { + match self { + Self::AddressesOnly => { + let mut bytes_written = 0; + for address in &owners_table.owners_set { + bytes_written += file.write_pod(*address)?; + } + + Ok(bytes_written) + } + } + } + + /// Returns the owner address associated with the specified owner_offset + /// and footer inside the input mmap. + pub fn get_owner_address<'a>( + &self, + mmap: &'a Mmap, + footer: &TieredStorageFooter, + owner_offset: OwnerOffset, + ) -> TieredStorageResult<&'a Pubkey> { + match self { + Self::AddressesOnly => { + let offset = footer.owners_block_offset as usize + + (std::mem::size_of::() * owner_offset.0 as usize); + let (pubkey, _) = get_pod::(mmap, offset)?; + + Ok(pubkey) + } + } + } +} + +/// The in-memory representation of owners block for write. +/// It manages a set of unique addresses of account owners. +#[derive(Debug, Default)] +pub struct OwnersTable<'a> { + owners_set: IndexSet<&'a Pubkey>, +} + +/// OwnersBlock is persisted as a consecutive bytes of pubkeys without any +/// meta-data. For each account meta, it has a owner_offset field to +/// access its owner's address in the OwnersBlock. +impl<'a> OwnersTable<'a> { + /// Add the specified pubkey as the owner into the OwnersWriterTable + /// if the specified pubkey has not existed in the OwnersWriterTable + /// yet. In any case, the function returns its OwnerOffset. + pub fn insert(&mut self, pubkey: &'a Pubkey) -> OwnerOffset { + let (offset, _existed) = self.owners_set.insert_full(pubkey); + + OwnerOffset(offset as u32) + } + + /// Returns the number of unique owner addresses in the table. + pub fn len(&self) -> usize { + self.owners_set.len() + } + + /// Returns true if the OwnersTable is empty + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +#[cfg(test)] +mod tests { + use { + super::*, crate::tiered_storage::file::TieredStorageFile, memmap2::MmapOptions, + std::fs::OpenOptions, tempfile::TempDir, + }; + + #[test] + fn test_owners_block() { + // Generate a new temp path that is guaranteed to NOT already have a file. + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("test_owners_block"); + const NUM_OWNERS: u32 = 10; + + let addresses: Vec<_> = std::iter::repeat_with(Pubkey::new_unique) + .take(NUM_OWNERS as usize) + .collect(); + + let footer = TieredStorageFooter { + // Set owners_block_offset to 0 as we didn't write any account + // meta/data nor index block. + owners_block_offset: 0, + ..TieredStorageFooter::default() + }; + + { + let file = TieredStorageFile::new_writable(&path).unwrap(); + + let mut owners_table = OwnersTable::default(); + addresses.iter().for_each(|owner_address| { + owners_table.insert(owner_address); + }); + footer + .owners_block_format + .write_owners_block(&file, &owners_table) + .unwrap(); + + // while the test only focuses on account metas, writing a footer + // here is necessary to make it a valid tiered-storage file. + footer.write_footer_block(&file).unwrap(); + } + + let file = OpenOptions::new().read(true).open(path).unwrap(); + let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; + + for (i, address) in addresses.iter().enumerate() { + assert_eq!( + footer + .owners_block_format + .get_owner_address(&mmap, &footer, OwnerOffset(i as u32)) + .unwrap(), + address + ); + } + } + + #[test] + fn test_owners_table() { + let mut owners_table = OwnersTable::default(); + const NUM_OWNERS: usize = 99; + + let addresses: Vec<_> = std::iter::repeat_with(Pubkey::new_unique) + .take(NUM_OWNERS) + .collect(); + + // as we insert sequentially, we expect each entry has same OwnerOffset + // as its index inside the Vector. + for (i, address) in addresses.iter().enumerate() { + assert_eq!(owners_table.insert(address), OwnerOffset(i as u32)); + } + + let cloned_addresses = addresses.clone(); + + // insert again and expect the same OwnerOffset + for (i, address) in cloned_addresses.iter().enumerate() { + assert_eq!(owners_table.insert(address), OwnerOffset(i as u32)); + } + + // make sure the size of the resulting owner table is the same + // as the input + assert_eq!(owners_table.owners_set.len(), addresses.len()); + } +} diff --git a/accounts-db/src/tiered_storage/readable.rs b/accounts-db/src/tiered_storage/readable.rs index 629f08fa1d3fe6..74c2ced0f6cd7c 100644 --- a/accounts-db/src/tiered_storage/readable.rs +++ b/accounts-db/src/tiered_storage/readable.rs @@ -1,14 +1,20 @@ use { crate::{ - accounts_hash::AccountHash, + account_storage::meta::StoredAccountMeta, + accounts_file::MatchAccountOwnerError, + rent_collector::RENT_EXEMPT_RENT_EPOCH, tiered_storage::{ footer::{AccountMetaFormat, TieredStorageFooter}, hot::HotStorageReader, + index::IndexOffset, meta::TieredAccountMeta, TieredStorageResult, }, }, - solana_sdk::{account::ReadableAccount, pubkey::Pubkey, stake_history::Epoch}, + solana_sdk::{ + account::ReadableAccount, pubkey::Pubkey, + stake_history::Epoch, + }, std::path::Path, }; @@ -22,7 +28,7 @@ pub struct TieredReadableAccount<'accounts_file, M: TieredAccountMeta> { /// The address of the account owner pub owner: &'accounts_file Pubkey, /// The index for accessing the account inside its belonging AccountsFile - pub index: usize, + pub index: IndexOffset, /// The account block that contains this account. Note that this account /// block may be shared with other accounts. pub account_block: &'accounts_file [u8], @@ -34,13 +40,8 @@ impl<'accounts_file, M: TieredAccountMeta> TieredReadableAccount<'accounts_file, self.address } - /// Returns the hash of this account. - pub fn hash(&self) -> Option<&'accounts_file AccountHash> { - self.meta.account_hash(self.account_block) - } - /// Returns the index to this account in its AccountsFile. - pub fn index(&self) -> usize { + pub fn index(&self) -> IndexOffset { self.index } @@ -64,20 +65,28 @@ impl<'accounts_file, M: TieredAccountMeta> ReadableAccount } /// Returns true if the data associated to this account is executable. - /// - /// Temporarily unimplemented!() as program runtime v2 will use - /// a different API for executable. fn executable(&self) -> bool { - unimplemented!(); + self.meta.flags().executable() } /// Returns the epoch that this account will next owe rent by parsing - /// the specified account block. Epoch::MAX will be returned if the account - /// is rent-exempt. + /// the specified account block. RENT_EXEMPT_RENT_EPOCH will be returned + /// if the account is rent-exempt. + /// + /// For a zero-lamport account, Epoch::default() will be returned to + /// default states of an AccountSharedData. fn rent_epoch(&self) -> Epoch { self.meta .rent_epoch(self.account_block) - .unwrap_or(Epoch::MAX) + .unwrap_or(if self.lamports() != 0 { + RENT_EXEMPT_RENT_EPOCH + } else { + // While there is no valid-values for any fields of a zero + // lamport account, here we return Epoch::default() to + // match the default states of AccountSharedData. Otherwise, + // a hash mismatch will occur. + Epoch::default() + }) } /// Returns the data associated to this account. @@ -114,4 +123,49 @@ impl TieredStorageReader { Self::Hot(hot) => hot.num_accounts(), } } + + /// Returns the account located at the specified index offset. + pub fn get_account( + &self, + index_offset: IndexOffset, + ) -> TieredStorageResult, IndexOffset)>> { + match self { + Self::Hot(hot) => hot.get_account(index_offset), + } + } + + /// Returns Ok(index_of_matching_owner) if the account owner at + /// `account_offset` is one of the pubkeys in `owners`. + /// + /// Returns Err(MatchAccountOwnerError::NoMatch) if the account has 0 + /// lamports or the owner is not one of the pubkeys in `owners`. + /// + /// Returns Err(MatchAccountOwnerError::UnableToLoad) if there is any internal + /// error that causes the data unable to load, including `account_offset` + /// causes a data overrun. + pub fn account_matches_owners( + &self, + index_offset: IndexOffset, + owners: &[Pubkey], + ) -> Result { + match self { + Self::Hot(hot) => { + let account_offset = hot + .get_account_offset(index_offset) + .map_err(|_| MatchAccountOwnerError::UnableToLoad)?; + hot.account_matches_owners(account_offset, owners) + } + } + } + + /// Return a vector of account metadata for each account, starting from + /// `index_offset` + pub fn accounts( + &self, + index_offset: IndexOffset, + ) -> TieredStorageResult> { + match self { + Self::Hot(hot) => hot.accounts(index_offset), + } + } } diff --git a/accounts-db/src/tiered_storage/test_utils.rs b/accounts-db/src/tiered_storage/test_utils.rs new file mode 100644 index 00000000000000..eeeed647eb20a2 --- /dev/null +++ b/accounts-db/src/tiered_storage/test_utils.rs @@ -0,0 +1,63 @@ +#![cfg(test)] +//! Helper functions for TieredStorage tests +use { + crate::{ + account_storage::meta::{StoredAccountMeta, StoredMeta}, + accounts_hash::AccountHash, + rent_collector::RENT_EXEMPT_RENT_EPOCH, + tiered_storage::owners::OWNER_NO_OWNER, + }, + solana_sdk::{ + account::{Account, AccountSharedData, ReadableAccount}, + hash::Hash, + pubkey::Pubkey, + }, +}; + +/// Create a test account based on the specified seed. +/// The created test account might have default rent_epoch +/// and write_version. +/// +/// When the seed is zero, then a zero-lamport test account will be +/// created. +pub(super) fn create_test_account(seed: u64) -> (StoredMeta, AccountSharedData) { + let data_byte = seed as u8; + let owner_byte = u8::MAX - data_byte; + let account = Account { + lamports: seed, + data: std::iter::repeat(data_byte).take(seed as usize).collect(), + // this will allow some test account sharing the same owner. + owner: [owner_byte; 32].into(), + executable: seed % 2 > 0, + rent_epoch: if seed % 3 > 0 { + seed + } else { + RENT_EXEMPT_RENT_EPOCH + }, + }; + + let stored_meta = StoredMeta { + write_version_obsolete: u64::MAX, + pubkey: Pubkey::new_unique(), + data_len: seed, + }; + (stored_meta, AccountSharedData::from(account)) +} + +pub(super) fn verify_test_account( + stored_meta: &StoredAccountMeta<'_>, + account: Option<&impl ReadableAccount>, + address: &Pubkey, +) { + let (lamports, owner, data, executable) = account + .map(|acc| (acc.lamports(), acc.owner(), acc.data(), acc.executable())) + .unwrap_or((0, &OWNER_NO_OWNER, &[], false)); + + assert_eq!(stored_meta.lamports(), lamports); + assert_eq!(stored_meta.data().len(), data.len()); + assert_eq!(stored_meta.data(), data); + assert_eq!(stored_meta.executable(), executable); + assert_eq!(stored_meta.owner(), owner); + assert_eq!(stored_meta.pubkey(), address); + assert_eq!(*stored_meta.hash(), AccountHash(Hash::default())); +} diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index 69de98de6b55d3..e7b343172ed886 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -4649,6 +4649,7 @@ dependencies = [ "fs-err", "im", "index_list", + "indexmap 2.1.0", "itertools", "lazy_static", "log",