diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs index 2bdfe4a25..7a9f4009a 100644 --- a/mountpoint-s3/src/fs.rs +++ b/mountpoint-s3/src/fs.rs @@ -194,6 +194,13 @@ impl UploadState { #[derive(Debug, Clone)] pub struct CacheConfig { + /// Should strong consistency be maintained in the file system? + /// + /// Some operations such as `lookup` are allowed to be cached w/ short TTLs despite + /// strong consistency being enabled since Linux filesystems behave badly when the TTL is zero. + /// For example, results from `readdir` will expire immediately, and so the kernel will + /// immediately re-lookup every entry returned from `readdir`. + pub strong_consistency: bool, /// How long the kernel will cache metadata for files pub file_ttl: Duration, /// How long the kernel will cache metadata for directories @@ -202,8 +209,9 @@ pub struct CacheConfig { impl Default for CacheConfig { fn default() -> Self { - // We want to do as little caching as possible, but Linux filesystems behave badly when the - // TTL is exactly zero. For example, results from `readdir` will expire immediately, and so + // We want to do as little caching as possible by default, + // but Linux filesystems behave badly when the TTL is exactly zero. + // For example, results from `readdir` will expire immediately, and so // the kernel will immediately re-lookup every entry returned from `readdir`. So we apply // small non-zero TTLs. The goal is to be small enough that the impact on consistency is // minimal, but large enough that a single cache miss doesn't cause a cascading effect where @@ -213,7 +221,11 @@ impl Default for CacheConfig { let file_ttl = Duration::from_millis(100); let dir_ttl = Duration::from_millis(1000); - Self { file_ttl, dir_ttl } + Self { + strong_consistency: true, + file_ttl, + dir_ttl, + } } } @@ -461,7 +473,8 @@ where pub async fn open(&self, ino: InodeNo, flags: i32) -> Result { trace!("fs:open with ino {:?} flags {:?}", ino, flags); - let lookup = self.superblock.getattr(&self.client, ino, true).await?; + let force_revalidate = self.config.cache_config.strong_consistency; + let lookup = self.superblock.getattr(&self.client, ino, force_revalidate).await?; match lookup.inode.kind() { InodeKind::Directory => return Err(InodeError::IsDirectory(lookup.inode.err()).into()), diff --git a/mountpoint-s3/src/inode.rs b/mountpoint-s3/src/inode.rs index 06d00201b..398397f8c 100644 --- a/mountpoint-s3/src/inode.rs +++ b/mountpoint-s3/src/inode.rs @@ -160,6 +160,13 @@ impl Superblock { } } writing_children.remove(&ino); + + if let Ok(state) = inode.get_inode_state() { + metrics::counter!( + "metadata_cache.inode_forgotten_before_expiry", + state.stat.is_valid().into(), + ); + }; } } } @@ -518,14 +525,50 @@ impl SuperblockInner { return Err(InodeError::InvalidFileName(name.into())); } - // TODO use caches. if we already know about this name, we just need to revalidate the stat - // cache and then read it. - let remote = self.remote_lookup(client, parent_ino, name).await?; - let lookup = self.update_from_remote(parent_ino, name, remote)?; + let lookup = if self.cache_config.strong_consistency { + None + } else { + let lookup = self.cache_lookup(parent_ino, name); + trace!("lookup returned from cache: {:?}", lookup); + metrics::counter!("metadata_cache.cache_hit", lookup.is_some().into()); + lookup + }; + + let lookup = match lookup { + Some(lookup) => lookup, + None => { + let remote = self.remote_lookup(client, parent_ino, name).await?; + self.update_from_remote(parent_ino, name, remote)? + } + }; + lookup.inode.verify_child(parent_ino, name.as_ref())?; Ok(lookup) } + /// Lookup an [Inode] against known directory entries in the parent, + /// verifying any returned entry has not expired. + fn cache_lookup(&self, parent_ino: InodeNo, name: &str) -> Option { + let parent = self.get(parent_ino).ok()?; + + match &parent.get_inode_state().ok()?.kind_data { + InodeKindData::File { .. } => unreachable!("parent should be a directory!"), + InodeKindData::Directory { children, .. } => { + let inode = children.get(name)?; + let inode_stat = &inode.get_inode_state().ok()?.stat; + if inode_stat.is_valid() { + let lookup = LookedUp { + inode: inode.clone(), + stat: inode_stat.clone(), + }; + return Some(lookup); + } + } + }; + + None + } + /// Lookup an inode in the parent directory with the given name /// on the remote client. async fn remote_lookup( @@ -1121,6 +1164,7 @@ impl Inode { Self { inner: inner.into() } } + /// Verify [Inode] has the expected inode number and the inode content is valid for its checksum. fn verify_inode(&self, expected_ino: InodeNo) -> Result<(), InodeError> { let computed = Self::compute_checksum(self.ino(), self.full_key()); if computed == self.inner.checksum && self.ino() == expected_ino { @@ -1130,6 +1174,8 @@ impl Inode { } } + /// Verify [Inode] has the expected inode number, expected parent inode number, + /// and the inode's content is valid for its checksum. fn verify_child(&self, expected_parent: InodeNo, expected_name: &str) -> Result<(), InodeError> { let computed = Self::compute_checksum(self.ino(), self.full_key()); if computed == self.inner.checksum && self.parent() == expected_parent && self.name() == expected_name { diff --git a/mountpoint-s3/tests/fuse_tests/lookup_test.rs b/mountpoint-s3/tests/fuse_tests/lookup_test.rs index 34c42c35a..3b15ece28 100644 --- a/mountpoint-s3/tests/fuse_tests/lookup_test.rs +++ b/mountpoint-s3/tests/fuse_tests/lookup_test.rs @@ -111,6 +111,7 @@ where { let filesystem_config = S3FilesystemConfig { cache_config: CacheConfig { + strong_consistency: true, file_ttl: Duration::ZERO, dir_ttl: Duration::ZERO, }, diff --git a/mountpoint-s3/tests/reftests/harness.rs b/mountpoint-s3/tests/reftests/harness.rs index 80d063f3b..b19aba906 100644 --- a/mountpoint-s3/tests/reftests/harness.rs +++ b/mountpoint-s3/tests/reftests/harness.rs @@ -2,11 +2,12 @@ use std::collections::{BTreeMap, HashSet}; use std::fmt::Debug; use std::path::{Component, Path, PathBuf}; use std::sync::Arc; +use std::time::Duration; use fuser::FileType; use futures::executor::ThreadPool; use futures::future::{BoxFuture, FutureExt}; -use mountpoint_s3::fs::{self, InodeNo, ReadReplier, ToErrno, FUSE_ROOT_INODE}; +use mountpoint_s3::fs::{self, CacheConfig, InodeNo, ReadReplier, ToErrno, FUSE_ROOT_INODE}; use mountpoint_s3::prefix::Prefix; use mountpoint_s3::{S3Filesystem, S3FilesystemConfig}; use mountpoint_s3_client::mock_client::{MockClient, MockObject}; @@ -885,6 +886,12 @@ mod mutations { let config = S3FilesystemConfig { readdir_size: 5, allow_delete: true, + cache_config: CacheConfig { + // We are only interested in strong consistency for the reference tests. FUSE isn't even in the loop. + strong_consistency: true, + dir_ttl: Duration::ZERO, + file_ttl: Duration::ZERO, + }, ..Default::default() }; let (client, fs) = make_test_filesystem(BUCKET_NAME, &test_prefix, config);