Skip to content

Commit

Permalink
Implement strong consistency toggle in mountpoint-s3::fs::CacheConfig
Browse files Browse the repository at this point in the history
This plumbs in checks for if the filesystem should maintain strong consistency for operations like open.
There is no way to configure mountpoint-s3 itself to relax the consistency model - this change only impacts internals.

Signed-off-by: Daniel Carl Jones <[email protected]>
  • Loading branch information
dannycjones committed Oct 9, 2023
1 parent 16efe55 commit 951d541
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 9 deletions.
21 changes: 17 additions & 4 deletions mountpoint-s3/src/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,13 @@ impl<Client: ObjectClient> UploadState<Client> {

#[derive(Debug, Clone)]
pub struct CacheConfig {
/// Should strong consistency be maintained in the file system?
///
/// Some operations such as `lookup` are allowed to be cached w/ short TTLs despite
/// strong consistency being enabled since Linux filesystems behave badly when the TTL is zero.
/// For example, results from `readdir` will expire immediately, and so the kernel will
/// immediately re-lookup every entry returned from `readdir`.
pub strong_consistency: bool,
/// How long the kernel will cache metadata for files
pub file_ttl: Duration,
/// How long the kernel will cache metadata for directories
Expand All @@ -202,8 +209,9 @@ pub struct CacheConfig {

impl Default for CacheConfig {
fn default() -> Self {
// We want to do as little caching as possible, but Linux filesystems behave badly when the
// TTL is exactly zero. For example, results from `readdir` will expire immediately, and so
// We want to do as little caching as possible by default,
// but Linux filesystems behave badly when the TTL is exactly zero.
// For example, results from `readdir` will expire immediately, and so
// the kernel will immediately re-lookup every entry returned from `readdir`. So we apply
// small non-zero TTLs. The goal is to be small enough that the impact on consistency is
// minimal, but large enough that a single cache miss doesn't cause a cascading effect where
Expand All @@ -213,7 +221,11 @@ impl Default for CacheConfig {
let file_ttl = Duration::from_millis(100);
let dir_ttl = Duration::from_millis(1000);

Self { file_ttl, dir_ttl }
Self {
strong_consistency: true,
file_ttl,
dir_ttl,
}
}
}

Expand Down Expand Up @@ -461,7 +473,8 @@ where
pub async fn open(&self, ino: InodeNo, flags: i32) -> Result<Opened, Error> {
trace!("fs:open with ino {:?} flags {:?}", ino, flags);

let lookup = self.superblock.getattr(&self.client, ino, true).await?;
let force_revalidate = self.config.cache_config.strong_consistency;
let lookup = self.superblock.getattr(&self.client, ino, force_revalidate).await?;

match lookup.inode.kind() {
InodeKind::Directory => return Err(InodeError::IsDirectory(lookup.inode.err()).into()),
Expand Down
54 changes: 50 additions & 4 deletions mountpoint-s3/src/inode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ impl Superblock {
}
}
writing_children.remove(&ino);

if let Ok(state) = inode.get_inode_state() {
metrics::counter!(
"metadata_cache.inode_forgotten_before_expiry",
state.stat.is_valid().into(),
);
};
}
}
}
Expand Down Expand Up @@ -518,14 +525,50 @@ impl SuperblockInner {
return Err(InodeError::InvalidFileName(name.into()));
}

// TODO use caches. if we already know about this name, we just need to revalidate the stat
// cache and then read it.
let remote = self.remote_lookup(client, parent_ino, name).await?;
let lookup = self.update_from_remote(parent_ino, name, remote)?;
let lookup = if self.cache_config.strong_consistency {
None
} else {
let lookup = self.cache_lookup(parent_ino, name);
trace!("lookup returned from cache: {:?}", lookup);
metrics::counter!("metadata_cache.cache_hit", lookup.is_some().into());
lookup
};

let lookup = match lookup {
Some(lookup) => lookup,
None => {
let remote = self.remote_lookup(client, parent_ino, name).await?;
self.update_from_remote(parent_ino, name, remote)?
}
};

lookup.inode.verify_child(parent_ino, name.as_ref())?;
Ok(lookup)
}

/// Lookup an [Inode] against known directory entries in the parent,
/// verifying any returned entry has not expired.
fn cache_lookup(&self, parent_ino: InodeNo, name: &str) -> Option<LookedUp> {
let parent = self.get(parent_ino).ok()?;

match &parent.get_inode_state().ok()?.kind_data {
InodeKindData::File { .. } => unreachable!("parent should be a directory!"),
InodeKindData::Directory { children, .. } => {
let inode = children.get(name)?;
let inode_stat = &inode.get_inode_state().ok()?.stat;
if inode_stat.is_valid() {
let lookup = LookedUp {
inode: inode.clone(),
stat: inode_stat.clone(),
};
return Some(lookup);
}
}
};

None
}

/// Lookup an inode in the parent directory with the given name
/// on the remote client.
async fn remote_lookup<OC: ObjectClient>(
Expand Down Expand Up @@ -1121,6 +1164,7 @@ impl Inode {
Self { inner: inner.into() }
}

/// Verify [Inode] has the expected inode number and the inode content is valid for its checksum.
fn verify_inode(&self, expected_ino: InodeNo) -> Result<(), InodeError> {
let computed = Self::compute_checksum(self.ino(), self.full_key());
if computed == self.inner.checksum && self.ino() == expected_ino {
Expand All @@ -1130,6 +1174,8 @@ impl Inode {
}
}

/// Verify [Inode] has the expected inode number, expected parent inode number,
/// and the inode's content is valid for its checksum.
fn verify_child(&self, expected_parent: InodeNo, expected_name: &str) -> Result<(), InodeError> {
let computed = Self::compute_checksum(self.ino(), self.full_key());
if computed == self.inner.checksum && self.parent() == expected_parent && self.name() == expected_name {
Expand Down
1 change: 1 addition & 0 deletions mountpoint-s3/tests/fuse_tests/lookup_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ where
{
let filesystem_config = S3FilesystemConfig {
cache_config: CacheConfig {
strong_consistency: true,
file_ttl: Duration::ZERO,
dir_ttl: Duration::ZERO,
},
Expand Down
9 changes: 8 additions & 1 deletion mountpoint-s3/tests/reftests/harness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ use std::collections::{BTreeMap, HashSet};
use std::fmt::Debug;
use std::path::{Component, Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;

use fuser::FileType;
use futures::executor::ThreadPool;
use futures::future::{BoxFuture, FutureExt};
use mountpoint_s3::fs::{self, InodeNo, ReadReplier, ToErrno, FUSE_ROOT_INODE};
use mountpoint_s3::fs::{self, CacheConfig, InodeNo, ReadReplier, ToErrno, FUSE_ROOT_INODE};
use mountpoint_s3::prefix::Prefix;
use mountpoint_s3::{S3Filesystem, S3FilesystemConfig};
use mountpoint_s3_client::mock_client::{MockClient, MockObject};
Expand Down Expand Up @@ -885,6 +886,12 @@ mod mutations {
let config = S3FilesystemConfig {
readdir_size: 5,
allow_delete: true,
cache_config: CacheConfig {
// We are only interested in strong consistency for the reference tests. FUSE isn't even in the loop.
strong_consistency: true,
dir_ttl: Duration::ZERO,
file_ttl: Duration::ZERO,
},
..Default::default()
};
let (client, fs) = make_test_filesystem(BUCKET_NAME, &test_prefix, config);
Expand Down

0 comments on commit 951d541

Please sign in to comment.