From a8b374ccdedb584d42435a6ddc57aa174b25f1cc Mon Sep 17 00:00:00 2001 From: Tarek Date: Sun, 30 Jun 2024 15:37:13 +0300 Subject: [PATCH 1/6] feat(fs-index): add benchmarks for index update_all() Signed-off-by: Tarek --- README.md | 6 +- fs-index/Cargo.toml | 5 +- fs-index/benches/index_build_benchmark.rs | 43 ------ fs-index/benches/resource_index_benchmark.rs | 146 +++++++++++++++++++ 4 files changed, 152 insertions(+), 48 deletions(-) delete mode 100644 fs-index/benches/index_build_benchmark.rs create mode 100644 fs-index/benches/resource_index_benchmark.rs diff --git a/README.md b/README.md index 6cf0436d..8a472b50 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ cargo bench This command runs all benchmarks and generates a report in HTML format located at `target/criterion/report`. If you wish to run a specific benchmark, you can specify its name as an argument as in: ```bash -cargo bench index_build +cargo bench resource_index ``` ### Benchmarking Local Files @@ -97,10 +97,10 @@ To install `flamegraph`, run: cargo install flamegraph ``` -To generate a flame graph for `index_build_benchmark`, use the following command: +To generate a flame graph for `resource_index_benchmark`, use the following command: ```bash -cargo flamegraph --bench index_build_benchmark -o index_build_benchmark.svg -- --bench +cargo flamegraph --bench resource_index_benchmark -o resource_index_benchmark.svg -- --bench ``` > [!NOTE] diff --git a/fs-index/Cargo.toml b/fs-index/Cargo.toml index 49d0e763..0ae00b3f 100644 --- a/fs-index/Cargo.toml +++ b/fs-index/Cargo.toml @@ -26,11 +26,12 @@ data-resource = { path = "../data-resource" } uuid = { version = "1.6.1", features = ["v4"] } # benchmarking criterion = { version = "0.5", features = ["html_reports"] } +tempfile = "3.10" # Depending on `dev-hash` for testing dev-hash = { path = "../dev-hash" } fs-atomic-versions = { path = "../fs-atomic-versions" } [[bench]] -name = "index_build_benchmark" +name = "resource_index_benchmark" harness = false -path = "benches/index_build_benchmark.rs" +path = "benches/resource_index_benchmark.rs" diff --git a/fs-index/benches/index_build_benchmark.rs b/fs-index/benches/index_build_benchmark.rs deleted file mode 100644 index 1de0ef3f..00000000 --- a/fs-index/benches/index_build_benchmark.rs +++ /dev/null @@ -1,43 +0,0 @@ -use criterion::{ - black_box, criterion_group, criterion_main, BenchmarkId, Criterion, -}; -use dev_hash::Crc32; -use fs_index::index::ResourceIndex; - -const DIR_PATH: &str = "../test-assets/"; // Set the path to the directory containing the resources here - -fn index_build_benchmark(c: &mut Criterion) { - // assert the path exists and is a directory - assert!( - std::path::Path::new(DIR_PATH).is_dir(), - "The path: {} does not exist or is not a directory", - DIR_PATH - ); - - let mut group = c.benchmark_group("index_build"); - group.measurement_time(std::time::Duration::from_secs(20)); // Set the measurement time here - - let mut collisions_size = 0; - - group.bench_with_input( - BenchmarkId::new("index_build", DIR_PATH), - &DIR_PATH, - |b, path| { - b.iter(|| { - let index: ResourceIndex = - ResourceIndex::build(black_box(path.to_string())); - collisions_size = index.collisions.len(); - }); - }, - ); - group.finish(); - - println!("Collisions: {}", collisions_size); -} - -criterion_group! { - name = benches; - config = Criterion::default(); - targets = index_build_benchmark -} -criterion_main!(benches); diff --git a/fs-index/benches/resource_index_benchmark.rs b/fs-index/benches/resource_index_benchmark.rs new file mode 100644 index 00000000..c762eebd --- /dev/null +++ b/fs-index/benches/resource_index_benchmark.rs @@ -0,0 +1,146 @@ +use std::path::PathBuf; + +use criterion::{ + black_box, criterion_group, criterion_main, BenchmarkId, Criterion, +}; +use tempfile::TempDir; + +use dev_hash::Crc32; +use fs_index::index::ResourceIndex; + +// The path to the test assets directory +const DIR_PATH: &str = "../test-assets/"; + +fn resource_index_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("resource_index"); + group.measurement_time(std::time::Duration::from_secs(20)); // Set the measurement time here + + let benchmarks_dir = setup_temp_dir(); + let benchmarks_dir = benchmarks_dir.path(); + let benchmarks_dir_str = benchmarks_dir.to_str().unwrap(); + + // Benchmark `ResourceIndex::build()` + + let mut collisions_size = 0; + group.bench_with_input( + BenchmarkId::new("index_build", benchmarks_dir_str), + &benchmarks_dir, + |b, path| { + b.iter(|| { + let index: ResourceIndex = + ResourceIndex::build(black_box(path)); + collisions_size = index.collisions.len(); + }); + }, + ); + println!("Collisions: {}", collisions_size); + + // TODO: Benchmark `ResourceIndex::get_resource_by_id()` + + // TODO: Benchmark `ResourceIndex::get_resource_by_path()` + + // TODO: Benchmark `ResourceIndex::track_addition()` + + // TODO: Benchmark `ResourceIndex::track_deletion()` + + // TODO: Benchmark `ResourceIndex::track_update()` + + // Benchmark `ResourceIndex::update_all()` + + // First, create a new temp directory specifically for the update_all benchmark + // since we will be creating new files, removing files, and modifying files + let update_all_benchmarks_dir = + TempDir::with_prefix("ark-fs-index-benchmarks-update-all").unwrap(); + let update_all_benchmarks_dir = update_all_benchmarks_dir.path(); + + group.bench_function("index_update_all", |b| { + b.iter(|| { + // Clear the directory + std::fs::remove_dir_all(&update_all_benchmarks_dir).unwrap(); + std::fs::create_dir(&update_all_benchmarks_dir).unwrap(); + + // Create 50 new files + for i in 0..50 { + let new_file = + update_all_benchmarks_dir.join(format!("file_{}.txt", i)); + std::fs::File::create(&new_file).unwrap(); + std::fs::write(&new_file, format!("Hello, World! {}", i)) + .unwrap(); + } + let mut index: ResourceIndex = + ResourceIndex::build(black_box(&update_all_benchmarks_dir)); + + update_all_files(&update_all_benchmarks_dir.to_path_buf()); + let _update_result = index.update_all().unwrap(); + }); + }); + + group.finish(); +} + +criterion_group! { + name = benches; + config = Criterion::default(); + targets = resource_index_benchmark +} +criterion_main!(benches); + +/// A helper function to setup a temp directory for the benchmarks using the test assets directory +fn setup_temp_dir() -> TempDir { + // assert the path exists and is a directory + assert!( + std::path::Path::new(DIR_PATH).is_dir(), + "The path: {} does not exist or is not a directory", + DIR_PATH + ); + + // Create a temp directory + let temp_dir = TempDir::with_prefix("ark-fs-index-benchmarks").unwrap(); + let benchmarks_dir = temp_dir.path(); + let benchmarks_dir_str = benchmarks_dir.to_str().unwrap(); + log::info!("Temp directory for benchmarks: {}", benchmarks_dir_str); + + // Copy the test assets to the temp directory + let source = std::path::Path::new(DIR_PATH); + // Can't use fs::copy because the source is a directory + let output = std::process::Command::new("cp") + .arg("-r") + .arg(source) + .arg(benchmarks_dir_str) + .output() + .expect("Failed to copy test assets to temp directory"); + if !output.status.success() { + panic!( + "Failed to copy test assets to temp directory: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + temp_dir +} + +/// A helper function that takes a directory and creates 50 new files, removes 30 files, and modifies 10 files +/// +/// Note: The function assumes that the directory already contains 50 files named `file_0.txt` to `file_49.txt` +fn update_all_files(dir: &PathBuf) { + // Create 50 new files + for i in 51..101 { + let new_file = dir.join(format!("file_{}.txt", i)); + std::fs::File::create(&new_file).unwrap(); + // We add the index `i` to the file content to make sure the content is unique + // This is to avoid collisions in the index + std::fs::write(&new_file, format!("Hello, World! {}", i)).unwrap(); + } + + // Remove 30 files + for i in 0..30 { + let removed_file = dir.join(format!("file_{}.txt", i)); + std::fs::remove_file(&removed_file).unwrap(); + } + + // Modify 10 files + for i in 40..50 { + let modified_file = dir.join(format!("file_{}.txt", i)); + std::fs::write(&modified_file, "Hello, World!").unwrap(); + } +} From ebb51324a0d111b4bea79fb93ec5f0dfbbf214b7 Mon Sep 17 00:00:00 2001 From: Tarek Date: Sun, 30 Jun 2024 15:41:46 +0300 Subject: [PATCH 2/6] feat(fs-index): move tests to a separate module Signed-off-by: Tarek --- fs-index/src/index.rs | 446 +----------------------------------------- fs-index/src/lib.rs | 3 + fs-index/src/tests.rs | 426 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 431 insertions(+), 444 deletions(-) create mode 100644 fs-index/src/tests.rs diff --git a/fs-index/src/index.rs b/fs-index/src/index.rs index a60287ea..9bbb863d 100644 --- a/fs-index/src/index.rs +++ b/fs-index/src/index.rs @@ -27,7 +27,7 @@ pub struct ResourceIndex { pub path2id: HashMap>, pub collisions: HashMap, - root: PathBuf, + pub root: PathBuf, } #[derive(PartialEq, Debug)] @@ -578,7 +578,7 @@ impl ResourceIndex { } } -fn discover_paths>( +pub(crate) fn discover_paths>( root_path: P, ) -> HashMap { log::debug!( @@ -676,445 +676,3 @@ fn is_hidden(entry: &DirEntry) -> bool { .map(|s| s.starts_with('.')) .unwrap_or(false) } - -#[cfg(test)] -mod tests { - use crate::index::{discover_paths, IndexEntry}; - use crate::ResourceIndex; - use canonical_path::CanonicalPathBuf; - use dev_hash::Crc32; - use fs_atomic_versions::initialize; - use std::fs::File; - #[cfg(target_family = "unix")] - use std::fs::Permissions; - #[cfg(target_family = "unix")] - use std::os::unix::fs::PermissionsExt; - - use std::path::PathBuf; - use std::time::SystemTime; - use uuid::Uuid; - - const FILE_SIZE_1: u64 = 10; - const FILE_SIZE_2: u64 = 11; - - const FILE_NAME_1: &str = "test1.txt"; - const FILE_NAME_2: &str = "test2.txt"; - const FILE_NAME_3: &str = "test3.txt"; - - const CRC32_1: Crc32 = Crc32(3817498742); - const CRC32_2: Crc32 = Crc32(1804055020); - - fn get_temp_dir() -> PathBuf { - create_dir_at(std::env::temp_dir()) - } - - fn create_dir_at(path: PathBuf) -> PathBuf { - let mut dir_path = path.clone(); - dir_path.push(Uuid::new_v4().to_string()); - std::fs::create_dir(&dir_path).expect("Could not create temp dir"); - dir_path - } - - fn create_file_at( - path: PathBuf, - size: Option, - name: Option<&str>, - ) -> (File, PathBuf) { - let mut file_path = path.clone(); - if let Some(file_name) = name { - file_path.push(file_name); - } else { - file_path.push(Uuid::new_v4().to_string()); - } - let file = File::create(file_path.clone()) - .expect("Could not create temp file"); - file.set_len(size.unwrap_or(0)) - .expect("Could not set file size"); - (file, file_path) - } - - fn run_test_and_clean_up( - test: impl FnOnce(PathBuf) + std::panic::UnwindSafe, - ) { - initialize(); - - let path = get_temp_dir(); - let result = std::panic::catch_unwind(|| test(path.clone())); - std::fs::remove_dir_all(path.clone()) - .expect("Could not clean up after test"); - if result.is_err() { - panic!("{}", result.err().map(|_| "Test panicked").unwrap()) - } - assert!(result.is_ok()); - } - - // resource index build - - #[test] - fn index_build_should_process_1_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 1); - assert_eq!(actual.id2path.len(), 1); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 1); - }) - } - - #[test] - fn index_build_should_process_colliding_files_correctly() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 2); - assert_eq!(actual.id2path.len(), 1); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert_eq!(actual.collisions.len(), 1); - assert_eq!(actual.size(), 2); - }) - } - - // resource index update - - #[test] - fn update_all_should_handle_renamed_file_correctly() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - create_file_at(path.clone(), Some(FILE_SIZE_2), Some(FILE_NAME_2)); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - - // rename test2.txt to test3.txt - let mut name_from = path.clone(); - name_from.push(FILE_NAME_2); - let mut name_to = path.clone(); - name_to.push(FILE_NAME_3); - std::fs::rename(name_from, name_to) - .expect("Should rename file successfully"); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 1); - assert_eq!(update.added.len(), 1); - }) - } - - #[test] - fn update_all_should_index_new_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - let (_, expected_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 2); - assert_eq!(actual.id2path.len(), 2); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert!(actual.id2path.contains_key(&CRC32_2)); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 1); - - let added_key = - CanonicalPathBuf::canonicalize(expected_path.clone()) - .expect("CanonicalPathBuf should be fine"); - assert_eq!( - update - .added - .get(&added_key) - .expect("Key exists") - .clone(), - CRC32_2 - ) - }) - } - - #[test] - fn index_new_should_index_new_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - let mut index: ResourceIndex = - ResourceIndex::build(path.clone()); - - let (_, new_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = index - .index_new(&new_path) - .expect("Should update index correctly"); - - assert_eq!(index.root, path.clone()); - assert_eq!(index.path2id.len(), 2); - assert_eq!(index.id2path.len(), 2); - assert!(index.id2path.contains_key(&CRC32_1)); - assert!(index.id2path.contains_key(&CRC32_2)); - assert_eq!(index.collisions.len(), 0); - assert_eq!(index.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 1); - - let added_key = CanonicalPathBuf::canonicalize(new_path.clone()) - .expect("CanonicalPathBuf should be fine"); - assert_eq!( - update - .added - .get(&added_key) - .expect("Key exists") - .clone(), - CRC32_2 - ) - }) - } - - #[test] - fn update_one_should_error_on_new_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - let mut index = ResourceIndex::build(path.clone()); - - let (_, new_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = index.update_one(&new_path, CRC32_2); - - assert!(update.is_err()) - }) - } - - #[test] - fn update_one_should_index_delete_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - - let mut actual = ResourceIndex::build(path.clone()); - - let mut file_path = path.clone(); - file_path.push(FILE_NAME_1); - std::fs::remove_file(file_path.clone()) - .expect("Should remove file successfully"); - - let update = actual - .update_one(&file_path.clone(), CRC32_1) - .expect("Should update index successfully"); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 0); - assert_eq!(update.deleted.len(), 1); - assert_eq!(update.added.len(), 0); - - assert!(update.deleted.contains(&CRC32_1)) - }) - } - - #[test] - fn update_all_should_error_on_files_without_permissions() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - let (file, _) = create_file_at( - path.clone(), - Some(FILE_SIZE_2), - Some(FILE_NAME_2), - ); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - #[cfg(target_family = "unix")] - file.set_permissions(Permissions::from_mode(0o222)) - .expect("Should be fine"); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 0); - }) - } - - // error cases - - #[test] - fn update_one_should_not_update_absent_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let mut actual = ResourceIndex::build(path.clone()); - let old_id = Crc32(2); - let result = actual - .update_one(&missing_path, old_id.clone()) - .map(|i| i.deleted.clone().take(&old_id)) - .ok() - .flatten(); - - assert_eq!(result, Some(Crc32(2))); - }) - } - - #[test] - fn update_one_should_index_new_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let mut actual = ResourceIndex::build(path.clone()); - let old_id = Crc32(2); - let result = actual - .update_one(&missing_path, old_id.clone()) - .map(|i| i.deleted.clone().take(&old_id)) - .ok() - .flatten(); - - assert_eq!(result, Some(Crc32(2))); - }) - } - - #[test] - fn should_not_index_empty_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(0), None); - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) - } - - #[test] - fn should_not_index_hidden_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(".hidden")); - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) - } - - #[test] - fn should_not_index_1_empty_directory() { - run_test_and_clean_up(|path| { - create_dir_at(path.clone()); - - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) - } - - #[test] - fn discover_paths_should_not_walk_on_invalid_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let actual = discover_paths(missing_path); - assert_eq!(actual.len(), 0); - }) - } - - #[test] - fn index_entry_order() { - let old1 = IndexEntry { - id: Crc32(2), - modified: SystemTime::UNIX_EPOCH, - }; - let old2 = IndexEntry { - id: Crc32(1), - modified: SystemTime::UNIX_EPOCH, - }; - - let new1 = IndexEntry { - id: Crc32(1), - modified: SystemTime::now(), - }; - let new2 = IndexEntry { - id: Crc32(2), - modified: SystemTime::now(), - }; - - assert_eq!(new1, new1); - assert_eq!(new2, new2); - assert_eq!(old1, old1); - assert_eq!(old2, old2); - - assert_ne!(new1, new2); - assert_ne!(new1, old1); - - assert!(new1 > old1); - assert!(new1 > old2); - assert!(new2 > old1); - assert!(new2 > old2); - assert!(new2 > new1); - } - - /// Test the performance of `ResourceIndex::build` on a specific directory. - /// - /// This test evaluates the performance of building a resource - /// index using the `ResourceIndex::build` method on a given directory. - /// It measures the time taken to build the resource index and prints the - /// number of collisions detected. - #[test] - fn test_build_resource_index() { - use std::time::Instant; - - let path = "../test-assets/"; // The path to the directory to index - assert!( - std::path::Path::new(path).is_dir(), - "The provided path is not a directory or does not exist" - ); - - let start_time = Instant::now(); - let index: ResourceIndex = - ResourceIndex::build(path.to_string()); - let elapsed_time = start_time.elapsed(); - - println!("Number of paths: {}", index.id2path.len()); - println!("Number of resources: {}", index.id2path.len()); - println!("Number of collisions: {}", index.collisions.len()); - println!("Time taken: {:?}", elapsed_time); - } -} diff --git a/fs-index/src/lib.rs b/fs-index/src/lib.rs index f475f478..bf280e99 100644 --- a/fs-index/src/lib.rs +++ b/fs-index/src/lib.rs @@ -1,3 +1,6 @@ pub mod index; +#[cfg(test)] +mod tests; + pub use index::ResourceIndex; diff --git a/fs-index/src/tests.rs b/fs-index/src/tests.rs new file mode 100644 index 00000000..76d13f4d --- /dev/null +++ b/fs-index/src/tests.rs @@ -0,0 +1,426 @@ +use crate::index::{discover_paths, IndexEntry}; +use crate::ResourceIndex; +use canonical_path::CanonicalPathBuf; +use dev_hash::Crc32; +use fs_atomic_versions::initialize; +use std::fs::File; +#[cfg(target_family = "unix")] +use std::fs::Permissions; +#[cfg(target_family = "unix")] +use std::os::unix::fs::PermissionsExt; + +use std::path::PathBuf; +use std::time::SystemTime; +use uuid::Uuid; + +const FILE_SIZE_1: u64 = 10; +const FILE_SIZE_2: u64 = 11; + +const FILE_NAME_1: &str = "test1.txt"; +const FILE_NAME_2: &str = "test2.txt"; +const FILE_NAME_3: &str = "test3.txt"; + +const CRC32_1: Crc32 = Crc32(3817498742); +const CRC32_2: Crc32 = Crc32(1804055020); + +fn get_temp_dir() -> PathBuf { + create_dir_at(std::env::temp_dir()) +} + +fn create_dir_at(path: PathBuf) -> PathBuf { + let mut dir_path = path.clone(); + dir_path.push(Uuid::new_v4().to_string()); + std::fs::create_dir(&dir_path).expect("Could not create temp dir"); + dir_path +} + +fn create_file_at( + path: PathBuf, + size: Option, + name: Option<&str>, +) -> (File, PathBuf) { + let mut file_path = path.clone(); + if let Some(file_name) = name { + file_path.push(file_name); + } else { + file_path.push(Uuid::new_v4().to_string()); + } + let file = + File::create(file_path.clone()).expect("Could not create temp file"); + file.set_len(size.unwrap_or(0)) + .expect("Could not set file size"); + (file, file_path) +} + +fn run_test_and_clean_up(test: impl FnOnce(PathBuf) + std::panic::UnwindSafe) { + initialize(); + + let path = get_temp_dir(); + let result = std::panic::catch_unwind(|| test(path.clone())); + std::fs::remove_dir_all(path.clone()) + .expect("Could not clean up after test"); + if result.is_err() { + panic!("{}", result.err().map(|_| "Test panicked").unwrap()) + } + assert!(result.is_ok()); +} + +// resource index build + +#[test] +fn index_build_should_process_1_file_successfully() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), None); + + let actual: ResourceIndex = ResourceIndex::build(path.clone()); + + assert_eq!(actual.root, path.clone()); + assert_eq!(actual.path2id.len(), 1); + assert_eq!(actual.id2path.len(), 1); + assert!(actual.id2path.contains_key(&CRC32_1)); + assert_eq!(actual.collisions.len(), 0); + assert_eq!(actual.size(), 1); + }) +} + +#[test] +fn index_build_should_process_colliding_files_correctly() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), None); + create_file_at(path.clone(), Some(FILE_SIZE_1), None); + + let actual: ResourceIndex = ResourceIndex::build(path.clone()); + + assert_eq!(actual.root, path.clone()); + assert_eq!(actual.path2id.len(), 2); + assert_eq!(actual.id2path.len(), 1); + assert!(actual.id2path.contains_key(&CRC32_1)); + assert_eq!(actual.collisions.len(), 1); + assert_eq!(actual.size(), 2); + }) +} + +// resource index update + +#[test] +fn update_all_should_handle_renamed_file_correctly() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); + create_file_at(path.clone(), Some(FILE_SIZE_2), Some(FILE_NAME_2)); + + let mut actual: ResourceIndex = + ResourceIndex::build(path.clone()); + + assert_eq!(actual.collisions.len(), 0); + assert_eq!(actual.size(), 2); + + // rename test2.txt to test3.txt + let mut name_from = path.clone(); + name_from.push(FILE_NAME_2); + let mut name_to = path.clone(); + name_to.push(FILE_NAME_3); + std::fs::rename(name_from, name_to) + .expect("Should rename file successfully"); + + let update = actual + .update_all() + .expect("Should update index correctly"); + + assert_eq!(actual.collisions.len(), 0); + assert_eq!(actual.size(), 2); + assert_eq!(update.deleted.len(), 1); + assert_eq!(update.added.len(), 1); + }) +} + +#[test] +fn update_all_should_index_new_file_successfully() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), None); + + let mut actual: ResourceIndex = + ResourceIndex::build(path.clone()); + + let (_, expected_path) = + create_file_at(path.clone(), Some(FILE_SIZE_2), None); + + let update = actual + .update_all() + .expect("Should update index correctly"); + + assert_eq!(actual.root, path.clone()); + assert_eq!(actual.path2id.len(), 2); + assert_eq!(actual.id2path.len(), 2); + assert!(actual.id2path.contains_key(&CRC32_1)); + assert!(actual.id2path.contains_key(&CRC32_2)); + assert_eq!(actual.collisions.len(), 0); + assert_eq!(actual.size(), 2); + assert_eq!(update.deleted.len(), 0); + assert_eq!(update.added.len(), 1); + + let added_key = CanonicalPathBuf::canonicalize(expected_path.clone()) + .expect("CanonicalPathBuf should be fine"); + assert_eq!( + update + .added + .get(&added_key) + .expect("Key exists") + .clone(), + CRC32_2 + ) + }) +} + +#[test] +fn index_new_should_index_new_file_successfully() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), None); + let mut index: ResourceIndex = + ResourceIndex::build(path.clone()); + + let (_, new_path) = + create_file_at(path.clone(), Some(FILE_SIZE_2), None); + + let update = index + .index_new(&new_path) + .expect("Should update index correctly"); + + assert_eq!(index.root, path.clone()); + assert_eq!(index.path2id.len(), 2); + assert_eq!(index.id2path.len(), 2); + assert!(index.id2path.contains_key(&CRC32_1)); + assert!(index.id2path.contains_key(&CRC32_2)); + assert_eq!(index.collisions.len(), 0); + assert_eq!(index.size(), 2); + assert_eq!(update.deleted.len(), 0); + assert_eq!(update.added.len(), 1); + + let added_key = CanonicalPathBuf::canonicalize(new_path.clone()) + .expect("CanonicalPathBuf should be fine"); + assert_eq!( + update + .added + .get(&added_key) + .expect("Key exists") + .clone(), + CRC32_2 + ) + }) +} + +#[test] +fn update_one_should_error_on_new_file() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), None); + let mut index = ResourceIndex::build(path.clone()); + + let (_, new_path) = + create_file_at(path.clone(), Some(FILE_SIZE_2), None); + + let update = index.update_one(&new_path, CRC32_2); + + assert!(update.is_err()) + }) +} + +#[test] +fn update_one_should_index_delete_file_successfully() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); + + let mut actual = ResourceIndex::build(path.clone()); + + let mut file_path = path.clone(); + file_path.push(FILE_NAME_1); + std::fs::remove_file(file_path.clone()) + .expect("Should remove file successfully"); + + let update = actual + .update_one(&file_path.clone(), CRC32_1) + .expect("Should update index successfully"); + + assert_eq!(actual.root, path.clone()); + assert_eq!(actual.path2id.len(), 0); + assert_eq!(actual.id2path.len(), 0); + assert_eq!(actual.collisions.len(), 0); + assert_eq!(actual.size(), 0); + assert_eq!(update.deleted.len(), 1); + assert_eq!(update.added.len(), 0); + + assert!(update.deleted.contains(&CRC32_1)) + }) +} + +#[test] +fn update_all_should_error_on_files_without_permissions() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); + let (file, _) = + create_file_at(path.clone(), Some(FILE_SIZE_2), Some(FILE_NAME_2)); + + let mut actual: ResourceIndex = + ResourceIndex::build(path.clone()); + + assert_eq!(actual.collisions.len(), 0); + assert_eq!(actual.size(), 2); + #[cfg(target_family = "unix")] + file.set_permissions(Permissions::from_mode(0o222)) + .expect("Should be fine"); + + let update = actual + .update_all() + .expect("Should update index correctly"); + + assert_eq!(actual.collisions.len(), 0); + assert_eq!(actual.size(), 2); + assert_eq!(update.deleted.len(), 0); + assert_eq!(update.added.len(), 0); + }) +} + +// error cases + +#[test] +fn update_one_should_not_update_absent_path() { + run_test_and_clean_up(|path| { + let mut missing_path = path.clone(); + missing_path.push("missing/directory"); + let mut actual = ResourceIndex::build(path.clone()); + let old_id = Crc32(2); + let result = actual + .update_one(&missing_path, old_id.clone()) + .map(|i| i.deleted.clone().take(&old_id)) + .ok() + .flatten(); + + assert_eq!(result, Some(Crc32(2))); + }) +} + +#[test] +fn update_one_should_index_new_path() { + run_test_and_clean_up(|path| { + let mut missing_path = path.clone(); + missing_path.push("missing/directory"); + let mut actual = ResourceIndex::build(path.clone()); + let old_id = Crc32(2); + let result = actual + .update_one(&missing_path, old_id.clone()) + .map(|i| i.deleted.clone().take(&old_id)) + .ok() + .flatten(); + + assert_eq!(result, Some(Crc32(2))); + }) +} + +#[test] +fn should_not_index_empty_file() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(0), None); + let actual: ResourceIndex = ResourceIndex::build(path.clone()); + + assert_eq!(actual.root, path.clone()); + assert_eq!(actual.path2id.len(), 0); + assert_eq!(actual.id2path.len(), 0); + assert_eq!(actual.collisions.len(), 0); + }) +} + +#[test] +fn should_not_index_hidden_file() { + run_test_and_clean_up(|path| { + create_file_at(path.clone(), Some(FILE_SIZE_1), Some(".hidden")); + let actual: ResourceIndex = ResourceIndex::build(path.clone()); + + assert_eq!(actual.root, path.clone()); + assert_eq!(actual.path2id.len(), 0); + assert_eq!(actual.id2path.len(), 0); + assert_eq!(actual.collisions.len(), 0); + }) +} + +#[test] +fn should_not_index_1_empty_directory() { + run_test_and_clean_up(|path| { + create_dir_at(path.clone()); + + let actual: ResourceIndex = ResourceIndex::build(path.clone()); + + assert_eq!(actual.root, path.clone()); + assert_eq!(actual.path2id.len(), 0); + assert_eq!(actual.id2path.len(), 0); + assert_eq!(actual.collisions.len(), 0); + }) +} + +#[test] +fn discover_paths_should_not_walk_on_invalid_path() { + run_test_and_clean_up(|path| { + let mut missing_path = path.clone(); + missing_path.push("missing/directory"); + let actual = discover_paths(missing_path); + assert_eq!(actual.len(), 0); + }) +} + +#[test] +fn index_entry_order() { + let old1 = IndexEntry { + id: Crc32(2), + modified: SystemTime::UNIX_EPOCH, + }; + let old2 = IndexEntry { + id: Crc32(1), + modified: SystemTime::UNIX_EPOCH, + }; + + let new1 = IndexEntry { + id: Crc32(1), + modified: SystemTime::now(), + }; + let new2 = IndexEntry { + id: Crc32(2), + modified: SystemTime::now(), + }; + + assert_eq!(new1, new1); + assert_eq!(new2, new2); + assert_eq!(old1, old1); + assert_eq!(old2, old2); + + assert_ne!(new1, new2); + assert_ne!(new1, old1); + + assert!(new1 > old1); + assert!(new1 > old2); + assert!(new2 > old1); + assert!(new2 > old2); + assert!(new2 > new1); +} + +/// Test the performance of `ResourceIndex::build` on a specific directory. +/// +/// This test evaluates the performance of building a resource +/// index using the `ResourceIndex::build` method on a given directory. +/// It measures the time taken to build the resource index and prints the +/// number of collisions detected. +#[test] +fn test_build_resource_index() { + use std::time::Instant; + + let path = "../test-assets/"; // The path to the directory to index + assert!( + std::path::Path::new(path).is_dir(), + "The provided path is not a directory or does not exist" + ); + + let start_time = Instant::now(); + let index: ResourceIndex = ResourceIndex::build(path.to_string()); + let elapsed_time = start_time.elapsed(); + + println!("Number of paths: {}", index.id2path.len()); + println!("Number of resources: {}", index.id2path.len()); + println!("Number of collisions: {}", index.collisions.len()); + println!("Time taken: {:?}", elapsed_time); +} From 63a0e22330433e5938f4c49d0449f1490cf70b2f Mon Sep 17 00:00:00 2001 From: Tarek Date: Sun, 30 Jun 2024 15:56:59 +0300 Subject: [PATCH 3/6] organize rules for rustfmt Signed-off-by: Tarek --- .github/workflows/build.yml | 1 + ark-cli/src/commands/backup.rs | 3 +- ark-cli/src/commands/file/append.rs | 7 ++-- ark-cli/src/commands/file/insert.rs | 7 ++-- ark-cli/src/commands/file/read.rs | 7 ++-- ark-cli/src/commands/file/utils.rs | 7 ++-- ark-cli/src/commands/link/utils.rs | 3 +- ark-cli/src/commands/list.rs | 6 ++-- ark-cli/src/commands/render.rs | 3 +- ark-cli/src/commands/storage/list.rs | 4 +-- ark-cli/src/index_registrar.rs | 8 +++-- ark-cli/src/main.rs | 30 ++++++++-------- ark-cli/src/models/storage.rs | 3 +- ark-cli/src/util.rs | 26 +++++++------- data-json/src/lib.rs | 5 +-- data-link/src/lib.rs | 18 +++++----- data-resource/src/lib.rs | 10 +++--- dev-hash/benches/blake3.rs | 3 +- dev-hash/benches/crc32.rs | 3 +- fs-atomic-light/src/lib.rs | 5 +-- fs-atomic-versions/src/atomic/file.rs | 8 +++-- fs-atomic-versions/src/lib.rs | 7 ++-- fs-index/benches/resource_index_benchmark.rs | 18 ++++++---- fs-index/src/index.rs | 14 ++++---- fs-index/src/tests.rs | 9 ++--- fs-metadata/src/lib.rs | 4 +-- fs-properties/src/lib.rs | 4 +-- fs-storage/examples/cli.rs | 7 ++-- fs-storage/src/base_storage.rs | 20 +++++++---- fs-storage/src/file_storage.rs | 37 +++++++++++--------- fs-storage/src/jni/file_storage.rs | 7 ++-- fs-storage/src/monoid.rs | 10 +++--- fs-storage/src/utils.rs | 3 +- rustfmt.toml | 13 ++++--- 34 files changed, 167 insertions(+), 153 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4d1b1704..9a33f712 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -22,6 +22,7 @@ jobs: - name: Install Rust uses: dtolnay/rust-toolchain@stable with: + toolchain: nightly # nightly is required for fmt components: rustfmt, clippy - name: Check diff --git a/ark-cli/src/commands/backup.rs b/ark-cli/src/commands/backup.rs index 398220bd..40d5d46d 100644 --- a/ark-cli/src/commands/backup.rs +++ b/ark-cli/src/commands/backup.rs @@ -1,5 +1,4 @@ -use std::io::Write; -use std::path::PathBuf; +use std::{io::Write, path::PathBuf}; use crate::{ create_dir_all, dir, discover_roots, home_dir, storages_exists, timestamp, diff --git a/ark-cli/src/commands/file/append.rs b/ark-cli/src/commands/file/append.rs index 46d0b132..91969928 100644 --- a/ark-cli/src/commands/file/append.rs +++ b/ark-cli/src/commands/file/append.rs @@ -1,9 +1,8 @@ -use std::path::PathBuf; -use std::str::FromStr; +use std::{path::PathBuf, str::FromStr}; use crate::{ - models::storage::Storage, models::storage::StorageType, translate_storage, - AppError, Format, ResourceId, + models::storage::{Storage, StorageType}, + translate_storage, AppError, Format, ResourceId, }; use data_error::ArklibError; diff --git a/ark-cli/src/commands/file/insert.rs b/ark-cli/src/commands/file/insert.rs index ff9b1ac9..b60199e5 100644 --- a/ark-cli/src/commands/file/insert.rs +++ b/ark-cli/src/commands/file/insert.rs @@ -1,9 +1,8 @@ -use std::path::PathBuf; -use std::str::FromStr; +use std::{path::PathBuf, str::FromStr}; use crate::{ - models::storage::Storage, models::storage::StorageType, translate_storage, - AppError, Format, ResourceId, + models::storage::{Storage, StorageType}, + translate_storage, AppError, Format, ResourceId, }; use data_error::ArklibError; diff --git a/ark-cli/src/commands/file/read.rs b/ark-cli/src/commands/file/read.rs index 8387d011..7b47d719 100644 --- a/ark-cli/src/commands/file/read.rs +++ b/ark-cli/src/commands/file/read.rs @@ -1,9 +1,8 @@ -use std::path::PathBuf; -use std::str::FromStr; +use std::{path::PathBuf, str::FromStr}; use crate::{ - models::storage::Storage, models::storage::StorageType, translate_storage, - AppError, ResourceId, + models::storage::{Storage, StorageType}, + translate_storage, AppError, ResourceId, }; use data_error::ArklibError; diff --git a/ark-cli/src/commands/file/utils.rs b/ark-cli/src/commands/file/utils.rs index 8a3c4048..6cc993dd 100644 --- a/ark-cli/src/commands/file/utils.rs +++ b/ark-cli/src/commands/file/utils.rs @@ -1,6 +1,7 @@ -use crate::error::AppError; -use crate::models::key_value_to_str; -use crate::models::Format; +use crate::{ + error::AppError, + models::{key_value_to_str, Format}, +}; use data_error::Result as ArklibResult; use fs_atomic_versions::atomic::{modify, modify_json, AtomicFile}; diff --git a/ark-cli/src/commands/link/utils.rs b/ark-cli/src/commands/link/utils.rs index 122f7f1c..1b851966 100644 --- a/ark-cli/src/commands/link/utils.rs +++ b/ark-cli/src/commands/link/utils.rs @@ -3,8 +3,7 @@ use data_link::Link; use std::path::PathBuf; use url::Url; -use crate::error::AppError; -use crate::util::provide_index; // Import your custom AppError type +use crate::{error::AppError, util::provide_index}; // Import your custom AppError type pub async fn create_link( root: &PathBuf, diff --git a/ark-cli/src/commands/list.rs b/ark-cli/src/commands/list.rs index bc557b42..99725c65 100644 --- a/ark-cli/src/commands/list.rs +++ b/ark-cli/src/commands/list.rs @@ -1,5 +1,4 @@ -use std::io::Read; -use std::path::PathBuf; +use std::{io::Read, path::PathBuf}; use crate::{ provide_index, provide_root, read_storage_value, AppError, DateTime, @@ -137,7 +136,8 @@ impl List { let mut contents = String::new(); match file.read_to_string(&mut contents) { Ok(_) => { - // Check if the content of the file is a valid url + // Check if the content of the file is a + // valid url let url = contents.trim(); let url = url::Url::parse(url); match url { diff --git a/ark-cli/src/commands/render.rs b/ark-cli/src/commands/render.rs index 7f3fa9e6..82fde115 100644 --- a/ark-cli/src/commands/render.rs +++ b/ark-cli/src/commands/render.rs @@ -26,7 +26,8 @@ impl Render { let dest_path = filepath.with_file_name( filepath .file_stem() - // SAFETY: we know that the file stem is valid UTF-8 because it is a file name + // SAFETY: we know that the file stem is valid UTF-8 because it + // is a file name .unwrap() .to_str() .unwrap() diff --git a/ark-cli/src/commands/storage/list.rs b/ark-cli/src/commands/storage/list.rs index 6b0c6c4e..0fdcef7c 100644 --- a/ark-cli/src/commands/storage/list.rs +++ b/ark-cli/src/commands/storage/list.rs @@ -1,8 +1,8 @@ use std::path::PathBuf; use crate::{ - models::storage::Storage, models::storage::StorageType, translate_storage, - AppError, + models::storage::{Storage, StorageType}, + translate_storage, AppError, }; #[derive(Clone, Debug, clap::Args)] diff --git a/ark-cli/src/index_registrar.rs b/ark-cli/src/index_registrar.rs index fc6a2e5b..4d0ea6fd 100644 --- a/ark-cli/src/index_registrar.rs +++ b/ark-cli/src/index_registrar.rs @@ -4,9 +4,11 @@ extern crate canonical_path; use data_error::{ArklibError, Result}; use fs_index::ResourceIndex; -use std::collections::HashMap; -use std::path::Path; -use std::sync::{Arc, RwLock}; +use std::{ + collections::HashMap, + path::Path, + sync::{Arc, RwLock}, +}; use crate::ResourceId; diff --git a/ark-cli/src/main.rs b/ark-cli/src/main.rs index c8c718ca..e8029c18 100644 --- a/ark-cli/src/main.rs +++ b/ark-cli/src/main.rs @@ -1,5 +1,7 @@ -use std::fs::{create_dir_all, File}; -use std::path::PathBuf; +use std::{ + fs::{create_dir_all, File}, + path::PathBuf, +}; use crate::index_registrar::provide_index; use data_pdf::{render_preview_page, PDFQuality}; @@ -15,25 +17,23 @@ use fs_storage::ARK_FOLDER; use anyhow::Result; -use chrono::prelude::DateTime; -use chrono::Utc; +use chrono::{prelude::DateTime, Utc}; -use clap::CommandFactory; -use clap::FromArgMatches; +use clap::{CommandFactory, FromArgMatches}; use fs_extra::dir::{self, CopyOptions}; use home::home_dir; -use crate::cli::Cli; -use crate::commands::file::File::{Append, Insert, Read}; -use crate::commands::link::Link::{Create, Load}; -use crate::commands::Commands::Link; -use crate::commands::Commands::Storage; -use crate::commands::Commands::*; -use crate::models::EntryOutput; -use crate::models::Format; -use crate::models::Sort; +use crate::{ + cli::Cli, + commands::{ + file::File::{Append, Insert, Read}, + link::Link::{Create, Load}, + Commands::{Link, Storage, *}, + }, + models::{EntryOutput, Format, Sort}, +}; use crate::error::AppError; diff --git a/ark-cli/src/models/storage.rs b/ark-cli/src/models/storage.rs index 48fc0464..ef6f7f37 100644 --- a/ark-cli/src/models/storage.rs +++ b/ark-cli/src/models/storage.rs @@ -1,7 +1,6 @@ use crate::ResourceId; use fs_atomic_versions::atomic::AtomicFile; -use std::fmt::Write; -use std::path::PathBuf; +use std::{fmt::Write, path::PathBuf}; use crate::{ commands::{file_append, file_insert, format_file, format_line}, diff --git a/ark-cli/src/util.rs b/ark-cli/src/util.rs index 9a370167..d2b216ac 100644 --- a/ark-cli/src/util.rs +++ b/ark-cli/src/util.rs @@ -6,19 +6,21 @@ use fs_storage::{ ARK_FOLDER, PREVIEWS_STORAGE_FOLDER, SCORE_STORAGE_FILE, STATS_FOLDER, TAG_STORAGE_FILE, THUMBNAILS_STORAGE_FOLDER, }; -use std::env::current_dir; -use std::fs::{canonicalize, metadata}; -use std::io::BufRead; -use std::io::BufReader; -use std::path::Path; -use std::str::FromStr; -use std::thread; -use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use std::{fs::File, path::PathBuf}; +use std::{ + env::current_dir, + fs::{canonicalize, metadata, File}, + io::{BufRead, BufReader}, + path::{Path, PathBuf}, + str::FromStr, + thread, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, +}; -use crate::error::AppError; -use crate::models::storage::{Storage, StorageType}; -use crate::ARK_CONFIG; +use crate::{ + error::AppError, + models::storage::{Storage, StorageType}, + ARK_CONFIG, +}; pub fn discover_roots( roots_cfg: &Option, diff --git a/data-json/src/lib.rs b/data-json/src/lib.rs index cc663775..9d047f17 100644 --- a/data-json/src/lib.rs +++ b/data-json/src/lib.rs @@ -1,7 +1,4 @@ -use serde_json::json; -use serde_json::map::Entry; -use serde_json::Map; -use serde_json::Value; +use serde_json::{json, map::Entry, Map, Value}; pub fn merge(origin: Value, new_data: Value) -> Value { match (origin, new_data) { diff --git a/data-link/src/lib.rs b/data-link/src/lib.rs index 7e614a1d..ba925722 100644 --- a/data-link/src/lib.rs +++ b/data-link/src/lib.rs @@ -2,18 +2,20 @@ use data_error::Result; use data_resource::ResourceId; use fs_atomic_versions::atomic::AtomicFile; use fs_metadata::store_metadata; -use fs_properties::load_raw_properties; -use fs_properties::store_properties; -use fs_properties::PROPERTIES_STORAGE_FOLDER; +use fs_properties::{ + load_raw_properties, store_properties, PROPERTIES_STORAGE_FOLDER, +}; use fs_storage::{ARK_FOLDER, PREVIEWS_STORAGE_FOLDER}; use reqwest::header::HeaderValue; use scraper::{Html, Selector}; use serde::{Deserialize, Serialize}; -use std::fmt; -use std::marker::PhantomData; -use std::path::Path; -use std::str::{self, FromStr}; -use std::{io::Write, path::PathBuf}; +use std::{ + fmt, + io::Write, + marker::PhantomData, + path::{Path, PathBuf}, + str::{self, FromStr}, +}; use url::Url; #[derive(Debug, Deserialize, Serialize)] diff --git a/data-resource/src/lib.rs b/data-resource/src/lib.rs index 049b4dbb..a8ac7774 100644 --- a/data-resource/src/lib.rs +++ b/data-resource/src/lib.rs @@ -3,16 +3,16 @@ //! `data-resource` is a crate for managing resource identifiers. use core::{fmt::Display, str::FromStr}; use data_error::Result; -use serde::de::DeserializeOwned; -use serde::Serialize; +use serde::{de::DeserializeOwned, Serialize}; use std::{fmt::Debug, hash::Hash, path::Path}; /// This trait defines a generic type representing a resource identifier. /// -/// Resources are identified by a hash value, which is computed from the resource's data. -/// The hash value is used to uniquely identify the resource. +/// Resources are identified by a hash value, which is computed from the +/// resource's data. The hash value is used to uniquely identify the resource. /// -/// Implementors of this trait must provide a way to compute the hash value from the resource's data. +/// Implementors of this trait must provide a way to compute the hash value from +/// the resource's data. pub trait ResourceId: Debug + Display diff --git a/dev-hash/benches/blake3.rs b/dev-hash/benches/blake3.rs index cf95233b..f434cb36 100644 --- a/dev-hash/benches/blake3.rs +++ b/dev-hash/benches/blake3.rs @@ -17,7 +17,8 @@ fn generate_random_data(size: usize) -> Vec { (0..size).map(|_| rng.gen()).collect() } -/// Benchmarks the performance of resource ID creation from file paths and random data. +/// Benchmarks the performance of resource ID creation from file paths and +/// random data. /// /// - Measures the time taken to create a resource ID from file paths. /// - Measures the time taken to create a resource ID from random data. diff --git a/dev-hash/benches/crc32.rs b/dev-hash/benches/crc32.rs index b462035b..c85c4dc7 100644 --- a/dev-hash/benches/crc32.rs +++ b/dev-hash/benches/crc32.rs @@ -17,7 +17,8 @@ fn generate_random_data(size: usize) -> Vec { (0..size).map(|_| rng.gen()).collect() } -/// Benchmarks the performance of resource ID creation from file paths and random data. +/// Benchmarks the performance of resource ID creation from file paths and +/// random data. /// /// - Measures the time taken to create a resource ID from file paths. /// - Measures the time taken to create a resource ID from random data. diff --git a/fs-atomic-light/src/lib.rs b/fs-atomic-light/src/lib.rs index 25288f6d..fb4bf028 100644 --- a/fs-atomic-light/src/lib.rs +++ b/fs-atomic-light/src/lib.rs @@ -1,9 +1,6 @@ use data_error::Result; -use std::env; -use std::fs; -use std::path::Path; -use std::str; +use std::{env, fs, path::Path, str}; /// Write data to a tempory file and move that written file to destination /// diff --git a/fs-atomic-versions/src/atomic/file.rs b/fs-atomic-versions/src/atomic/file.rs index bd3f2571..1fe9549d 100644 --- a/fs-atomic-versions/src/atomic/file.rs +++ b/fs-atomic-versions/src/atomic/file.rs @@ -1,8 +1,10 @@ -use std::fs::{self, File}; -use std::io::{Error, ErrorKind, Read, Result}; #[cfg(target_os = "unix")] use std::os::unix::fs::MetadataExt; -use std::path::{Path, PathBuf}; +use std::{ + fs::{self, File}, + io::{Error, ErrorKind, Read, Result}, + path::{Path, PathBuf}, +}; use crate::app_id; diff --git a/fs-atomic-versions/src/lib.rs b/fs-atomic-versions/src/lib.rs index 1ce1f0b2..62935039 100644 --- a/fs-atomic-versions/src/lib.rs +++ b/fs-atomic-versions/src/lib.rs @@ -1,7 +1,8 @@ use lazy_static::lazy_static; -use std::path::PathBuf; -use std::sync::Once; -use std::sync::RwLock; +use std::{ + path::PathBuf, + sync::{Once, RwLock}, +}; pub mod app_id; pub mod atomic; diff --git a/fs-index/benches/resource_index_benchmark.rs b/fs-index/benches/resource_index_benchmark.rs index c762eebd..be113dca 100644 --- a/fs-index/benches/resource_index_benchmark.rs +++ b/fs-index/benches/resource_index_benchmark.rs @@ -47,8 +47,9 @@ fn resource_index_benchmark(c: &mut Criterion) { // Benchmark `ResourceIndex::update_all()` - // First, create a new temp directory specifically for the update_all benchmark - // since we will be creating new files, removing files, and modifying files + // First, create a new temp directory specifically for the update_all + // benchmark since we will be creating new files, removing files, and + // modifying files let update_all_benchmarks_dir = TempDir::with_prefix("ark-fs-index-benchmarks-update-all").unwrap(); let update_all_benchmarks_dir = update_all_benchmarks_dir.path(); @@ -85,7 +86,8 @@ criterion_group! { } criterion_main!(benches); -/// A helper function to setup a temp directory for the benchmarks using the test assets directory +/// A helper function to setup a temp directory for the benchmarks using the +/// test assets directory fn setup_temp_dir() -> TempDir { // assert the path exists and is a directory assert!( @@ -119,16 +121,18 @@ fn setup_temp_dir() -> TempDir { temp_dir } -/// A helper function that takes a directory and creates 50 new files, removes 30 files, and modifies 10 files +/// A helper function that takes a directory and creates 50 new files, removes +/// 30 files, and modifies 10 files /// -/// Note: The function assumes that the directory already contains 50 files named `file_0.txt` to `file_49.txt` +/// Note: The function assumes that the directory already contains 50 files +/// named `file_0.txt` to `file_49.txt` fn update_all_files(dir: &PathBuf) { // Create 50 new files for i in 51..101 { let new_file = dir.join(format!("file_{}.txt", i)); std::fs::File::create(&new_file).unwrap(); - // We add the index `i` to the file content to make sure the content is unique - // This is to avoid collisions in the index + // We add the index `i` to the file content to make sure the content is + // unique This is to avoid collisions in the index std::fs::write(&new_file, format!("Hello, World! {}", i)).unwrap(); } diff --git a/fs-index/src/index.rs b/fs-index/src/index.rs index 9bbb863d..22aa7c43 100644 --- a/fs-index/src/index.rs +++ b/fs-index/src/index.rs @@ -1,12 +1,14 @@ use anyhow::anyhow; use canonical_path::{CanonicalPath, CanonicalPathBuf}; use itertools::Itertools; -use std::collections::{HashMap, HashSet}; -use std::fs::{self, File, Metadata}; -use std::io::{BufRead, BufReader, Write}; -use std::ops::Add; -use std::path::{Path, PathBuf}; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use std::{ + collections::{HashMap, HashSet}, + fs::{self, File, Metadata}, + io::{BufRead, BufReader, Write}, + ops::Add, + path::{Path, PathBuf}, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; use walkdir::{DirEntry, WalkDir}; use log; diff --git a/fs-index/src/tests.rs b/fs-index/src/tests.rs index 76d13f4d..9eed0158 100644 --- a/fs-index/src/tests.rs +++ b/fs-index/src/tests.rs @@ -1,5 +1,7 @@ -use crate::index::{discover_paths, IndexEntry}; -use crate::ResourceIndex; +use crate::{ + index::{discover_paths, IndexEntry}, + ResourceIndex, +}; use canonical_path::CanonicalPathBuf; use dev_hash::Crc32; use fs_atomic_versions::initialize; @@ -9,8 +11,7 @@ use std::fs::Permissions; #[cfg(target_family = "unix")] use std::os::unix::fs::PermissionsExt; -use std::path::PathBuf; -use std::time::SystemTime; +use std::{path::PathBuf, time::SystemTime}; use uuid::Uuid; const FILE_SIZE_1: u64 = 10; diff --git a/fs-metadata/src/lib.rs b/fs-metadata/src/lib.rs index df258ae5..881ddeab 100644 --- a/fs-metadata/src/lib.rs +++ b/fs-metadata/src/lib.rs @@ -1,9 +1,7 @@ use data_error::Result; use fs_atomic_versions::atomic::{modify_json, AtomicFile}; use serde::{de::DeserializeOwned, Serialize}; -use std::fmt::Debug; -use std::io::Read; -use std::path::Path; +use std::{fmt::Debug, io::Read, path::Path}; use data_resource::ResourceId; use fs_storage::ARK_FOLDER; diff --git a/fs-properties/src/lib.rs b/fs-properties/src/lib.rs index 81b5b04f..0c28aede 100644 --- a/fs-properties/src/lib.rs +++ b/fs-properties/src/lib.rs @@ -1,8 +1,6 @@ use serde::{de::DeserializeOwned, Serialize}; use serde_json::Value; -use std::fmt::Debug; -use std::io::Read; -use std::path::Path; +use std::{fmt::Debug, io::Read, path::Path}; use data_error::Result; use data_json::merge; diff --git a/fs-storage/examples/cli.rs b/fs-storage/examples/cli.rs index 7b2ab23d..8214f258 100644 --- a/fs-storage/examples/cli.rs +++ b/fs-storage/examples/cli.rs @@ -1,10 +1,7 @@ use anyhow::{Context, Result}; -use fs_storage::base_storage::BaseStorage; -use fs_storage::file_storage::FileStorage; +use fs_storage::{base_storage::BaseStorage, file_storage::FileStorage}; use serde_json::Value; -use std::env; -use std::fs; -use std::path::Path; +use std::{env, fs, path::Path}; fn main() { if let Err(e) = run() { diff --git a/fs-storage/src/base_storage.rs b/fs-storage/src/base_storage.rs index 83981259..540b904f 100644 --- a/fs-storage/src/base_storage.rs +++ b/fs-storage/src/base_storage.rs @@ -30,16 +30,22 @@ impl std::fmt::Display for SyncStatus { } } -/// The `BaseStorage` trait represents a key-value mapping that is written to the file system. +/// The `BaseStorage` trait represents a key-value mapping that is written to +/// the file system. /// -/// This trait provides methods to create or update entries in the internal mapping, remove entries from the internal mapping, -/// determine if the in-memory model or the underlying storage requires syncing, scan and load the mapping from the filesystem, -/// write the mapping to the filesystem, and remove all stored data. +/// This trait provides methods to create or update entries in the internal +/// mapping, remove entries from the internal mapping, determine if the +/// in-memory model or the underlying storage requires syncing, scan and load +/// the mapping from the filesystem, write the mapping to the filesystem, and +/// remove all stored data. /// -/// The trait also includes a method to merge values from another key-value mapping. +/// The trait also includes a method to merge values from another key-value +/// mapping. /// -/// Note: The trait does not write to storage by default. It is up to the implementor to decide when to read or write to storage -/// based on `SyncStatus`. This is to allow for trading off between performance and consistency. +/// Note: The trait does not write to storage by default. It is up to the +/// implementor to decide when to read or write to storage +/// based on `SyncStatus`. This is to allow for trading off between performance +/// and consistency. pub trait BaseStorage: AsRef> { /// Create or update an entry in the internal mapping. fn set(&mut self, id: K, value: V); diff --git a/fs-storage/src/file_storage.rs b/fs-storage/src/file_storage.rs index 57fcdd80..e4dce1d4 100644 --- a/fs-storage/src/file_storage.rs +++ b/fs-storage/src/file_storage.rs @@ -1,15 +1,17 @@ use serde::{Deserialize, Serialize}; -use std::fs::{self, File}; -use std::io::Write; -use std::time::SystemTime; use std::{ collections::BTreeMap, + fs::{self, File}, + io::Write, path::{Path, PathBuf}, + time::SystemTime, }; -use crate::base_storage::{BaseStorage, SyncStatus}; -use crate::monoid::Monoid; -use crate::utils::read_version_2_fs; +use crate::{ + base_storage::{BaseStorage, SyncStatus}, + monoid::Monoid, + utils::read_version_2_fs, +}; use data_error::{ArklibError, Result}; /* @@ -80,8 +82,8 @@ where /// Create a new file storage with a diagnostic label and file path /// The storage will be initialized using the disk data, if the path exists /// - /// Note: if the file storage already exists, the data will be read from the file - /// without overwriting it. + /// Note: if the file storage already exists, the data will be read from the + /// file without overwriting it. pub fn new(label: String, path: &Path) -> Result { let time = SystemTime::now(); let mut storage = Self { @@ -114,7 +116,8 @@ where // First check if the file starts with "version: 2" let file_content = std::fs::read_to_string(&self.path)?; if file_content.starts_with("version: 2") { - // Attempt to parse the file using the legacy version 2 storage format of FileStorage. + // Attempt to parse the file using the legacy version 2 storage + // format of FileStorage. match read_version_2_fs(&self.path) { Ok(data) => { log::info!( @@ -193,14 +196,14 @@ where // Determine the synchronization status based on the modification times // Conditions: - // 1. If both the in-memory storage and the storage on disk have been modified - // since the last write, then the storage is diverged. - // 2. If only the in-memory storage has been modified since the last write, - // then the storage on disk is stale. - // 3. If only the storage on disk has been modified since the last write, - // then the in-memory storage is stale. - // 4. If neither the in-memory storage nor the storage on disk has been modified - // since the last write, then the storage is in sync. + // 1. If both the in-memory storage and the storage on disk have been + // modified since the last write, then the storage is diverged. + // 2. If only the in-memory storage has been modified since the last + // write, then the storage on disk is stale. + // 3. If only the storage on disk has been modified since the last + // write, then the in-memory storage is stale. + // 4. If neither the in-memory storage nor the storage on disk has been + // modified since the last write, then the storage is in sync. let status = match ( self.modified > self.written_to_disk, file_updated > self.written_to_disk, diff --git a/fs-storage/src/jni/file_storage.rs b/fs-storage/src/jni/file_storage.rs index 6f1e8c1c..3cb0ae2b 100644 --- a/fs-storage/src/jni/file_storage.rs +++ b/fs-storage/src/jni/file_storage.rs @@ -1,7 +1,6 @@ use crate::base_storage::SyncStatus; use jni::signature::ReturnType; -use std::collections::BTreeMap; -use std::path::Path; +use std::{collections::BTreeMap, path::Path}; // This is the interface to the JVM that we'll call the majority of our // methods on. use jni::JNIEnv; @@ -82,8 +81,8 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_remove<'local>( }); } -// A JNI function called from Java that creates a `MyData` Rust type, converts it to a Java -// type and returns it. +// A JNI function called from Java that creates a `MyData` Rust type, converts +// it to a Java type and returns it. #[no_mangle] #[allow(non_snake_case)] pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_syncStatus< diff --git a/fs-storage/src/monoid.rs b/fs-storage/src/monoid.rs index b2acf546..1c483201 100644 --- a/fs-storage/src/monoid.rs +++ b/fs-storage/src/monoid.rs @@ -1,10 +1,11 @@ -// Currently, we have three structures: Tags (HashSet), Properties (HashSet), Score (int). -// In fact, HashSet already implements a union function, +// Currently, we have three structures: Tags (HashSet), Properties (HashSet), +// Score (int). In fact, HashSet already implements a union function, // so only a special function for integers is needed. // CRDTs can be considered later when we need to add structures that require // more powerful combine semantics. -// Trait defining a Monoid, which represents a mathematical structure with an identity element and an associative binary operation. +// Trait defining a Monoid, which represents a mathematical structure with an +// identity element and an associative binary operation. pub trait Monoid { // Returns the neutral element of the monoid. fn neutral() -> V; @@ -13,7 +14,8 @@ pub trait Monoid { fn combine(a: &V, b: &V) -> V; // Combines multiple elements of the monoid into a single element. - // Default implementation uses `neutral()` as the initial accumulator and `combine()` for folding. + // Default implementation uses `neutral()` as the initial accumulator and + // `combine()` for folding. fn combine_all>(values: I) -> V { values .into_iter() diff --git a/fs-storage/src/utils.rs b/fs-storage/src/utils.rs index b5b6830a..d1e818bd 100644 --- a/fs-storage/src/utils.rs +++ b/fs-storage/src/utils.rs @@ -1,6 +1,5 @@ use data_error::Result; -use std::collections::BTreeMap; -use std::path::Path; +use std::{collections::BTreeMap, path::Path}; /// Parses version 2 `FileStorage` format and returns the data as a BTreeMap /// diff --git a/rustfmt.toml b/rustfmt.toml index 0a869692..5b415646 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,15 +1,18 @@ +# General settings +imports_granularity = 'Crate' verbose = "Verbose" tab_spaces = 4 - max_width = 80 +newline_style = "Unix" + +# Code style settings chain_width = 50 single_line_if_else_max_width = 30 - force_explicit_abi = true - +# Import settings reorder_imports = true +# Comment settings wrap_comments = true - -newline_style = "Unix" +comment_width = 80 From 3a368afe2d03d739c0bf059d9b35480e12b1aa65 Mon Sep 17 00:00:00 2001 From: Tarek Date: Sun, 30 Jun 2024 18:12:42 +0300 Subject: [PATCH 4/6] fix: change target_os to target_family Signed-off-by: Tarek --- fs-atomic-versions/src/atomic/file.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs-atomic-versions/src/atomic/file.rs b/fs-atomic-versions/src/atomic/file.rs index 1fe9549d..a2b708ab 100644 --- a/fs-atomic-versions/src/atomic/file.rs +++ b/fs-atomic-versions/src/atomic/file.rs @@ -1,4 +1,4 @@ -#[cfg(target_os = "unix")] +#[cfg(target_family = "unix")] use std::os::unix::fs::MetadataExt; use std::{ fs::{self, File}, @@ -231,7 +231,7 @@ impl AtomicFile { // May return `EEXIST`. let res = std::fs::hard_link(&new.path, new_path); if let Err(err) = res { - #[cfg(target_os = "unix")] + #[cfg(target_family = "unix")] // From open(2) manual page: // // "[...] create a unique file on the same filesystem (e.g., @@ -243,7 +243,7 @@ impl AtomicFile { if new.path.metadata()?.nlink() != 2 { Err(err)?; } - #[cfg(not(target_os = "unix"))] + #[cfg(not(target_family = "unix"))] Err(err)?; } From 031551cecd6511f8afb2a1adcb3b166d7941596c Mon Sep 17 00:00:00 2001 From: Tarek Date: Sun, 30 Jun 2024 19:51:00 +0300 Subject: [PATCH 5/6] refactor(fs-index): refactor ResourceIndex and store relative paths Signed-off-by: Tarek --- fs-index/Cargo.toml | 6 +- fs-index/src/index.rs | 970 ++++++++++++++++-------------------------- fs-index/src/lib.rs | 4 + fs-index/src/serde.rs | 123 ++++++ fs-index/src/tests.rs | 918 +++++++++++++++++++++++---------------- fs-index/src/utils.rs | 69 +++ 6 files changed, 1105 insertions(+), 985 deletions(-) create mode 100644 fs-index/src/serde.rs create mode 100644 fs-index/src/utils.rs diff --git a/fs-index/Cargo.toml b/fs-index/Cargo.toml index 0ae00b3f..0ded09dd 100644 --- a/fs-index/Cargo.toml +++ b/fs-index/Cargo.toml @@ -12,9 +12,8 @@ bench = false log = { version = "0.4.17", features = ["release_max_level_off"] } walkdir = "2.3.2" anyhow = "1.0.58" -canonical-path = "2.0.2" -pathdiff = "0.2.1" -itertools = "0.10.5" +serde_json = "1.0" +serde = { version = "1.0", features = ["derive"] } fs-storage = { path = "../fs-storage" } @@ -29,7 +28,6 @@ criterion = { version = "0.5", features = ["html_reports"] } tempfile = "3.10" # Depending on `dev-hash` for testing dev-hash = { path = "../dev-hash" } -fs-atomic-versions = { path = "../fs-atomic-versions" } [[bench]] name = "resource_index_benchmark" diff --git a/fs-index/src/index.rs b/fs-index/src/index.rs index 22aa7c43..1026e557 100644 --- a/fs-index/src/index.rs +++ b/fs-index/src/index.rs @@ -1,680 +1,430 @@ -use anyhow::anyhow; -use canonical_path::{CanonicalPath, CanonicalPathBuf}; -use itertools::Itertools; use std::{ - collections::{HashMap, HashSet}, - fs::{self, File, Metadata}, - io::{BufRead, BufReader, Write}, - ops::Add, + collections::HashMap, + fs, + hash::Hash, path::{Path, PathBuf}, - time::{Duration, SystemTime, UNIX_EPOCH}, + time::SystemTime, }; -use walkdir::{DirEntry, WalkDir}; +use anyhow::anyhow; use log; +use serde::{Deserialize, Serialize}; +use walkdir::WalkDir; use data_error::{ArklibError, Result}; use data_resource::ResourceId; use fs_storage::{ARK_FOLDER, INDEX_PATH}; -#[derive(Eq, Ord, PartialEq, PartialOrd, Hash, Clone, Debug)] -pub struct IndexEntry { - pub modified: SystemTime, +use crate::utils::should_index; + +/// Represents a resource in the index +#[derive( + PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Debug, Serialize, Deserialize, +)] +pub struct IndexedResource { + /// The unique identifier of the resource pub id: Id, + /// The path of the resource, relative to the root path + pub path: PathBuf, + /// The last modified time of the resource (from the file system metadata) + pub last_modified: SystemTime, } -#[derive(PartialEq, Clone, Debug)] -pub struct ResourceIndex { - pub id2path: HashMap, - pub path2id: HashMap>, - - pub collisions: HashMap, - pub root: PathBuf, +/// Represents the index of resources in a directory. +/// +/// [`ResourceIndex`] provides functionality for managing a directory index, +/// including tracking changes, and querying resources. +/// +/// #### Reactive API +/// - [`ResourceIndex::update_all`]: Method to update the index by rescanning +/// files and returning changes (additions/deletions/updates). +/// +/// #### Snapshot API +/// - [`ResourceIndex::get_resources_by_id`]: Query resources from the index by +/// ID. +/// - [`ResourceIndex::get_resource_by_path`]: Query a resource from the index +/// by its path. +/// +/// #### Track API +/// Allows for fine-grained control over tracking changes in the index +/// - [`ResourceIndex::track_addition`]: Track a newly added file (checks if the +/// file exists in the file system). +/// - [`ResourceIndex::track_removal`]: Track the deletion of a file (checks if +/// the file was actually deleted). +/// - [`ResourceIndex::track_modification`]: Track an update on a single file. +/// +/// ## Examples +/// ```no_run +/// use std::path::Path; +/// use fs_index::{ResourceIndex, load_or_build_index}; +/// use dev_hash::Crc32; +/// +/// // Define the root path +/// let root_path = Path::new("animals"); +/// +/// // Build the index +/// let index: ResourceIndex = ResourceIndex::build(root_path).expect("Failed to build index"); +/// // Store the index +/// index.store().expect("Failed to store index"); +/// +/// // Load the stored index +/// let mut loaded_index: ResourceIndex = load_or_build_index(root_path, false).expect("Failed to load index"); +/// +/// // Update the index +/// loaded_index.update_all().expect("Failed to update index"); +/// +/// // Get a resource by path +/// let _resource = loaded_index +/// .get_resource_by_path("cat.txt") +/// .expect("Resource not found"); +/// +/// // Track the removal of a file +/// loaded_index +/// .track_removal(Path::new("cat.txt")) +/// .expect("Failed to track removal"); +/// +/// // Track the addition of a new file +/// loaded_index +/// .track_addition(Path::new("dog.txt")) +/// .expect("Failed to track addition"); +/// +/// // Track the modification of a file +/// loaded_index +/// .track_modification(Path::new("dog.txt")) +/// .expect("Failed to track modification"); +/// ``` +#[derive(Clone, Debug)] +pub struct ResourceIndex +where + Id: Eq + Hash, +{ + /// The root path of the index (canonicalized) + pub(crate) root: PathBuf, + /// A map from resource IDs to resources + /// + /// Multiple resources can have the same ID (e.g., due to hash collisions + /// or files with the same content) + pub(crate) id_to_resources: HashMap>>, + /// A map from resource paths to resources + pub(crate) path_to_resource: HashMap>, } +/// Represents the result of an update operation on the ResourceIndex #[derive(PartialEq, Debug)] pub struct IndexUpdate { - pub deleted: HashSet, - pub added: HashMap, + /// Resources that were added during the update + pub added: Vec>, + /// Resources that were modified during the update + pub modified: Vec>, + /// Resources that were removed during the update + pub removed: Vec>, } -pub const RESOURCE_UPDATED_THRESHOLD: Duration = Duration::from_millis(1); - -pub type Paths = HashSet; - impl ResourceIndex { - pub fn size(&self) -> usize { - //the actual size is lower in presence of collisions - self.path2id.len() + /// Return the number of resources in the index + pub fn len(&self) -> usize { + self.path_to_resource.len() } - pub fn build>(root_path: P) -> Self { - log::info!("Building the index from scratch"); - let root_path: PathBuf = root_path.as_ref().to_owned(); - - let entries = discover_paths(&root_path); - let entries = scan_entries(entries); - - let mut index = ResourceIndex { - id2path: HashMap::new(), - path2id: HashMap::new(), - collisions: HashMap::new(), - root: root_path, - }; - - for (path, entry) in entries { - index.insert_entry(path, entry); - } - - log::info!("Index built"); - index + /// Return true if the index is empty + pub fn is_empty(&self) -> bool { + self.path_to_resource.is_empty() } - pub fn load>(root_path: P) -> Result { - let root_path: PathBuf = root_path.as_ref().to_owned(); - - let index_path: PathBuf = root_path.join(ARK_FOLDER).join(INDEX_PATH); - log::info!("Loading the index from file {}", index_path.display()); - let file = File::open(&index_path)?; - let mut index = ResourceIndex { - id2path: HashMap::new(), - path2id: HashMap::new(), - collisions: HashMap::new(), - root: root_path.clone(), - }; - - // We should not return early in case of missing files - let lines = BufReader::new(file).lines(); - for line in lines { - let line = line?; - - let mut parts = line.split(' '); - - let modified = { - let str = parts.next().ok_or(ArklibError::Parse)?; - UNIX_EPOCH.add(Duration::from_millis( - str.parse().map_err(|_| ArklibError::Parse)?, - )) - }; - - let id = { - let str = parts.next().ok_or(ArklibError::Parse)?; - Id::from_str(str).map_err(|_| ArklibError::Parse)? - }; + /// Return the root path of the index + pub fn root(&self) -> &Path { + &self.root + } - let path: String = - itertools::Itertools::intersperse(parts, " ").collect(); - let path: PathBuf = root_path.join(Path::new(&path)); - match CanonicalPathBuf::canonicalize(&path) { - Ok(path) => { - log::trace!("[load] {} -> {}", id, path.display()); - index.insert_entry(path, IndexEntry { modified, id }); - } - Err(_) => { - log::warn!("File {} not found", path.display()); - continue; - } - } - } + /// Return the resources in the index + pub fn resources(&self) -> Vec> { + // Using path_to_resource so to avoid not collecting duplicates + self.path_to_resource.values().cloned().collect() + } - Ok(index) + /// Return the ID collisions + pub fn collisions(&self) -> HashMap>> { + // Filter out IDs with only one resource + self.id_to_resources + .iter() + .filter(|(_, resources)| resources.len() > 1) + .map(|(id, resources)| (id.clone(), resources.clone())) + .collect() } + /// Save the index to the file system (as a JSON file in + /// /ARK_FOLDER/INDEX_PATH) pub fn store(&self) -> Result<()> { - log::info!("Storing the index to file"); - - let start = SystemTime::now(); + let ark_folder = self.root.join(ARK_FOLDER); + let index_path = ark_folder.join(INDEX_PATH); + log::debug!("Storing index at: {:?}", index_path); - let index_path = self - .root - .to_owned() - .join(ARK_FOLDER) - .join(INDEX_PATH); + fs::create_dir_all(&ark_folder)?; + let index_file = fs::File::create(index_path)?; + serde_json::to_writer_pretty(index_file, self)?; - let ark_dir = index_path.parent().unwrap(); - fs::create_dir_all(ark_dir)?; - - let mut file = File::create(index_path)?; + Ok(()) + } - let mut path2id: Vec<(&CanonicalPathBuf, &IndexEntry)> = - self.path2id.iter().collect(); - path2id.sort_by_key(|(_, entry)| *entry); + /// Get resources by their ID + /// + /// Returns None if there is no resource with the given ID + /// + /// **Note**: This can return multiple resources with the same ID in case of + /// hash collisions or files with the same content + pub fn get_resources_by_id( + &self, + id: Id, + ) -> Option<&Vec>> { + self.id_to_resources.get(&id) + } - for (path, entry) in path2id.iter() { - log::trace!("[store] {} by path {}", entry.id, path.display()); + /// Get a resource by its path + /// + /// Returns None if the resource does not exist + /// + /// **Note**: The path should be relative to the root path + pub fn get_resource_by_path>( + &self, + path: P, + ) -> Option<&IndexedResource> { + self.path_to_resource.get(path.as_ref()) + } - let timestamp = entry - .modified - .duration_since(UNIX_EPOCH) - .map_err(|_| { - ArklibError::Other(anyhow!("Error using duration since")) - })? - .as_millis(); + /// Build a new index from the given root path + pub fn build>(root_path: P) -> Result { + log::debug!("Building index at root path: {:?}", root_path.as_ref()); - let path = - pathdiff::diff_paths(path.to_str().unwrap(), self.root.clone()) - .ok_or(ArklibError::Path( - "Couldn't calculate path diff".into(), - ))?; + // Canonicalize the root path + let root = fs::canonicalize(&root_path)?; + let mut id_to_resources = HashMap::new(); + let mut path_to_resource = HashMap::new(); - writeln!(file, "{} {} {}", timestamp, entry.id, path.display())?; + // Loop through the root path and add resources to the index + let walker = WalkDir::new(&root) + .min_depth(1) // Skip the root directory + .into_iter() + .filter_entry(should_index); // Skip hidden files + for entry in walker { + let entry = entry.map_err(|e| { + ArklibError::Path(format!("Error walking directory: {}", e)) + })?; + // Ignore directories + if !entry.file_type().is_file() { + continue; + } + let path = entry.path(); + let metadata = fs::metadata(path)?; + let last_modified = metadata.modified()?; + let id = Id::from_path(path)?; + // Path is relative to the root + let path = path.strip_prefix(&root).map_err(|_| { + ArklibError::Path("Error stripping prefix".to_string()) + })?; + + // Create the resource and add it to the index + let resource = IndexedResource { + id: id.clone(), + path: path.to_path_buf(), + last_modified, + }; + path_to_resource.insert(resource.path.clone(), resource.clone()); + id_to_resources + .entry(id) + .or_insert_with(Vec::new) + .push(resource); } - log::trace!( - "Storing the index took {:?}", - start - .elapsed() - .map_err(|_| ArklibError::Other(anyhow!("SystemTime error"))) - ); - Ok(()) + Ok(ResourceIndex { + root, + id_to_resources, + path_to_resource, + }) } - pub fn provide>(root_path: P) -> Result { - match Self::load(&root_path) { - Ok(mut index) => { - log::debug!("Index loaded: {} entries", index.path2id.len()); - - match index.update_all() { - Ok(update) => { - log::debug!( - "Index updated: {} added, {} deleted", - update.added.len(), - update.deleted.len() - ); - } - Err(e) => { - log::error!( - "Failed to update index: {}", - e.to_string() - ); - } - } - - if let Err(e) = index.store() { - log::error!("{}", e.to_string()); + /// Update the index with the latest information from the file system + pub fn update_all(&mut self) -> Result> { + log::debug!("Updating index at root path: {:?}", self.root); + + let mut added = Vec::new(); + let mut modified = Vec::new(); + let mut removed = Vec::new(); + + let new_index = ResourceIndex::build(&self.root)?; + + // Compare the new index with the old index + let current_resources = self.resources(); + let new_resources = new_index.resources(); + for resource in new_resources.clone() { + // If the resource is in the old index, check if it has been + // modified + if let Some(current_resource) = + self.get_resource_by_path(&resource.path) + { + if current_resource != &resource { + modified.push(resource.clone()); } - Ok(index) } - Err(e) => { - log::warn!("{}", e.to_string()); - Ok(Self::build(root_path)) + // If the resource is not in the old index, it has been added + else { + added.push(resource.clone()); } } - } - - pub fn update_all(&mut self) -> Result> { - log::debug!("Updating the index"); - log::trace!("[update] known paths: {:?}", self.path2id.keys()); - - let curr_entries = discover_paths(self.root.clone()); - - //assuming that collections manipulation is - // quicker than asking `path.exists()` for every path - let curr_paths: Paths = curr_entries.keys().cloned().collect(); - let prev_paths: Paths = self.path2id.keys().cloned().collect(); - let preserved_paths: Paths = curr_paths - .intersection(&prev_paths) - .cloned() - .collect(); - - let created_paths: HashMap = curr_entries - .iter() - .filter_map(|(path, entry)| { - if !preserved_paths.contains(path.as_canonical_path()) { - Some((path.clone(), entry.clone())) - } else { - None - } - }) - .collect(); - - log::debug!("Checking updated paths"); - let updated_paths: HashMap = curr_entries - .into_iter() - .filter(|(path, dir_entry)| { - if !preserved_paths.contains(path.as_canonical_path()) { - false - } else { - let our_entry = &self.path2id[path]; - let prev_modified = our_entry.modified; - - let result = dir_entry.metadata(); - match result { - Err(msg) => { - log::error!( - "Couldn't retrieve metadata for {}: {}", - &path.display(), - msg - ); - false - } - Ok(metadata) => match metadata.modified() { - Err(msg) => { - log::error!( - "Couldn't retrieve timestamp for {}: {}", - &path.display(), - msg - ); - false - } - Ok(curr_modified) => { - let elapsed = curr_modified - .duration_since(prev_modified) - .unwrap(); - - let was_updated = - elapsed >= RESOURCE_UPDATED_THRESHOLD; - if was_updated { - log::trace!( - "[update] modified {} by path {} - \twas {:?} - \tnow {:?} - \telapsed {:?}", - our_entry.id, - path.display(), - prev_modified, - curr_modified, - elapsed - ); - } - - was_updated - } - }, - } - } - }) - .collect(); - - let mut deleted: HashSet = HashSet::new(); - - // treating both deleted and updated paths as deletions - prev_paths - .difference(&preserved_paths) - .cloned() - .chain(updated_paths.keys().cloned()) - .for_each(|path| { - if let Some(entry) = - self.path2id.remove(path.as_canonical_path()) - { - let k = self.collisions.remove(&entry.id).unwrap_or(1); - if k > 1 { - self.collisions.insert(entry.id, k - 1); - } else { - log::trace!( - "[delete] {} by path {}", - entry.id, - path.display() - ); - self.id2path.remove(&entry.id); - deleted.insert(entry.id); - } - } else { - log::warn!("Path {} was not known", path.display()); - } - }); - - let added: HashMap> = - scan_entries(updated_paths) - .into_iter() - .chain({ - log::debug!("Checking added paths"); - scan_entries(created_paths).into_iter() - }) - .filter(|(_, entry)| !self.id2path.contains_key(&entry.id)) - .collect(); - - for (path, entry) in added.iter() { - if deleted.contains(&entry.id) { - // emitting the resource as both deleted and added - // (renaming a duplicate might remain undetected) - log::trace!( - "[update] moved {} to path {}", - entry.id, - path.display() - ); + for resource in current_resources { + // If the resource is not in the new index, it has been removed + if !new_resources.contains(&resource) { + removed.push(resource.clone()); } - - self.insert_entry(path.clone(), entry.clone()); } - let added: HashMap = added - .into_iter() - .map(|(path, entry)| (path, entry.id)) - .collect(); - - Ok(IndexUpdate { deleted, added }) + // Update the index with the new index and return the result + *self = new_index; + Ok(IndexUpdate { + added, + modified, + removed, + }) } - // the caller must ensure that: - // * the index is up-to-date except this single path - // * the path hasn't been indexed before - pub fn index_new( + /// Track the addition of a newly added file to the resource index. + /// + /// This method checks if the file exists in the file system. + /// + /// # Arguments + /// * `relative_path` - The path of the file to be added (relative to the + /// root path of the index). + /// + /// # Returns + /// Returns `Ok(resource)` if the file was successfully added to the index. + /// + /// # Errors + /// - If the file does not exist in the file system. + /// - If there was an error calculating the checksum of the file. + pub fn track_addition>( &mut self, - path: &dyn AsRef, - ) -> Result> { - log::debug!("Indexing a new path"); - - if !path.as_ref().exists() { - return Err(ArklibError::Path( - "Absent paths cannot be indexed".into(), - )); + relative_path: P, + ) -> Result> { + log::debug!("Tracking addition of file: {:?}", relative_path.as_ref()); + + let path = relative_path.as_ref(); + let full_path = self.root.join(path); + if !full_path.exists() { + return Err(ArklibError::Path(format!( + "File does not exist: {:?}", + full_path + )) + .into()); } - - let path_buf = CanonicalPathBuf::canonicalize(path)?; - let path = path_buf.as_canonical_path(); - - return match fs::metadata(path) { - Err(_) => { - return Err(ArklibError::Path( - "Couldn't to retrieve file metadata".into(), - )); - } - Ok(metadata) => match scan_entry(path, metadata) { - Err(_) => { - return Err(ArklibError::Path( - "The path points to a directory or empty file".into(), - )); - } - Ok(new_entry) => { - let id = new_entry.clone().id; - - if let Some(nonempty) = self.collisions.get_mut(&id) { - *nonempty += 1; - } - - let mut added = HashMap::new(); - added.insert(path_buf.clone(), id.clone()); - - self.id2path.insert(id, path_buf.clone()); - self.path2id.insert(path_buf, new_entry); - - Ok(IndexUpdate { - added, - deleted: HashSet::new(), - }) - } - }, + let metadata = fs::metadata(&full_path)?; + let last_modified = metadata.modified()?; + let id = Id::from_path(&full_path)?; + + let resource = IndexedResource { + id: id.clone(), + path: path.to_path_buf(), + last_modified, }; + self.path_to_resource + .insert(resource.path.clone(), resource.clone()); + self.id_to_resources + .entry(id) + .or_default() + .push(resource.clone()); + + Ok(resource) } - // the caller must ensure that: - // * the index is up-to-date except this single path - // * the path has been indexed before - // * the path maps into `old_id` - // * the content by the path has been modified - pub fn update_one( + /// Track the removal of a file from the resource index. + /// + /// This method checks if the file exists in the file system + /// + /// # Arguments + /// * `relative_path` - The path of the file to be removed (relative to the + /// root path of the index). + /// + /// # Returns + /// Returns `Ok(resource)` if the resource was successfully removed from the + /// index. + /// + /// # Errors + /// - If the file still exists in the file system. + /// - If the resource does not exist in the index. + pub fn track_removal>( &mut self, - path: &dyn AsRef, - old_id: Id, - ) -> Result> { - log::debug!("Updating a single entry in the index"); - - if !path.as_ref().exists() { - return self.forget_id(old_id); + relative_path: P, + ) -> Result> { + log::debug!("Tracking removal of file: {:?}", relative_path.as_ref()); + + let path = relative_path.as_ref(); + let full_path = self.root.join(path); + if full_path.exists() { + return Err(ArklibError::Path(format!( + "File still exists: {:?}", + full_path + )) + .into()); } - let path_buf = CanonicalPathBuf::canonicalize(path)?; - let path = path_buf.as_canonical_path(); - - log::trace!( - "[update] paths {:?} has id {:?}", - path, - self.path2id[path] - ); - - return match fs::metadata(path) { - Err(_) => { - // updating the index after resource removal - // is a correct scenario - self.forget_path(path, old_id) - } - Ok(metadata) => { - match scan_entry(path, metadata) { - Err(_) => { - // a directory or empty file exists by the path - self.forget_path(path, old_id) - } - Ok(new_entry) => { - // valid resource exists by the path - - let curr_entry = &self.path2id.get(path); - if curr_entry.is_none() { - // if the path is not indexed, then we can't have - // `old_id` if you want - // to index new path, use `index_new` method - return Err(ArklibError::Path( - "Couldn't find the path in the index".into(), - )); - } - let curr_entry = curr_entry.unwrap(); - - if curr_entry.id == new_entry.id { - // in rare cases we are here due to hash collision - if curr_entry.modified == new_entry.modified { - log::warn!("path {:?} was not modified", &path); - } else { - log::warn!("path {:?} was modified but not its content", &path); - } - - // the caller must have ensured that the path was - // indeed update - return Err(ArklibError::Collision( - "New content has the same id".into(), - )); - } - - // new resource exists by the path - self.forget_path(path, old_id).map(|mut update| { - update - .added - .insert(path_buf.clone(), new_entry.clone().id); - self.insert_entry(path_buf, new_entry); - - update - }) - } - } + // Remove the resource from the index + let resource = self + .path_to_resource + .remove(path) + .ok_or_else(|| anyhow!("Resource not found: {}", path.display()))?; + + // Remove the resource from the id_to_resources map + if let Some(resources) = self.id_to_resources.get_mut(&resource.id) { + resources.retain(|r| r.path != resource.path); + if resources.is_empty() { + self.id_to_resources.remove(&resource.id); } - }; - } - - pub fn forget_id(&mut self, old_id: Id) -> Result> { - let old_path = self - .path2id - .drain() - .filter_map(|(k, v)| { - if v.id == old_id { - Some(k) - } else { - None - } - }) - .collect_vec(); - for p in old_path { - self.path2id.remove(&p); - } - self.id2path.remove(&old_id); - let mut deleted = HashSet::new(); - deleted.insert(old_id); - - Ok(IndexUpdate { - added: HashMap::new(), - deleted, - }) - } - - fn insert_entry(&mut self, path: CanonicalPathBuf, entry: IndexEntry) { - log::trace!("[add] {} by path {}", entry.id, path.display()); - let id = entry.clone().id; - - if let std::collections::hash_map::Entry::Vacant(e) = - self.id2path.entry(id.clone()) - { - e.insert(path.clone()); - } else if let Some(nonempty) = self.collisions.get_mut(&id) { - *nonempty += 1; - } else { - self.collisions.insert(id, 2); } - self.path2id.insert(path, entry); + Ok(resource) } - fn forget_path( + /// Track the modification of a file in the resource index. + /// + /// This method checks if the file exists in the file system and removes the + /// old resource from the index before adding the new resource to the + /// index. + /// + /// # Arguments + /// * `relative_path` - The relative path of the file to be modified. + /// + /// # Returns + /// Returns `Ok(new_resource)` if the resource was successfully modified in + /// the index. + /// + /// # Errors + /// - If there was a problem removing the old resource from the index. + /// - If there was a problem adding the new resource to the index. + pub fn track_modification>( &mut self, - path: &CanonicalPath, - old_id: Id, - ) -> Result> { - self.path2id.remove(path); - - if let Some(collisions) = self.collisions.get_mut(&old_id) { - debug_assert!( - *collisions > 1, - "Any collision must involve at least 2 resources" - ); - *collisions -= 1; - - if *collisions == 1 { - self.collisions.remove(&old_id); - } + relative_path: P, + ) -> Result> { + log::debug!( + "Tracking modification of file: {:?}", + relative_path.as_ref() + ); - // minor performance issue: - // we must find path of one of the collided - // resources and use it as new value - let maybe_collided_path = - self.path2id.iter().find_map(|(path, entry)| { - if entry.id == old_id { - Some(path) - } else { - None - } - }); - - if let Some(collided_path) = maybe_collided_path { - let old_path = self - .id2path - .insert(old_id.clone(), collided_path.clone()); - - debug_assert_eq!( - old_path.unwrap().as_canonical_path(), - path, - "Must forget the requested path" - ); - } else { - return Err(ArklibError::Collision( - "Illegal state of collision tracker".into(), - )); + let path = relative_path.as_ref(); + // Remove the resource from the index + let resource = self + .path_to_resource + .remove(path) + .ok_or_else(|| anyhow!("Resource not found: {}", path.display()))?; + + // Remove the resource from the id_to_resources map + if let Some(resources) = self.id_to_resources.get_mut(&resource.id) { + resources.retain(|r| r.path != resource.path); + if resources.is_empty() { + self.id_to_resources.remove(&resource.id); } - } else { - self.id2path.remove(&old_id.clone()); } - let mut deleted = HashSet::new(); - deleted.insert(old_id); - - Ok(IndexUpdate { - added: HashMap::new(), - deleted, - }) - } -} - -pub(crate) fn discover_paths>( - root_path: P, -) -> HashMap { - log::debug!( - "Discovering all files under path {}", - root_path.as_ref().display() - ); - - WalkDir::new(root_path) - .into_iter() - .filter_entry(|entry| !is_hidden(entry)) - .filter_map(|result| match result { - Ok(entry) => { - let path = entry.path(); - if !entry.file_type().is_dir() { - match CanonicalPathBuf::canonicalize(path) { - Ok(canonical_path) => Some((canonical_path, entry)), - Err(msg) => { - log::warn!( - "Couldn't canonicalize {}:\n{}", - path.display(), - msg - ); - None - } - } - } else { - None - } - } - Err(msg) => { - log::error!("Error during walking: {}", msg); - None - } - }) - .collect() -} - -fn scan_entry( - path: &CanonicalPath, - metadata: Metadata, -) -> Result> -where - Id: ResourceId, -{ - if metadata.is_dir() { - return Err(ArklibError::Path("Path is expected to be a file".into())); - } + // Add the new resource to the index + let new_resource = self.track_addition(path)?; - let size = metadata.len(); - if size == 0 { - Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - "Empty resource", - ))?; + Ok(new_resource) } - - let id = Id::from_path(path)?; - let modified = metadata.modified()?; - - Ok(IndexEntry { modified, id }) -} - -fn scan_entries( - entries: HashMap, -) -> HashMap> -where - Id: ResourceId, -{ - entries - .into_iter() - .filter_map(|(path_buf, entry)| { - let metadata = entry.metadata().ok()?; - - let path = path_buf.as_canonical_path(); - let result = scan_entry(path, metadata); - match result { - Err(msg) => { - log::error!( - "Couldn't retrieve metadata for {}:\n{}", - path.display(), - msg - ); - None - } - Ok(entry) => Some((path_buf, entry)), - } - }) - .collect() -} - -fn is_hidden(entry: &DirEntry) -> bool { - entry - .file_name() - .to_str() - .map(|s| s.starts_with('.')) - .unwrap_or(false) } diff --git a/fs-index/src/lib.rs b/fs-index/src/lib.rs index bf280e99..be68c43e 100644 --- a/fs-index/src/lib.rs +++ b/fs-index/src/lib.rs @@ -1,4 +1,8 @@ pub mod index; +mod serde; +mod utils; + +pub use utils::load_or_build_index; #[cfg(test)] mod tests; diff --git a/fs-index/src/serde.rs b/fs-index/src/serde.rs new file mode 100644 index 00000000..9d16e8f0 --- /dev/null +++ b/fs-index/src/serde.rs @@ -0,0 +1,123 @@ +use std::{collections::HashMap, path::PathBuf, time::SystemTime}; + +use anyhow::Result; +use serde::{ + ser::{SerializeStruct, Serializer}, + Deserialize, Serialize, +}; + +use data_resource::ResourceId; + +use crate::{index::IndexedResource, ResourceIndex}; + +/// Data structure for serializing and deserializing the index +#[derive(Serialize, Deserialize)] +struct ResourceIndexData { + root: PathBuf, + resources: HashMap>, +} + +#[derive(Serialize, Deserialize)] +struct IndexedResourceData { + id: Id, + last_modified: u64, +} + +/// Custom implementation of [`Serialize`] for [`ResourceIndex`] +/// +/// To avoid writing a large repetitive index file with double maps, +/// we are only serializing the root path, and path_to_resource. +/// +/// Other fields can be reconstructed from the path_to_resource map. +impl Serialize for ResourceIndex +where + Id: ResourceId, +{ + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut state = serializer.serialize_struct("ResourceIndex", 2)?; + state.serialize_field("root", &self.root)?; + + let mut resources = HashMap::new(); + for (path, resource) in &self.path_to_resource { + let id = resource.id.clone(); + let last_modified = resource + .last_modified + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(|e| { + serde::ser::Error::custom(format!( + "Failed to serialize last_modified: {}", + e + )) + })? + .as_nanos() as u64; + + let resource_data = IndexedResourceData { id, last_modified }; + resources.insert(path.clone(), resource_data); + } + + state.serialize_field("resources", &resources)?; + state.end() + } +} + +/// Custom implementation of [`Deserialize`] for [`ResourceIndex`] +/// +/// Deserializes the index from the root path and path_to_resource map. +/// Other fields are reconstructed from the path_to_resource map. +impl<'de, Id> Deserialize<'de> for ResourceIndex +where + Id: ResourceId, +{ + fn deserialize(deserializer: D) -> Result, D::Error> + where + D: serde::Deserializer<'de>, + { + let index_data: ResourceIndexData = + ResourceIndexData::deserialize(deserializer)?; + + let mut path_to_resource = HashMap::new(); + let mut id_to_resources = HashMap::new(); + for (path, resource_data) in index_data.resources { + let last_modified = SystemTime::UNIX_EPOCH + + std::time::Duration::from_nanos(resource_data.last_modified); + let resource = IndexedResource { + id: resource_data.id, + path: path.clone(), + last_modified, + }; + path_to_resource.insert(path, resource.clone()); + id_to_resources + .entry(resource.id.clone()) + .or_insert_with(Vec::new) + .push(resource); + } + + Ok(ResourceIndex { + root: index_data.root, + id_to_resources, + path_to_resource, + }) + } +} + +/// Custom implementation of [`PartialEq`] for [`ResourceIndex`] +/// +/// The order of items in hashmaps is not relevant. +/// we just need to compare [`ResourceIndex::resources`] to check if the two +/// indexes are equal. +impl PartialEq for ResourceIndex +where + Id: ResourceId, +{ + fn eq(&self, other: &Self) -> bool { + let mut resources1 = self.resources(); + let mut resources2 = other.resources(); + resources1.sort_by(|a, b| a.path.cmp(&b.path)); + resources2.sort_by(|a, b| a.path.cmp(&b.path)); + + resources1 == resources2 && self.root == other.root + } +} diff --git a/fs-index/src/tests.rs b/fs-index/src/tests.rs index 9eed0158..ae9c20eb 100644 --- a/fs-index/src/tests.rs +++ b/fs-index/src/tests.rs @@ -1,427 +1,603 @@ -use crate::{ - index::{discover_paths, IndexEntry}, - ResourceIndex, -}; -use canonical_path::CanonicalPathBuf; +use std::{fs, path::Path}; + +use anyhow::{anyhow, Result}; +use tempfile::TempDir; + +use data_resource::ResourceId; use dev_hash::Crc32; -use fs_atomic_versions::initialize; -use std::fs::File; -#[cfg(target_family = "unix")] -use std::fs::Permissions; -#[cfg(target_family = "unix")] -use std::os::unix::fs::PermissionsExt; -use std::{path::PathBuf, time::SystemTime}; -use uuid::Uuid; +use super::*; +use crate::{index::IndexedResource, utils::load_or_build_index}; + +/// A helper function to get [`IndexedResource`] from a file path +fn get_indexed_resource_from_file>( + path: P, + parent_dir: P, +) -> Result> { + let id = Crc32::from_path(&path)?; + + let relative_path = path + .as_ref() + .strip_prefix(parent_dir) + .map_err(|_| anyhow!("Failed to get relative path"))?; + + Ok(IndexedResource { + id: id, + path: relative_path.into(), + last_modified: fs::metadata(&path)?.modified()?, + }) +} -const FILE_SIZE_1: u64 = 10; -const FILE_SIZE_2: u64 = 11; +/// Test storing and loading the resource index. +/// +/// ## Test scenario: +/// - Build a resource index in the temporary directory. +/// - Store the index. +/// - Load the stored index. +/// - Assert that the loaded index matches the original index. +#[test] +fn test_store_and_load_index() { + let temp_dir = TempDir::with_prefix("ark_test_store_and_load_index") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); -const FILE_NAME_1: &str = "test1.txt"; -const FILE_NAME_2: &str = "test2.txt"; -const FILE_NAME_3: &str = "test3.txt"; + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); -const CRC32_1: Crc32 = Crc32(3817498742); -const CRC32_2: Crc32 = Crc32(1804055020); + let index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 1); + index.store().expect("Failed to store index"); -fn get_temp_dir() -> PathBuf { - create_dir_at(std::env::temp_dir()) -} + let loaded_index = + load_or_build_index(root_path, false).expect("Failed to load index"); -fn create_dir_at(path: PathBuf) -> PathBuf { - let mut dir_path = path.clone(); - dir_path.push(Uuid::new_v4().to_string()); - std::fs::create_dir(&dir_path).expect("Could not create temp dir"); - dir_path + assert_eq!(index, loaded_index); } -fn create_file_at( - path: PathBuf, - size: Option, - name: Option<&str>, -) -> (File, PathBuf) { - let mut file_path = path.clone(); - if let Some(file_name) = name { - file_path.push(file_name); - } else { - file_path.push(Uuid::new_v4().to_string()); - } - let file = - File::create(file_path.clone()).expect("Could not create temp file"); - file.set_len(size.unwrap_or(0)) - .expect("Could not set file size"); - (file, file_path) -} +/// Test storing and loading the resource index with collisions. +/// +/// ## Test scenario: +/// - Build a resource index in the temporary directory. +/// - Write duplicate files with the same content. +/// - Store the index. +/// - Load the stored index. +/// - Assert that the loaded index matches the original index. +#[test] +fn test_store_and_load_index_with_collisions() { + let temp_dir = + TempDir::with_prefix("ark_test_store_and_load_index_with_collisions") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); -fn run_test_and_clean_up(test: impl FnOnce(PathBuf) + std::panic::UnwindSafe) { - initialize(); - - let path = get_temp_dir(); - let result = std::panic::catch_unwind(|| test(path.clone())); - std::fs::remove_dir_all(path.clone()) - .expect("Could not clean up after test"); - if result.is_err() { - panic!("{}", result.err().map(|_| "Test panicked").unwrap()) - } - assert!(result.is_ok()); -} + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); -// resource index build + let file_path2 = root_path.join("file2.txt"); + fs::write(&file_path2, "file content").expect("Failed to write to file"); -#[test] -fn index_build_should_process_1_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let actual: ResourceIndex = ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 1); - assert_eq!(actual.id2path.len(), 1); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 1); - }) -} + let file_path3 = root_path.join("file3.txt"); + fs::write(&file_path3, "file content").expect("Failed to write to file"); -#[test] -fn index_build_should_process_colliding_files_correctly() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let actual: ResourceIndex = ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 2); - assert_eq!(actual.id2path.len(), 1); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert_eq!(actual.collisions.len(), 1); - assert_eq!(actual.size(), 2); - }) -} + let file_path4 = root_path.join("file4.txt"); + fs::write(&file_path4, "file content").expect("Failed to write to file"); -// resource index update + // Now we have 4 files with the same content (same checksum) -#[test] -fn update_all_should_handle_renamed_file_correctly() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - create_file_at(path.clone(), Some(FILE_SIZE_2), Some(FILE_NAME_2)); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - - // rename test2.txt to test3.txt - let mut name_from = path.clone(); - name_from.push(FILE_NAME_2); - let mut name_to = path.clone(); - name_to.push(FILE_NAME_3); - std::fs::rename(name_from, name_to) - .expect("Should rename file successfully"); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 1); - assert_eq!(update.added.len(), 1); - }) + let index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + let checksum = + Crc32::from_path(&file_path).expect("Failed to get checksum"); + assert_eq!(index.len(), 4); + assert_eq!(index.collisions().len(), 1); + assert_eq!(index.collisions()[&checksum].len(), 4); + index.store().expect("Failed to store index"); + + let loaded_index = + load_or_build_index(root_path, false).expect("Failed to load index"); + + assert_eq!(index, loaded_index); } +/// Test building an index with a file. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains one entry. +/// - Assert that the resource retrieved by path matches the expected resource. +/// - Assert that the resource retrieved by ID matches the expected resource. #[test] -fn update_all_should_index_new_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - let (_, expected_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 2); - assert_eq!(actual.id2path.len(), 2); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert!(actual.id2path.contains_key(&CRC32_2)); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 1); - - let added_key = CanonicalPathBuf::canonicalize(expected_path.clone()) - .expect("CanonicalPathBuf should be fine"); - assert_eq!( - update - .added - .get(&added_key) - .expect("Key exists") - .clone(), - CRC32_2 - ) - }) +fn test_build_index_with_file() { + let temp_dir = TempDir::with_prefix("ark_test_build_index_with_file") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + let expected_resource = + get_indexed_resource_from_file(&file_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + + let index = ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 1); + + let resource = index + .get_resource_by_path("file.txt") + .expect("Failed to get resource"); + assert_eq!(resource, &expected_resource); } +/// Test building an index with a directory. +/// +/// ## Test scenario: +/// - Create a subdirectory within the temporary directory. +/// - Create a file within the subdirectory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains one entry. +/// - Assert that the resource retrieved by path matches the expected resource. +/// - Assert that the resource retrieved by ID matches the expected resource. #[test] -fn index_new_should_index_new_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - let mut index: ResourceIndex = - ResourceIndex::build(path.clone()); - - let (_, new_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = index - .index_new(&new_path) - .expect("Should update index correctly"); - - assert_eq!(index.root, path.clone()); - assert_eq!(index.path2id.len(), 2); - assert_eq!(index.id2path.len(), 2); - assert!(index.id2path.contains_key(&CRC32_1)); - assert!(index.id2path.contains_key(&CRC32_2)); - assert_eq!(index.collisions.len(), 0); - assert_eq!(index.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 1); - - let added_key = CanonicalPathBuf::canonicalize(new_path.clone()) - .expect("CanonicalPathBuf should be fine"); - assert_eq!( - update - .added - .get(&added_key) - .expect("Key exists") - .clone(), - CRC32_2 - ) - }) +fn test_build_index_with_directory() { + let temp_dir = TempDir::with_prefix("ark_test_build_index_with_directory") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let dir_path = root_path.join("dir"); + fs::create_dir(&dir_path).expect("Failed to create dir"); + let file_path = dir_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + let expected_resource = + get_indexed_resource_from_file(&file_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + + let index = ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 1); + + let resource = index + .get_resource_by_path("dir/file.txt") + .expect("Failed to get resource"); + assert_eq!(resource, &expected_resource); } +/// Test building an index with multiple files. +/// +/// ## Test scenario: +/// - Create multiple files within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains two entries. +/// - Assert that the resource retrieved by path for each file matches the +/// expected resource. #[test] -fn update_one_should_error_on_new_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - let mut index = ResourceIndex::build(path.clone()); - - let (_, new_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = index.update_one(&new_path, CRC32_2); - - assert!(update.is_err()) - }) +fn test_build_index_with_multiple_files() { + let temp_dir = + TempDir::with_prefix("ark_test_build_index_with_multiple_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file1_path = root_path.join("file1.txt"); + fs::write(&file1_path, "file1 content").expect("Failed to write to file"); + let file2_path = root_path.join("file2.txt"); + fs::write(&file2_path, "file2 content").expect("Failed to write to file"); + + let expected_resource1 = + get_indexed_resource_from_file(&file1_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + let expected_resource2 = + get_indexed_resource_from_file(&file2_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + + let index = ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 2); + + let resource = index + .get_resource_by_path("file1.txt") + .expect("Failed to get resource"); + assert_eq!(resource, &expected_resource1); + + let resource = index + .get_resource_by_path("file2.txt") + .expect("Failed to get resource"); + assert_eq!(resource, &expected_resource2); } +/// Test building an index with multiple directories. +/// +/// ## Test scenario: +/// - Create multiple directories within the temporary directory, each +/// containing a file. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains two entries. +/// - Assert that the resources retrieved by path for each file match the +/// expected resources. #[test] -fn update_one_should_index_delete_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - - let mut actual = ResourceIndex::build(path.clone()); - - let mut file_path = path.clone(); - file_path.push(FILE_NAME_1); - std::fs::remove_file(file_path.clone()) - .expect("Should remove file successfully"); - - let update = actual - .update_one(&file_path.clone(), CRC32_1) - .expect("Should update index successfully"); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 0); - assert_eq!(update.deleted.len(), 1); - assert_eq!(update.added.len(), 0); - - assert!(update.deleted.contains(&CRC32_1)) - }) +fn test_build_index_with_multiple_directories() { + let temp_dir = + TempDir::with_prefix("ark_test_build_index_with_multiple_directories") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let dir1_path = root_path.join("dir1"); + fs::create_dir(&dir1_path).expect("Failed to create dir"); + let file1_path = dir1_path.join("file1.txt"); + fs::write(&file1_path, "file1 content").expect("Failed to write to file"); + + let dir2_path = root_path.join("dir2"); + fs::create_dir(&dir2_path).expect("Failed to create dir"); + let file2_path = dir2_path.join("file2.txt"); + fs::write(&file2_path, "file2 content").expect("Failed to write to file"); + + let expected_resource1 = + get_indexed_resource_from_file(&file1_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + let expected_resource2 = + get_indexed_resource_from_file(&file2_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + + let index = ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 2); + + let resource = index + .get_resource_by_path("dir1/file1.txt") + .expect("Resource not found"); + assert_eq!(resource, &expected_resource1); + + let resource = index + .get_resource_by_path("dir2/file2.txt") + .expect("Resource not found"); + assert_eq!(resource, &expected_resource2); } +/// Test tracking the removal of a file from the index. +/// +/// ## Test scenario: +/// - Create two files within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains two entries. +/// - Remove one of the files. +/// - Track the removal of the file in the index. +/// - Assert that the index contains only one entry after removal. +/// - Assert that the removed file is no longer present in the index, while the +/// other file remains. #[test] -fn update_all_should_error_on_files_without_permissions() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - let (file, _) = - create_file_at(path.clone(), Some(FILE_SIZE_2), Some(FILE_NAME_2)); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - #[cfg(target_family = "unix")] - file.set_permissions(Permissions::from_mode(0o222)) - .expect("Should be fine"); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 0); - }) +fn test_track_removal() { + let temp_dir = TempDir::with_prefix("ark_test_track_removal") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + let image_path = root_path.join("image.png"); + fs::write(&image_path, "image content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 2); + + fs::remove_file(&file_path).expect("Failed to remove file"); + + let file_relative_path = file_path + .strip_prefix(root_path) + .expect("Failed to get relative path"); + index + .track_removal(&file_relative_path) + .expect("Failed to track removal"); + + assert_eq!(index.len(), 1); + assert!(index.get_resource_by_path("file.txt").is_none()); + assert!(index.get_resource_by_path("image.png").is_some()); } -// error cases - +/// Test tracking the removal of a file that doesn't exist. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains only one entry. +/// - Track the removal of a file that doesn't exist in the index. +/// - Assert that the index still contains only one entry. #[test] -fn update_one_should_not_update_absent_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let mut actual = ResourceIndex::build(path.clone()); - let old_id = Crc32(2); - let result = actual - .update_one(&missing_path, old_id.clone()) - .map(|i| i.deleted.clone().take(&old_id)) - .ok() - .flatten(); - - assert_eq!(result, Some(Crc32(2))); - }) +fn test_track_removal_non_existent() { + let temp_dir = TempDir::with_prefix("ark_test_track_removal_non_existent") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 1); + + let new_file_path = root_path.join("new_file.txt"); + + let new_file_relative_path = new_file_path + .strip_prefix(root_path) + .expect("Failed to get relative path"); + let removal_result = index.track_removal(&new_file_relative_path); + assert!(removal_result.is_err()); + assert_eq!(index.len(), 1); } +/// Test tracking the addition of a new file to the index. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains only one entry. +/// - Create a new file in the temporary directory. +/// - Track the addition of the new file in the index. +/// - Assert that the index contains two entries after addition. +/// - Assert that both files are present in the index. #[test] -fn update_one_should_index_new_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let mut actual = ResourceIndex::build(path.clone()); - let old_id = Crc32(2); - let result = actual - .update_one(&missing_path, old_id.clone()) - .map(|i| i.deleted.clone().take(&old_id)) - .ok() - .flatten(); - - assert_eq!(result, Some(Crc32(2))); - }) +fn test_track_addition() { + let temp_dir = TempDir::with_prefix("ark_test_track_addition") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 1); + + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "new file content") + .expect("Failed to write to file"); + + let new_file_relative_path = new_file_path + .strip_prefix(root_path) + .expect("Failed to get relative path"); + index + .track_addition(&new_file_relative_path) + .expect("Failed to track addition"); + + assert_eq!(index.len(), 2); + assert!(index.get_resource_by_path("file.txt").is_some()); + assert!(index + .get_resource_by_path("new_file.txt") + .is_some()); } +/// Test for tracking addition of a file that doesn't exist +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains only one entry. +/// - Track the addition of a file that doesn't exist in the index. +/// - Assert that the index still contains only one entry. #[test] -fn should_not_index_empty_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(0), None); - let actual: ResourceIndex = ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) +fn test_track_addition_non_existent() { + let temp_dir = TempDir::with_prefix("ark_test_track_addition_non_existent") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 1); + + let new_file_path = root_path.join("new_file.txt"); + + let new_file_relative_path = new_file_path + .strip_prefix(root_path) + .expect("Failed to get relative path"); + let addition_result = index.track_addition(&new_file_relative_path); + assert!(addition_result.is_err()); + assert_eq!(index.len(), 1); } +/// Test tracking the modification of a file in the index. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains only one entry. +/// - Update the content of the file. +/// - Track the modification of the file in the index. +/// - Assert that the index still contains only one entry. +/// - Assert that the modification timestamp of the file in the index matches +/// the actual file's modification timestamp. #[test] -fn should_not_index_hidden_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(".hidden")); - let actual: ResourceIndex = ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) +fn test_track_modification() { + let temp_dir = TempDir::with_prefix("ark_test_track_modification") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 1); + + fs::write(&file_path, "updated file content") + .expect("Failed to write to file"); + + let file_relative_path = file_path + .strip_prefix(root_path) + .expect("Failed to get relative path"); + index + .track_modification(&file_relative_path) + .expect("Failed to track modification"); + + assert_eq!(index.len(), 1); + let resource = index + .get_resource_by_path("file.txt") + .expect("Resource not found"); + assert_eq!( + resource.last_modified, + fs::metadata(&file_path) + .unwrap() + .modified() + .unwrap() + ); } +/// Test that track modification does not add a new file to the index. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains only one entry. +/// - Create a new file in the temporary directory. +/// - Track the modification of the new file in the index. +/// - Assert that the index still contains only one entry. #[test] -fn should_not_index_1_empty_directory() { - run_test_and_clean_up(|path| { - create_dir_at(path.clone()); - - let actual: ResourceIndex = ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) +fn test_track_modification_does_not_add() { + let temp_dir = + TempDir::with_prefix("ark_test_track_modification_does_not_add") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 1); + + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "new file content") + .expect("Failed to write to file"); + + let new_file_relative_path = new_file_path + .strip_prefix(root_path) + .expect("Failed to get relative path"); + + let modification_result = index.track_modification(&new_file_relative_path); + assert!(modification_result.is_err()); } +/// Test updating the resource index. +/// +/// ## Test scenario: +/// - Create files within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains the expected number of entries. +/// - Create a new file, modify an existing file, and remove another file. +/// - Update the resource index. +/// - Assert that the index contains the expected number of entries after the +/// update. +/// - Assert that the entries in the index match the expected state after the +/// update. #[test] -fn discover_paths_should_not_walk_on_invalid_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let actual = discover_paths(missing_path); - assert_eq!(actual.len(), 0); - }) +fn test_resource_index_update() { + let temp_dir = TempDir::with_prefix("ark_test_resource_index_update") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let image_path = root_path.join("image.png"); + fs::write(&image_path, "image content").expect("Failed to write to file"); + + let mut index = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 2); + + // create new file + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "new file content") + .expect("Failed to write to file"); + + // modify file + fs::write(&file_path, "updated file content") + .expect("Failed to write to file"); + + // remove file + fs::remove_file(&image_path).expect("Failed to remove file"); + + index + .update_all() + .expect("Failed to update index"); + // Index now contains 2 resources (file.txt and new_file.txt) + assert_eq!(index.len(), 2); + + let resource = index + .get_resource_by_path("file.txt") + .expect("Resource not found"); + let expected_resource = + get_indexed_resource_from_file(&file_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + assert_eq!(resource, &expected_resource); + + let _resource = index + .get_resource_by_path("new_file.txt") + .expect("Resource not found"); + + assert!(index.get_resource_by_path("image.png").is_none()); } +/// Test adding colliding files to the index. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains the expected number of entries. +/// - Create a new file with the same checksum as the existing file. +/// - Track the addition of the new file in the index. +/// - Assert that the index contains the expected number of entries after the +/// addition. +/// - Assert index.collisions contains the expected number of entries. #[test] -fn index_entry_order() { - let old1 = IndexEntry { - id: Crc32(2), - modified: SystemTime::UNIX_EPOCH, - }; - let old2 = IndexEntry { - id: Crc32(1), - modified: SystemTime::UNIX_EPOCH, - }; - - let new1 = IndexEntry { - id: Crc32(1), - modified: SystemTime::now(), - }; - let new2 = IndexEntry { - id: Crc32(2), - modified: SystemTime::now(), - }; - - assert_eq!(new1, new1); - assert_eq!(new2, new2); - assert_eq!(old1, old1); - assert_eq!(old2, old2); - - assert_ne!(new1, new2); - assert_ne!(new1, old1); - - assert!(new1 > old1); - assert!(new1 > old2); - assert!(new2 > old1); - assert!(new2 > old2); - assert!(new2 > new1); +fn test_add_colliding_files() { + let temp_dir = TempDir::with_prefix("ark_test_add_colliding_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 1); + + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "file content").expect("Failed to write to file"); + + let new_file_relative_path = new_file_path + .strip_prefix(root_path) + .expect("Failed to get relative path"); + index + .track_addition(&new_file_relative_path) + .expect("Failed to track addition"); + + assert_eq!(index.len(), 2); + assert_eq!(index.collisions().len(), 1); } -/// Test the performance of `ResourceIndex::build` on a specific directory. +/// Test that we don't index hidden files. /// -/// This test evaluates the performance of building a resource -/// index using the `ResourceIndex::build` method on a given directory. -/// It measures the time taken to build the resource index and prints the -/// number of collisions detected. +/// ## Test scenario: +/// - Create a hidden file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains the expected number of entries. +/// (0) #[test] -fn test_build_resource_index() { - use std::time::Instant; - - let path = "../test-assets/"; // The path to the directory to index - assert!( - std::path::Path::new(path).is_dir(), - "The provided path is not a directory or does not exist" - ); - - let start_time = Instant::now(); - let index: ResourceIndex = ResourceIndex::build(path.to_string()); - let elapsed_time = start_time.elapsed(); - - println!("Number of paths: {}", index.id2path.len()); - println!("Number of resources: {}", index.id2path.len()); - println!("Number of collisions: {}", index.collisions.len()); - println!("Time taken: {:?}", elapsed_time); +fn test_hidden_files() { + let temp_dir = TempDir::with_prefix("ark_test_hidden_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join(".hidden_file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 0); } diff --git a/fs-index/src/utils.rs b/fs-index/src/utils.rs new file mode 100644 index 00000000..71a16180 --- /dev/null +++ b/fs-index/src/utils.rs @@ -0,0 +1,69 @@ +use std::{fs, io::BufReader, path::Path}; + +use data_error::{ArklibError, Result}; +use data_resource::ResourceId; +use fs_storage::{ARK_FOLDER, INDEX_PATH}; + +use crate::ResourceIndex; + +/// A helper function to check if the entry should be indexed (not hidden) +pub fn should_index(entry: &walkdir::DirEntry) -> bool { + !entry + .file_name() + .to_string_lossy() + .starts_with('.') +} + +/// Load the index from the file system +fn load_index, Id: ResourceId>( + root_path: P, +) -> Result> { + let index_path = Path::new(ARK_FOLDER).join(INDEX_PATH); + let index_path = fs::canonicalize(root_path.as_ref())?.join(index_path); + let index_file = fs::File::open(index_path)?; + let reader = BufReader::new(index_file); + let index = serde_json::from_reader(reader)?; + + Ok(index) +} + +/// Load the index from the file system, or build a new index if it doesn't +/// exist +/// +/// If `update` is true, the index will be updated and stored after loading +/// it. +pub fn load_or_build_index, Id: ResourceId>( + root_path: P, + update: bool, +) -> Result> { + log::debug!( + "Attempting to load or build index at root path: {:?}", + root_path.as_ref() + ); + + let index_path = Path::new(ARK_FOLDER).join(INDEX_PATH); + let index_path = fs::canonicalize(root_path.as_ref())?.join(index_path); + log::trace!("Index path: {:?}", index_path); + + if index_path.exists() { + log::trace!("Index file exists, loading index"); + + let mut index = load_index(root_path)?; + if update { + log::trace!("Updating loaded index"); + + index.update_all()?; + index.store()?; + } + Ok(index) + } else { + log::trace!("Index file does not exist, building index"); + + // Build a new index if it doesn't exist and store it + let index = ResourceIndex::build(root_path.as_ref())?; + index.store().map_err(|e| { + ArklibError::Path(format!("Failed to store index: {}", e)) + })?; + Ok(index) + } +} From 03849c90a39c87bcd488b79797905515732924d1 Mon Sep 17 00:00:00 2001 From: Tarek Date: Sun, 30 Jun 2024 19:51:31 +0300 Subject: [PATCH 6/6] feat(fs-index): implement new benchmark functions for fs-index Signed-off-by: Tarek --- fs-index/benches/resource_index_benchmark.rs | 83 ++++++++++++++++++-- 1 file changed, 75 insertions(+), 8 deletions(-) diff --git a/fs-index/benches/resource_index_benchmark.rs b/fs-index/benches/resource_index_benchmark.rs index be113dca..fba6d048 100644 --- a/fs-index/benches/resource_index_benchmark.rs +++ b/fs-index/benches/resource_index_benchmark.rs @@ -28,22 +28,88 @@ fn resource_index_benchmark(c: &mut Criterion) { |b, path| { b.iter(|| { let index: ResourceIndex = - ResourceIndex::build(black_box(path)); - collisions_size = index.collisions.len(); + ResourceIndex::build(black_box(path)).unwrap(); + collisions_size = index.collisions().len(); }); }, ); println!("Collisions: {}", collisions_size); - // TODO: Benchmark `ResourceIndex::get_resource_by_id()` + // Benchmark `ResourceIndex::get_resources_by_id()` + let index: ResourceIndex = + ResourceIndex::build(benchmarks_dir).unwrap(); + let resources = index.resources(); + let resource_id = &resources.clone()[0].id; + group.bench_function("index_get_resource_by_id", |b| { + b.iter(|| { + let _resource = + index.get_resources_by_id(black_box(resource_id.clone())); + }); + }); + + // Benchmark `ResourceIndex::get_resource_by_path()` + let resource_path = &resources.clone()[0].path; + group.bench_function("index_get_resource_by_path", |b| { + b.iter(|| { + let _resource = + index.get_resource_by_path(black_box(resource_path.clone())); + }); + }); - // TODO: Benchmark `ResourceIndex::get_resource_by_path()` + // Benchmark `ResourceIndex::track_addition()` + let new_file = benchmarks_dir.join("new_file.txt"); + group.bench_function("index_track_addition", |b| { + b.iter(|| { + std::fs::File::create(&new_file).unwrap(); + std::fs::write(&new_file, "Hello, World!").unwrap(); + let mut index: ResourceIndex = + ResourceIndex::build(black_box(benchmarks_dir)).unwrap(); + let _addition_result = index.track_addition(&new_file).unwrap(); - // TODO: Benchmark `ResourceIndex::track_addition()` + // Cleanup + std::fs::remove_file(&new_file).unwrap(); + }); + }); - // TODO: Benchmark `ResourceIndex::track_deletion()` + // Benchmark `ResourceIndex::track_removal()` + let removed_file = benchmarks_dir.join("new_file.txt"); + group.bench_function("index_track_removal", |b| { + b.iter(|| { + std::fs::File::create(&removed_file).unwrap(); + std::fs::write(&removed_file, "Hello, World!").unwrap(); + let mut index: ResourceIndex = + ResourceIndex::build(black_box(benchmarks_dir)).unwrap(); + std::fs::remove_file(&removed_file).unwrap(); + let relative_path = removed_file + .strip_prefix(benchmarks_dir) + .unwrap() + .to_str() + .unwrap(); + let _removal_result = index.track_removal(&relative_path).unwrap(); + }); + }); - // TODO: Benchmark `ResourceIndex::track_update()` + // Benchmark `ResourceIndex::track_modification()` + let modified_file = benchmarks_dir.join("new_file.txt"); + group.bench_function("index_track_modification", |b| { + b.iter(|| { + std::fs::File::create(&modified_file).unwrap(); + std::fs::write(&modified_file, "Hello, World!").unwrap(); + let mut index: ResourceIndex = + ResourceIndex::build(black_box(benchmarks_dir)).unwrap(); + std::fs::write(&modified_file, "Hello, World! Modified").unwrap(); + let relative_path = modified_file + .strip_prefix(benchmarks_dir) + .unwrap() + .to_str() + .unwrap(); + let _modification_result = + index.track_modification(&relative_path).unwrap(); + + // Cleanup + std::fs::remove_file(&modified_file).unwrap(); + }); + }); // Benchmark `ResourceIndex::update_all()` @@ -69,7 +135,8 @@ fn resource_index_benchmark(c: &mut Criterion) { .unwrap(); } let mut index: ResourceIndex = - ResourceIndex::build(black_box(&update_all_benchmarks_dir)); + ResourceIndex::build(black_box(&update_all_benchmarks_dir)) + .unwrap(); update_all_files(&update_all_benchmarks_dir.to_path_buf()); let _update_result = index.update_all().unwrap();