diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0edf7006..94591f1c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -22,6 +22,7 @@ jobs: - name: Install Rust uses: dtolnay/rust-toolchain@stable with: + toolchain: nightly # nightly is required for fmt components: rustfmt, clippy - name: Check @@ -36,7 +37,9 @@ jobs: run: cargo build --verbose - name: Run tests - run: cargo test --verbose + run: | + cargo test --verbose + cargo test --verbose - name: Build Release run: cargo build --verbose --release @@ -68,7 +71,9 @@ jobs: run: cargo build --verbose --release - name: Run tests - run: cargo test --workspace --verbose + run: | + cargo test --workspace --verbose + cargo test --workspace --verbose - name: Install JDK uses: actions/setup-java@v4.2.1 @@ -97,7 +102,9 @@ jobs: run: cargo build --verbose --release - name: Run tests - run: cargo test --workspace --verbose + run: | + cargo test --workspace --verbose + cargo test --workspace --verbose - name: Install JDK uses: actions/setup-java@v4.2.1 diff --git a/.gitignore b/.gitignore index bc757308..00d8cf93 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ target Cargo.lock **/app_id -*.class \ No newline at end of file +*.class +**/.ark diff --git a/README.md b/README.md index 6cf0436d..8a472b50 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ cargo bench This command runs all benchmarks and generates a report in HTML format located at `target/criterion/report`. If you wish to run a specific benchmark, you can specify its name as an argument as in: ```bash -cargo bench index_build +cargo bench resource_index ``` ### Benchmarking Local Files @@ -97,10 +97,10 @@ To install `flamegraph`, run: cargo install flamegraph ``` -To generate a flame graph for `index_build_benchmark`, use the following command: +To generate a flame graph for `resource_index_benchmark`, use the following command: ```bash -cargo flamegraph --bench index_build_benchmark -o index_build_benchmark.svg -- --bench +cargo flamegraph --bench resource_index_benchmark -o resource_index_benchmark.svg -- --bench ``` > [!NOTE] diff --git a/ark-cli/src/commands/backup.rs b/ark-cli/src/commands/backup.rs index 398220bd..40d5d46d 100644 --- a/ark-cli/src/commands/backup.rs +++ b/ark-cli/src/commands/backup.rs @@ -1,5 +1,4 @@ -use std::io::Write; -use std::path::PathBuf; +use std::{io::Write, path::PathBuf}; use crate::{ create_dir_all, dir, discover_roots, home_dir, storages_exists, timestamp, diff --git a/ark-cli/src/commands/file/append.rs b/ark-cli/src/commands/file/append.rs index 46d0b132..91969928 100644 --- a/ark-cli/src/commands/file/append.rs +++ b/ark-cli/src/commands/file/append.rs @@ -1,9 +1,8 @@ -use std::path::PathBuf; -use std::str::FromStr; +use std::{path::PathBuf, str::FromStr}; use crate::{ - models::storage::Storage, models::storage::StorageType, translate_storage, - AppError, Format, ResourceId, + models::storage::{Storage, StorageType}, + translate_storage, AppError, Format, ResourceId, }; use data_error::ArklibError; diff --git a/ark-cli/src/commands/file/insert.rs b/ark-cli/src/commands/file/insert.rs index ff9b1ac9..b60199e5 100644 --- a/ark-cli/src/commands/file/insert.rs +++ b/ark-cli/src/commands/file/insert.rs @@ -1,9 +1,8 @@ -use std::path::PathBuf; -use std::str::FromStr; +use std::{path::PathBuf, str::FromStr}; use crate::{ - models::storage::Storage, models::storage::StorageType, translate_storage, - AppError, Format, ResourceId, + models::storage::{Storage, StorageType}, + translate_storage, AppError, Format, ResourceId, }; use data_error::ArklibError; diff --git a/ark-cli/src/commands/file/read.rs b/ark-cli/src/commands/file/read.rs index 8387d011..7b47d719 100644 --- a/ark-cli/src/commands/file/read.rs +++ b/ark-cli/src/commands/file/read.rs @@ -1,9 +1,8 @@ -use std::path::PathBuf; -use std::str::FromStr; +use std::{path::PathBuf, str::FromStr}; use crate::{ - models::storage::Storage, models::storage::StorageType, translate_storage, - AppError, ResourceId, + models::storage::{Storage, StorageType}, + translate_storage, AppError, ResourceId, }; use data_error::ArklibError; diff --git a/ark-cli/src/commands/file/utils.rs b/ark-cli/src/commands/file/utils.rs index 8a3c4048..6cc993dd 100644 --- a/ark-cli/src/commands/file/utils.rs +++ b/ark-cli/src/commands/file/utils.rs @@ -1,6 +1,7 @@ -use crate::error::AppError; -use crate::models::key_value_to_str; -use crate::models::Format; +use crate::{ + error::AppError, + models::{key_value_to_str, Format}, +}; use data_error::Result as ArklibResult; use fs_atomic_versions::atomic::{modify, modify_json, AtomicFile}; diff --git a/ark-cli/src/commands/link/utils.rs b/ark-cli/src/commands/link/utils.rs index 122f7f1c..2d27ca9a 100644 --- a/ark-cli/src/commands/link/utils.rs +++ b/ark-cli/src/commands/link/utils.rs @@ -3,8 +3,7 @@ use data_link::Link; use std::path::PathBuf; use url::Url; -use crate::error::AppError; -use crate::util::provide_index; // Import your custom AppError type +use crate::{error::AppError, util::provide_index}; // Import your custom AppError type pub async fn create_link( root: &PathBuf, @@ -28,8 +27,17 @@ pub fn load_link( ) -> Result, AppError> { let path_from_index = id.clone().map(|id| { let index = provide_index(root); - index.id2path[&id].as_path().to_path_buf() + index + .get_resources_by_id(&id) + .map(|r| r[0].path().to_owned()) + .ok_or_else(|| { + AppError::IndexError(format!( + "Resource with id {} not found", + id + )) + }) }); + let path_from_index = path_from_index.transpose()?; let path_from_user = file_path; let path = match (path_from_user, path_from_index) { @@ -46,7 +54,7 @@ pub fn load_link( } } (Some(path), None) => Ok(path.to_path_buf()), - (None, Some(path)) => Ok(path), + (None, Some(path)) => Ok(path.to_path_buf()), (None, None) => Err(AppError::LinkLoadError( "Provide a path or id for request.".to_owned(), ))?, diff --git a/ark-cli/src/commands/list.rs b/ark-cli/src/commands/list.rs index bc557b42..ee86152b 100644 --- a/ark-cli/src/commands/list.rs +++ b/ark-cli/src/commands/list.rs @@ -1,5 +1,4 @@ -use std::io::Read; -use std::path::PathBuf; +use std::{io::Read, path::PathBuf}; use crate::{ provide_index, provide_root, read_storage_value, AppError, DateTime, @@ -76,15 +75,17 @@ impl List { .map_err(|_| { AppError::IndexError("Could not read index".to_owned()) })? - .path2id + .resources() .iter() - .filter_map(|(path, resource)| { + .filter_map(|indexed_resource| { + let path = indexed_resource.path(); + let id = indexed_resource.id(); let tags = if self.tags { Some( read_storage_value( &root, "tags", - &resource.id.to_string(), + &id.to_string(), &None, ) .map_or(vec![], |s| { @@ -102,7 +103,7 @@ impl List { read_storage_value( &root, "scores", - &resource.id.to_string(), + &id.to_string(), &None, ) .map_or(0, |s| s.parse::().unwrap_or(0)), @@ -114,7 +115,7 @@ impl List { let datetime = if self.modified { let format = "%b %e %H:%M %Y"; Some( - DateTime::::from(resource.modified) + DateTime::::from(indexed_resource.last_modified()) .format(format) .to_string(), ) @@ -123,21 +124,18 @@ impl List { }; let (path, resource, content) = match entry_output { - EntryOutput::Both => ( - Some(path.to_owned().into_path_buf()), - Some(resource.clone().id), - None, - ), - EntryOutput::Path => { - (Some(path.to_owned().into_path_buf()), None, None) + EntryOutput::Both => { + (Some(path.to_owned()), Some(id.to_owned()), None) } - EntryOutput::Id => (None, Some(resource.clone().id), None), + EntryOutput::Path => (Some(path.to_owned()), None, None), + EntryOutput::Id => (None, Some(id.to_owned()), None), EntryOutput::Link => match File::open(path) { Ok(mut file) => { let mut contents = String::new(); match file.read_to_string(&mut contents) { Ok(_) => { - // Check if the content of the file is a valid url + // Check if the content + // of the file is a valid url let url = contents.trim(); let url = url::Url::parse(url); match url { diff --git a/ark-cli/src/commands/render.rs b/ark-cli/src/commands/render.rs index 7f3fa9e6..19014831 100644 --- a/ark-cli/src/commands/render.rs +++ b/ark-cli/src/commands/render.rs @@ -26,7 +26,8 @@ impl Render { let dest_path = filepath.with_file_name( filepath .file_stem() - // SAFETY: we know that the file stem is valid UTF-8 because it is a file name + // SAFETY: we know that the file stem is valid UTF-8 + // because it is a file name .unwrap() .to_str() .unwrap() diff --git a/ark-cli/src/commands/storage/list.rs b/ark-cli/src/commands/storage/list.rs index 6b0c6c4e..0fdcef7c 100644 --- a/ark-cli/src/commands/storage/list.rs +++ b/ark-cli/src/commands/storage/list.rs @@ -1,8 +1,8 @@ use std::path::PathBuf; use crate::{ - models::storage::Storage, models::storage::StorageType, translate_storage, - AppError, + models::storage::{Storage, StorageType}, + translate_storage, AppError, }; #[derive(Clone, Debug, clap::Args)] diff --git a/ark-cli/src/index_registrar.rs b/ark-cli/src/index_registrar.rs index fc6a2e5b..a75df080 100644 --- a/ark-cli/src/index_registrar.rs +++ b/ark-cli/src/index_registrar.rs @@ -2,11 +2,13 @@ use lazy_static::lazy_static; extern crate canonical_path; use data_error::{ArklibError, Result}; -use fs_index::ResourceIndex; +use fs_index::{load_or_build_index, ResourceIndex}; -use std::collections::HashMap; -use std::path::Path; -use std::sync::{Arc, RwLock}; +use std::{ + collections::HashMap, + path::Path, + sync::{Arc, RwLock}, +}; use crate::ResourceId; @@ -34,7 +36,9 @@ pub fn provide_index>( } log::info!("Index has not been registered before"); - match ResourceIndex::provide(&root_path) { + // If the index has not been registered before, + // we need to load it, update it and register it + match load_or_build_index(&root_path, true) { Ok(index) => { let mut registrar = REGISTRAR.write().map_err(|_| { ArklibError::Other(anyhow::anyhow!("Failed to lock registrar")) diff --git a/ark-cli/src/main.rs b/ark-cli/src/main.rs index c8c718ca..e8029c18 100644 --- a/ark-cli/src/main.rs +++ b/ark-cli/src/main.rs @@ -1,5 +1,7 @@ -use std::fs::{create_dir_all, File}; -use std::path::PathBuf; +use std::{ + fs::{create_dir_all, File}, + path::PathBuf, +}; use crate::index_registrar::provide_index; use data_pdf::{render_preview_page, PDFQuality}; @@ -15,25 +17,23 @@ use fs_storage::ARK_FOLDER; use anyhow::Result; -use chrono::prelude::DateTime; -use chrono::Utc; +use chrono::{prelude::DateTime, Utc}; -use clap::CommandFactory; -use clap::FromArgMatches; +use clap::{CommandFactory, FromArgMatches}; use fs_extra::dir::{self, CopyOptions}; use home::home_dir; -use crate::cli::Cli; -use crate::commands::file::File::{Append, Insert, Read}; -use crate::commands::link::Link::{Create, Load}; -use crate::commands::Commands::Link; -use crate::commands::Commands::Storage; -use crate::commands::Commands::*; -use crate::models::EntryOutput; -use crate::models::Format; -use crate::models::Sort; +use crate::{ + cli::Cli, + commands::{ + file::File::{Append, Insert, Read}, + link::Link::{Create, Load}, + Commands::{Link, Storage, *}, + }, + models::{EntryOutput, Format, Sort}, +}; use crate::error::AppError; diff --git a/ark-cli/src/models/storage.rs b/ark-cli/src/models/storage.rs index 48fc0464..ef6f7f37 100644 --- a/ark-cli/src/models/storage.rs +++ b/ark-cli/src/models/storage.rs @@ -1,7 +1,6 @@ use crate::ResourceId; use fs_atomic_versions::atomic::AtomicFile; -use std::fmt::Write; -use std::path::PathBuf; +use std::{fmt::Write, path::PathBuf}; use crate::{ commands::{file_append, file_insert, format_file, format_line}, diff --git a/ark-cli/src/util.rs b/ark-cli/src/util.rs index 9a370167..40eff290 100644 --- a/ark-cli/src/util.rs +++ b/ark-cli/src/util.rs @@ -1,24 +1,26 @@ use crate::ResourceId; -use fs_index::index::ResourceIndex; +use fs_index::ResourceIndex; use fs_metadata::METADATA_STORAGE_FOLDER; use fs_properties::PROPERTIES_STORAGE_FOLDER; use fs_storage::{ ARK_FOLDER, PREVIEWS_STORAGE_FOLDER, SCORE_STORAGE_FILE, STATS_FOLDER, TAG_STORAGE_FILE, THUMBNAILS_STORAGE_FOLDER, }; -use std::env::current_dir; -use std::fs::{canonicalize, metadata}; -use std::io::BufRead; -use std::io::BufReader; -use std::path::Path; -use std::str::FromStr; -use std::thread; -use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use std::{fs::File, path::PathBuf}; +use std::{ + env::current_dir, + fs::{canonicalize, metadata, File}, + io::{BufRead, BufReader}, + path::{Path, PathBuf}, + str::FromStr, + thread, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, +}; -use crate::error::AppError; -use crate::models::storage::{Storage, StorageType}; -use crate::ARK_CONFIG; +use crate::{ + error::AppError, + models::storage::{Storage, StorageType}, + ARK_CONFIG, +}; pub fn discover_roots( roots_cfg: &Option, @@ -111,11 +113,11 @@ pub fn monitor_index( let duration = start.elapsed(); println!("Updating succeeded in {:?}\n", duration); - if !diff.deleted.is_empty() { - println!("Deleted: {:?}", diff.deleted); + if !diff.removed().is_empty() { + println!("Deleted: {:?}", diff.removed()); } - if !diff.added.is_empty() { - println!("Added: {:?}", diff.added); + if !diff.added().is_empty() { + println!("Added: {:?}", diff.added()); } } } @@ -127,10 +129,14 @@ pub fn monitor_index( ) })?; - println!("Here are {} entries in the index", index.size()); + println!("Here are {} entries in the index", index.len()); - for (key, count) in index.collisions.iter() { - println!("Id {:?} calculated {} times", key, count); + for (key, resources) in index.collisions().iter() { + println!( + "Id {:?} calculated {} times", + key, + resources.len() + ); } } } diff --git a/data-json/src/lib.rs b/data-json/src/lib.rs index cc663775..9d047f17 100644 --- a/data-json/src/lib.rs +++ b/data-json/src/lib.rs @@ -1,7 +1,4 @@ -use serde_json::json; -use serde_json::map::Entry; -use serde_json::Map; -use serde_json::Value; +use serde_json::{json, map::Entry, Map, Value}; pub fn merge(origin: Value, new_data: Value) -> Value { match (origin, new_data) { diff --git a/data-link/src/lib.rs b/data-link/src/lib.rs index 7e614a1d..ba925722 100644 --- a/data-link/src/lib.rs +++ b/data-link/src/lib.rs @@ -2,18 +2,20 @@ use data_error::Result; use data_resource::ResourceId; use fs_atomic_versions::atomic::AtomicFile; use fs_metadata::store_metadata; -use fs_properties::load_raw_properties; -use fs_properties::store_properties; -use fs_properties::PROPERTIES_STORAGE_FOLDER; +use fs_properties::{ + load_raw_properties, store_properties, PROPERTIES_STORAGE_FOLDER, +}; use fs_storage::{ARK_FOLDER, PREVIEWS_STORAGE_FOLDER}; use reqwest::header::HeaderValue; use scraper::{Html, Selector}; use serde::{Deserialize, Serialize}; -use std::fmt; -use std::marker::PhantomData; -use std::path::Path; -use std::str::{self, FromStr}; -use std::{io::Write, path::PathBuf}; +use std::{ + fmt, + io::Write, + marker::PhantomData, + path::{Path, PathBuf}, + str::{self, FromStr}, +}; use url::Url; #[derive(Debug, Deserialize, Serialize)] diff --git a/data-resource/src/lib.rs b/data-resource/src/lib.rs index 049b4dbb..ea7426c3 100644 --- a/data-resource/src/lib.rs +++ b/data-resource/src/lib.rs @@ -3,16 +3,16 @@ //! `data-resource` is a crate for managing resource identifiers. use core::{fmt::Display, str::FromStr}; use data_error::Result; -use serde::de::DeserializeOwned; -use serde::Serialize; +use serde::{de::DeserializeOwned, Serialize}; use std::{fmt::Debug, hash::Hash, path::Path}; /// This trait defines a generic type representing a resource identifier. /// -/// Resources are identified by a hash value, which is computed from the resource's data. -/// The hash value is used to uniquely identify the resource. +/// Resources are identified by a hash value, which is computed from the +/// resource's data. The hash value is used to uniquely identify the resource. /// -/// Implementors of this trait must provide a way to compute the hash value from the resource's data. +/// Implementors of this trait must provide a way to compute +/// the hash value from the resource's data. pub trait ResourceId: Debug + Display diff --git a/dev-hash/benches/blake3.rs b/dev-hash/benches/blake3.rs index cf95233b..9b3c6541 100644 --- a/dev-hash/benches/blake3.rs +++ b/dev-hash/benches/blake3.rs @@ -17,7 +17,8 @@ fn generate_random_data(size: usize) -> Vec { (0..size).map(|_| rng.gen()).collect() } -/// Benchmarks the performance of resource ID creation from file paths and random data. +/// Benchmarks the performance of resource ID creation +/// from file paths and random data. /// /// - Measures the time taken to create a resource ID from file paths. /// - Measures the time taken to create a resource ID from random data. diff --git a/dev-hash/benches/crc32.rs b/dev-hash/benches/crc32.rs index b462035b..dab4f776 100644 --- a/dev-hash/benches/crc32.rs +++ b/dev-hash/benches/crc32.rs @@ -17,7 +17,8 @@ fn generate_random_data(size: usize) -> Vec { (0..size).map(|_| rng.gen()).collect() } -/// Benchmarks the performance of resource ID creation from file paths and random data. +/// Benchmarks the performance of resource ID creation +/// from file paths and random data. /// /// - Measures the time taken to create a resource ID from file paths. /// - Measures the time taken to create a resource ID from random data. diff --git a/fs-atomic-light/src/lib.rs b/fs-atomic-light/src/lib.rs index 25288f6d..fb4bf028 100644 --- a/fs-atomic-light/src/lib.rs +++ b/fs-atomic-light/src/lib.rs @@ -1,9 +1,6 @@ use data_error::Result; -use std::env; -use std::fs; -use std::path::Path; -use std::str; +use std::{env, fs, path::Path, str}; /// Write data to a tempory file and move that written file to destination /// diff --git a/fs-atomic-versions/src/atomic/file.rs b/fs-atomic-versions/src/atomic/file.rs index bd3f2571..a2b708ab 100644 --- a/fs-atomic-versions/src/atomic/file.rs +++ b/fs-atomic-versions/src/atomic/file.rs @@ -1,8 +1,10 @@ -use std::fs::{self, File}; -use std::io::{Error, ErrorKind, Read, Result}; -#[cfg(target_os = "unix")] +#[cfg(target_family = "unix")] use std::os::unix::fs::MetadataExt; -use std::path::{Path, PathBuf}; +use std::{ + fs::{self, File}, + io::{Error, ErrorKind, Read, Result}, + path::{Path, PathBuf}, +}; use crate::app_id; @@ -229,7 +231,7 @@ impl AtomicFile { // May return `EEXIST`. let res = std::fs::hard_link(&new.path, new_path); if let Err(err) = res { - #[cfg(target_os = "unix")] + #[cfg(target_family = "unix")] // From open(2) manual page: // // "[...] create a unique file on the same filesystem (e.g., @@ -241,7 +243,7 @@ impl AtomicFile { if new.path.metadata()?.nlink() != 2 { Err(err)?; } - #[cfg(not(target_os = "unix"))] + #[cfg(not(target_family = "unix"))] Err(err)?; } diff --git a/fs-atomic-versions/src/lib.rs b/fs-atomic-versions/src/lib.rs index 1ce1f0b2..62935039 100644 --- a/fs-atomic-versions/src/lib.rs +++ b/fs-atomic-versions/src/lib.rs @@ -1,7 +1,8 @@ use lazy_static::lazy_static; -use std::path::PathBuf; -use std::sync::Once; -use std::sync::RwLock; +use std::{ + path::PathBuf, + sync::{Once, RwLock}, +}; pub mod app_id; pub mod atomic; diff --git a/fs-index/Cargo.toml b/fs-index/Cargo.toml index 49d0e763..0ded09dd 100644 --- a/fs-index/Cargo.toml +++ b/fs-index/Cargo.toml @@ -12,9 +12,8 @@ bench = false log = { version = "0.4.17", features = ["release_max_level_off"] } walkdir = "2.3.2" anyhow = "1.0.58" -canonical-path = "2.0.2" -pathdiff = "0.2.1" -itertools = "0.10.5" +serde_json = "1.0" +serde = { version = "1.0", features = ["derive"] } fs-storage = { path = "../fs-storage" } @@ -26,11 +25,11 @@ data-resource = { path = "../data-resource" } uuid = { version = "1.6.1", features = ["v4"] } # benchmarking criterion = { version = "0.5", features = ["html_reports"] } +tempfile = "3.10" # Depending on `dev-hash` for testing dev-hash = { path = "../dev-hash" } -fs-atomic-versions = { path = "../fs-atomic-versions" } [[bench]] -name = "index_build_benchmark" +name = "resource_index_benchmark" harness = false -path = "benches/index_build_benchmark.rs" +path = "benches/resource_index_benchmark.rs" diff --git a/fs-index/README.md b/fs-index/README.md new file mode 100644 index 00000000..9253375a --- /dev/null +++ b/fs-index/README.md @@ -0,0 +1,35 @@ +# fs-index + +`fs-index` is a Rust crate for managing and indexing file system resources. It provides a flexible and efficient way to track changes, query resources, and keep files in sync across multiple devices or locations. + +Originally developed for the ARK framework to support local-first applications, `fs-index` can also be used in various scenarios including backup systems, content management, and more. + +## Features + +The most important struct in this crate is `ResourceIndex` which comes with: + +- **Reactive API** + - `update_all`: Method to update the index by rescanning files and returning changes (additions/deletions). +- **Snapshot API** + - `get_resources_by_id`: Query resources from the index by ID. + - `get_resource_by_path`: Query a resource from the index by its path. + +## Custom Serialization + +The `ResourceIndex` struct includes a custom serialization implementation to avoid writing a large repetitive index file with double maps. + +## Tests and Benchmarks + +- Unit tests are located in `src/tests.rs`. +- The benchmarking suite is in `benches/resource_index_benchmark.rs`, benchmarking all methods of `ResourceIndex`. + - Run benchmarks with `cargo bench`. + +## Examples + +To get started, take a look at the examples in the `examples/` directory. + +To run a specific example: + +```shell +cargo run --example resource_index +``` diff --git a/fs-index/benches/index_build_benchmark.rs b/fs-index/benches/index_build_benchmark.rs deleted file mode 100644 index 1de0ef3f..00000000 --- a/fs-index/benches/index_build_benchmark.rs +++ /dev/null @@ -1,43 +0,0 @@ -use criterion::{ - black_box, criterion_group, criterion_main, BenchmarkId, Criterion, -}; -use dev_hash::Crc32; -use fs_index::index::ResourceIndex; - -const DIR_PATH: &str = "../test-assets/"; // Set the path to the directory containing the resources here - -fn index_build_benchmark(c: &mut Criterion) { - // assert the path exists and is a directory - assert!( - std::path::Path::new(DIR_PATH).is_dir(), - "The path: {} does not exist or is not a directory", - DIR_PATH - ); - - let mut group = c.benchmark_group("index_build"); - group.measurement_time(std::time::Duration::from_secs(20)); // Set the measurement time here - - let mut collisions_size = 0; - - group.bench_with_input( - BenchmarkId::new("index_build", DIR_PATH), - &DIR_PATH, - |b, path| { - b.iter(|| { - let index: ResourceIndex = - ResourceIndex::build(black_box(path.to_string())); - collisions_size = index.collisions.len(); - }); - }, - ); - group.finish(); - - println!("Collisions: {}", collisions_size); -} - -criterion_group! { - name = benches; - config = Criterion::default(); - targets = index_build_benchmark -} -criterion_main!(benches); diff --git a/fs-index/benches/resource_index_benchmark.rs b/fs-index/benches/resource_index_benchmark.rs new file mode 100644 index 00000000..f6d3f21a --- /dev/null +++ b/fs-index/benches/resource_index_benchmark.rs @@ -0,0 +1,142 @@ +use std::path::PathBuf; + +use criterion::{ + black_box, criterion_group, criterion_main, BenchmarkId, Criterion, +}; +use tempfile::TempDir; + +use dev_hash::Crc32; +use fs_index::ResourceIndex; + +fn resource_index_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("resource_index"); + group.measurement_time(std::time::Duration::from_secs(20)); // Set the measurement time here + + let benchmarks_dir = setup_temp_dir(); + let benchmarks_dir = benchmarks_dir.path(); + let benchmarks_dir_str = benchmarks_dir.to_str().unwrap(); + + // Benchmark `ResourceIndex::build()` + + let mut collisions_size = 0; + group.bench_with_input( + BenchmarkId::new("index_build", benchmarks_dir_str), + &benchmarks_dir, + |b, path| { + b.iter(|| { + let index: ResourceIndex = + ResourceIndex::build(black_box(path)).unwrap(); + collisions_size = index.collisions().len(); + }); + }, + ); + println!("Collisions: {}", collisions_size); + + // Benchmark `ResourceIndex::get_resources_by_id()` + let index: ResourceIndex = + ResourceIndex::build(benchmarks_dir).unwrap(); + let resources = index.resources(); + let resource_id = resources[0].id(); + group.bench_function("index_get_resource_by_id", |b| { + b.iter(|| { + let _resource = index.get_resources_by_id(black_box(resource_id)); + }); + }); + + // Benchmark `ResourceIndex::get_resource_by_path()` + let resource_path = resources[0].path(); + group.bench_function("index_get_resource_by_path", |b| { + b.iter(|| { + let _resource = + index.get_resource_by_path(black_box(resource_path)); + }); + }); + + // Benchmark `ResourceIndex::update_all()` + + // First, create a new temp directory specifically for the update_all + // benchmark since we will be creating new files, removing files, and + // modifying files + let update_all_benchmarks_dir = + TempDir::with_prefix("ark-fs-index-benchmarks-update-all").unwrap(); + let update_all_benchmarks_dir = update_all_benchmarks_dir.path(); + + group.bench_function("index_update_all", |b| { + b.iter(|| { + // Clear the directory + std::fs::remove_dir_all(&update_all_benchmarks_dir).unwrap(); + std::fs::create_dir(&update_all_benchmarks_dir).unwrap(); + + // Create 5000 new files + for i in 0..5000 { + let new_file = + update_all_benchmarks_dir.join(format!("file_{}.txt", i)); + std::fs::File::create(&new_file).unwrap(); + std::fs::write(&new_file, format!("Hello, World! {}", i)) + .unwrap(); + } + let mut index: ResourceIndex = + ResourceIndex::build(black_box(&update_all_benchmarks_dir)) + .unwrap(); + + update_all_files(&update_all_benchmarks_dir.to_path_buf()); + let _update_result = index.update_all().unwrap(); + }); + }); + + group.finish(); +} + +criterion_group! { + name = benches; + config = Criterion::default(); + targets = resource_index_benchmark +} +criterion_main!(benches); + +/// A helper function to setup a temp directory for the benchmarks +fn setup_temp_dir() -> TempDir { + // Create a temp directory + let temp_dir = TempDir::with_prefix("ark-fs-index-benchmarks").unwrap(); + let benchmarks_dir = temp_dir.path(); + log::info!("Temp directory for benchmarks: {:?}", benchmarks_dir); + + // Create 10,000 files in the temp directory + for i in 0..10000 { + let new_file = benchmarks_dir.join(format!("file_{}.txt", i)); + std::fs::File::create(&new_file).unwrap(); + // We add the index `i` to the file content to make sure the content is + // unique This is to avoid collisions in the index + std::fs::write(&new_file, format!("Hello, World! {}", i)).unwrap(); + } + + temp_dir +} + +/// A helper function that takes a directory and creates 5000 new files, removes +/// 3000 files, and modifies 1000 files +/// +/// Note: The function assumes that the directory already contains 5000 files +/// with the names `file_0.txt` to `file_4999.txt` +fn update_all_files(dir: &PathBuf) { + // Create 5000 new files + for i in 5001..10001 { + let new_file = dir.join(format!("file_{}.txt", i)); + std::fs::File::create(&new_file).unwrap(); + // We add the index `i` to the file content to make sure the content is + // unique This is to avoid collisions in the index + std::fs::write(&new_file, format!("Hello, World! {}", i)).unwrap(); + } + + // Remove 3000 files + for i in 0..3000 { + let removed_file = dir.join(format!("file_{}.txt", i)); + std::fs::remove_file(&removed_file).unwrap(); + } + + // Modify 1000 files + for i in 4000..5000 { + let modified_file = dir.join(format!("file_{}.txt", i)); + std::fs::write(&modified_file, "Hello, World!").unwrap(); + } +} diff --git a/fs-index/examples/resource_index.rs b/fs-index/examples/resource_index.rs new file mode 100644 index 00000000..3f5cb797 --- /dev/null +++ b/fs-index/examples/resource_index.rs @@ -0,0 +1,46 @@ +use std::path::Path; + +use anyhow::Result; + +use dev_hash::Blake3; +use fs_index::ResourceIndex; + +/// A simple example of how to use [`ResourceIndex`] to index a directory. +fn main() -> Result<()> { + // Create a new `ResourceIndex` from the directory "test-assets" using + // blake3 as the hashing algorithm. + let mut index: ResourceIndex = + ResourceIndex::build(Path::new("test-assets"))?; + + // Print the indexed resources. + for resource in index.resources() { + println!("{:?}", resource); + } + + // Save the index to a file. + index.store()?; + + // Get resources by their id. + let id = Blake3( + "172b4bf148e858b13dde0fc6613413bcb7552e5c4e5c45195ac6c80f20eb5ff5" + .to_string(), + ); + let resources = index.get_resources_by_id(&id).ok_or_else(|| { + anyhow::anyhow!("Resource with id {:?} not found", id) + })?; + for resource in resources { + println!("{:?}", resource); + } + + // Get resources by their path. + let path = Path::new("lena.jpg"); + let resource = index.get_resource_by_path(path).ok_or_else(|| { + anyhow::anyhow!("Resource with path {:?} not found", path) + })?; + println!("{:?}", resource); + + // Update the index. + index.update_all()?; + + Ok(()) +} diff --git a/fs-index/src/index.rs b/fs-index/src/index.rs index a60287ea..0221d7ae 100644 --- a/fs-index/src/index.rs +++ b/fs-index/src/index.rs @@ -1,1120 +1,486 @@ -use anyhow::anyhow; -use canonical_path::{CanonicalPath, CanonicalPathBuf}; -use itertools::Itertools; -use std::collections::{HashMap, HashSet}; -use std::fs::{self, File, Metadata}; -use std::io::{BufRead, BufReader, Write}; -use std::ops::Add; -use std::path::{Path, PathBuf}; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use walkdir::{DirEntry, WalkDir}; +use std::{ + collections::{HashMap, HashSet}, + fs, + hash::Hash, + path::{Path, PathBuf}, + time::{Duration, SystemTime}, +}; -use log; +use serde::{Deserialize, Serialize}; -use data_error::{ArklibError, Result}; +use data_error::Result; use data_resource::ResourceId; use fs_storage::{ARK_FOLDER, INDEX_PATH}; -#[derive(Eq, Ord, PartialEq, PartialOrd, Hash, Clone, Debug)] -pub struct IndexEntry { - pub modified: SystemTime, - pub id: Id, -} +use crate::utils::{discover_paths, scan_entries}; -#[derive(PartialEq, Clone, Debug)] -pub struct ResourceIndex { - pub id2path: HashMap, - pub path2id: HashMap>, +/// The threshold for considering a resource updated +pub const RESOURCE_UPDATED_THRESHOLD: Duration = Duration::from_millis(1); - pub collisions: HashMap, - root: PathBuf, +/// Represents a resource in the index +#[derive( + PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Debug, Serialize, Deserialize, +)] +pub struct IndexedResource { + /// The unique identifier of the resource + id: Id, + /// The path of the resource, relative to the root path + path: PathBuf, + /// The last modified time of the resource (from the file system metadata) + last_modified: SystemTime, } -#[derive(PartialEq, Debug)] -pub struct IndexUpdate { - pub deleted: HashSet, - pub added: HashMap, -} +impl IndexedResource { + /// Create a new indexed resource + pub fn new(id: Id, path: PathBuf, last_modified: SystemTime) -> Self { + IndexedResource { + id, + path, + last_modified, + } + } -pub const RESOURCE_UPDATED_THRESHOLD: Duration = Duration::from_millis(1); + /// Return the ID of the resource + pub fn id(&self) -> &Id { + &self.id + } -pub type Paths = HashSet; + /// Return the path of the resource + pub fn path(&self) -> &Path { + &self.path + } -impl ResourceIndex { - pub fn size(&self) -> usize { - //the actual size is lower in presence of collisions - self.path2id.len() + /// Return the last modified time of the resource + pub fn last_modified(&self) -> SystemTime { + self.last_modified } +} - pub fn build>(root_path: P) -> Self { - log::info!("Building the index from scratch"); - let root_path: PathBuf = root_path.as_ref().to_owned(); +/// Represents a resource ID with its last modified time +#[derive(Eq, Ord, PartialEq, PartialOrd, Hash, Clone, Debug)] +pub struct ResourceIdWithTimestamp { + /// The unique identifier of the resource + pub(crate) id: Id, + /// The last modified time of the resource (from the file system metadata) + pub(crate) last_modified: SystemTime, +} - let entries = discover_paths(&root_path); - let entries = scan_entries(entries); +/// Represents a resource path with its last modified time +#[derive(Eq, Ord, PartialEq, PartialOrd, Hash, Clone, Debug)] +pub struct ResourcePathWithTimestamp { + /// The path of the resource, relative to the root path + pub(crate) path: PathBuf, + /// The last modified time of the resource (from the file system metadata) + pub(crate) last_modified: SystemTime, +} - let mut index = ResourceIndex { - id2path: HashMap::new(), - path2id: HashMap::new(), - collisions: HashMap::new(), - root: root_path, - }; +/// Represents the index of resources in a directory. +/// +/// [`ResourceIndex`] provides functionality for managing a directory index, +/// including tracking changes, and querying resources. +/// +/// #### Reactive API +/// - [`ResourceIndex::update_all`]: Method to update the index by rescanning +/// files and returning changes (additions/deletions/updates). +/// +/// #### Snapshot API +/// - [`ResourceIndex::get_resources_by_id`]: Query resources from the index by +/// ID. +/// - [`ResourceIndex::get_resource_by_path`]: Query a resource from the index +/// by its path. +/// +/// #### Track API +/// Allows for fine-grained control over tracking changes in the index +/// - [`ResourceIndex::track_addition`]: Track a newly added file (checks if the +/// file exists in the file system). +/// - [`ResourceIndex::track_removal`]: Track the deletion of a file (checks if +/// the file was actually deleted). +/// - [`ResourceIndex::track_modification`]: Track an update on a single file. +/// +/// ## Examples +/// ```no_run +/// use std::path::Path; +/// use fs_index::{ResourceIndex, load_or_build_index}; +/// use dev_hash::Crc32; +/// +/// // Define the root path +/// let root_path = Path::new("animals"); +/// +/// // Build the index +/// let index: ResourceIndex = ResourceIndex::build(root_path).expect("Failed to build index"); +/// // Store the index +/// index.store().expect("Failed to store index"); +/// +/// // Load the stored index +/// let mut loaded_index: ResourceIndex = load_or_build_index(root_path, false).expect("Failed to load index"); +/// +/// // Update the index +/// loaded_index.update_all().expect("Failed to update index"); +/// +/// // Get a resource by path +/// let _resource = loaded_index +/// .get_resource_by_path("cat.txt") +/// .expect("Resource not found"); +/// ``` +#[derive(Clone, Debug)] +pub struct ResourceIndex +where + Id: Eq + Hash, +{ + /// The root path of the index (canonicalized) + pub(crate) root: PathBuf, + /// A map from resource IDs to paths + /// + /// Multiple resources can have the same ID (e.g., due to hash collisions + /// or files with the same content) + pub(crate) id_to_paths: HashMap>, + /// A map from resource paths to resources + pub(crate) path_to_resource: HashMap>, +} - for (path, entry) in entries { - index.insert_entry(path, entry); - } +/// Represents the result of an update operation on the ResourceIndex +#[derive(PartialEq, Debug)] +pub struct IndexUpdate { + /// Resources that were added during the update + added: HashMap>, + /// Resources that were removed during the update + removed: HashSet, +} - log::info!("Index built"); - index +impl IndexUpdate { + /// Return the resources that were added during the update + pub fn added(&self) -> &HashMap> { + &self.added } - pub fn load>(root_path: P) -> Result { - let root_path: PathBuf = root_path.as_ref().to_owned(); - - let index_path: PathBuf = root_path.join(ARK_FOLDER).join(INDEX_PATH); - log::info!("Loading the index from file {}", index_path.display()); - let file = File::open(&index_path)?; - let mut index = ResourceIndex { - id2path: HashMap::new(), - path2id: HashMap::new(), - collisions: HashMap::new(), - root: root_path.clone(), - }; - - // We should not return early in case of missing files - let lines = BufReader::new(file).lines(); - for line in lines { - let line = line?; + /// Return the resources that were removed during the update + pub fn removed(&self) -> &HashSet { + &self.removed + } +} - let mut parts = line.split(' '); +impl ResourceIndex { + /// Return the number of resources in the index + pub fn len(&self) -> usize { + self.path_to_resource.len() + } - let modified = { - let str = parts.next().ok_or(ArklibError::Parse)?; - UNIX_EPOCH.add(Duration::from_millis( - str.parse().map_err(|_| ArklibError::Parse)?, - )) - }; + /// Return true if the index is empty + pub fn is_empty(&self) -> bool { + self.path_to_resource.is_empty() + } - let id = { - let str = parts.next().ok_or(ArklibError::Parse)?; - Id::from_str(str).map_err(|_| ArklibError::Parse)? - }; + /// Return the root path of the index + pub fn root(&self) -> &Path { + &self.root + } - let path: String = - itertools::Itertools::intersperse(parts, " ").collect(); - let path: PathBuf = root_path.join(Path::new(&path)); - match CanonicalPathBuf::canonicalize(&path) { - Ok(path) => { - log::trace!("[load] {} -> {}", id, path.display()); - index.insert_entry(path, IndexEntry { modified, id }); - } - Err(_) => { - log::warn!("File {} not found", path.display()); - continue; - } - } + /// Return the resources in the index + pub fn resources(&self) -> Vec> { + // Using path_to_resource so to avoid not collecting duplicates + let mut resources = vec![]; + for (path, resource) in self.path_to_resource.iter() { + resources.push(IndexedResource::new( + resource.id.clone(), + path.clone(), + resource.last_modified, + )); } + resources + } - Ok(index) + /// Return the ID collisions + /// + /// **Note**: If you are using a cryptographic hash function, collisions + /// should be files with the same content. If you are using a + /// non-cryptographic hash function, collisions can be files with the + /// same content or files whose content hash to the same value. + pub fn collisions(&self) -> HashMap> { + // Filter out IDs with only one resource + self.id_to_paths + .iter() + .filter(|(_id, paths)| paths.len() > 1) + .map(|(id, paths)| (id.clone(), paths.clone())) + .collect() } + /// Return the number of ID collisions + /// + /// **Note**: If you are using a cryptographic hash function, collisions + /// should be files with the same content. If you are using a + /// non-cryptographic hash function, collisions can be files with the + /// same content or files whose content hash to the same value. + pub fn num_collisions(&self) -> usize { + // Aggregate the number of collisions for each ID + self.id_to_paths + .values() + .filter(|paths| paths.len() > 1) + .map(|paths| paths.len()) + .sum() + } + + /// Save the index to the file system (as a JSON file in + /// /ARK_FOLDER/INDEX_PATH) pub fn store(&self) -> Result<()> { - log::info!("Storing the index to file"); - - let start = SystemTime::now(); - - let index_path = self - .root - .to_owned() - .join(ARK_FOLDER) - .join(INDEX_PATH); + let ark_folder = self.root.join(ARK_FOLDER); + let index_path = ark_folder.join(INDEX_PATH); + log::debug!("Storing index at: {:?}", index_path); - let ark_dir = index_path.parent().unwrap(); - fs::create_dir_all(ark_dir)?; + fs::create_dir_all(&ark_folder)?; + let index_file = fs::File::create(index_path)?; + serde_json::to_writer_pretty(index_file, self)?; - let mut file = File::create(index_path)?; + Ok(()) + } - let mut path2id: Vec<(&CanonicalPathBuf, &IndexEntry)> = - self.path2id.iter().collect(); - path2id.sort_by_key(|(_, entry)| *entry); + /// Get resources by their ID + /// + /// Returns None if there is no resource with the given ID + /// + /// **Note**: This can return multiple resources with the same ID in case of + /// hash collisions or files with the same content + pub fn get_resources_by_id( + &self, + id: &Id, + ) -> Option>> { + let mut resources = vec![]; + + let paths = self.id_to_paths.get(id)?; + for path in paths { + let resource = self.path_to_resource.get(path)?; + let resource = IndexedResource::new( + resource.id.clone(), + path.clone(), + resource.last_modified, + ); + resources.push(resource); + } - for (path, entry) in path2id.iter() { - log::trace!("[store] {} by path {}", entry.id, path.display()); + Some(resources) + } - let timestamp = entry - .modified - .duration_since(UNIX_EPOCH) - .map_err(|_| { - ArklibError::Other(anyhow!("Error using duration since")) - })? - .as_millis(); + /// Get a resource by its path + /// + /// Returns None if the resource does not exist + /// + /// **Note**: The path should be relative to the root path + pub fn get_resource_by_path>( + &self, + path: P, + ) -> Option> { + let resource = self.path_to_resource.get(path.as_ref())?; + let resource = IndexedResource::new( + resource.id.clone(), + path.as_ref().to_path_buf(), + resource.last_modified, + ); + Some(resource) + } - let path = - pathdiff::diff_paths(path.to_str().unwrap(), self.root.clone()) - .ok_or(ArklibError::Path( - "Couldn't calculate path diff".into(), - ))?; + /// Build a new index from the given root path + pub fn build>(root_path: P) -> Result { + log::debug!("Building index at root path: {:?}", root_path.as_ref()); - writeln!(file, "{} {} {}", timestamp, entry.id, path.display())?; - } + // Canonicalize the root path + let root = fs::canonicalize(&root_path)?; + let mut id_to_paths: HashMap> = HashMap::new(); + let mut path_to_resource = HashMap::new(); - log::trace!( - "Storing the index took {:?}", - start - .elapsed() - .map_err(|_| ArklibError::Other(anyhow!("SystemTime error"))) - ); - Ok(()) - } + // Discover paths in the root directory + let paths = discover_paths(&root)?; + let entries: HashMap> = + scan_entries(&root, paths); - pub fn provide>(root_path: P) -> Result { - match Self::load(&root_path) { - Ok(mut index) => { - log::debug!("Index loaded: {} entries", index.path2id.len()); + // Strip the root path from the entries + let entries: HashMap> = entries + .into_iter() + .map(|(path, resource)| { + // Update the ID to paths map + id_to_paths + .entry(resource.id.clone()) + .or_default() + .insert(path.clone()); + + (path, resource) + }) + .collect(); - match index.update_all() { - Ok(update) => { - log::debug!( - "Index updated: {} added, {} deleted", - update.added.len(), - update.deleted.len() - ); - } - Err(e) => { - log::error!( - "Failed to update index: {}", - e.to_string() - ); - } - } + // Update the path to resource map + path_to_resource.extend(entries.clone()); - if let Err(e) = index.store() { - log::error!("{}", e.to_string()); - } - Ok(index) - } - Err(e) => { - log::warn!("{}", e.to_string()); - Ok(Self::build(root_path)) - } - } + let index = ResourceIndex { + root, + id_to_paths, + path_to_resource, + }; + Ok(index) } + /// Update the index with the latest information from the file system pub fn update_all(&mut self) -> Result> { - log::debug!("Updating the index"); - log::trace!("[update] known paths: {:?}", self.path2id.keys()); - - let curr_entries = discover_paths(self.root.clone()); - - //assuming that collections manipulation is - // quicker than asking `path.exists()` for every path - let curr_paths: Paths = curr_entries.keys().cloned().collect(); - let prev_paths: Paths = self.path2id.keys().cloned().collect(); - let preserved_paths: Paths = curr_paths - .intersection(&prev_paths) - .cloned() - .collect(); + log::debug!("Updating index at root path: {:?}", self.root); + log::trace!("Current index: {:#?}", self); + + let mut added: HashMap> = + HashMap::new(); + let mut removed: HashSet = HashSet::new(); + + let current_paths = discover_paths(&self.root)?; + + // Assuming that collection manipulation is faster than repeated + // lookups + let current_entries: HashMap> = + scan_entries(self.root(), current_paths); + let previous_entries = self.path_to_resource.clone(); + // `preserved_entries` is the intersection of current_entries and + // previous_entries + let preserved_entries: HashMap> = + current_entries + .iter() + .filter_map(|(path, _resource)| { + previous_entries.get(path).map(|prev_resource| { + (path.clone(), prev_resource.clone()) + }) + }) + .collect(); - let created_paths: HashMap = curr_entries - .iter() - .filter_map(|(path, entry)| { - if !preserved_paths.contains(path.as_canonical_path()) { - Some((path.clone(), entry.clone())) - } else { - None - } - }) - .collect(); + // `created_entries` is the difference between current_entries and + // preserved_entries + let created_entries: HashMap> = + current_entries + .iter() + .filter_map(|(path, resource)| { + if preserved_entries.contains_key(path) { + None + } else { + Some((path.clone(), resource.clone())) + } + }) + .collect(); - log::debug!("Checking updated paths"); - let updated_paths: HashMap = curr_entries - .into_iter() - .filter(|(path, dir_entry)| { - if !preserved_paths.contains(path.as_canonical_path()) { - false - } else { - let our_entry = &self.path2id[path]; - let prev_modified = our_entry.modified; + // `updated_entries` is the intersection of current_entries and + // preserved_entries where the last modified time has changed + // significantly (> RESOURCE_UPDATED_THRESHOLD) + let updated_entries: HashMap> = + current_entries + .into_iter() + .filter(|(path, _entry)| { + if !preserved_entries.contains_key(path) { + false + } else { + let our_entry = &self.path_to_resource[path]; + let prev_modified = our_entry.last_modified; - let result = dir_entry.metadata(); - match result { - Err(msg) => { - log::error!( - "Couldn't retrieve metadata for {}: {}", - &path.display(), - msg - ); - false - } - Ok(metadata) => match metadata.modified() { + let entry_path = self.root.join(path); + let result = fs::metadata(&entry_path); + match result { Err(msg) => { log::error!( - "Couldn't retrieve timestamp for {}: {}", + "Couldn't retrieve metadata for {}: {}", &path.display(), msg ); false } - Ok(curr_modified) => { - let elapsed = curr_modified - .duration_since(prev_modified) - .unwrap(); - - let was_updated = - elapsed >= RESOURCE_UPDATED_THRESHOLD; - if was_updated { - log::trace!( - "[update] modified {} by path {} + Ok(metadata) => match metadata.modified() { + Err(msg) => { + log::error!( + "Couldn't retrieve timestamp for {}: {}", + &path.display(), + msg + ); + false + } + Ok(curr_modified) => { + let elapsed = curr_modified + .duration_since(prev_modified) + .unwrap(); + + let was_updated = + elapsed >= RESOURCE_UPDATED_THRESHOLD; + if was_updated { + log::trace!( + "[update] modified {} by path {} \twas {:?} \tnow {:?} \telapsed {:?}", - our_entry.id, - path.display(), - prev_modified, - curr_modified, - elapsed - ); + our_entry.id, + path.display(), + prev_modified, + curr_modified, + elapsed + ); + } + + was_updated } - - was_updated - } - }, + }, + } } - } - }) - .collect(); - - let mut deleted: HashSet = HashSet::new(); + }) + .collect(); - // treating both deleted and updated paths as deletions - prev_paths - .difference(&preserved_paths) - .cloned() - .chain(updated_paths.keys().cloned()) - .for_each(|path| { - if let Some(entry) = - self.path2id.remove(path.as_canonical_path()) - { - let k = self.collisions.remove(&entry.id).unwrap_or(1); - if k > 1 { - self.collisions.insert(entry.id, k - 1); + // Remove resources that are not in the current entries + let removed_entries: HashMap> = + previous_entries + .iter() + .filter_map(|(path, resource)| { + if preserved_entries.contains_key(path) { + None } else { - log::trace!( - "[delete] {} by path {}", - entry.id, - path.display() - ); - self.id2path.remove(&entry.id); - deleted.insert(entry.id); + Some((path.clone(), resource.clone())) } - } else { - log::warn!("Path {} was not known", path.display()); - } - }); - - let added: HashMap> = - scan_entries(updated_paths) - .into_iter() - .chain({ - log::debug!("Checking added paths"); - scan_entries(created_paths).into_iter() }) - .filter(|(_, entry)| !self.id2path.contains_key(&entry.id)) .collect(); - - for (path, entry) in added.iter() { - if deleted.contains(&entry.id) { - // emitting the resource as both deleted and added - // (renaming a duplicate might remain undetected) - log::trace!( - "[update] moved {} to path {}", - entry.id, - path.display() - ); - } - - self.insert_entry(path.clone(), entry.clone()); - } - - let added: HashMap = added - .into_iter() - .map(|(path, entry)| (path, entry.id)) - .collect(); - - Ok(IndexUpdate { deleted, added }) - } - - // the caller must ensure that: - // * the index is up-to-date except this single path - // * the path hasn't been indexed before - pub fn index_new( - &mut self, - path: &dyn AsRef, - ) -> Result> { - log::debug!("Indexing a new path"); - - if !path.as_ref().exists() { - return Err(ArklibError::Path( - "Absent paths cannot be indexed".into(), - )); - } - - let path_buf = CanonicalPathBuf::canonicalize(path)?; - let path = path_buf.as_canonical_path(); - - return match fs::metadata(path) { - Err(_) => { - return Err(ArklibError::Path( - "Couldn't to retrieve file metadata".into(), - )); - } - Ok(metadata) => match scan_entry(path, metadata) { - Err(_) => { - return Err(ArklibError::Path( - "The path points to a directory or empty file".into(), - )); - } - Ok(new_entry) => { - let id = new_entry.clone().id; - - if let Some(nonempty) = self.collisions.get_mut(&id) { - *nonempty += 1; - } - - let mut added = HashMap::new(); - added.insert(path_buf.clone(), id.clone()); - - self.id2path.insert(id, path_buf.clone()); - self.path2id.insert(path_buf, new_entry); - - Ok(IndexUpdate { - added, - deleted: HashSet::new(), - }) - } - }, - }; - } - - // the caller must ensure that: - // * the index is up-to-date except this single path - // * the path has been indexed before - // * the path maps into `old_id` - // * the content by the path has been modified - pub fn update_one( - &mut self, - path: &dyn AsRef, - old_id: Id, - ) -> Result> { - log::debug!("Updating a single entry in the index"); - - if !path.as_ref().exists() { - return self.forget_id(old_id); - } - - let path_buf = CanonicalPathBuf::canonicalize(path)?; - let path = path_buf.as_canonical_path(); - - log::trace!( - "[update] paths {:?} has id {:?}", - path, - self.path2id[path] - ); - - return match fs::metadata(path) { - Err(_) => { - // updating the index after resource removal - // is a correct scenario - self.forget_path(path, old_id) - } - Ok(metadata) => { - match scan_entry(path, metadata) { - Err(_) => { - // a directory or empty file exists by the path - self.forget_path(path, old_id) - } - Ok(new_entry) => { - // valid resource exists by the path - - let curr_entry = &self.path2id.get(path); - if curr_entry.is_none() { - // if the path is not indexed, then we can't have - // `old_id` if you want - // to index new path, use `index_new` method - return Err(ArklibError::Path( - "Couldn't find the path in the index".into(), - )); - } - let curr_entry = curr_entry.unwrap(); - - if curr_entry.id == new_entry.id { - // in rare cases we are here due to hash collision - if curr_entry.modified == new_entry.modified { - log::warn!("path {:?} was not modified", &path); - } else { - log::warn!("path {:?} was modified but not its content", &path); - } - - // the caller must have ensured that the path was - // indeed update - return Err(ArklibError::Collision( - "New content has the same id".into(), - )); - } - - // new resource exists by the path - self.forget_path(path, old_id).map(|mut update| { - update - .added - .insert(path_buf.clone(), new_entry.clone().id); - self.insert_entry(path_buf, new_entry); - - update - }) - } - } - } - }; - } - - pub fn forget_id(&mut self, old_id: Id) -> Result> { - let old_path = self - .path2id - .drain() - .filter_map(|(k, v)| { - if v.id == old_id { - Some(k) - } else { - None - } - }) - .collect_vec(); - for p in old_path { - self.path2id.remove(&p); - } - self.id2path.remove(&old_id); - let mut deleted = HashSet::new(); - deleted.insert(old_id); - - Ok(IndexUpdate { - added: HashMap::new(), - deleted, - }) - } - - fn insert_entry(&mut self, path: CanonicalPathBuf, entry: IndexEntry) { - log::trace!("[add] {} by path {}", entry.id, path.display()); - let id = entry.clone().id; - - if let std::collections::hash_map::Entry::Vacant(e) = - self.id2path.entry(id.clone()) - { - e.insert(path.clone()); - } else if let Some(nonempty) = self.collisions.get_mut(&id) { - *nonempty += 1; - } else { - self.collisions.insert(id, 2); - } - - self.path2id.insert(path, entry); - } - - fn forget_path( - &mut self, - path: &CanonicalPath, - old_id: Id, - ) -> Result> { - self.path2id.remove(path); - - if let Some(collisions) = self.collisions.get_mut(&old_id) { - debug_assert!( - *collisions > 1, - "Any collision must involve at least 2 resources" + for (path, resource) in removed_entries { + log::trace!( + "Resource removed: {:?}, last modified: {:?}", + path, + resource.last_modified ); - *collisions -= 1; - - if *collisions == 1 { - self.collisions.remove(&old_id); - } - - // minor performance issue: - // we must find path of one of the collided - // resources and use it as new value - let maybe_collided_path = - self.path2id.iter().find_map(|(path, entry)| { - if entry.id == old_id { - Some(path) - } else { - None - } - }); - - if let Some(collided_path) = maybe_collided_path { - let old_path = self - .id2path - .insert(old_id.clone(), collided_path.clone()); - - debug_assert_eq!( - old_path.unwrap().as_canonical_path(), - path, - "Must forget the requested path" - ); - } else { - return Err(ArklibError::Collision( - "Illegal state of collision tracker".into(), - )); - } - } else { - self.id2path.remove(&old_id.clone()); - } - - let mut deleted = HashSet::new(); - deleted.insert(old_id); - - Ok(IndexUpdate { - added: HashMap::new(), - deleted, - }) - } -} - -fn discover_paths>( - root_path: P, -) -> HashMap { - log::debug!( - "Discovering all files under path {}", - root_path.as_ref().display() - ); - - WalkDir::new(root_path) - .into_iter() - .filter_entry(|entry| !is_hidden(entry)) - .filter_map(|result| match result { - Ok(entry) => { - let path = entry.path(); - if !entry.file_type().is_dir() { - match CanonicalPathBuf::canonicalize(path) { - Ok(canonical_path) => Some((canonical_path, entry)), - Err(msg) => { - log::warn!( - "Couldn't canonicalize {}:\n{}", - path.display(), - msg - ); - None - } - } - } else { - None - } - } - Err(msg) => { - log::error!("Error during walking: {}", msg); - None - } - }) - .collect() -} - -fn scan_entry( - path: &CanonicalPath, - metadata: Metadata, -) -> Result> -where - Id: ResourceId, -{ - if metadata.is_dir() { - return Err(ArklibError::Path("Path is expected to be a file".into())); - } - - let size = metadata.len(); - if size == 0 { - Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - "Empty resource", - ))?; - } - let id = Id::from_path(path)?; - let modified = metadata.modified()?; - - Ok(IndexEntry { modified, id }) -} - -fn scan_entries( - entries: HashMap, -) -> HashMap> -where - Id: ResourceId, -{ - entries - .into_iter() - .filter_map(|(path_buf, entry)| { - let metadata = entry.metadata().ok()?; - - let path = path_buf.as_canonical_path(); - let result = scan_entry(path, metadata); - match result { - Err(msg) => { - log::error!( - "Couldn't retrieve metadata for {}:\n{}", - path.display(), - msg - ); - None - } - Ok(entry) => Some((path_buf, entry)), + self.path_to_resource.remove(&path); + self.id_to_paths + .get_mut(&resource.id) + .unwrap() + .remove(&path); + let id = resource.id.clone(); + // Only remove the ID if it has no paths + if self.id_to_paths[&id].is_empty() { + self.id_to_paths.remove(&id); + removed.insert(id); } - }) - .collect() -} - -fn is_hidden(entry: &DirEntry) -> bool { - entry - .file_name() - .to_str() - .map(|s| s.starts_with('.')) - .unwrap_or(false) -} - -#[cfg(test)] -mod tests { - use crate::index::{discover_paths, IndexEntry}; - use crate::ResourceIndex; - use canonical_path::CanonicalPathBuf; - use dev_hash::Crc32; - use fs_atomic_versions::initialize; - use std::fs::File; - #[cfg(target_family = "unix")] - use std::fs::Permissions; - #[cfg(target_family = "unix")] - use std::os::unix::fs::PermissionsExt; - - use std::path::PathBuf; - use std::time::SystemTime; - use uuid::Uuid; - - const FILE_SIZE_1: u64 = 10; - const FILE_SIZE_2: u64 = 11; - - const FILE_NAME_1: &str = "test1.txt"; - const FILE_NAME_2: &str = "test2.txt"; - const FILE_NAME_3: &str = "test3.txt"; - - const CRC32_1: Crc32 = Crc32(3817498742); - const CRC32_2: Crc32 = Crc32(1804055020); - - fn get_temp_dir() -> PathBuf { - create_dir_at(std::env::temp_dir()) - } - - fn create_dir_at(path: PathBuf) -> PathBuf { - let mut dir_path = path.clone(); - dir_path.push(Uuid::new_v4().to_string()); - std::fs::create_dir(&dir_path).expect("Could not create temp dir"); - dir_path - } - - fn create_file_at( - path: PathBuf, - size: Option, - name: Option<&str>, - ) -> (File, PathBuf) { - let mut file_path = path.clone(); - if let Some(file_name) = name { - file_path.push(file_name); - } else { - file_path.push(Uuid::new_v4().to_string()); } - let file = File::create(file_path.clone()) - .expect("Could not create temp file"); - file.set_len(size.unwrap_or(0)) - .expect("Could not set file size"); - (file, file_path) - } - fn run_test_and_clean_up( - test: impl FnOnce(PathBuf) + std::panic::UnwindSafe, - ) { - initialize(); + // added_entries = created_entries + updated_entries + let added_entries: HashMap> = + created_entries + .into_iter() + .chain(updated_entries) + .collect(); - let path = get_temp_dir(); - let result = std::panic::catch_unwind(|| test(path.clone())); - std::fs::remove_dir_all(path.clone()) - .expect("Could not clean up after test"); - if result.is_err() { - panic!("{}", result.err().map(|_| "Test panicked").unwrap()) + for (path, resource) in added_entries { + log::trace!("Resource added: {:?}", path); + self.path_to_resource + .insert(path.clone(), resource.clone()); + let id = resource.id.clone(); + self.id_to_paths + .entry(id.clone()) + .or_default() + .insert(path.clone()); + + let resource_path_with_timestamp = ResourcePathWithTimestamp { + path: path.clone(), + last_modified: resource.last_modified, + }; + // If the ID is not in the added map, add it + // If the ID is in the added map, add the path to the set + added + .entry(id) + .or_default() + .insert(resource_path_with_timestamp); } - assert!(result.is_ok()); - } - - // resource index build - - #[test] - fn index_build_should_process_1_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 1); - assert_eq!(actual.id2path.len(), 1); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 1); - }) - } - - #[test] - fn index_build_should_process_colliding_files_correctly() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 2); - assert_eq!(actual.id2path.len(), 1); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert_eq!(actual.collisions.len(), 1); - assert_eq!(actual.size(), 2); - }) - } - - // resource index update - - #[test] - fn update_all_should_handle_renamed_file_correctly() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - create_file_at(path.clone(), Some(FILE_SIZE_2), Some(FILE_NAME_2)); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - - // rename test2.txt to test3.txt - let mut name_from = path.clone(); - name_from.push(FILE_NAME_2); - let mut name_to = path.clone(); - name_to.push(FILE_NAME_3); - std::fs::rename(name_from, name_to) - .expect("Should rename file successfully"); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 1); - assert_eq!(update.added.len(), 1); - }) - } - - #[test] - fn update_all_should_index_new_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - let (_, expected_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 2); - assert_eq!(actual.id2path.len(), 2); - assert!(actual.id2path.contains_key(&CRC32_1)); - assert!(actual.id2path.contains_key(&CRC32_2)); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 1); - - let added_key = - CanonicalPathBuf::canonicalize(expected_path.clone()) - .expect("CanonicalPathBuf should be fine"); - assert_eq!( - update - .added - .get(&added_key) - .expect("Key exists") - .clone(), - CRC32_2 - ) - }) - } - - #[test] - fn index_new_should_index_new_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - let mut index: ResourceIndex = - ResourceIndex::build(path.clone()); - - let (_, new_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = index - .index_new(&new_path) - .expect("Should update index correctly"); - - assert_eq!(index.root, path.clone()); - assert_eq!(index.path2id.len(), 2); - assert_eq!(index.id2path.len(), 2); - assert!(index.id2path.contains_key(&CRC32_1)); - assert!(index.id2path.contains_key(&CRC32_2)); - assert_eq!(index.collisions.len(), 0); - assert_eq!(index.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 1); - - let added_key = CanonicalPathBuf::canonicalize(new_path.clone()) - .expect("CanonicalPathBuf should be fine"); - assert_eq!( - update - .added - .get(&added_key) - .expect("Key exists") - .clone(), - CRC32_2 - ) - }) - } - - #[test] - fn update_one_should_error_on_new_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), None); - let mut index = ResourceIndex::build(path.clone()); - - let (_, new_path) = - create_file_at(path.clone(), Some(FILE_SIZE_2), None); - - let update = index.update_one(&new_path, CRC32_2); - - assert!(update.is_err()) - }) - } - - #[test] - fn update_one_should_index_delete_file_successfully() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - - let mut actual = ResourceIndex::build(path.clone()); - - let mut file_path = path.clone(); - file_path.push(FILE_NAME_1); - std::fs::remove_file(file_path.clone()) - .expect("Should remove file successfully"); - - let update = actual - .update_one(&file_path.clone(), CRC32_1) - .expect("Should update index successfully"); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 0); - assert_eq!(update.deleted.len(), 1); - assert_eq!(update.added.len(), 0); - - assert!(update.deleted.contains(&CRC32_1)) - }) - } - - #[test] - fn update_all_should_error_on_files_without_permissions() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1)); - let (file, _) = create_file_at( - path.clone(), - Some(FILE_SIZE_2), - Some(FILE_NAME_2), - ); - - let mut actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - #[cfg(target_family = "unix")] - file.set_permissions(Permissions::from_mode(0o222)) - .expect("Should be fine"); - - let update = actual - .update_all() - .expect("Should update index correctly"); - - assert_eq!(actual.collisions.len(), 0); - assert_eq!(actual.size(), 2); - assert_eq!(update.deleted.len(), 0); - assert_eq!(update.added.len(), 0); - }) - } - - // error cases - - #[test] - fn update_one_should_not_update_absent_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let mut actual = ResourceIndex::build(path.clone()); - let old_id = Crc32(2); - let result = actual - .update_one(&missing_path, old_id.clone()) - .map(|i| i.deleted.clone().take(&old_id)) - .ok() - .flatten(); - - assert_eq!(result, Some(Crc32(2))); - }) - } - - #[test] - fn update_one_should_index_new_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let mut actual = ResourceIndex::build(path.clone()); - let old_id = Crc32(2); - let result = actual - .update_one(&missing_path, old_id.clone()) - .map(|i| i.deleted.clone().take(&old_id)) - .ok() - .flatten(); - - assert_eq!(result, Some(Crc32(2))); - }) - } - - #[test] - fn should_not_index_empty_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(0), None); - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) - } - - #[test] - fn should_not_index_hidden_file() { - run_test_and_clean_up(|path| { - create_file_at(path.clone(), Some(FILE_SIZE_1), Some(".hidden")); - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) - } - - #[test] - fn should_not_index_1_empty_directory() { - run_test_and_clean_up(|path| { - create_dir_at(path.clone()); - - let actual: ResourceIndex = - ResourceIndex::build(path.clone()); - - assert_eq!(actual.root, path.clone()); - assert_eq!(actual.path2id.len(), 0); - assert_eq!(actual.id2path.len(), 0); - assert_eq!(actual.collisions.len(), 0); - }) - } - - #[test] - fn discover_paths_should_not_walk_on_invalid_path() { - run_test_and_clean_up(|path| { - let mut missing_path = path.clone(); - missing_path.push("missing/directory"); - let actual = discover_paths(missing_path); - assert_eq!(actual.len(), 0); - }) - } - - #[test] - fn index_entry_order() { - let old1 = IndexEntry { - id: Crc32(2), - modified: SystemTime::UNIX_EPOCH, - }; - let old2 = IndexEntry { - id: Crc32(1), - modified: SystemTime::UNIX_EPOCH, - }; - - let new1 = IndexEntry { - id: Crc32(1), - modified: SystemTime::now(), - }; - let new2 = IndexEntry { - id: Crc32(2), - modified: SystemTime::now(), - }; - - assert_eq!(new1, new1); - assert_eq!(new2, new2); - assert_eq!(old1, old1); - assert_eq!(old2, old2); - - assert_ne!(new1, new2); - assert_ne!(new1, old1); - - assert!(new1 > old1); - assert!(new1 > old2); - assert!(new2 > old1); - assert!(new2 > old2); - assert!(new2 > new1); - } - - /// Test the performance of `ResourceIndex::build` on a specific directory. - /// - /// This test evaluates the performance of building a resource - /// index using the `ResourceIndex::build` method on a given directory. - /// It measures the time taken to build the resource index and prints the - /// number of collisions detected. - #[test] - fn test_build_resource_index() { - use std::time::Instant; - - let path = "../test-assets/"; // The path to the directory to index - assert!( - std::path::Path::new(path).is_dir(), - "The provided path is not a directory or does not exist" - ); - - let start_time = Instant::now(); - let index: ResourceIndex = - ResourceIndex::build(path.to_string()); - let elapsed_time = start_time.elapsed(); - println!("Number of paths: {}", index.id2path.len()); - println!("Number of resources: {}", index.id2path.len()); - println!("Number of collisions: {}", index.collisions.len()); - println!("Time taken: {:?}", elapsed_time); + Ok(IndexUpdate { added, removed }) } } diff --git a/fs-index/src/lib.rs b/fs-index/src/lib.rs index f475f478..59030cef 100644 --- a/fs-index/src/lib.rs +++ b/fs-index/src/lib.rs @@ -1,3 +1,10 @@ -pub mod index; +mod index; +mod serde; +mod utils; + +pub use utils::load_or_build_index; pub use index::ResourceIndex; + +#[cfg(test)] +mod tests; diff --git a/fs-index/src/serde.rs b/fs-index/src/serde.rs new file mode 100644 index 00000000..8c575127 --- /dev/null +++ b/fs-index/src/serde.rs @@ -0,0 +1,126 @@ +use std::{ + collections::{HashMap, HashSet}, + path::PathBuf, + time::SystemTime, +}; + +use anyhow::Result; +use serde::{ + ser::{SerializeStruct, Serializer}, + Deserialize, Serialize, +}; + +use data_resource::ResourceId; + +use crate::{index::ResourceIdWithTimestamp, ResourceIndex}; + +/// Data structure for serializing and deserializing the index +#[derive(Serialize, Deserialize)] +struct ResourceIndexData { + root: PathBuf, + resources: HashMap>, +} + +#[derive(Serialize, Deserialize)] +struct IndexedResourceData { + id: Id, + last_modified: u64, +} + +/// Custom implementation of [`Serialize`] for [`ResourceIndex`] +/// +/// To avoid writing a large repetitive index file with double maps, +/// we are only serializing the root path, and path_to_resource. +/// +/// Other fields can be reconstructed from the path_to_resource map. +impl Serialize for ResourceIndex +where + Id: ResourceId, +{ + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut state = serializer.serialize_struct("ResourceIndex", 2)?; + state.serialize_field("root", &self.root)?; + + let mut resources = HashMap::new(); + for (path, resource) in &self.path_to_resource { + let id = resource.id.clone(); + let last_modified = resource + .last_modified + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(|e| { + serde::ser::Error::custom(format!( + "Failed to serialize last_modified: {}", + e + )) + })? + .as_nanos() as u64; + + let resource_data = IndexedResourceData { id, last_modified }; + resources.insert(path.clone(), resource_data); + } + + state.serialize_field("resources", &resources)?; + state.end() + } +} + +/// Custom implementation of [`Deserialize`] for [`ResourceIndex`] +/// +/// Deserializes the index from the root path and path_to_resource map. +/// Other fields are reconstructed from the path_to_resource map. +impl<'de, Id> Deserialize<'de> for ResourceIndex +where + Id: ResourceId, +{ + fn deserialize(deserializer: D) -> Result, D::Error> + where + D: serde::Deserializer<'de>, + { + let index_data: ResourceIndexData = + ResourceIndexData::deserialize(deserializer)?; + + let mut path_to_resource = HashMap::new(); + let mut id_to_paths = HashMap::new(); + for (path, resource_data) in index_data.resources { + let last_modified = SystemTime::UNIX_EPOCH + + std::time::Duration::from_nanos(resource_data.last_modified); + let resource = ResourceIdWithTimestamp { + id: resource_data.id, + last_modified, + }; + path_to_resource.insert(path.clone(), resource.clone()); + id_to_paths + .entry(resource.id.clone()) + .or_insert_with(HashSet::new) + .insert(path); + } + + Ok(ResourceIndex { + root: index_data.root, + id_to_paths, + path_to_resource, + }) + } +} + +/// Custom implementation of [`PartialEq`] for [`ResourceIndex`] +/// +/// The order of items in hashmaps is not relevant. +/// we just need to compare [`ResourceIndex::resources`] to check if the two +/// indexes are equal. +impl PartialEq for ResourceIndex +where + Id: ResourceId, +{ + fn eq(&self, other: &Self) -> bool { + let mut resources1 = self.resources(); + let mut resources2 = other.resources(); + resources1.sort_by(|a, b| a.path().cmp(b.path())); + resources2.sort_by(|a, b| a.path().cmp(b.path())); + + resources1 == resources2 && self.root == other.root + } +} diff --git a/fs-index/src/tests.rs b/fs-index/src/tests.rs new file mode 100644 index 00000000..c1b8b065 --- /dev/null +++ b/fs-index/src/tests.rs @@ -0,0 +1,649 @@ +//! This module provides tests for the `ResourceIndex` functionality using +//! different hash algorithms. +//! +//! The tests are parameterized by various hash types, such as `Blake3` and +//! `Crc32`, to ensure the implementation works consistently across different +//! hashing algorithms. +//! +//! # Structure +//! +//! - **Macros**: +//! - `for_each_type!`: A macro that takes a list of hash function types and a +//! block of code to execute for each hash type. +//! +//! - **Test Functions**: +//! - Defined to test various aspects of `ResourceIndex`, parameterized by +//! hash type. +//! +//! - **Helper Functions**: +//! - `get_indexed_resource_from_file`: Helper to create `IndexedResource` +//! from a file path. +//! +//! # Usage +//! +//! To add a new test for a specific hash type: +//! 1. Write a block of code generic over the hash type (a Type implementing +//! ResourceId trait). +//! 2. Use the `for_each_type!` macro to execute the block of code for each +//! desired hash type. + +use dev_hash::{Blake3, Crc32}; +use std::{fs, path::Path}; + +use anyhow::{anyhow, Result}; +use tempfile::TempDir; + +use data_resource::ResourceId; + +use crate::{ + index::IndexedResource, utils::load_or_build_index, ResourceIndex, +}; + +/// A macro that takes a list of hash function types and a block of code to +/// execute for each hash type. +#[macro_export] +macro_rules! for_each_type { + ($($hash_type:ty),+ => $body:block) => { + $( + { + type Id = $hash_type; + $body + } + )+ + }; +} + +/// A helper function to get [`IndexedResource`] from a file path +fn get_indexed_resource_from_file>( + path: P, + parent_dir: P, +) -> Result> { + let id = Id::from_path(&path)?; + + let relative_path = path + .as_ref() + .strip_prefix(parent_dir) + .map_err(|_| anyhow!("Failed to get relative path"))?; + + Ok(IndexedResource::new( + id, + relative_path.to_path_buf(), + fs::metadata(path)?.modified()?, + )) +} + +/// Test storing and loading the resource index. +/// +/// ## Test scenario: +/// - Build a resource index in the temporary directory. +/// - Store the index. +/// - Load the stored index. +/// - Assert that the loaded index matches the original index. +#[test] +fn test_store_and_load_index() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_store_and_load_index") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 1, "{:?}", index); + index.store().expect("Failed to store index"); + + let loaded_index = + load_or_build_index(root_path, false).expect("Failed to load index"); + + assert_eq!(index, loaded_index, "{:?} != {:?}", index, loaded_index); + }); +} + +/// Test storing and loading the resource index with collisions. +/// +/// ## Test scenario: +/// - Build a resource index in the temporary directory. +/// - Write duplicate files with the same content. +/// - Store the index. +/// - Load the stored index. +/// - Assert that the loaded index matches the original index. +#[test] +fn test_store_and_load_index_with_collisions() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = + TempDir::with_prefix("ark_test_store_and_load_index_with_collisions") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let file_path2 = root_path.join("file2.txt"); + fs::write(&file_path2, "file content").expect("Failed to write to file"); + + let file_path3 = root_path.join("file3.txt"); + fs::write(&file_path3, "file content").expect("Failed to write to file"); + + let file_path4 = root_path.join("file4.txt"); + fs::write(&file_path4, "file content").expect("Failed to write to file"); + + // Now we have 4 files with the same content (same checksum) + + let index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + let checksum = Id::from_path(&file_path).expect("Failed to get checksum"); + assert_eq!(index.len(), 4, "{:?}", index); + assert_eq!(index.collisions().len(), 1, "{:?}", index); + assert_eq!(index.collisions()[&checksum].len(), 4, "{:?}", index); + index.store().expect("Failed to store index"); + + let loaded_index = + load_or_build_index(root_path, false).expect("Failed to load index"); + + assert_eq!(index, loaded_index, "{:?} != {:?}", index, loaded_index); + }); +} + +/// Test building an index with a file. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains one entry. +/// - Assert that the resource retrieved by path matches the expected resource. +/// - Assert that the resource retrieved by ID matches the expected resource. +#[test] +fn test_build_index_with_file() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_build_index_with_file") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + let expected_resource: IndexedResource = + get_indexed_resource_from_file(&file_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + + let index = ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 1, "{:?}", index); + + let resource = index + .get_resource_by_path("file.txt") + .expect("Failed to get resource"); + assert_eq!( + resource, expected_resource, + "{:?} != {:?}", + resource, expected_resource + ); + }); +} + +/// Test building an index with an empty file. +/// +/// ## Test scenario: +/// - Create an empty file within the temporary directory. +/// - Create a file with content within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains one entries. +#[test] +fn test_build_index_with_empty_file() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_build_index_with_empty_file") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let empty_file_path = root_path.join("empty_file.txt"); + fs::write(&empty_file_path, "").expect("Failed to write to file"); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 1, "{:?}", index); + }); +} + +/// Test building an index with a directory. +/// +/// ## Test scenario: +/// - Create a subdirectory within the temporary directory. +/// - Create a file within the subdirectory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains one entry. +/// - Assert that the resource retrieved by path matches the expected resource. +/// - Assert that the resource retrieved by ID matches the expected resource. +#[test] +fn test_build_index_with_directory() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_build_index_with_directory") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let dir_path = root_path.join("dir"); + fs::create_dir(&dir_path).expect("Failed to create dir"); + let file_path = dir_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + let expected_resource: IndexedResource = + get_indexed_resource_from_file(&file_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + + let index = ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 1, "{:?}", index); + + let resource = index + .get_resource_by_path("dir/file.txt") + .expect("Failed to get resource"); + assert_eq!( + resource, expected_resource, + "{:?} != {:?}", + resource, expected_resource + ); + }); +} + +/// Test building an index with multiple files. +/// +/// ## Test scenario: +/// - Create multiple files within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains two entries. +/// - Assert that the resource retrieved by path for each file matches the +/// expected resource. +#[test] +fn test_build_index_with_multiple_files() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = + TempDir::with_prefix("ark_test_build_index_with_multiple_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file1_path = root_path.join("file1.txt"); + fs::write(&file1_path, "file1 content").expect("Failed to write to file"); + let file2_path = root_path.join("file2.txt"); + fs::write(&file2_path, "file2 content").expect("Failed to write to file"); + + let expected_resource1: IndexedResource = + get_indexed_resource_from_file(&file1_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + let expected_resource2 = + get_indexed_resource_from_file(&file2_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + + let index = ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 2, "{:?}", index); + + let resource = index + .get_resource_by_path("file1.txt") + .expect("Failed to get resource"); + assert_eq!(resource, expected_resource1, "{:?}", resource); + + let resource = index + .get_resource_by_path("file2.txt") + .expect("Failed to get resource"); + assert_eq!(resource, expected_resource2, "{:?}", resource); + }); +} + +/// Test building an index with multiple directories. +/// +/// ## Test scenario: +/// - Create multiple directories within the temporary directory, each +/// containing a file. +/// - Build a resource index in the temporary directory. +/// - Assert that the index contains two entries. +/// - Assert that the resources retrieved by path for each file match the +/// expected resources. +#[test] +fn test_build_index_with_multiple_directories() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = + TempDir::with_prefix("ark_test_build_index_with_multiple_directories") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let dir1_path = root_path.join("dir1"); + fs::create_dir(&dir1_path).expect("Failed to create dir"); + let file1_path = dir1_path.join("file1.txt"); + fs::write(&file1_path, "file1 content").expect("Failed to write to file"); + + let dir2_path = root_path.join("dir2"); + fs::create_dir(&dir2_path).expect("Failed to create dir"); + let file2_path = dir2_path.join("file2.txt"); + fs::write(&file2_path, "file2 content").expect("Failed to write to file"); + + let expected_resource1: IndexedResource = + get_indexed_resource_from_file(&file1_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + let expected_resource2 = + get_indexed_resource_from_file(&file2_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + + let index = ResourceIndex::build(root_path).expect("Failed to build index"); + assert_eq!(index.len(), 2, "{:?}", index); + + let resource = index + .get_resource_by_path("dir1/file1.txt") + .expect("Resource not found"); + assert_eq!(resource, expected_resource1, "{:?}", resource); + + let resource = index + .get_resource_by_path("dir2/file2.txt") + .expect("Resource not found"); + assert_eq!(resource, expected_resource2, "{:?}", resource); + }); +} + +/// Test updating the resource index. +/// +/// ## Test scenario: +/// - Create files within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains the expected number of entries. +/// - Create a new file, modify an existing file, and remove another file. +/// - Update the resource index. +/// - Assert that the index contains the expected number of entries after the +/// update. +/// - Assert that the entries in the index match the expected state after the +/// update. +#[test] +fn test_resource_index_update() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_resource_index_update") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let image_path = root_path.join("image.png"); + fs::write(&image_path, "image content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 2, "{:?}", index); + + // create new file + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "new file content") + .expect("Failed to write to file"); + + // modify file + fs::write(&file_path, "updated file content") + .expect("Failed to write to file"); + + // remove file + fs::remove_file(&image_path).expect("Failed to remove file"); + + index + .update_all() + .expect("Failed to update index"); + // Index now contains 2 resources (file.txt and new_file.txt) + assert_eq!(index.len(), 2, "{:?}", index); + + let resource = index + .get_resource_by_path("file.txt") + .expect("Resource not found"); + let expected_resource = + get_indexed_resource_from_file(&file_path, &root_path.to_path_buf()) + .expect("Failed to get indexed resource"); + assert_eq!(resource, expected_resource, "{:?}", resource); + + let _resource = index + .get_resource_by_path("new_file.txt") + .expect("Resource not found"); + + assert!( + index.get_resource_by_path("image.png").is_none(), + "{:?}", + index + ); + }); +} + +/// Test adding colliding files to the index. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains the expected number of entries. +/// - Create a new file with the same checksum as the existing file. +/// - Track the addition of the new file in the index. +/// - Assert that the index contains the expected number of entries after the +/// addition. +/// - Assert index.collisions contains the expected number of entries. +#[test] +fn test_add_colliding_files() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_add_colliding_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 1, "{:?}", index); + + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "file content").expect("Failed to write to file"); + + index + .update_all() + .expect("Failed to update index"); + + assert_eq!(index.len(), 2, "{:?}", index); + assert_eq!(index.collisions().len(), 1, "{:?}", index); + }); +} + +/// Test `ResourceIndex::num_collisions()` method. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains the expected number of entries. +/// - Create 2 new files with the same checksum as the existing file. +/// - Update the index. +/// - Assert that the index contains the expected number of entries after the +/// update. +#[test] +fn test_num_collisions() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_num_collisions") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 1, "{:?}", index); + + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "file content").expect("Failed to write to file"); + + let new_file_path2 = root_path.join("new_file2.txt"); + fs::write(&new_file_path2, "file content") + .expect("Failed to write to file"); + + index + .update_all() + .expect("Failed to update index"); + + assert_eq!(index.len(), 3, "{:?}", index); + assert_eq!(index.num_collisions(), 3, "{:?}", index); + }); +} + +/// Test that we don't index hidden files. +/// +/// ## Test scenario: +/// - Create a hidden file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Assert that the index initially contains the expected number of entries. +/// (0) +#[test] +fn test_hidden_files() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_hidden_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join(".hidden_file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + index.store().expect("Failed to store index"); + assert_eq!(index.len(), 0, "{:?}", index); + }); +} + +/// Test that we detect added files in `update_all`. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Create a new file. +/// - Update the resource index. +/// - Assert that the return from `update_all` is that `added` includes the +/// new file. +#[test] +fn test_update_all_added_files() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_added_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "new file content") + .expect("Failed to write to file"); + + let update_result = index.update_all().expect("Failed to update index"); + assert_eq!(update_result.added().len(), 1, "{:?}", update_result); + }); +} + +/// Test that we detect updated files using the last modified time. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Sleep for a second to ensure the last modified time is different. +/// - Update the file. +/// - Update the resource index. +/// - Assert that the return from `update_all` is that `added` includes the +/// updated file. +#[test] +fn test_update_all_updated_files() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_updated_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + std::thread::sleep(std::time::Duration::from_secs(1)); + + fs::write(&file_path, "updated file content") + .expect("Failed to write to file"); + + let update_result = index.update_all().expect("Failed to update index"); + assert_eq!(update_result.added().len(), 1, "{:?}", update_result); + }); +} + +/// Test that we detect deleted files in `update_all`. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Remove the file. +/// - Update the resource index. +/// - Assert that the return from `update_all` is that `removed` includes the +/// deleted file. +#[test] +fn test_update_all_deleted_files() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_deleted_files") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + fs::remove_file(&file_path).expect("Failed to remove file"); + + let update_result = index.update_all().expect("Failed to update index"); + assert_eq!(update_result.removed().len(), 1, "{:?}", update_result); + }); +} + +/// Test that we detect files with the same hash but different content in +/// `update_all`. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Modify the file. +/// - Create a new file with the same content but different name (path). +/// - Update the resource index. +/// - Assert that the return from `update_all` is that `added` includes both +/// files. +#[test] +fn test_update_all_files_with_same_hash() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_files_with_same_hash") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + std::thread::sleep(std::time::Duration::from_secs(1)); + + fs::write(&file_path, "updated file content") + .expect("Failed to write to file"); + + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "updated file content") + .expect("Failed to write to file"); + + let update_result = index.update_all().expect("Failed to update index"); + // The lentgh of `added` should be 1 because the new file has the same + // content as the updated file. + assert_eq!(update_result.added().len(), 1, "{:?}", update_result); + + // The length of `added`'s first element should be 2 + assert_eq!(update_result.added().values().next().unwrap().len(), 2); + + // The length of `collisions` should be 1 because the new file has the + // same content as the updated file. + assert_eq!(index.collisions().len(), 1, "{:?}", index); + }); +} diff --git a/fs-index/src/utils.rs b/fs-index/src/utils.rs new file mode 100644 index 00000000..1f2f5988 --- /dev/null +++ b/fs-index/src/utils.rs @@ -0,0 +1,159 @@ +use std::{ + collections::HashMap, + fs, + io::BufReader, + path::{Path, PathBuf}, +}; + +use walkdir::{DirEntry, WalkDir}; + +use data_error::{ArklibError, Result}; +use data_resource::ResourceId; +use fs_storage::{ARK_FOLDER, INDEX_PATH}; + +use crate::{index::ResourceIdWithTimestamp, ResourceIndex}; + +/// Load the index from the file system +fn load_index, Id: ResourceId>( + root_path: P, +) -> Result> { + let index_path = Path::new(ARK_FOLDER).join(INDEX_PATH); + let index_path = fs::canonicalize(root_path.as_ref())?.join(index_path); + let index_file = fs::File::open(index_path)?; + let reader = BufReader::new(index_file); + let index = serde_json::from_reader(reader)?; + + Ok(index) +} + +/// Load the index from the file system, or build a new index if it doesn't +/// exist +/// +/// If `update` is true, the index will be updated and stored after loading +/// it. +pub fn load_or_build_index, Id: ResourceId>( + root_path: P, + update: bool, +) -> Result> { + log::debug!( + "Attempting to load or build index at root path: {:?}", + root_path.as_ref() + ); + + let index_path = Path::new(ARK_FOLDER).join(INDEX_PATH); + let index_path = fs::canonicalize(root_path.as_ref())?.join(index_path); + log::trace!("Index path: {:?}", index_path); + + if index_path.exists() { + log::trace!("Index file exists, loading index"); + + let mut index = load_index(root_path)?; + if update { + log::trace!("Updating loaded index"); + + index.update_all()?; + index.store()?; + } + Ok(index) + } else { + log::trace!("Index file does not exist, building index"); + + // Build a new index if it doesn't exist and store it + let index = ResourceIndex::build(root_path.as_ref())?; + index.store().map_err(|e| { + ArklibError::Path(format!("Failed to store index: {}", e)) + })?; + Ok(index) + } +} + +/// A helper function to discover paths in a directory +/// +/// This function walks the directory tree starting from the root path and +/// returns a list of file paths. +/// +/// Ignore hidden files and empty files. +pub(crate) fn discover_paths>( + root_path: P, +) -> Result> { + log::debug!("Discovering paths at root path: {:?}", root_path.as_ref()); + + let walker = WalkDir::new(&root_path) + .min_depth(1) // Skip the root directory + .into_iter() + .filter_entry(should_index); // Skip hidden files and empty files + + // Filter out directories + let paths = walker + .filter_map(|entry| { + let entry = entry.ok()?; + if entry.file_type().is_file() { + Some(entry) + } else { + None + } + }) + .collect(); + + Ok(paths) +} + +/// A helper function to scan entries and create indexed resources +pub(crate) fn scan_entries, Id: ResourceId>( + root_path: P, + paths: Vec, +) -> HashMap> { + let mut path_to_resource = HashMap::new(); + for entry in paths { + let resource = scan_entry(entry.clone()); + + let path = entry.path().to_path_buf(); + // Strip the root path from the entry path + let path = path + .strip_prefix(root_path.as_ref()) + .expect("Failed to strip prefix"); + let path = path.to_path_buf(); + + path_to_resource.insert(path, resource); + } + path_to_resource +} + +/// A helper function to scan one entry and create an indexed resource +pub(crate) fn scan_entry( + entry: DirEntry, +) -> ResourceIdWithTimestamp { + let metadata = entry.metadata().expect("Failed to get metadata"); + let last_modified = metadata + .modified() + .expect("Failed to get modified"); + + // Get the ID of the resource + let id = Id::from_path(entry.path()).expect("Failed to get ID from path"); + + ResourceIdWithTimestamp { id, last_modified } +} + +/// A helper function to check if the entry should be indexed (not hidden or +/// empty) +fn should_index(entry: &walkdir::DirEntry) -> bool { + // Check if the entry is hidden + if entry + .file_name() + .to_string_lossy() + .starts_with('.') + { + return false; + } + + // Check if the entry is empty + if entry + .metadata() + .map(|m| m.len() == 0) + .unwrap_or(false) + { + return false; + } + + true +} diff --git a/fs-metadata/src/lib.rs b/fs-metadata/src/lib.rs index df258ae5..881ddeab 100644 --- a/fs-metadata/src/lib.rs +++ b/fs-metadata/src/lib.rs @@ -1,9 +1,7 @@ use data_error::Result; use fs_atomic_versions::atomic::{modify_json, AtomicFile}; use serde::{de::DeserializeOwned, Serialize}; -use std::fmt::Debug; -use std::io::Read; -use std::path::Path; +use std::{fmt::Debug, io::Read, path::Path}; use data_resource::ResourceId; use fs_storage::ARK_FOLDER; diff --git a/fs-properties/src/lib.rs b/fs-properties/src/lib.rs index 81b5b04f..0c28aede 100644 --- a/fs-properties/src/lib.rs +++ b/fs-properties/src/lib.rs @@ -1,8 +1,6 @@ use serde::{de::DeserializeOwned, Serialize}; use serde_json::Value; -use std::fmt::Debug; -use std::io::Read; -use std::path::Path; +use std::{fmt::Debug, io::Read, path::Path}; use data_error::Result; use data_json::merge; diff --git a/fs-storage/examples/cli.rs b/fs-storage/examples/cli.rs index 7b2ab23d..8214f258 100644 --- a/fs-storage/examples/cli.rs +++ b/fs-storage/examples/cli.rs @@ -1,10 +1,7 @@ use anyhow::{Context, Result}; -use fs_storage::base_storage::BaseStorage; -use fs_storage::file_storage::FileStorage; +use fs_storage::{base_storage::BaseStorage, file_storage::FileStorage}; use serde_json::Value; -use std::env; -use std::fs; -use std::path::Path; +use std::{env, fs, path::Path}; fn main() { if let Err(e) = run() { diff --git a/fs-storage/src/base_storage.rs b/fs-storage/src/base_storage.rs index 576b5a8f..4eba03b0 100644 --- a/fs-storage/src/base_storage.rs +++ b/fs-storage/src/base_storage.rs @@ -30,16 +30,22 @@ impl std::fmt::Display for SyncStatus { } } -/// The `BaseStorage` trait represents a key-value mapping that is written to the file system. +/// The `BaseStorage` trait represents a key-value mapping that is written to +/// the file system. /// -/// This trait provides methods to create or update entries in the internal mapping, remove entries from the internal mapping, -/// determine if the in-memory model or the underlying storage requires syncing, scan and load the mapping from the filesystem, -/// write the mapping to the filesystem, and remove all stored data. +/// This trait provides methods to create or update entries in the internal +/// mapping, remove entries from the internal mapping, determine if the +/// in-memory model or the underlying storage requires syncing, scan and load +/// the mapping from the filesystem, write the mapping to the filesystem, and +/// remove all stored data. /// -/// The trait also includes a method to merge values from another key-value mapping. +/// The trait also includes a method to merge values from another key-value +/// mapping. /// -/// Note: The trait does not write to storage by default. It is up to the implementor to decide when to read or write to storage -/// based on `SyncStatus`. This is to allow for trading off between performance and consistency. +/// Note: The trait does not write to storage by default. It is up to the +/// implementor to decide when to read or write to storage +/// based on `SyncStatus`. This is to allow for trading off between performance +/// and consistency. pub trait BaseStorage: AsRef> { /// Create or update an entry in the internal mapping. fn set(&mut self, id: K, value: V); diff --git a/fs-storage/src/btreemap_iter.rs b/fs-storage/src/btreemap_iter.rs index b3d7a69f..bb52d6f1 100644 --- a/fs-storage/src/btreemap_iter.rs +++ b/fs-storage/src/btreemap_iter.rs @@ -1,7 +1,5 @@ use crate::base_storage::BaseStorage; -use std::cell::RefCell; -use std::collections::btree_map::Iter; -use std::rc::Rc; +use std::{cell::RefCell, collections::btree_map::Iter, rc::Rc}; pub struct BTreeMapIterator<'a, K, V> { iter: Rc>>, diff --git a/fs-storage/src/file_storage.rs b/fs-storage/src/file_storage.rs index 0e27766f..9b762bbd 100644 --- a/fs-storage/src/file_storage.rs +++ b/fs-storage/src/file_storage.rs @@ -1,15 +1,17 @@ use serde::{Deserialize, Serialize}; -use std::fs::{self, File}; -use std::io::Write; -use std::time::SystemTime; use std::{ collections::BTreeMap, + fs::{self, File}, + io::Write, path::{Path, PathBuf}, + time::SystemTime, }; -use crate::base_storage::{BaseStorage, SyncStatus}; -use crate::monoid::Monoid; -use crate::utils::read_version_2_fs; +use crate::{ + base_storage::{BaseStorage, SyncStatus}, + monoid::Monoid, + utils::read_version_2_fs, +}; use data_error::{ArklibError, Result}; /* @@ -80,8 +82,8 @@ where /// Create a new file storage with a diagnostic label and file path /// The storage will be initialized using the disk data, if the path exists /// - /// Note: if the file storage already exists, the data will be read from the file - /// without overwriting it. + /// Note: if the file storage already exists, the data will be read from the + /// file without overwriting it. pub fn new(label: String, path: &Path) -> Result { let time = SystemTime::now(); let mut storage = Self { @@ -114,7 +116,8 @@ where // First check if the file starts with "version: 2" let file_content = std::fs::read_to_string(&self.path)?; if file_content.starts_with("version: 2") { - // Attempt to parse the file using the legacy version 2 storage format of FileStorage. + // Attempt to parse the file using the legacy version 2 storage + // format of FileStorage. match read_version_2_fs(&self.path) { Ok(data) => { log::info!( @@ -193,14 +196,14 @@ where // Determine the synchronization status based on the modification times // Conditions: - // 1. If both the in-memory storage and the storage on disk have been modified - // since the last write, then the storage is diverged. - // 2. If only the in-memory storage has been modified since the last write, - // then the storage on disk is stale. - // 3. If only the storage on disk has been modified since the last write, - // then the in-memory storage is stale. - // 4. If neither the in-memory storage nor the storage on disk has been modified - // since the last write, then the storage is in sync. + // 1. If both the in-memory storage and the storage on disk have been + // modified since the last write, then the storage is diverged. + // 2. If only the in-memory storage has been modified since the last + // write, then the storage on disk is stale. + // 3. If only the storage on disk has been modified since the last + // write, then the in-memory storage is stale. + // 4. If neither the in-memory storage nor the storage on disk has been + // modified since the last write, then the storage is in sync. let status = match ( self.modified > self.written_to_disk, file_updated > self.written_to_disk, diff --git a/fs-storage/src/jni/btreemap_iter.rs b/fs-storage/src/jni/btreemap_iter.rs index 6ebd6d61..61573e09 100644 --- a/fs-storage/src/jni/btreemap_iter.rs +++ b/fs-storage/src/jni/btreemap_iter.rs @@ -1,5 +1,4 @@ -use crate::btreemap_iter::BTreeMapIterator; -use crate::file_storage::FileStorage; +use crate::{btreemap_iter::BTreeMapIterator, file_storage::FileStorage}; // This is the interface to the JVM that we'll call the majority of our // methods on. use jni::JNIEnv; diff --git a/fs-storage/src/jni/file_storage.rs b/fs-storage/src/jni/file_storage.rs index 20820cd0..b62ed5bb 100644 --- a/fs-storage/src/jni/file_storage.rs +++ b/fs-storage/src/jni/file_storage.rs @@ -1,7 +1,6 @@ use crate::base_storage::SyncStatus; use jni::signature::ReturnType; -use std::collections::BTreeMap; -use std::path::Path; +use std::{collections::BTreeMap, path::Path}; // This is the interface to the JVM that we'll call the majority of our // methods on. use jni::JNIEnv; @@ -45,7 +44,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_create<'local>( let file_storage: FileStorage = FileStorage::new(label, Path::new(&path)).unwrap_or_else(|err| { - env.throw_new("java/lang/RuntimeException", &err.to_string()) + env.throw_new("java/lang/RuntimeException", err.to_string()) .expect("Failed to throw RuntimeException"); FileStorage::new("".to_string(), Path::new("")).unwrap() }); @@ -77,13 +76,13 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_remove<'local>( FileStorage::from_jlong(file_storage_ptr) .remove(&id) .unwrap_or_else(|err| { - env.throw_new("java/lang/RuntimeException", &err.to_string()) + env.throw_new("java/lang/RuntimeException", err.to_string()) .unwrap(); }); } -// A JNI function called from Java that creates a `MyData` Rust type, converts it to a Java -// type and returns it. +// A JNI function called from Java that creates a `MyData` Rust type, converts +// it to a Java type and returns it. #[no_mangle] #[allow(non_snake_case)] pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_syncStatus< @@ -114,7 +113,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_sync( FileStorage::from_jlong(file_storage_ptr) .sync() .unwrap_or_else(|err| { - env.throw_new("java/lang/RuntimeException", &err.to_string()) + env.throw_new("java/lang/RuntimeException", err.to_string()) .unwrap(); }); } @@ -129,7 +128,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_readFS( match FileStorage::from_jlong(file_storage_ptr).read_fs() { Ok(data) => data.clone(), Err(err) => { - env.throw_new("java/lang/RuntimeException", &err.to_string()) + env.throw_new("java/lang/RuntimeException", err.to_string()) .expect("Failed to throw RuntimeException"); return JObject::null().into_raw(); } @@ -202,7 +201,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_writeFS( FileStorage::from_jlong(file_storage_ptr) .write_fs() .unwrap_or_else(|err| { - env.throw_new("java/lang/RuntimeException", &err.to_string()) + env.throw_new("java/lang/RuntimeException", err.to_string()) .unwrap(); }); } @@ -219,7 +218,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_erase( Box::from_raw(file_storage_ptr as *mut FileStorage) }; file_storage.erase().unwrap_or_else(|err| { - env.throw_new("java/lang/RuntimeException", &err.to_string()) + env.throw_new("java/lang/RuntimeException", err.to_string()) .unwrap(); }); } @@ -234,7 +233,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_merge( FileStorage::from_jlong(file_storage_ptr) .merge_from(FileStorage::from_jlong(other_file_storage_ptr)) .unwrap_or_else(|err| { - env.throw_new("java/lang/RuntimeException", &err.to_string()) + env.throw_new("java/lang/RuntimeException", err.to_string()) .unwrap(); }); } diff --git a/fs-storage/src/monoid.rs b/fs-storage/src/monoid.rs index b2acf546..1c483201 100644 --- a/fs-storage/src/monoid.rs +++ b/fs-storage/src/monoid.rs @@ -1,10 +1,11 @@ -// Currently, we have three structures: Tags (HashSet), Properties (HashSet), Score (int). -// In fact, HashSet already implements a union function, +// Currently, we have three structures: Tags (HashSet), Properties (HashSet), +// Score (int). In fact, HashSet already implements a union function, // so only a special function for integers is needed. // CRDTs can be considered later when we need to add structures that require // more powerful combine semantics. -// Trait defining a Monoid, which represents a mathematical structure with an identity element and an associative binary operation. +// Trait defining a Monoid, which represents a mathematical structure with an +// identity element and an associative binary operation. pub trait Monoid { // Returns the neutral element of the monoid. fn neutral() -> V; @@ -13,7 +14,8 @@ pub trait Monoid { fn combine(a: &V, b: &V) -> V; // Combines multiple elements of the monoid into a single element. - // Default implementation uses `neutral()` as the initial accumulator and `combine()` for folding. + // Default implementation uses `neutral()` as the initial accumulator and + // `combine()` for folding. fn combine_all>(values: I) -> V { values .into_iter() diff --git a/fs-storage/src/utils.rs b/fs-storage/src/utils.rs index b5b6830a..d1e818bd 100644 --- a/fs-storage/src/utils.rs +++ b/fs-storage/src/utils.rs @@ -1,6 +1,5 @@ use data_error::Result; -use std::collections::BTreeMap; -use std::path::Path; +use std::{collections::BTreeMap, path::Path}; /// Parses version 2 `FileStorage` format and returns the data as a BTreeMap /// diff --git a/rustfmt.toml b/rustfmt.toml index 0a869692..5b415646 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,15 +1,18 @@ +# General settings +imports_granularity = 'Crate' verbose = "Verbose" tab_spaces = 4 - max_width = 80 +newline_style = "Unix" + +# Code style settings chain_width = 50 single_line_if_else_max_width = 30 - force_explicit_abi = true - +# Import settings reorder_imports = true +# Comment settings wrap_comments = true - -newline_style = "Unix" +comment_width = 80