diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0edf7006..94591f1c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -22,6 +22,7 @@ jobs:
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
with:
+ toolchain: nightly # nightly is required for fmt
components: rustfmt, clippy
- name: Check
@@ -36,7 +37,9 @@ jobs:
run: cargo build --verbose
- name: Run tests
- run: cargo test --verbose
+ run: |
+ cargo test --verbose
+ cargo test --verbose
- name: Build Release
run: cargo build --verbose --release
@@ -68,7 +71,9 @@ jobs:
run: cargo build --verbose --release
- name: Run tests
- run: cargo test --workspace --verbose
+ run: |
+ cargo test --workspace --verbose
+ cargo test --workspace --verbose
- name: Install JDK
uses: actions/setup-java@v4.2.1
@@ -97,7 +102,9 @@ jobs:
run: cargo build --verbose --release
- name: Run tests
- run: cargo test --workspace --verbose
+ run: |
+ cargo test --workspace --verbose
+ cargo test --workspace --verbose
- name: Install JDK
uses: actions/setup-java@v4.2.1
diff --git a/.gitignore b/.gitignore
index bc757308..00d8cf93 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@
target
Cargo.lock
**/app_id
-*.class
\ No newline at end of file
+*.class
+**/.ark
diff --git a/README.md b/README.md
index 6cf0436d..8a472b50 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ cargo bench
This command runs all benchmarks and generates a report in HTML format located at `target/criterion/report`. If you wish to run a specific benchmark, you can specify its name as an argument as in:
```bash
-cargo bench index_build
+cargo bench resource_index
```
### Benchmarking Local Files
@@ -97,10 +97,10 @@ To install `flamegraph`, run:
cargo install flamegraph
```
-To generate a flame graph for `index_build_benchmark`, use the following command:
+To generate a flame graph for `resource_index_benchmark`, use the following command:
```bash
-cargo flamegraph --bench index_build_benchmark -o index_build_benchmark.svg -- --bench
+cargo flamegraph --bench resource_index_benchmark -o resource_index_benchmark.svg -- --bench
```
> [!NOTE]
diff --git a/ark-cli/src/commands/backup.rs b/ark-cli/src/commands/backup.rs
index 398220bd..40d5d46d 100644
--- a/ark-cli/src/commands/backup.rs
+++ b/ark-cli/src/commands/backup.rs
@@ -1,5 +1,4 @@
-use std::io::Write;
-use std::path::PathBuf;
+use std::{io::Write, path::PathBuf};
use crate::{
create_dir_all, dir, discover_roots, home_dir, storages_exists, timestamp,
diff --git a/ark-cli/src/commands/file/append.rs b/ark-cli/src/commands/file/append.rs
index 46d0b132..91969928 100644
--- a/ark-cli/src/commands/file/append.rs
+++ b/ark-cli/src/commands/file/append.rs
@@ -1,9 +1,8 @@
-use std::path::PathBuf;
-use std::str::FromStr;
+use std::{path::PathBuf, str::FromStr};
use crate::{
- models::storage::Storage, models::storage::StorageType, translate_storage,
- AppError, Format, ResourceId,
+ models::storage::{Storage, StorageType},
+ translate_storage, AppError, Format, ResourceId,
};
use data_error::ArklibError;
diff --git a/ark-cli/src/commands/file/insert.rs b/ark-cli/src/commands/file/insert.rs
index ff9b1ac9..b60199e5 100644
--- a/ark-cli/src/commands/file/insert.rs
+++ b/ark-cli/src/commands/file/insert.rs
@@ -1,9 +1,8 @@
-use std::path::PathBuf;
-use std::str::FromStr;
+use std::{path::PathBuf, str::FromStr};
use crate::{
- models::storage::Storage, models::storage::StorageType, translate_storage,
- AppError, Format, ResourceId,
+ models::storage::{Storage, StorageType},
+ translate_storage, AppError, Format, ResourceId,
};
use data_error::ArklibError;
diff --git a/ark-cli/src/commands/file/read.rs b/ark-cli/src/commands/file/read.rs
index 8387d011..7b47d719 100644
--- a/ark-cli/src/commands/file/read.rs
+++ b/ark-cli/src/commands/file/read.rs
@@ -1,9 +1,8 @@
-use std::path::PathBuf;
-use std::str::FromStr;
+use std::{path::PathBuf, str::FromStr};
use crate::{
- models::storage::Storage, models::storage::StorageType, translate_storage,
- AppError, ResourceId,
+ models::storage::{Storage, StorageType},
+ translate_storage, AppError, ResourceId,
};
use data_error::ArklibError;
diff --git a/ark-cli/src/commands/file/utils.rs b/ark-cli/src/commands/file/utils.rs
index 8a3c4048..6cc993dd 100644
--- a/ark-cli/src/commands/file/utils.rs
+++ b/ark-cli/src/commands/file/utils.rs
@@ -1,6 +1,7 @@
-use crate::error::AppError;
-use crate::models::key_value_to_str;
-use crate::models::Format;
+use crate::{
+ error::AppError,
+ models::{key_value_to_str, Format},
+};
use data_error::Result as ArklibResult;
use fs_atomic_versions::atomic::{modify, modify_json, AtomicFile};
diff --git a/ark-cli/src/commands/link/utils.rs b/ark-cli/src/commands/link/utils.rs
index 122f7f1c..2d27ca9a 100644
--- a/ark-cli/src/commands/link/utils.rs
+++ b/ark-cli/src/commands/link/utils.rs
@@ -3,8 +3,7 @@ use data_link::Link;
use std::path::PathBuf;
use url::Url;
-use crate::error::AppError;
-use crate::util::provide_index; // Import your custom AppError type
+use crate::{error::AppError, util::provide_index}; // Import your custom AppError type
pub async fn create_link(
root: &PathBuf,
@@ -28,8 +27,17 @@ pub fn load_link(
) -> Result, AppError> {
let path_from_index = id.clone().map(|id| {
let index = provide_index(root);
- index.id2path[&id].as_path().to_path_buf()
+ index
+ .get_resources_by_id(&id)
+ .map(|r| r[0].path().to_owned())
+ .ok_or_else(|| {
+ AppError::IndexError(format!(
+ "Resource with id {} not found",
+ id
+ ))
+ })
});
+ let path_from_index = path_from_index.transpose()?;
let path_from_user = file_path;
let path = match (path_from_user, path_from_index) {
@@ -46,7 +54,7 @@ pub fn load_link(
}
}
(Some(path), None) => Ok(path.to_path_buf()),
- (None, Some(path)) => Ok(path),
+ (None, Some(path)) => Ok(path.to_path_buf()),
(None, None) => Err(AppError::LinkLoadError(
"Provide a path or id for request.".to_owned(),
))?,
diff --git a/ark-cli/src/commands/list.rs b/ark-cli/src/commands/list.rs
index bc557b42..ee86152b 100644
--- a/ark-cli/src/commands/list.rs
+++ b/ark-cli/src/commands/list.rs
@@ -1,5 +1,4 @@
-use std::io::Read;
-use std::path::PathBuf;
+use std::{io::Read, path::PathBuf};
use crate::{
provide_index, provide_root, read_storage_value, AppError, DateTime,
@@ -76,15 +75,17 @@ impl List {
.map_err(|_| {
AppError::IndexError("Could not read index".to_owned())
})?
- .path2id
+ .resources()
.iter()
- .filter_map(|(path, resource)| {
+ .filter_map(|indexed_resource| {
+ let path = indexed_resource.path();
+ let id = indexed_resource.id();
let tags = if self.tags {
Some(
read_storage_value(
&root,
"tags",
- &resource.id.to_string(),
+ &id.to_string(),
&None,
)
.map_or(vec![], |s| {
@@ -102,7 +103,7 @@ impl List {
read_storage_value(
&root,
"scores",
- &resource.id.to_string(),
+ &id.to_string(),
&None,
)
.map_or(0, |s| s.parse::().unwrap_or(0)),
@@ -114,7 +115,7 @@ impl List {
let datetime = if self.modified {
let format = "%b %e %H:%M %Y";
Some(
- DateTime::::from(resource.modified)
+ DateTime::::from(indexed_resource.last_modified())
.format(format)
.to_string(),
)
@@ -123,21 +124,18 @@ impl List {
};
let (path, resource, content) = match entry_output {
- EntryOutput::Both => (
- Some(path.to_owned().into_path_buf()),
- Some(resource.clone().id),
- None,
- ),
- EntryOutput::Path => {
- (Some(path.to_owned().into_path_buf()), None, None)
+ EntryOutput::Both => {
+ (Some(path.to_owned()), Some(id.to_owned()), None)
}
- EntryOutput::Id => (None, Some(resource.clone().id), None),
+ EntryOutput::Path => (Some(path.to_owned()), None, None),
+ EntryOutput::Id => (None, Some(id.to_owned()), None),
EntryOutput::Link => match File::open(path) {
Ok(mut file) => {
let mut contents = String::new();
match file.read_to_string(&mut contents) {
Ok(_) => {
- // Check if the content of the file is a valid url
+ // Check if the content
+ // of the file is a valid url
let url = contents.trim();
let url = url::Url::parse(url);
match url {
diff --git a/ark-cli/src/commands/render.rs b/ark-cli/src/commands/render.rs
index 7f3fa9e6..19014831 100644
--- a/ark-cli/src/commands/render.rs
+++ b/ark-cli/src/commands/render.rs
@@ -26,7 +26,8 @@ impl Render {
let dest_path = filepath.with_file_name(
filepath
.file_stem()
- // SAFETY: we know that the file stem is valid UTF-8 because it is a file name
+ // SAFETY: we know that the file stem is valid UTF-8
+ // because it is a file name
.unwrap()
.to_str()
.unwrap()
diff --git a/ark-cli/src/commands/storage/list.rs b/ark-cli/src/commands/storage/list.rs
index 6b0c6c4e..0fdcef7c 100644
--- a/ark-cli/src/commands/storage/list.rs
+++ b/ark-cli/src/commands/storage/list.rs
@@ -1,8 +1,8 @@
use std::path::PathBuf;
use crate::{
- models::storage::Storage, models::storage::StorageType, translate_storage,
- AppError,
+ models::storage::{Storage, StorageType},
+ translate_storage, AppError,
};
#[derive(Clone, Debug, clap::Args)]
diff --git a/ark-cli/src/index_registrar.rs b/ark-cli/src/index_registrar.rs
index fc6a2e5b..a75df080 100644
--- a/ark-cli/src/index_registrar.rs
+++ b/ark-cli/src/index_registrar.rs
@@ -2,11 +2,13 @@ use lazy_static::lazy_static;
extern crate canonical_path;
use data_error::{ArklibError, Result};
-use fs_index::ResourceIndex;
+use fs_index::{load_or_build_index, ResourceIndex};
-use std::collections::HashMap;
-use std::path::Path;
-use std::sync::{Arc, RwLock};
+use std::{
+ collections::HashMap,
+ path::Path,
+ sync::{Arc, RwLock},
+};
use crate::ResourceId;
@@ -34,7 +36,9 @@ pub fn provide_index>(
}
log::info!("Index has not been registered before");
- match ResourceIndex::provide(&root_path) {
+ // If the index has not been registered before,
+ // we need to load it, update it and register it
+ match load_or_build_index(&root_path, true) {
Ok(index) => {
let mut registrar = REGISTRAR.write().map_err(|_| {
ArklibError::Other(anyhow::anyhow!("Failed to lock registrar"))
diff --git a/ark-cli/src/main.rs b/ark-cli/src/main.rs
index c8c718ca..e8029c18 100644
--- a/ark-cli/src/main.rs
+++ b/ark-cli/src/main.rs
@@ -1,5 +1,7 @@
-use std::fs::{create_dir_all, File};
-use std::path::PathBuf;
+use std::{
+ fs::{create_dir_all, File},
+ path::PathBuf,
+};
use crate::index_registrar::provide_index;
use data_pdf::{render_preview_page, PDFQuality};
@@ -15,25 +17,23 @@ use fs_storage::ARK_FOLDER;
use anyhow::Result;
-use chrono::prelude::DateTime;
-use chrono::Utc;
+use chrono::{prelude::DateTime, Utc};
-use clap::CommandFactory;
-use clap::FromArgMatches;
+use clap::{CommandFactory, FromArgMatches};
use fs_extra::dir::{self, CopyOptions};
use home::home_dir;
-use crate::cli::Cli;
-use crate::commands::file::File::{Append, Insert, Read};
-use crate::commands::link::Link::{Create, Load};
-use crate::commands::Commands::Link;
-use crate::commands::Commands::Storage;
-use crate::commands::Commands::*;
-use crate::models::EntryOutput;
-use crate::models::Format;
-use crate::models::Sort;
+use crate::{
+ cli::Cli,
+ commands::{
+ file::File::{Append, Insert, Read},
+ link::Link::{Create, Load},
+ Commands::{Link, Storage, *},
+ },
+ models::{EntryOutput, Format, Sort},
+};
use crate::error::AppError;
diff --git a/ark-cli/src/models/storage.rs b/ark-cli/src/models/storage.rs
index 48fc0464..ef6f7f37 100644
--- a/ark-cli/src/models/storage.rs
+++ b/ark-cli/src/models/storage.rs
@@ -1,7 +1,6 @@
use crate::ResourceId;
use fs_atomic_versions::atomic::AtomicFile;
-use std::fmt::Write;
-use std::path::PathBuf;
+use std::{fmt::Write, path::PathBuf};
use crate::{
commands::{file_append, file_insert, format_file, format_line},
diff --git a/ark-cli/src/util.rs b/ark-cli/src/util.rs
index 9a370167..40eff290 100644
--- a/ark-cli/src/util.rs
+++ b/ark-cli/src/util.rs
@@ -1,24 +1,26 @@
use crate::ResourceId;
-use fs_index::index::ResourceIndex;
+use fs_index::ResourceIndex;
use fs_metadata::METADATA_STORAGE_FOLDER;
use fs_properties::PROPERTIES_STORAGE_FOLDER;
use fs_storage::{
ARK_FOLDER, PREVIEWS_STORAGE_FOLDER, SCORE_STORAGE_FILE, STATS_FOLDER,
TAG_STORAGE_FILE, THUMBNAILS_STORAGE_FOLDER,
};
-use std::env::current_dir;
-use std::fs::{canonicalize, metadata};
-use std::io::BufRead;
-use std::io::BufReader;
-use std::path::Path;
-use std::str::FromStr;
-use std::thread;
-use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
-use std::{fs::File, path::PathBuf};
+use std::{
+ env::current_dir,
+ fs::{canonicalize, metadata, File},
+ io::{BufRead, BufReader},
+ path::{Path, PathBuf},
+ str::FromStr,
+ thread,
+ time::{Duration, Instant, SystemTime, UNIX_EPOCH},
+};
-use crate::error::AppError;
-use crate::models::storage::{Storage, StorageType};
-use crate::ARK_CONFIG;
+use crate::{
+ error::AppError,
+ models::storage::{Storage, StorageType},
+ ARK_CONFIG,
+};
pub fn discover_roots(
roots_cfg: &Option,
@@ -111,11 +113,11 @@ pub fn monitor_index(
let duration = start.elapsed();
println!("Updating succeeded in {:?}\n", duration);
- if !diff.deleted.is_empty() {
- println!("Deleted: {:?}", diff.deleted);
+ if !diff.removed().is_empty() {
+ println!("Deleted: {:?}", diff.removed());
}
- if !diff.added.is_empty() {
- println!("Added: {:?}", diff.added);
+ if !diff.added().is_empty() {
+ println!("Added: {:?}", diff.added());
}
}
}
@@ -127,10 +129,14 @@ pub fn monitor_index(
)
})?;
- println!("Here are {} entries in the index", index.size());
+ println!("Here are {} entries in the index", index.len());
- for (key, count) in index.collisions.iter() {
- println!("Id {:?} calculated {} times", key, count);
+ for (key, resources) in index.collisions().iter() {
+ println!(
+ "Id {:?} calculated {} times",
+ key,
+ resources.len()
+ );
}
}
}
diff --git a/data-json/src/lib.rs b/data-json/src/lib.rs
index cc663775..9d047f17 100644
--- a/data-json/src/lib.rs
+++ b/data-json/src/lib.rs
@@ -1,7 +1,4 @@
-use serde_json::json;
-use serde_json::map::Entry;
-use serde_json::Map;
-use serde_json::Value;
+use serde_json::{json, map::Entry, Map, Value};
pub fn merge(origin: Value, new_data: Value) -> Value {
match (origin, new_data) {
diff --git a/data-link/src/lib.rs b/data-link/src/lib.rs
index 7e614a1d..ba925722 100644
--- a/data-link/src/lib.rs
+++ b/data-link/src/lib.rs
@@ -2,18 +2,20 @@ use data_error::Result;
use data_resource::ResourceId;
use fs_atomic_versions::atomic::AtomicFile;
use fs_metadata::store_metadata;
-use fs_properties::load_raw_properties;
-use fs_properties::store_properties;
-use fs_properties::PROPERTIES_STORAGE_FOLDER;
+use fs_properties::{
+ load_raw_properties, store_properties, PROPERTIES_STORAGE_FOLDER,
+};
use fs_storage::{ARK_FOLDER, PREVIEWS_STORAGE_FOLDER};
use reqwest::header::HeaderValue;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
-use std::fmt;
-use std::marker::PhantomData;
-use std::path::Path;
-use std::str::{self, FromStr};
-use std::{io::Write, path::PathBuf};
+use std::{
+ fmt,
+ io::Write,
+ marker::PhantomData,
+ path::{Path, PathBuf},
+ str::{self, FromStr},
+};
use url::Url;
#[derive(Debug, Deserialize, Serialize)]
diff --git a/data-resource/src/lib.rs b/data-resource/src/lib.rs
index 049b4dbb..ea7426c3 100644
--- a/data-resource/src/lib.rs
+++ b/data-resource/src/lib.rs
@@ -3,16 +3,16 @@
//! `data-resource` is a crate for managing resource identifiers.
use core::{fmt::Display, str::FromStr};
use data_error::Result;
-use serde::de::DeserializeOwned;
-use serde::Serialize;
+use serde::{de::DeserializeOwned, Serialize};
use std::{fmt::Debug, hash::Hash, path::Path};
/// This trait defines a generic type representing a resource identifier.
///
-/// Resources are identified by a hash value, which is computed from the resource's data.
-/// The hash value is used to uniquely identify the resource.
+/// Resources are identified by a hash value, which is computed from the
+/// resource's data. The hash value is used to uniquely identify the resource.
///
-/// Implementors of this trait must provide a way to compute the hash value from the resource's data.
+/// Implementors of this trait must provide a way to compute
+/// the hash value from the resource's data.
pub trait ResourceId:
Debug
+ Display
diff --git a/dev-hash/benches/blake3.rs b/dev-hash/benches/blake3.rs
index cf95233b..9b3c6541 100644
--- a/dev-hash/benches/blake3.rs
+++ b/dev-hash/benches/blake3.rs
@@ -17,7 +17,8 @@ fn generate_random_data(size: usize) -> Vec {
(0..size).map(|_| rng.gen()).collect()
}
-/// Benchmarks the performance of resource ID creation from file paths and random data.
+/// Benchmarks the performance of resource ID creation
+/// from file paths and random data.
///
/// - Measures the time taken to create a resource ID from file paths.
/// - Measures the time taken to create a resource ID from random data.
diff --git a/dev-hash/benches/crc32.rs b/dev-hash/benches/crc32.rs
index b462035b..dab4f776 100644
--- a/dev-hash/benches/crc32.rs
+++ b/dev-hash/benches/crc32.rs
@@ -17,7 +17,8 @@ fn generate_random_data(size: usize) -> Vec {
(0..size).map(|_| rng.gen()).collect()
}
-/// Benchmarks the performance of resource ID creation from file paths and random data.
+/// Benchmarks the performance of resource ID creation
+/// from file paths and random data.
///
/// - Measures the time taken to create a resource ID from file paths.
/// - Measures the time taken to create a resource ID from random data.
diff --git a/fs-atomic-light/src/lib.rs b/fs-atomic-light/src/lib.rs
index 25288f6d..fb4bf028 100644
--- a/fs-atomic-light/src/lib.rs
+++ b/fs-atomic-light/src/lib.rs
@@ -1,9 +1,6 @@
use data_error::Result;
-use std::env;
-use std::fs;
-use std::path::Path;
-use std::str;
+use std::{env, fs, path::Path, str};
/// Write data to a tempory file and move that written file to destination
///
diff --git a/fs-atomic-versions/src/atomic/file.rs b/fs-atomic-versions/src/atomic/file.rs
index bd3f2571..a2b708ab 100644
--- a/fs-atomic-versions/src/atomic/file.rs
+++ b/fs-atomic-versions/src/atomic/file.rs
@@ -1,8 +1,10 @@
-use std::fs::{self, File};
-use std::io::{Error, ErrorKind, Read, Result};
-#[cfg(target_os = "unix")]
+#[cfg(target_family = "unix")]
use std::os::unix::fs::MetadataExt;
-use std::path::{Path, PathBuf};
+use std::{
+ fs::{self, File},
+ io::{Error, ErrorKind, Read, Result},
+ path::{Path, PathBuf},
+};
use crate::app_id;
@@ -229,7 +231,7 @@ impl AtomicFile {
// May return `EEXIST`.
let res = std::fs::hard_link(&new.path, new_path);
if let Err(err) = res {
- #[cfg(target_os = "unix")]
+ #[cfg(target_family = "unix")]
// From open(2) manual page:
//
// "[...] create a unique file on the same filesystem (e.g.,
@@ -241,7 +243,7 @@ impl AtomicFile {
if new.path.metadata()?.nlink() != 2 {
Err(err)?;
}
- #[cfg(not(target_os = "unix"))]
+ #[cfg(not(target_family = "unix"))]
Err(err)?;
}
diff --git a/fs-atomic-versions/src/lib.rs b/fs-atomic-versions/src/lib.rs
index 1ce1f0b2..62935039 100644
--- a/fs-atomic-versions/src/lib.rs
+++ b/fs-atomic-versions/src/lib.rs
@@ -1,7 +1,8 @@
use lazy_static::lazy_static;
-use std::path::PathBuf;
-use std::sync::Once;
-use std::sync::RwLock;
+use std::{
+ path::PathBuf,
+ sync::{Once, RwLock},
+};
pub mod app_id;
pub mod atomic;
diff --git a/fs-index/Cargo.toml b/fs-index/Cargo.toml
index 49d0e763..0ded09dd 100644
--- a/fs-index/Cargo.toml
+++ b/fs-index/Cargo.toml
@@ -12,9 +12,8 @@ bench = false
log = { version = "0.4.17", features = ["release_max_level_off"] }
walkdir = "2.3.2"
anyhow = "1.0.58"
-canonical-path = "2.0.2"
-pathdiff = "0.2.1"
-itertools = "0.10.5"
+serde_json = "1.0"
+serde = { version = "1.0", features = ["derive"] }
fs-storage = { path = "../fs-storage" }
@@ -26,11 +25,11 @@ data-resource = { path = "../data-resource" }
uuid = { version = "1.6.1", features = ["v4"] }
# benchmarking
criterion = { version = "0.5", features = ["html_reports"] }
+tempfile = "3.10"
# Depending on `dev-hash` for testing
dev-hash = { path = "../dev-hash" }
-fs-atomic-versions = { path = "../fs-atomic-versions" }
[[bench]]
-name = "index_build_benchmark"
+name = "resource_index_benchmark"
harness = false
-path = "benches/index_build_benchmark.rs"
+path = "benches/resource_index_benchmark.rs"
diff --git a/fs-index/README.md b/fs-index/README.md
new file mode 100644
index 00000000..9253375a
--- /dev/null
+++ b/fs-index/README.md
@@ -0,0 +1,35 @@
+# fs-index
+
+`fs-index` is a Rust crate for managing and indexing file system resources. It provides a flexible and efficient way to track changes, query resources, and keep files in sync across multiple devices or locations.
+
+Originally developed for the ARK framework to support local-first applications, `fs-index` can also be used in various scenarios including backup systems, content management, and more.
+
+## Features
+
+The most important struct in this crate is `ResourceIndex` which comes with:
+
+- **Reactive API**
+ - `update_all`: Method to update the index by rescanning files and returning changes (additions/deletions).
+- **Snapshot API**
+ - `get_resources_by_id`: Query resources from the index by ID.
+ - `get_resource_by_path`: Query a resource from the index by its path.
+
+## Custom Serialization
+
+The `ResourceIndex` struct includes a custom serialization implementation to avoid writing a large repetitive index file with double maps.
+
+## Tests and Benchmarks
+
+- Unit tests are located in `src/tests.rs`.
+- The benchmarking suite is in `benches/resource_index_benchmark.rs`, benchmarking all methods of `ResourceIndex`.
+ - Run benchmarks with `cargo bench`.
+
+## Examples
+
+To get started, take a look at the examples in the `examples/` directory.
+
+To run a specific example:
+
+```shell
+cargo run --example resource_index
+```
diff --git a/fs-index/benches/index_build_benchmark.rs b/fs-index/benches/index_build_benchmark.rs
deleted file mode 100644
index 1de0ef3f..00000000
--- a/fs-index/benches/index_build_benchmark.rs
+++ /dev/null
@@ -1,43 +0,0 @@
-use criterion::{
- black_box, criterion_group, criterion_main, BenchmarkId, Criterion,
-};
-use dev_hash::Crc32;
-use fs_index::index::ResourceIndex;
-
-const DIR_PATH: &str = "../test-assets/"; // Set the path to the directory containing the resources here
-
-fn index_build_benchmark(c: &mut Criterion) {
- // assert the path exists and is a directory
- assert!(
- std::path::Path::new(DIR_PATH).is_dir(),
- "The path: {} does not exist or is not a directory",
- DIR_PATH
- );
-
- let mut group = c.benchmark_group("index_build");
- group.measurement_time(std::time::Duration::from_secs(20)); // Set the measurement time here
-
- let mut collisions_size = 0;
-
- group.bench_with_input(
- BenchmarkId::new("index_build", DIR_PATH),
- &DIR_PATH,
- |b, path| {
- b.iter(|| {
- let index: ResourceIndex =
- ResourceIndex::build(black_box(path.to_string()));
- collisions_size = index.collisions.len();
- });
- },
- );
- group.finish();
-
- println!("Collisions: {}", collisions_size);
-}
-
-criterion_group! {
- name = benches;
- config = Criterion::default();
- targets = index_build_benchmark
-}
-criterion_main!(benches);
diff --git a/fs-index/benches/resource_index_benchmark.rs b/fs-index/benches/resource_index_benchmark.rs
new file mode 100644
index 00000000..f6d3f21a
--- /dev/null
+++ b/fs-index/benches/resource_index_benchmark.rs
@@ -0,0 +1,142 @@
+use std::path::PathBuf;
+
+use criterion::{
+ black_box, criterion_group, criterion_main, BenchmarkId, Criterion,
+};
+use tempfile::TempDir;
+
+use dev_hash::Crc32;
+use fs_index::ResourceIndex;
+
+fn resource_index_benchmark(c: &mut Criterion) {
+ let mut group = c.benchmark_group("resource_index");
+ group.measurement_time(std::time::Duration::from_secs(20)); // Set the measurement time here
+
+ let benchmarks_dir = setup_temp_dir();
+ let benchmarks_dir = benchmarks_dir.path();
+ let benchmarks_dir_str = benchmarks_dir.to_str().unwrap();
+
+ // Benchmark `ResourceIndex::build()`
+
+ let mut collisions_size = 0;
+ group.bench_with_input(
+ BenchmarkId::new("index_build", benchmarks_dir_str),
+ &benchmarks_dir,
+ |b, path| {
+ b.iter(|| {
+ let index: ResourceIndex =
+ ResourceIndex::build(black_box(path)).unwrap();
+ collisions_size = index.collisions().len();
+ });
+ },
+ );
+ println!("Collisions: {}", collisions_size);
+
+ // Benchmark `ResourceIndex::get_resources_by_id()`
+ let index: ResourceIndex =
+ ResourceIndex::build(benchmarks_dir).unwrap();
+ let resources = index.resources();
+ let resource_id = resources[0].id();
+ group.bench_function("index_get_resource_by_id", |b| {
+ b.iter(|| {
+ let _resource = index.get_resources_by_id(black_box(resource_id));
+ });
+ });
+
+ // Benchmark `ResourceIndex::get_resource_by_path()`
+ let resource_path = resources[0].path();
+ group.bench_function("index_get_resource_by_path", |b| {
+ b.iter(|| {
+ let _resource =
+ index.get_resource_by_path(black_box(resource_path));
+ });
+ });
+
+ // Benchmark `ResourceIndex::update_all()`
+
+ // First, create a new temp directory specifically for the update_all
+ // benchmark since we will be creating new files, removing files, and
+ // modifying files
+ let update_all_benchmarks_dir =
+ TempDir::with_prefix("ark-fs-index-benchmarks-update-all").unwrap();
+ let update_all_benchmarks_dir = update_all_benchmarks_dir.path();
+
+ group.bench_function("index_update_all", |b| {
+ b.iter(|| {
+ // Clear the directory
+ std::fs::remove_dir_all(&update_all_benchmarks_dir).unwrap();
+ std::fs::create_dir(&update_all_benchmarks_dir).unwrap();
+
+ // Create 5000 new files
+ for i in 0..5000 {
+ let new_file =
+ update_all_benchmarks_dir.join(format!("file_{}.txt", i));
+ std::fs::File::create(&new_file).unwrap();
+ std::fs::write(&new_file, format!("Hello, World! {}", i))
+ .unwrap();
+ }
+ let mut index: ResourceIndex =
+ ResourceIndex::build(black_box(&update_all_benchmarks_dir))
+ .unwrap();
+
+ update_all_files(&update_all_benchmarks_dir.to_path_buf());
+ let _update_result = index.update_all().unwrap();
+ });
+ });
+
+ group.finish();
+}
+
+criterion_group! {
+ name = benches;
+ config = Criterion::default();
+ targets = resource_index_benchmark
+}
+criterion_main!(benches);
+
+/// A helper function to setup a temp directory for the benchmarks
+fn setup_temp_dir() -> TempDir {
+ // Create a temp directory
+ let temp_dir = TempDir::with_prefix("ark-fs-index-benchmarks").unwrap();
+ let benchmarks_dir = temp_dir.path();
+ log::info!("Temp directory for benchmarks: {:?}", benchmarks_dir);
+
+ // Create 10,000 files in the temp directory
+ for i in 0..10000 {
+ let new_file = benchmarks_dir.join(format!("file_{}.txt", i));
+ std::fs::File::create(&new_file).unwrap();
+ // We add the index `i` to the file content to make sure the content is
+ // unique This is to avoid collisions in the index
+ std::fs::write(&new_file, format!("Hello, World! {}", i)).unwrap();
+ }
+
+ temp_dir
+}
+
+/// A helper function that takes a directory and creates 5000 new files, removes
+/// 3000 files, and modifies 1000 files
+///
+/// Note: The function assumes that the directory already contains 5000 files
+/// with the names `file_0.txt` to `file_4999.txt`
+fn update_all_files(dir: &PathBuf) {
+ // Create 5000 new files
+ for i in 5001..10001 {
+ let new_file = dir.join(format!("file_{}.txt", i));
+ std::fs::File::create(&new_file).unwrap();
+ // We add the index `i` to the file content to make sure the content is
+ // unique This is to avoid collisions in the index
+ std::fs::write(&new_file, format!("Hello, World! {}", i)).unwrap();
+ }
+
+ // Remove 3000 files
+ for i in 0..3000 {
+ let removed_file = dir.join(format!("file_{}.txt", i));
+ std::fs::remove_file(&removed_file).unwrap();
+ }
+
+ // Modify 1000 files
+ for i in 4000..5000 {
+ let modified_file = dir.join(format!("file_{}.txt", i));
+ std::fs::write(&modified_file, "Hello, World!").unwrap();
+ }
+}
diff --git a/fs-index/examples/resource_index.rs b/fs-index/examples/resource_index.rs
new file mode 100644
index 00000000..3f5cb797
--- /dev/null
+++ b/fs-index/examples/resource_index.rs
@@ -0,0 +1,46 @@
+use std::path::Path;
+
+use anyhow::Result;
+
+use dev_hash::Blake3;
+use fs_index::ResourceIndex;
+
+/// A simple example of how to use [`ResourceIndex`] to index a directory.
+fn main() -> Result<()> {
+ // Create a new `ResourceIndex` from the directory "test-assets" using
+ // blake3 as the hashing algorithm.
+ let mut index: ResourceIndex =
+ ResourceIndex::build(Path::new("test-assets"))?;
+
+ // Print the indexed resources.
+ for resource in index.resources() {
+ println!("{:?}", resource);
+ }
+
+ // Save the index to a file.
+ index.store()?;
+
+ // Get resources by their id.
+ let id = Blake3(
+ "172b4bf148e858b13dde0fc6613413bcb7552e5c4e5c45195ac6c80f20eb5ff5"
+ .to_string(),
+ );
+ let resources = index.get_resources_by_id(&id).ok_or_else(|| {
+ anyhow::anyhow!("Resource with id {:?} not found", id)
+ })?;
+ for resource in resources {
+ println!("{:?}", resource);
+ }
+
+ // Get resources by their path.
+ let path = Path::new("lena.jpg");
+ let resource = index.get_resource_by_path(path).ok_or_else(|| {
+ anyhow::anyhow!("Resource with path {:?} not found", path)
+ })?;
+ println!("{:?}", resource);
+
+ // Update the index.
+ index.update_all()?;
+
+ Ok(())
+}
diff --git a/fs-index/src/index.rs b/fs-index/src/index.rs
index a60287ea..91c8228b 100644
--- a/fs-index/src/index.rs
+++ b/fs-index/src/index.rs
@@ -1,1120 +1,462 @@
-use anyhow::anyhow;
-use canonical_path::{CanonicalPath, CanonicalPathBuf};
-use itertools::Itertools;
-use std::collections::{HashMap, HashSet};
-use std::fs::{self, File, Metadata};
-use std::io::{BufRead, BufReader, Write};
-use std::ops::Add;
-use std::path::{Path, PathBuf};
-use std::time::{Duration, SystemTime, UNIX_EPOCH};
-use walkdir::{DirEntry, WalkDir};
+use std::{
+ collections::{HashMap, HashSet},
+ fs,
+ hash::Hash,
+ path::{Path, PathBuf},
+ time::{Duration, SystemTime},
+};
-use log;
+use serde::{Deserialize, Serialize};
-use data_error::{ArklibError, Result};
+use data_error::Result;
use data_resource::ResourceId;
use fs_storage::{ARK_FOLDER, INDEX_PATH};
-#[derive(Eq, Ord, PartialEq, PartialOrd, Hash, Clone, Debug)]
-pub struct IndexEntry {
- pub modified: SystemTime,
- pub id: Id,
-}
-
-#[derive(PartialEq, Clone, Debug)]
-pub struct ResourceIndex {
- pub id2path: HashMap,
- pub path2id: HashMap>,
+use crate::utils::{discover_paths, scan_entries};
- pub collisions: HashMap,
- root: PathBuf,
-}
+/// The threshold for considering a resource updated
+pub const RESOURCE_UPDATED_THRESHOLD: Duration = Duration::from_millis(1);
-#[derive(PartialEq, Debug)]
-pub struct IndexUpdate {
- pub deleted: HashSet,
- pub added: HashMap,
+/// Represents a resource in the index
+#[derive(
+ PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Debug, Serialize, Deserialize,
+)]
+pub struct IndexedResource {
+ /// The unique identifier of the resource
+ id: Id,
+ /// The path of the resource, relative to the root path
+ path: PathBuf,
+ /// The last modified time of the resource (from the file system metadata)
+ last_modified: SystemTime,
}
-pub const RESOURCE_UPDATED_THRESHOLD: Duration = Duration::from_millis(1);
-
-pub type Paths = HashSet;
+impl IndexedResource {
+ /// Create a new indexed resource
+ pub fn new(id: Id, path: PathBuf, last_modified: SystemTime) -> Self {
+ IndexedResource {
+ id,
+ path,
+ last_modified,
+ }
+ }
-impl ResourceIndex {
- pub fn size(&self) -> usize {
- //the actual size is lower in presence of collisions
- self.path2id.len()
+ /// Return the ID of the resource
+ pub fn id(&self) -> &Id {
+ &self.id
}
- pub fn build>(root_path: P) -> Self {
- log::info!("Building the index from scratch");
- let root_path: PathBuf = root_path.as_ref().to_owned();
+ /// Return the path of the resource
+ pub fn path(&self) -> &Path {
+ &self.path
+ }
- let entries = discover_paths(&root_path);
- let entries = scan_entries(entries);
+ /// Return the last modified time of the resource
+ pub fn last_modified(&self) -> SystemTime {
+ self.last_modified
+ }
+}
- let mut index = ResourceIndex {
- id2path: HashMap::new(),
- path2id: HashMap::new(),
- collisions: HashMap::new(),
- root: root_path,
- };
+/// Represents the index of resources in a directory.
+///
+/// [`ResourceIndex`] provides functionality for managing a directory index,
+/// including tracking changes, and querying resources.
+///
+/// #### Reactive API
+/// - [`ResourceIndex::update_all`]: Method to update the index by rescanning
+/// files and returning changes (additions/deletions/updates).
+///
+/// #### Snapshot API
+/// - [`ResourceIndex::get_resources_by_id`]: Query resources from the index by
+/// ID.
+/// - [`ResourceIndex::get_resource_by_path`]: Query a resource from the index
+/// by its path.
+///
+/// #### Track API
+/// Allows for fine-grained control over tracking changes in the index
+/// - [`ResourceIndex::track_addition`]: Track a newly added file (checks if the
+/// file exists in the file system).
+/// - [`ResourceIndex::track_removal`]: Track the deletion of a file (checks if
+/// the file was actually deleted).
+/// - [`ResourceIndex::track_modification`]: Track an update on a single file.
+///
+/// ## Examples
+/// ```no_run
+/// use std::path::Path;
+/// use fs_index::{ResourceIndex, load_or_build_index};
+/// use dev_hash::Crc32;
+///
+/// // Define the root path
+/// let root_path = Path::new("animals");
+///
+/// // Build the index
+/// let index: ResourceIndex = ResourceIndex::build(root_path).expect("Failed to build index");
+/// // Store the index
+/// index.store().expect("Failed to store index");
+///
+/// // Load the stored index
+/// let mut loaded_index: ResourceIndex = load_or_build_index(root_path, false).expect("Failed to load index");
+///
+/// // Update the index
+/// loaded_index.update_all().expect("Failed to update index");
+///
+/// // Get a resource by path
+/// let _resource = loaded_index
+/// .get_resource_by_path("cat.txt")
+/// .expect("Resource not found");
+/// ```
+#[derive(Clone, Debug)]
+pub struct ResourceIndex
+where
+ Id: Eq + Hash,
+{
+ /// The root path of the index (canonicalized)
+ pub(crate) root: PathBuf,
+ /// A map from resource IDs to paths
+ ///
+ /// Multiple resources can have the same ID (e.g., due to hash collisions
+ /// or files with the same content)
+ pub(crate) id_to_paths: HashMap>,
+ /// A map from resource paths to resources
+ pub(crate) path_to_resource: HashMap>,
+}
- for (path, entry) in entries {
- index.insert_entry(path, entry);
- }
+/// Represents the result of an update operation on the ResourceIndex
+#[derive(PartialEq, Debug)]
+pub struct IndexUpdate {
+ /// Resources that were added during the update
+ added: HashMap>,
+ /// Resources that were removed during the update
+ removed: HashSet,
+}
- log::info!("Index built");
- index
+impl IndexUpdate {
+ /// Return the resources that were added during the update
+ pub fn added(&self) -> &HashMap> {
+ &self.added
}
- pub fn load>(root_path: P) -> Result {
- let root_path: PathBuf = root_path.as_ref().to_owned();
-
- let index_path: PathBuf = root_path.join(ARK_FOLDER).join(INDEX_PATH);
- log::info!("Loading the index from file {}", index_path.display());
- let file = File::open(&index_path)?;
- let mut index = ResourceIndex {
- id2path: HashMap::new(),
- path2id: HashMap::new(),
- collisions: HashMap::new(),
- root: root_path.clone(),
- };
-
- // We should not return early in case of missing files
- let lines = BufReader::new(file).lines();
- for line in lines {
- let line = line?;
+ /// Return the resources that were removed during the update
+ pub fn removed(&self) -> &HashSet {
+ &self.removed
+ }
+}
- let mut parts = line.split(' ');
+impl ResourceIndex {
+ /// Return the number of resources in the index
+ pub fn len(&self) -> usize {
+ self.path_to_resource.len()
+ }
- let modified = {
- let str = parts.next().ok_or(ArklibError::Parse)?;
- UNIX_EPOCH.add(Duration::from_millis(
- str.parse().map_err(|_| ArklibError::Parse)?,
- ))
- };
+ /// Return true if the index is empty
+ pub fn is_empty(&self) -> bool {
+ self.path_to_resource.is_empty()
+ }
- let id = {
- let str = parts.next().ok_or(ArklibError::Parse)?;
- Id::from_str(str).map_err(|_| ArklibError::Parse)?
- };
+ /// Return the root path of the index
+ pub fn root(&self) -> &Path {
+ &self.root
+ }
- let path: String =
- itertools::Itertools::intersperse(parts, " ").collect();
- let path: PathBuf = root_path.join(Path::new(&path));
- match CanonicalPathBuf::canonicalize(&path) {
- Ok(path) => {
- log::trace!("[load] {} -> {}", id, path.display());
- index.insert_entry(path, IndexEntry { modified, id });
- }
- Err(_) => {
- log::warn!("File {} not found", path.display());
- continue;
- }
- }
- }
+ /// Return the resources in the index
+ pub fn resources(&self) -> Vec> {
+ // Using path_to_resource so to avoid not collecting duplicates
+ self.path_to_resource.values().cloned().collect()
+ }
- Ok(index)
+ /// Return the ID collisions
+ ///
+ /// **Note**: If you are using a cryptographic hash function, collisions
+ /// should be files with the same content. If you are using a
+ /// non-cryptographic hash function, collisions can be files with the
+ /// same content or files whose content hash to the same value.
+ pub fn collisions(&self) -> HashMap> {
+ // Filter out IDs with only one resource
+ self.id_to_paths
+ .iter()
+ .filter(|(_id, paths)| paths.len() > 1)
+ .map(|(id, paths)| (id.clone(), paths.clone()))
+ .collect()
}
+ /// Return the number of ID collisions
+ ///
+ /// **Note**: If you are using a cryptographic hash function, collisions
+ /// should be files with the same content. If you are using a
+ /// non-cryptographic hash function, collisions can be files with the
+ /// same content or files whose content hash to the same value.
+ pub fn num_collisions(&self) -> usize {
+ // Aggregate the number of collisions for each ID
+ self.id_to_paths
+ .values()
+ .filter(|paths| paths.len() > 1)
+ .map(|paths| paths.len())
+ .sum()
+ }
+
+ /// Save the index to the file system (as a JSON file in
+ /// /ARK_FOLDER/INDEX_PATH)
pub fn store(&self) -> Result<()> {
- log::info!("Storing the index to file");
-
- let start = SystemTime::now();
-
- let index_path = self
- .root
- .to_owned()
- .join(ARK_FOLDER)
- .join(INDEX_PATH);
-
- let ark_dir = index_path.parent().unwrap();
- fs::create_dir_all(ark_dir)?;
-
- let mut file = File::create(index_path)?;
-
- let mut path2id: Vec<(&CanonicalPathBuf, &IndexEntry)> =
- self.path2id.iter().collect();
- path2id.sort_by_key(|(_, entry)| *entry);
-
- for (path, entry) in path2id.iter() {
- log::trace!("[store] {} by path {}", entry.id, path.display());
-
- let timestamp = entry
- .modified
- .duration_since(UNIX_EPOCH)
- .map_err(|_| {
- ArklibError::Other(anyhow!("Error using duration since"))
- })?
- .as_millis();
+ let ark_folder = self.root.join(ARK_FOLDER);
+ let index_path = ark_folder.join(INDEX_PATH);
+ log::debug!("Storing index at: {:?}", index_path);
- let path =
- pathdiff::diff_paths(path.to_str().unwrap(), self.root.clone())
- .ok_or(ArklibError::Path(
- "Couldn't calculate path diff".into(),
- ))?;
+ fs::create_dir_all(&ark_folder)?;
+ let index_file = fs::File::create(index_path)?;
+ serde_json::to_writer_pretty(index_file, self)?;
- writeln!(file, "{} {} {}", timestamp, entry.id, path.display())?;
- }
-
- log::trace!(
- "Storing the index took {:?}",
- start
- .elapsed()
- .map_err(|_| ArklibError::Other(anyhow!("SystemTime error")))
- );
Ok(())
}
- pub fn provide>(root_path: P) -> Result {
- match Self::load(&root_path) {
- Ok(mut index) => {
- log::debug!("Index loaded: {} entries", index.path2id.len());
+ /// Get resources by their ID
+ ///
+ /// Returns None if there is no resource with the given ID
+ ///
+ /// **Note**: This can return multiple resources with the same ID in case of
+ /// hash collisions or files with the same content
+ pub fn get_resources_by_id(
+ &self,
+ id: &Id,
+ ) -> Option>> {
+ let paths = self.id_to_paths.get(id)?;
+ let resources = paths
+ .iter()
+ .filter_map(|path| self.path_to_resource.get(path))
+ .collect();
+ Some(resources)
+ }
- match index.update_all() {
- Ok(update) => {
- log::debug!(
- "Index updated: {} added, {} deleted",
- update.added.len(),
- update.deleted.len()
- );
- }
- Err(e) => {
- log::error!(
- "Failed to update index: {}",
- e.to_string()
- );
- }
- }
+ /// Get a resource by its path
+ ///
+ /// Returns None if the resource does not exist
+ ///
+ /// **Note**: The path should be relative to the root path
+ pub fn get_resource_by_path>(
+ &self,
+ path: P,
+ ) -> Option<&IndexedResource> {
+ self.path_to_resource.get(path.as_ref())
+ }
+
+ /// Build a new index from the given root path
+ pub fn build>(root_path: P) -> Result {
+ log::debug!("Building index at root path: {:?}", root_path.as_ref());
+
+ // Canonicalize the root path
+ let root = fs::canonicalize(&root_path)?;
+ let mut id_to_paths: HashMap> = HashMap::new();
+ let mut path_to_resource = HashMap::new();
+
+ // Discover paths in the root directory
+ let paths = discover_paths(&root)?;
+ let entries: HashMap> =
+ scan_entries(paths);
+
+ // Strip the root path from the entries
+ let entries: HashMap> = entries
+ .into_iter()
+ .map(|(path, resource)| {
+ let relative_path =
+ path.strip_prefix(&root).unwrap().to_path_buf();
+ let resource = IndexedResource::new(
+ resource.id().clone(),
+ relative_path.clone(),
+ resource.last_modified(),
+ );
+ (relative_path, resource)
+ })
+ .collect();
- if let Err(e) = index.store() {
- log::error!("{}", e.to_string());
- }
- Ok(index)
- }
- Err(e) => {
- log::warn!("{}", e.to_string());
- Ok(Self::build(root_path))
- }
+ // Update the path to resource map
+ path_to_resource.extend(entries.clone());
+
+ // Update the ID to paths map
+ for resource in entries.values() {
+ let id = resource.id().clone();
+ id_to_paths
+ .entry(id)
+ .or_default()
+ .insert(resource.path().to_path_buf());
}
+
+ let index = ResourceIndex {
+ root,
+ id_to_paths,
+ path_to_resource,
+ };
+ Ok(index)
}
+ /// Update the index with the latest information from the file system
pub fn update_all(&mut self) -> Result> {
- log::debug!("Updating the index");
- log::trace!("[update] known paths: {:?}", self.path2id.keys());
-
- let curr_entries = discover_paths(self.root.clone());
-
- //assuming that collections manipulation is
- // quicker than asking `path.exists()` for every path
- let curr_paths: Paths = curr_entries.keys().cloned().collect();
- let prev_paths: Paths = self.path2id.keys().cloned().collect();
- let preserved_paths: Paths = curr_paths
- .intersection(&prev_paths)
- .cloned()
- .collect();
+ log::debug!("Updating index at root path: {:?}", self.root);
+ log::trace!("Current index: {:#?}", self);
+
+ let mut added = HashMap::new();
+ let mut removed = HashSet::new();
+
+ let current_paths = discover_paths(&self.root)?;
+
+ // Assuming that collection manipulation is faster than repeated
+ // lookups
+ let current_entries: HashMap> =
+ scan_entries(current_paths.clone());
+ let previous_entries = self.path_to_resource.clone();
+ // `preserved_entries` is the intersection of current_entries and
+ // previous_entries
+ let preserved_entries: HashMap> =
+ current_entries
+ .iter()
+ .filter_map(|(path, _resource)| {
+ previous_entries.get(path).map(|prev_resource| {
+ (path.clone(), prev_resource.clone())
+ })
+ })
+ .collect();
- let created_paths: HashMap = curr_entries
- .iter()
- .filter_map(|(path, entry)| {
- if !preserved_paths.contains(path.as_canonical_path()) {
- Some((path.clone(), entry.clone()))
- } else {
- None
- }
- })
- .collect();
+ // `created_entries` is the difference between current_entries and
+ // preserved_entries
+ let created_entries: HashMap> =
+ current_entries
+ .iter()
+ .filter_map(|(path, resource)| {
+ if preserved_entries.contains_key(path) {
+ None
+ } else {
+ Some((path.clone(), resource.clone()))
+ }
+ })
+ .collect();
- log::debug!("Checking updated paths");
- let updated_paths: HashMap = curr_entries
- .into_iter()
- .filter(|(path, dir_entry)| {
- if !preserved_paths.contains(path.as_canonical_path()) {
- false
- } else {
- let our_entry = &self.path2id[path];
- let prev_modified = our_entry.modified;
+ // `updated_entries` is the intersection of current_entries and
+ // preserved_entries where the last modified time has changed
+ // significantly (> RESOURCE_UPDATED_THRESHOLD)
+ let updated_entries: HashMap> =
+ current_entries
+ .into_iter()
+ .filter(|(path, entry)| {
+ if !preserved_entries.contains_key(path) {
+ false
+ } else {
+ let our_entry = &self.path_to_resource[path];
+ let prev_modified = our_entry.last_modified();
- let result = dir_entry.metadata();
- match result {
- Err(msg) => {
- log::error!(
- "Couldn't retrieve metadata for {}: {}",
- &path.display(),
- msg
- );
- false
- }
- Ok(metadata) => match metadata.modified() {
+ let result = entry.path().metadata();
+ match result {
Err(msg) => {
log::error!(
- "Couldn't retrieve timestamp for {}: {}",
+ "Couldn't retrieve metadata for {}: {}",
&path.display(),
msg
);
false
}
- Ok(curr_modified) => {
- let elapsed = curr_modified
- .duration_since(prev_modified)
- .unwrap();
-
- let was_updated =
- elapsed >= RESOURCE_UPDATED_THRESHOLD;
- if was_updated {
- log::trace!(
- "[update] modified {} by path {}
+ Ok(metadata) => match metadata.modified() {
+ Err(msg) => {
+ log::error!(
+ "Couldn't retrieve timestamp for {}: {}",
+ &path.display(),
+ msg
+ );
+ false
+ }
+ Ok(curr_modified) => {
+ let elapsed = curr_modified
+ .duration_since(prev_modified)
+ .unwrap();
+
+ let was_updated =
+ elapsed >= RESOURCE_UPDATED_THRESHOLD;
+ if was_updated {
+ log::trace!(
+ "[update] modified {} by path {}
\twas {:?}
\tnow {:?}
\telapsed {:?}",
- our_entry.id,
- path.display(),
- prev_modified,
- curr_modified,
- elapsed
- );
+ our_entry.id,
+ path.display(),
+ prev_modified,
+ curr_modified,
+ elapsed
+ );
+ }
+
+ was_updated
}
-
- was_updated
- }
- },
+ },
+ }
}
- }
- })
- .collect();
-
- let mut deleted: HashSet = HashSet::new();
+ })
+ .collect();
- // treating both deleted and updated paths as deletions
- prev_paths
- .difference(&preserved_paths)
- .cloned()
- .chain(updated_paths.keys().cloned())
- .for_each(|path| {
- if let Some(entry) =
- self.path2id.remove(path.as_canonical_path())
- {
- let k = self.collisions.remove(&entry.id).unwrap_or(1);
- if k > 1 {
- self.collisions.insert(entry.id, k - 1);
+ // Remove resources that are not in the current entries
+ let removed_entries: HashMap> =
+ previous_entries
+ .iter()
+ .filter_map(|(path, resource)| {
+ if preserved_entries.contains_key(path) {
+ None
} else {
- log::trace!(
- "[delete] {} by path {}",
- entry.id,
- path.display()
- );
- self.id2path.remove(&entry.id);
- deleted.insert(entry.id);
+ Some((path.clone(), resource.clone()))
}
- } else {
- log::warn!("Path {} was not known", path.display());
- }
- });
-
- let added: HashMap> =
- scan_entries(updated_paths)
- .into_iter()
- .chain({
- log::debug!("Checking added paths");
- scan_entries(created_paths).into_iter()
})
- .filter(|(_, entry)| !self.id2path.contains_key(&entry.id))
.collect();
-
- for (path, entry) in added.iter() {
- if deleted.contains(&entry.id) {
- // emitting the resource as both deleted and added
- // (renaming a duplicate might remain undetected)
- log::trace!(
- "[update] moved {} to path {}",
- entry.id,
- path.display()
- );
- }
-
- self.insert_entry(path.clone(), entry.clone());
- }
-
- let added: HashMap = added
- .into_iter()
- .map(|(path, entry)| (path, entry.id))
- .collect();
-
- Ok(IndexUpdate { deleted, added })
- }
-
- // the caller must ensure that:
- // * the index is up-to-date except this single path
- // * the path hasn't been indexed before
- pub fn index_new(
- &mut self,
- path: &dyn AsRef,
- ) -> Result> {
- log::debug!("Indexing a new path");
-
- if !path.as_ref().exists() {
- return Err(ArklibError::Path(
- "Absent paths cannot be indexed".into(),
- ));
- }
-
- let path_buf = CanonicalPathBuf::canonicalize(path)?;
- let path = path_buf.as_canonical_path();
-
- return match fs::metadata(path) {
- Err(_) => {
- return Err(ArklibError::Path(
- "Couldn't to retrieve file metadata".into(),
- ));
- }
- Ok(metadata) => match scan_entry(path, metadata) {
- Err(_) => {
- return Err(ArklibError::Path(
- "The path points to a directory or empty file".into(),
- ));
- }
- Ok(new_entry) => {
- let id = new_entry.clone().id;
-
- if let Some(nonempty) = self.collisions.get_mut(&id) {
- *nonempty += 1;
- }
-
- let mut added = HashMap::new();
- added.insert(path_buf.clone(), id.clone());
-
- self.id2path.insert(id, path_buf.clone());
- self.path2id.insert(path_buf, new_entry);
-
- Ok(IndexUpdate {
- added,
- deleted: HashSet::new(),
- })
- }
- },
- };
- }
-
- // the caller must ensure that:
- // * the index is up-to-date except this single path
- // * the path has been indexed before
- // * the path maps into `old_id`
- // * the content by the path has been modified
- pub fn update_one(
- &mut self,
- path: &dyn AsRef,
- old_id: Id,
- ) -> Result> {
- log::debug!("Updating a single entry in the index");
-
- if !path.as_ref().exists() {
- return self.forget_id(old_id);
- }
-
- let path_buf = CanonicalPathBuf::canonicalize(path)?;
- let path = path_buf.as_canonical_path();
-
- log::trace!(
- "[update] paths {:?} has id {:?}",
- path,
- self.path2id[path]
- );
-
- return match fs::metadata(path) {
- Err(_) => {
- // updating the index after resource removal
- // is a correct scenario
- self.forget_path(path, old_id)
- }
- Ok(metadata) => {
- match scan_entry(path, metadata) {
- Err(_) => {
- // a directory or empty file exists by the path
- self.forget_path(path, old_id)
- }
- Ok(new_entry) => {
- // valid resource exists by the path
-
- let curr_entry = &self.path2id.get(path);
- if curr_entry.is_none() {
- // if the path is not indexed, then we can't have
- // `old_id` if you want
- // to index new path, use `index_new` method
- return Err(ArklibError::Path(
- "Couldn't find the path in the index".into(),
- ));
- }
- let curr_entry = curr_entry.unwrap();
-
- if curr_entry.id == new_entry.id {
- // in rare cases we are here due to hash collision
- if curr_entry.modified == new_entry.modified {
- log::warn!("path {:?} was not modified", &path);
- } else {
- log::warn!("path {:?} was modified but not its content", &path);
- }
-
- // the caller must have ensured that the path was
- // indeed update
- return Err(ArklibError::Collision(
- "New content has the same id".into(),
- ));
- }
-
- // new resource exists by the path
- self.forget_path(path, old_id).map(|mut update| {
- update
- .added
- .insert(path_buf.clone(), new_entry.clone().id);
- self.insert_entry(path_buf, new_entry);
-
- update
- })
- }
- }
- }
- };
- }
-
- pub fn forget_id(&mut self, old_id: Id) -> Result> {
- let old_path = self
- .path2id
- .drain()
- .filter_map(|(k, v)| {
- if v.id == old_id {
- Some(k)
- } else {
- None
- }
- })
- .collect_vec();
- for p in old_path {
- self.path2id.remove(&p);
- }
- self.id2path.remove(&old_id);
- let mut deleted = HashSet::new();
- deleted.insert(old_id);
-
- Ok(IndexUpdate {
- added: HashMap::new(),
- deleted,
- })
- }
-
- fn insert_entry(&mut self, path: CanonicalPathBuf, entry: IndexEntry) {
- log::trace!("[add] {} by path {}", entry.id, path.display());
- let id = entry.clone().id;
-
- if let std::collections::hash_map::Entry::Vacant(e) =
- self.id2path.entry(id.clone())
- {
- e.insert(path.clone());
- } else if let Some(nonempty) = self.collisions.get_mut(&id) {
- *nonempty += 1;
- } else {
- self.collisions.insert(id, 2);
- }
-
- self.path2id.insert(path, entry);
- }
-
- fn forget_path(
- &mut self,
- path: &CanonicalPath,
- old_id: Id,
- ) -> Result> {
- self.path2id.remove(path);
-
- if let Some(collisions) = self.collisions.get_mut(&old_id) {
- debug_assert!(
- *collisions > 1,
- "Any collision must involve at least 2 resources"
+ for (path, resource) in removed_entries {
+ log::trace!(
+ "Resource removed: {:?}, last modified: {:?}",
+ path,
+ resource.last_modified()
);
- *collisions -= 1;
- if *collisions == 1 {
- self.collisions.remove(&old_id);
+ self.path_to_resource.remove(&path);
+ self.id_to_paths
+ .get_mut(resource.id())
+ .unwrap()
+ .remove(&path);
+ let id = resource.id().clone();
+ // Only remove the ID if it has no paths
+ if self.id_to_paths[&id].is_empty() {
+ self.id_to_paths.remove(&id);
+ removed.insert(id);
}
-
- // minor performance issue:
- // we must find path of one of the collided
- // resources and use it as new value
- let maybe_collided_path =
- self.path2id.iter().find_map(|(path, entry)| {
- if entry.id == old_id {
- Some(path)
- } else {
- None
- }
- });
-
- if let Some(collided_path) = maybe_collided_path {
- let old_path = self
- .id2path
- .insert(old_id.clone(), collided_path.clone());
-
- debug_assert_eq!(
- old_path.unwrap().as_canonical_path(),
- path,
- "Must forget the requested path"
- );
- } else {
- return Err(ArklibError::Collision(
- "Illegal state of collision tracker".into(),
- ));
- }
- } else {
- self.id2path.remove(&old_id.clone());
}
- let mut deleted = HashSet::new();
- deleted.insert(old_id);
-
- Ok(IndexUpdate {
- added: HashMap::new(),
- deleted,
- })
- }
-}
-
-fn discover_paths>(
- root_path: P,
-) -> HashMap {
- log::debug!(
- "Discovering all files under path {}",
- root_path.as_ref().display()
- );
-
- WalkDir::new(root_path)
- .into_iter()
- .filter_entry(|entry| !is_hidden(entry))
- .filter_map(|result| match result {
- Ok(entry) => {
- let path = entry.path();
- if !entry.file_type().is_dir() {
- match CanonicalPathBuf::canonicalize(path) {
- Ok(canonical_path) => Some((canonical_path, entry)),
- Err(msg) => {
- log::warn!(
- "Couldn't canonicalize {}:\n{}",
- path.display(),
- msg
- );
- None
- }
+ let added_entries: HashMap> =
+ updated_entries
+ .iter()
+ .chain(created_entries.iter())
+ .filter_map(|(path, resource)| {
+ if self.path_to_resource.contains_key(path) {
+ None
+ } else {
+ Some((path.clone(), resource.clone()))
}
- } else {
- None
- }
- }
- Err(msg) => {
- log::error!("Error during walking: {}", msg);
- None
- }
- })
- .collect()
-}
-
-fn scan_entry(
- path: &CanonicalPath,
- metadata: Metadata,
-) -> Result>
-where
- Id: ResourceId,
-{
- if metadata.is_dir() {
- return Err(ArklibError::Path("Path is expected to be a file".into()));
- }
-
- let size = metadata.len();
- if size == 0 {
- Err(std::io::Error::new(
- std::io::ErrorKind::InvalidData,
- "Empty resource",
- ))?;
- }
-
- let id = Id::from_path(path)?;
- let modified = metadata.modified()?;
-
- Ok(IndexEntry { modified, id })
-}
-
-fn scan_entries(
- entries: HashMap,
-) -> HashMap>
-where
- Id: ResourceId,
-{
- entries
- .into_iter()
- .filter_map(|(path_buf, entry)| {
- let metadata = entry.metadata().ok()?;
-
- let path = path_buf.as_canonical_path();
- let result = scan_entry(path, metadata);
- match result {
- Err(msg) => {
- log::error!(
- "Couldn't retrieve metadata for {}:\n{}",
- path.display(),
- msg
- );
- None
- }
- Ok(entry) => Some((path_buf, entry)),
- }
- })
- .collect()
-}
-
-fn is_hidden(entry: &DirEntry) -> bool {
- entry
- .file_name()
- .to_str()
- .map(|s| s.starts_with('.'))
- .unwrap_or(false)
-}
-
-#[cfg(test)]
-mod tests {
- use crate::index::{discover_paths, IndexEntry};
- use crate::ResourceIndex;
- use canonical_path::CanonicalPathBuf;
- use dev_hash::Crc32;
- use fs_atomic_versions::initialize;
- use std::fs::File;
- #[cfg(target_family = "unix")]
- use std::fs::Permissions;
- #[cfg(target_family = "unix")]
- use std::os::unix::fs::PermissionsExt;
-
- use std::path::PathBuf;
- use std::time::SystemTime;
- use uuid::Uuid;
-
- const FILE_SIZE_1: u64 = 10;
- const FILE_SIZE_2: u64 = 11;
-
- const FILE_NAME_1: &str = "test1.txt";
- const FILE_NAME_2: &str = "test2.txt";
- const FILE_NAME_3: &str = "test3.txt";
-
- const CRC32_1: Crc32 = Crc32(3817498742);
- const CRC32_2: Crc32 = Crc32(1804055020);
-
- fn get_temp_dir() -> PathBuf {
- create_dir_at(std::env::temp_dir())
- }
-
- fn create_dir_at(path: PathBuf) -> PathBuf {
- let mut dir_path = path.clone();
- dir_path.push(Uuid::new_v4().to_string());
- std::fs::create_dir(&dir_path).expect("Could not create temp dir");
- dir_path
- }
-
- fn create_file_at(
- path: PathBuf,
- size: Option,
- name: Option<&str>,
- ) -> (File, PathBuf) {
- let mut file_path = path.clone();
- if let Some(file_name) = name {
- file_path.push(file_name);
- } else {
- file_path.push(Uuid::new_v4().to_string());
- }
- let file = File::create(file_path.clone())
- .expect("Could not create temp file");
- file.set_len(size.unwrap_or(0))
- .expect("Could not set file size");
- (file, file_path)
- }
-
- fn run_test_and_clean_up(
- test: impl FnOnce(PathBuf) + std::panic::UnwindSafe,
- ) {
- initialize();
-
- let path = get_temp_dir();
- let result = std::panic::catch_unwind(|| test(path.clone()));
- std::fs::remove_dir_all(path.clone())
- .expect("Could not clean up after test");
- if result.is_err() {
- panic!("{}", result.err().map(|_| "Test panicked").unwrap())
- }
- assert!(result.is_ok());
- }
-
- // resource index build
-
- #[test]
- fn index_build_should_process_1_file_successfully() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), None);
-
- let actual: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- assert_eq!(actual.root, path.clone());
- assert_eq!(actual.path2id.len(), 1);
- assert_eq!(actual.id2path.len(), 1);
- assert!(actual.id2path.contains_key(&CRC32_1));
- assert_eq!(actual.collisions.len(), 0);
- assert_eq!(actual.size(), 1);
- })
- }
-
- #[test]
- fn index_build_should_process_colliding_files_correctly() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), None);
- create_file_at(path.clone(), Some(FILE_SIZE_1), None);
-
- let actual: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- assert_eq!(actual.root, path.clone());
- assert_eq!(actual.path2id.len(), 2);
- assert_eq!(actual.id2path.len(), 1);
- assert!(actual.id2path.contains_key(&CRC32_1));
- assert_eq!(actual.collisions.len(), 1);
- assert_eq!(actual.size(), 2);
- })
- }
-
- // resource index update
-
- #[test]
- fn update_all_should_handle_renamed_file_correctly() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1));
- create_file_at(path.clone(), Some(FILE_SIZE_2), Some(FILE_NAME_2));
-
- let mut actual: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- assert_eq!(actual.collisions.len(), 0);
- assert_eq!(actual.size(), 2);
-
- // rename test2.txt to test3.txt
- let mut name_from = path.clone();
- name_from.push(FILE_NAME_2);
- let mut name_to = path.clone();
- name_to.push(FILE_NAME_3);
- std::fs::rename(name_from, name_to)
- .expect("Should rename file successfully");
-
- let update = actual
- .update_all()
- .expect("Should update index correctly");
-
- assert_eq!(actual.collisions.len(), 0);
- assert_eq!(actual.size(), 2);
- assert_eq!(update.deleted.len(), 1);
- assert_eq!(update.added.len(), 1);
- })
- }
-
- #[test]
- fn update_all_should_index_new_file_successfully() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), None);
-
- let mut actual: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- let (_, expected_path) =
- create_file_at(path.clone(), Some(FILE_SIZE_2), None);
-
- let update = actual
- .update_all()
- .expect("Should update index correctly");
-
- assert_eq!(actual.root, path.clone());
- assert_eq!(actual.path2id.len(), 2);
- assert_eq!(actual.id2path.len(), 2);
- assert!(actual.id2path.contains_key(&CRC32_1));
- assert!(actual.id2path.contains_key(&CRC32_2));
- assert_eq!(actual.collisions.len(), 0);
- assert_eq!(actual.size(), 2);
- assert_eq!(update.deleted.len(), 0);
- assert_eq!(update.added.len(), 1);
-
- let added_key =
- CanonicalPathBuf::canonicalize(expected_path.clone())
- .expect("CanonicalPathBuf should be fine");
- assert_eq!(
- update
- .added
- .get(&added_key)
- .expect("Key exists")
- .clone(),
- CRC32_2
- )
- })
- }
-
- #[test]
- fn index_new_should_index_new_file_successfully() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), None);
- let mut index: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- let (_, new_path) =
- create_file_at(path.clone(), Some(FILE_SIZE_2), None);
-
- let update = index
- .index_new(&new_path)
- .expect("Should update index correctly");
-
- assert_eq!(index.root, path.clone());
- assert_eq!(index.path2id.len(), 2);
- assert_eq!(index.id2path.len(), 2);
- assert!(index.id2path.contains_key(&CRC32_1));
- assert!(index.id2path.contains_key(&CRC32_2));
- assert_eq!(index.collisions.len(), 0);
- assert_eq!(index.size(), 2);
- assert_eq!(update.deleted.len(), 0);
- assert_eq!(update.added.len(), 1);
-
- let added_key = CanonicalPathBuf::canonicalize(new_path.clone())
- .expect("CanonicalPathBuf should be fine");
- assert_eq!(
- update
- .added
- .get(&added_key)
- .expect("Key exists")
- .clone(),
- CRC32_2
- )
- })
- }
-
- #[test]
- fn update_one_should_error_on_new_file() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), None);
- let mut index = ResourceIndex::build(path.clone());
-
- let (_, new_path) =
- create_file_at(path.clone(), Some(FILE_SIZE_2), None);
-
- let update = index.update_one(&new_path, CRC32_2);
-
- assert!(update.is_err())
- })
- }
-
- #[test]
- fn update_one_should_index_delete_file_successfully() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1));
-
- let mut actual = ResourceIndex::build(path.clone());
-
- let mut file_path = path.clone();
- file_path.push(FILE_NAME_1);
- std::fs::remove_file(file_path.clone())
- .expect("Should remove file successfully");
-
- let update = actual
- .update_one(&file_path.clone(), CRC32_1)
- .expect("Should update index successfully");
-
- assert_eq!(actual.root, path.clone());
- assert_eq!(actual.path2id.len(), 0);
- assert_eq!(actual.id2path.len(), 0);
- assert_eq!(actual.collisions.len(), 0);
- assert_eq!(actual.size(), 0);
- assert_eq!(update.deleted.len(), 1);
- assert_eq!(update.added.len(), 0);
-
- assert!(update.deleted.contains(&CRC32_1))
- })
- }
+ })
+ .collect();
- #[test]
- fn update_all_should_error_on_files_without_permissions() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), Some(FILE_NAME_1));
- let (file, _) = create_file_at(
- path.clone(),
- Some(FILE_SIZE_2),
- Some(FILE_NAME_2),
+ for (path, resource) in added_entries {
+ log::trace!("Resource added: {:?}", path);
+
+ // strip the root path from the path
+ let relative_path = path
+ .strip_prefix(&self.root)
+ .unwrap()
+ .to_path_buf();
+ let resource = IndexedResource::new(
+ resource.id().clone(),
+ relative_path.clone(),
+ resource.last_modified(),
);
- let mut actual: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- assert_eq!(actual.collisions.len(), 0);
- assert_eq!(actual.size(), 2);
- #[cfg(target_family = "unix")]
- file.set_permissions(Permissions::from_mode(0o222))
- .expect("Should be fine");
-
- let update = actual
- .update_all()
- .expect("Should update index correctly");
-
- assert_eq!(actual.collisions.len(), 0);
- assert_eq!(actual.size(), 2);
- assert_eq!(update.deleted.len(), 0);
- assert_eq!(update.added.len(), 0);
- })
- }
-
- // error cases
-
- #[test]
- fn update_one_should_not_update_absent_path() {
- run_test_and_clean_up(|path| {
- let mut missing_path = path.clone();
- missing_path.push("missing/directory");
- let mut actual = ResourceIndex::build(path.clone());
- let old_id = Crc32(2);
- let result = actual
- .update_one(&missing_path, old_id.clone())
- .map(|i| i.deleted.clone().take(&old_id))
- .ok()
- .flatten();
-
- assert_eq!(result, Some(Crc32(2)));
- })
- }
-
- #[test]
- fn update_one_should_index_new_path() {
- run_test_and_clean_up(|path| {
- let mut missing_path = path.clone();
- missing_path.push("missing/directory");
- let mut actual = ResourceIndex::build(path.clone());
- let old_id = Crc32(2);
- let result = actual
- .update_one(&missing_path, old_id.clone())
- .map(|i| i.deleted.clone().take(&old_id))
- .ok()
- .flatten();
-
- assert_eq!(result, Some(Crc32(2)));
- })
- }
-
- #[test]
- fn should_not_index_empty_file() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(0), None);
- let actual: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- assert_eq!(actual.root, path.clone());
- assert_eq!(actual.path2id.len(), 0);
- assert_eq!(actual.id2path.len(), 0);
- assert_eq!(actual.collisions.len(), 0);
- })
- }
-
- #[test]
- fn should_not_index_hidden_file() {
- run_test_and_clean_up(|path| {
- create_file_at(path.clone(), Some(FILE_SIZE_1), Some(".hidden"));
- let actual: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- assert_eq!(actual.root, path.clone());
- assert_eq!(actual.path2id.len(), 0);
- assert_eq!(actual.id2path.len(), 0);
- assert_eq!(actual.collisions.len(), 0);
- })
- }
-
- #[test]
- fn should_not_index_1_empty_directory() {
- run_test_and_clean_up(|path| {
- create_dir_at(path.clone());
-
- let actual: ResourceIndex =
- ResourceIndex::build(path.clone());
-
- assert_eq!(actual.root, path.clone());
- assert_eq!(actual.path2id.len(), 0);
- assert_eq!(actual.id2path.len(), 0);
- assert_eq!(actual.collisions.len(), 0);
- })
- }
-
- #[test]
- fn discover_paths_should_not_walk_on_invalid_path() {
- run_test_and_clean_up(|path| {
- let mut missing_path = path.clone();
- missing_path.push("missing/directory");
- let actual = discover_paths(missing_path);
- assert_eq!(actual.len(), 0);
- })
- }
-
- #[test]
- fn index_entry_order() {
- let old1 = IndexEntry {
- id: Crc32(2),
- modified: SystemTime::UNIX_EPOCH,
- };
- let old2 = IndexEntry {
- id: Crc32(1),
- modified: SystemTime::UNIX_EPOCH,
- };
-
- let new1 = IndexEntry {
- id: Crc32(1),
- modified: SystemTime::now(),
- };
- let new2 = IndexEntry {
- id: Crc32(2),
- modified: SystemTime::now(),
- };
-
- assert_eq!(new1, new1);
- assert_eq!(new2, new2);
- assert_eq!(old1, old1);
- assert_eq!(old2, old2);
-
- assert_ne!(new1, new2);
- assert_ne!(new1, old1);
-
- assert!(new1 > old1);
- assert!(new1 > old2);
- assert!(new2 > old1);
- assert!(new2 > old2);
- assert!(new2 > new1);
- }
-
- /// Test the performance of `ResourceIndex::build` on a specific directory.
- ///
- /// This test evaluates the performance of building a resource
- /// index using the `ResourceIndex::build` method on a given directory.
- /// It measures the time taken to build the resource index and prints the
- /// number of collisions detected.
- #[test]
- fn test_build_resource_index() {
- use std::time::Instant;
-
- let path = "../test-assets/"; // The path to the directory to index
- assert!(
- std::path::Path::new(path).is_dir(),
- "The provided path is not a directory or does not exist"
- );
-
- let start_time = Instant::now();
- let index: ResourceIndex =
- ResourceIndex::build(path.to_string());
- let elapsed_time = start_time.elapsed();
+ self.path_to_resource
+ .insert(relative_path.clone(), resource.clone());
+ let id = resource.id().clone();
+ self.id_to_paths
+ .entry(id.clone())
+ .or_default()
+ .insert(relative_path.clone());
+ added.insert(id, resource);
+ }
- println!("Number of paths: {}", index.id2path.len());
- println!("Number of resources: {}", index.id2path.len());
- println!("Number of collisions: {}", index.collisions.len());
- println!("Time taken: {:?}", elapsed_time);
+ Ok(IndexUpdate { added, removed })
}
}
diff --git a/fs-index/src/lib.rs b/fs-index/src/lib.rs
index f475f478..59030cef 100644
--- a/fs-index/src/lib.rs
+++ b/fs-index/src/lib.rs
@@ -1,3 +1,10 @@
-pub mod index;
+mod index;
+mod serde;
+mod utils;
+
+pub use utils::load_or_build_index;
pub use index::ResourceIndex;
+
+#[cfg(test)]
+mod tests;
diff --git a/fs-index/src/serde.rs b/fs-index/src/serde.rs
new file mode 100644
index 00000000..a220d6a4
--- /dev/null
+++ b/fs-index/src/serde.rs
@@ -0,0 +1,127 @@
+use std::{
+ collections::{HashMap, HashSet},
+ path::PathBuf,
+ time::SystemTime,
+};
+
+use anyhow::Result;
+use serde::{
+ ser::{SerializeStruct, Serializer},
+ Deserialize, Serialize,
+};
+
+use data_resource::ResourceId;
+
+use crate::{index::IndexedResource, ResourceIndex};
+
+/// Data structure for serializing and deserializing the index
+#[derive(Serialize, Deserialize)]
+struct ResourceIndexData {
+ root: PathBuf,
+ resources: HashMap>,
+}
+
+#[derive(Serialize, Deserialize)]
+struct IndexedResourceData {
+ id: Id,
+ last_modified: u64,
+}
+
+/// Custom implementation of [`Serialize`] for [`ResourceIndex`]
+///
+/// To avoid writing a large repetitive index file with double maps,
+/// we are only serializing the root path, and path_to_resource.
+///
+/// Other fields can be reconstructed from the path_to_resource map.
+impl Serialize for ResourceIndex
+where
+ Id: ResourceId,
+{
+ fn serialize(&self, serializer: S) -> Result
+ where
+ S: Serializer,
+ {
+ let mut state = serializer.serialize_struct("ResourceIndex", 2)?;
+ state.serialize_field("root", &self.root)?;
+
+ let mut resources = HashMap::new();
+ for (path, resource) in &self.path_to_resource {
+ let id = resource.id().clone();
+ let last_modified = resource
+ .last_modified()
+ .duration_since(SystemTime::UNIX_EPOCH)
+ .map_err(|e| {
+ serde::ser::Error::custom(format!(
+ "Failed to serialize last_modified: {}",
+ e
+ ))
+ })?
+ .as_nanos() as u64;
+
+ let resource_data = IndexedResourceData { id, last_modified };
+ resources.insert(path.clone(), resource_data);
+ }
+
+ state.serialize_field("resources", &resources)?;
+ state.end()
+ }
+}
+
+/// Custom implementation of [`Deserialize`] for [`ResourceIndex`]
+///
+/// Deserializes the index from the root path and path_to_resource map.
+/// Other fields are reconstructed from the path_to_resource map.
+impl<'de, Id> Deserialize<'de> for ResourceIndex
+where
+ Id: ResourceId,
+{
+ fn deserialize(deserializer: D) -> Result, D::Error>
+ where
+ D: serde::Deserializer<'de>,
+ {
+ let index_data: ResourceIndexData =
+ ResourceIndexData::deserialize(deserializer)?;
+
+ let mut path_to_resource = HashMap::new();
+ let mut id_to_paths = HashMap::new();
+ for (path, resource_data) in index_data.resources {
+ let last_modified = SystemTime::UNIX_EPOCH
+ + std::time::Duration::from_nanos(resource_data.last_modified);
+ let resource = IndexedResource::new(
+ resource_data.id,
+ path.clone(),
+ last_modified,
+ );
+ path_to_resource.insert(path.clone(), resource.clone());
+ id_to_paths
+ .entry(resource.id().clone())
+ .or_insert_with(HashSet::new)
+ .insert(path);
+ }
+
+ Ok(ResourceIndex {
+ root: index_data.root,
+ id_to_paths,
+ path_to_resource,
+ })
+ }
+}
+
+/// Custom implementation of [`PartialEq`] for [`ResourceIndex`]
+///
+/// The order of items in hashmaps is not relevant.
+/// we just need to compare [`ResourceIndex::resources`] to check if the two
+/// indexes are equal.
+impl PartialEq for ResourceIndex
+where
+ Id: ResourceId,
+{
+ fn eq(&self, other: &Self) -> bool {
+ let mut resources1 = self.resources();
+ let mut resources2 = other.resources();
+ resources1.sort_by(|a, b| a.path().cmp(b.path()));
+ resources2.sort_by(|a, b| a.path().cmp(b.path()));
+
+ resources1 == resources2 && self.root == other.root
+ }
+}
diff --git a/fs-index/src/tests.rs b/fs-index/src/tests.rs
new file mode 100644
index 00000000..465aa51e
--- /dev/null
+++ b/fs-index/src/tests.rs
@@ -0,0 +1,502 @@
+//! This module provides tests for the `ResourceIndex` functionality using
+//! different hash algorithms.
+//!
+//! The tests are parameterized by various hash types, such as `Blake3` and
+//! `Crc32`, to ensure the implementation works consistently across different
+//! hashing algorithms.
+//!
+//! # Structure
+//!
+//! - **Macros**:
+//! - `hash_tests!`: Generates test functions for pairs of test functions and
+//! hash types.
+//!
+//! - **Test Functions**:
+//! - Defined to test various aspects of `ResourceIndex`, parameterized by
+//! hash type.
+//!
+//! - **Helper Functions**:
+//! - `get_indexed_resource_from_file`: Helper to create `IndexedResource`
+//! from a file path.
+//!
+//! # Usage
+//!
+//! To add a new test for a specific hash type, add a new entry in the
+//! `hash_tests!` macro invocation with the appropriate function and hash type.
+
+use dev_hash::{Blake3, Crc32};
+use std::{fs, path::Path};
+
+use anyhow::{anyhow, Result};
+use tempfile::TempDir;
+
+use data_resource::ResourceId;
+
+use crate::{
+ index::IndexedResource, utils::load_or_build_index, ResourceIndex,
+};
+
+/// A macro to generate tests for function and hash type pairs.
+#[macro_export]
+macro_rules! hash_tests {
+ ($($name:ident: ($func:ident, $hash_type:ty),)*) => {
+ $(
+ #[test]
+ fn $name() {
+ $func::<$hash_type>();
+ }
+ )*
+ };
+}
+
+// Use the macro to generate tests for the specified function and hash type
+// pairs
+hash_tests! {
+ // CRC32
+ test_store_and_load_index_crc32: (test_store_and_load_index, Crc32),
+ test_store_and_load_index_with_collisions_crc32: (test_store_and_load_index_with_collisions, Crc32),
+ test_build_index_with_file_crc32: (test_build_index_with_file, Crc32),
+ test_build_index_with_empty_file_crc32: (test_build_index_with_empty_file, Crc32),
+ test_build_index_with_directory_crc32: (test_build_index_with_directory, Crc32),
+ test_build_index_with_multiple_files_crc32: (test_build_index_with_multiple_files, Crc32),
+ test_build_index_with_multiple_directories_crc32: (test_build_index_with_multiple_directories, Crc32),
+ test_resource_index_update_crc32: (test_resource_index_update, Crc32),
+ test_add_colliding_files_crc32: (test_add_colliding_files, Crc32),
+ test_num_collisions_crc32: (test_num_collisions, Crc32),
+ test_hidden_files_crc32: (test_hidden_files, Crc32),
+
+ // Blake3
+ test_store_and_load_index_blake3: (test_store_and_load_index, Blake3),
+ test_store_and_load_index_with_collisions_blake3: (test_store_and_load_index_with_collisions, Blake3),
+ test_build_index_with_file_blake3: (test_build_index_with_file, Blake3),
+ test_build_index_with_empty_file_blake3: (test_build_index_with_empty_file, Blake3),
+ test_build_index_with_directory_blake3: (test_build_index_with_directory, Blake3),
+ test_build_index_with_multiple_files_blake3: (test_build_index_with_multiple_files, Blake3),
+ test_build_index_with_multiple_directories_blake3: (test_build_index_with_multiple_directories, Blake3),
+ test_resource_index_update_blake3: (test_resource_index_update, Blake3),
+ test_add_colliding_files_blake3: (test_add_colliding_files, Blake3),
+ test_num_collisions_blake3: (test_num_collisions, Blake3),
+ test_hidden_files_blake3: (test_hidden_files, Blake3),
+}
+
+/// A helper function to get [`IndexedResource`] from a file path
+fn get_indexed_resource_from_file>(
+ path: P,
+ parent_dir: P,
+) -> Result> {
+ let id = H::from_path(&path)?;
+
+ let relative_path = path
+ .as_ref()
+ .strip_prefix(parent_dir)
+ .map_err(|_| anyhow!("Failed to get relative path"))?;
+
+ Ok(IndexedResource::new(
+ id,
+ relative_path.to_path_buf(),
+ fs::metadata(path)?.modified()?,
+ ))
+}
+
+/// Test storing and loading the resource index.
+///
+/// ## Test scenario:
+/// - Build a resource index in the temporary directory.
+/// - Store the index.
+/// - Load the stored index.
+/// - Assert that the loaded index matches the original index.
+fn test_store_and_load_index() {
+ let temp_dir = TempDir::with_prefix("ark_test_store_and_load_index")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let file_path = root_path.join("file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+
+ let index: ResourceIndex =
+ ResourceIndex::build(root_path).expect("Failed to build index");
+ assert_eq!(index.len(), 1, "{:?}", index);
+ index.store().expect("Failed to store index");
+
+ let loaded_index =
+ load_or_build_index(root_path, false).expect("Failed to load index");
+
+ assert_eq!(index, loaded_index, "{:?} != {:?}", index, loaded_index);
+}
+
+/// Test storing and loading the resource index with collisions.
+///
+/// ## Test scenario:
+/// - Build a resource index in the temporary directory.
+/// - Write duplicate files with the same content.
+/// - Store the index.
+/// - Load the stored index.
+/// - Assert that the loaded index matches the original index.
+fn test_store_and_load_index_with_collisions() {
+ let temp_dir =
+ TempDir::with_prefix("ark_test_store_and_load_index_with_collisions")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let file_path = root_path.join("file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+
+ let file_path2 = root_path.join("file2.txt");
+ fs::write(&file_path2, "file content").expect("Failed to write to file");
+
+ let file_path3 = root_path.join("file3.txt");
+ fs::write(&file_path3, "file content").expect("Failed to write to file");
+
+ let file_path4 = root_path.join("file4.txt");
+ fs::write(&file_path4, "file content").expect("Failed to write to file");
+
+ // Now we have 4 files with the same content (same checksum)
+
+ let index: ResourceIndex =
+ ResourceIndex::build(root_path).expect("Failed to build index");
+ let checksum = H::from_path(&file_path).expect("Failed to get checksum");
+ assert_eq!(index.len(), 4, "{:?}", index);
+ assert_eq!(index.collisions().len(), 1, "{:?}", index);
+ assert_eq!(index.collisions()[&checksum].len(), 4, "{:?}", index);
+ index.store().expect("Failed to store index");
+
+ let loaded_index =
+ load_or_build_index(root_path, false).expect("Failed to load index");
+
+ assert_eq!(index, loaded_index, "{:?} != {:?}", index, loaded_index);
+}
+
+/// Test building an index with a file.
+///
+/// ## Test scenario:
+/// - Create a file within the temporary directory.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index contains one entry.
+/// - Assert that the resource retrieved by path matches the expected resource.
+/// - Assert that the resource retrieved by ID matches the expected resource.
+fn test_build_index_with_file() {
+ let temp_dir = TempDir::with_prefix("ark_test_build_index_with_file")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let file_path = root_path.join("file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+ let expected_resource: IndexedResource =
+ get_indexed_resource_from_file(&file_path, &root_path.to_path_buf())
+ .expect("Failed to get indexed resource");
+
+ let index = ResourceIndex::build(root_path).expect("Failed to build index");
+ assert_eq!(index.len(), 1, "{:?}", index);
+
+ let resource = index
+ .get_resource_by_path("file.txt")
+ .expect("Failed to get resource");
+ assert_eq!(
+ resource, &expected_resource,
+ "{:?} != {:?}",
+ resource, expected_resource
+ );
+}
+
+/// Test building an index with an empty file.
+///
+/// ## Test scenario:
+/// - Create an empty file within the temporary directory.
+/// - Create a file with content within the temporary directory.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index contains one entries.
+fn test_build_index_with_empty_file() {
+ let temp_dir = TempDir::with_prefix("ark_test_build_index_with_empty_file")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let empty_file_path = root_path.join("empty_file.txt");
+ fs::write(&empty_file_path, "").expect("Failed to write to file");
+
+ let file_path = root_path.join("file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+
+ let index: ResourceIndex =
+ ResourceIndex::build(root_path).expect("Failed to build index");
+ assert_eq!(index.len(), 1, "{:?}", index);
+}
+
+/// Test building an index with a directory.
+///
+/// ## Test scenario:
+/// - Create a subdirectory within the temporary directory.
+/// - Create a file within the subdirectory.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index contains one entry.
+/// - Assert that the resource retrieved by path matches the expected resource.
+/// - Assert that the resource retrieved by ID matches the expected resource.
+fn test_build_index_with_directory() {
+ let temp_dir = TempDir::with_prefix("ark_test_build_index_with_directory")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let dir_path = root_path.join("dir");
+ fs::create_dir(&dir_path).expect("Failed to create dir");
+ let file_path = dir_path.join("file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+ let expected_resource: IndexedResource =
+ get_indexed_resource_from_file(&file_path, &root_path.to_path_buf())
+ .expect("Failed to get indexed resource");
+
+ let index = ResourceIndex::build(root_path).expect("Failed to build index");
+ assert_eq!(index.len(), 1, "{:?}", index);
+
+ let resource = index
+ .get_resource_by_path("dir/file.txt")
+ .expect("Failed to get resource");
+ assert_eq!(
+ resource, &expected_resource,
+ "{:?} != {:?}",
+ resource, expected_resource
+ );
+}
+
+/// Test building an index with multiple files.
+///
+/// ## Test scenario:
+/// - Create multiple files within the temporary directory.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index contains two entries.
+/// - Assert that the resource retrieved by path for each file matches the
+/// expected resource.
+fn test_build_index_with_multiple_files() {
+ let temp_dir =
+ TempDir::with_prefix("ark_test_build_index_with_multiple_files")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let file1_path = root_path.join("file1.txt");
+ fs::write(&file1_path, "file1 content").expect("Failed to write to file");
+ let file2_path = root_path.join("file2.txt");
+ fs::write(&file2_path, "file2 content").expect("Failed to write to file");
+
+ let expected_resource1: IndexedResource =
+ get_indexed_resource_from_file(&file1_path, &root_path.to_path_buf())
+ .expect("Failed to get indexed resource");
+ let expected_resource2 =
+ get_indexed_resource_from_file(&file2_path, &root_path.to_path_buf())
+ .expect("Failed to get indexed resource");
+
+ let index = ResourceIndex::build(root_path).expect("Failed to build index");
+ assert_eq!(index.len(), 2, "{:?}", index);
+
+ let resource = index
+ .get_resource_by_path("file1.txt")
+ .expect("Failed to get resource");
+ assert_eq!(resource, &expected_resource1, "{:?}", resource);
+
+ let resource = index
+ .get_resource_by_path("file2.txt")
+ .expect("Failed to get resource");
+ assert_eq!(resource, &expected_resource2, "{:?}", resource);
+}
+
+/// Test building an index with multiple directories.
+///
+/// ## Test scenario:
+/// - Create multiple directories within the temporary directory, each
+/// containing a file.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index contains two entries.
+/// - Assert that the resources retrieved by path for each file match the
+/// expected resources.
+fn test_build_index_with_multiple_directories() {
+ let temp_dir =
+ TempDir::with_prefix("ark_test_build_index_with_multiple_directories")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let dir1_path = root_path.join("dir1");
+ fs::create_dir(&dir1_path).expect("Failed to create dir");
+ let file1_path = dir1_path.join("file1.txt");
+ fs::write(&file1_path, "file1 content").expect("Failed to write to file");
+
+ let dir2_path = root_path.join("dir2");
+ fs::create_dir(&dir2_path).expect("Failed to create dir");
+ let file2_path = dir2_path.join("file2.txt");
+ fs::write(&file2_path, "file2 content").expect("Failed to write to file");
+
+ let expected_resource1: IndexedResource =
+ get_indexed_resource_from_file(&file1_path, &root_path.to_path_buf())
+ .expect("Failed to get indexed resource");
+ let expected_resource2 =
+ get_indexed_resource_from_file(&file2_path, &root_path.to_path_buf())
+ .expect("Failed to get indexed resource");
+
+ let index = ResourceIndex::build(root_path).expect("Failed to build index");
+ assert_eq!(index.len(), 2, "{:?}", index);
+
+ let resource = index
+ .get_resource_by_path("dir1/file1.txt")
+ .expect("Resource not found");
+ assert_eq!(resource, &expected_resource1, "{:?}", resource);
+
+ let resource = index
+ .get_resource_by_path("dir2/file2.txt")
+ .expect("Resource not found");
+ assert_eq!(resource, &expected_resource2, "{:?}", resource);
+}
+
+/// Test updating the resource index.
+///
+/// ## Test scenario:
+/// - Create files within the temporary directory.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index initially contains the expected number of entries.
+/// - Create a new file, modify an existing file, and remove another file.
+/// - Update the resource index.
+/// - Assert that the index contains the expected number of entries after the
+/// update.
+/// - Assert that the entries in the index match the expected state after the
+/// update.
+fn test_resource_index_update() {
+ let temp_dir = TempDir::with_prefix("ark_test_resource_index_update")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let file_path = root_path.join("file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+
+ let image_path = root_path.join("image.png");
+ fs::write(&image_path, "image content").expect("Failed to write to file");
+
+ let mut index: ResourceIndex =
+ ResourceIndex::build(root_path).expect("Failed to build index");
+ index.store().expect("Failed to store index");
+ assert_eq!(index.len(), 2, "{:?}", index);
+
+ // create new file
+ let new_file_path = root_path.join("new_file.txt");
+ fs::write(&new_file_path, "new file content")
+ .expect("Failed to write to file");
+
+ // modify file
+ fs::write(&file_path, "updated file content")
+ .expect("Failed to write to file");
+
+ // remove file
+ fs::remove_file(&image_path).expect("Failed to remove file");
+
+ index
+ .update_all()
+ .expect("Failed to update index");
+ // Index now contains 2 resources (file.txt and new_file.txt)
+ assert_eq!(index.len(), 2, "{:?}", index);
+
+ let resource = index
+ .get_resource_by_path("file.txt")
+ .expect("Resource not found");
+ let expected_resource =
+ get_indexed_resource_from_file(&file_path, &root_path.to_path_buf())
+ .expect("Failed to get indexed resource");
+ assert_eq!(resource, &expected_resource, "{:?}", resource);
+
+ let _resource = index
+ .get_resource_by_path("new_file.txt")
+ .expect("Resource not found");
+
+ assert!(
+ index.get_resource_by_path("image.png").is_none(),
+ "{:?}",
+ index
+ );
+}
+
+/// Test adding colliding files to the index.
+///
+/// ## Test scenario:
+/// - Create a file within the temporary directory.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index initially contains the expected number of entries.
+/// - Create a new file with the same checksum as the existing file.
+/// - Track the addition of the new file in the index.
+/// - Assert that the index contains the expected number of entries after the
+/// addition.
+/// - Assert index.collisions contains the expected number of entries.
+fn test_add_colliding_files() {
+ let temp_dir = TempDir::with_prefix("ark_test_add_colliding_files")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let file_path = root_path.join("file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+
+ let mut index: ResourceIndex =
+ ResourceIndex::build(root_path).expect("Failed to build index");
+ index.store().expect("Failed to store index");
+ assert_eq!(index.len(), 1, "{:?}", index);
+
+ let new_file_path = root_path.join("new_file.txt");
+ fs::write(&new_file_path, "file content").expect("Failed to write to file");
+
+ index
+ .update_all()
+ .expect("Failed to update index");
+
+ assert_eq!(index.len(), 2, "{:?}", index);
+ assert_eq!(index.collisions().len(), 1, "{:?}", index);
+}
+
+/// Test `ResourceIndex::num_collisions()` method.
+///
+/// ## Test scenario:
+/// - Create a file within the temporary directory.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index initially contains the expected number of entries.
+/// - Create 2 new files with the same checksum as the existing file.
+/// - Update the index.
+/// - Assert that the index contains the expected number of entries after the
+/// update.
+fn test_num_collisions() {
+ let temp_dir = TempDir::with_prefix("ark_test_num_collisions")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let file_path = root_path.join("file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+
+ let mut index: ResourceIndex =
+ ResourceIndex::build(root_path).expect("Failed to build index");
+ index.store().expect("Failed to store index");
+ assert_eq!(index.len(), 1, "{:?}", index);
+
+ let new_file_path = root_path.join("new_file.txt");
+ fs::write(&new_file_path, "file content").expect("Failed to write to file");
+
+ let new_file_path2 = root_path.join("new_file2.txt");
+ fs::write(&new_file_path2, "file content")
+ .expect("Failed to write to file");
+
+ index
+ .update_all()
+ .expect("Failed to update index");
+
+ assert_eq!(index.len(), 3, "{:?}", index);
+ assert_eq!(index.num_collisions(), 3, "{:?}", index);
+}
+
+/// Test that we don't index hidden files.
+///
+/// ## Test scenario:
+/// - Create a hidden file within the temporary directory.
+/// - Build a resource index in the temporary directory.
+/// - Assert that the index initially contains the expected number of entries.
+/// (0)
+fn test_hidden_files() {
+ let temp_dir = TempDir::with_prefix("ark_test_hidden_files")
+ .expect("Failed to create temp dir");
+ let root_path = temp_dir.path();
+
+ let file_path = root_path.join(".hidden_file.txt");
+ fs::write(&file_path, "file content").expect("Failed to write to file");
+
+ let index: ResourceIndex =
+ ResourceIndex::build(root_path).expect("Failed to build index");
+ index.store().expect("Failed to store index");
+ assert_eq!(index.len(), 0, "{:?}", index);
+}
diff --git a/fs-index/src/utils.rs b/fs-index/src/utils.rs
new file mode 100644
index 00000000..52f96bc4
--- /dev/null
+++ b/fs-index/src/utils.rs
@@ -0,0 +1,150 @@
+use std::{
+ collections::HashMap,
+ fs,
+ io::BufReader,
+ path::{Path, PathBuf},
+};
+
+use walkdir::{DirEntry, WalkDir};
+
+use data_error::{ArklibError, Result};
+use data_resource::ResourceId;
+use fs_storage::{ARK_FOLDER, INDEX_PATH};
+
+use crate::{index::IndexedResource, ResourceIndex};
+
+/// Load the index from the file system
+fn load_index, Id: ResourceId>(
+ root_path: P,
+) -> Result> {
+ let index_path = Path::new(ARK_FOLDER).join(INDEX_PATH);
+ let index_path = fs::canonicalize(root_path.as_ref())?.join(index_path);
+ let index_file = fs::File::open(index_path)?;
+ let reader = BufReader::new(index_file);
+ let index = serde_json::from_reader(reader)?;
+
+ Ok(index)
+}
+
+/// Load the index from the file system, or build a new index if it doesn't
+/// exist
+///
+/// If `update` is true, the index will be updated and stored after loading
+/// it.
+pub fn load_or_build_index, Id: ResourceId>(
+ root_path: P,
+ update: bool,
+) -> Result> {
+ log::debug!(
+ "Attempting to load or build index at root path: {:?}",
+ root_path.as_ref()
+ );
+
+ let index_path = Path::new(ARK_FOLDER).join(INDEX_PATH);
+ let index_path = fs::canonicalize(root_path.as_ref())?.join(index_path);
+ log::trace!("Index path: {:?}", index_path);
+
+ if index_path.exists() {
+ log::trace!("Index file exists, loading index");
+
+ let mut index = load_index(root_path)?;
+ if update {
+ log::trace!("Updating loaded index");
+
+ index.update_all()?;
+ index.store()?;
+ }
+ Ok(index)
+ } else {
+ log::trace!("Index file does not exist, building index");
+
+ // Build a new index if it doesn't exist and store it
+ let index = ResourceIndex::build(root_path.as_ref())?;
+ index.store().map_err(|e| {
+ ArklibError::Path(format!("Failed to store index: {}", e))
+ })?;
+ Ok(index)
+ }
+}
+
+/// A helper function to discover paths in a directory
+///
+/// This function walks the directory tree starting from the root path and
+/// returns a list of file paths.
+///
+/// Ignore hidden files and empty files.
+pub(crate) fn discover_paths>(
+ root_path: P,
+) -> Result> {
+ log::debug!("Discovering paths at root path: {:?}", root_path.as_ref());
+
+ let walker = WalkDir::new(&root_path)
+ .min_depth(1) // Skip the root directory
+ .into_iter()
+ .filter_entry(should_index); // Skip hidden files and empty files
+
+ // Filter out directories
+ let paths = walker
+ .filter_map(|entry| {
+ let entry = entry.ok()?;
+ if entry.file_type().is_file() {
+ Some(entry)
+ } else {
+ None
+ }
+ })
+ .collect();
+
+ Ok(paths)
+}
+
+/// A helper function to scan entries and create indexed resources
+pub(crate) fn scan_entries(
+ paths: Vec,
+) -> HashMap> {
+ let mut path_to_resource = HashMap::new();
+ for entry in paths {
+ let resource = scan_entry(entry);
+ path_to_resource.insert(resource.path().to_path_buf(), resource);
+ }
+ path_to_resource
+}
+
+/// A helper function to scan one entry and create an indexed resource
+pub(crate) fn scan_entry(
+ entry: DirEntry,
+) -> IndexedResource {
+ let path = entry.path().to_path_buf();
+ let metadata = entry.metadata().unwrap();
+ let last_modified = metadata.modified().unwrap();
+
+ // Get the ID of the resource
+ let id = Id::from_path(&path).unwrap();
+
+ // Create the indexed resource
+ IndexedResource::new(id, path, last_modified)
+}
+
+/// A helper function to check if the entry should be indexed (not hidden or
+/// empty)
+fn should_index(entry: &walkdir::DirEntry) -> bool {
+ // Check if the entry is hidden
+ if entry
+ .file_name()
+ .to_string_lossy()
+ .starts_with('.')
+ {
+ return false;
+ }
+
+ // Check if the entry is empty
+ if entry
+ .metadata()
+ .map(|m| m.len() == 0)
+ .unwrap_or(false)
+ {
+ return false;
+ }
+
+ true
+}
diff --git a/fs-metadata/src/lib.rs b/fs-metadata/src/lib.rs
index df258ae5..881ddeab 100644
--- a/fs-metadata/src/lib.rs
+++ b/fs-metadata/src/lib.rs
@@ -1,9 +1,7 @@
use data_error::Result;
use fs_atomic_versions::atomic::{modify_json, AtomicFile};
use serde::{de::DeserializeOwned, Serialize};
-use std::fmt::Debug;
-use std::io::Read;
-use std::path::Path;
+use std::{fmt::Debug, io::Read, path::Path};
use data_resource::ResourceId;
use fs_storage::ARK_FOLDER;
diff --git a/fs-properties/src/lib.rs b/fs-properties/src/lib.rs
index 81b5b04f..0c28aede 100644
--- a/fs-properties/src/lib.rs
+++ b/fs-properties/src/lib.rs
@@ -1,8 +1,6 @@
use serde::{de::DeserializeOwned, Serialize};
use serde_json::Value;
-use std::fmt::Debug;
-use std::io::Read;
-use std::path::Path;
+use std::{fmt::Debug, io::Read, path::Path};
use data_error::Result;
use data_json::merge;
diff --git a/fs-storage/examples/cli.rs b/fs-storage/examples/cli.rs
index 7b2ab23d..8214f258 100644
--- a/fs-storage/examples/cli.rs
+++ b/fs-storage/examples/cli.rs
@@ -1,10 +1,7 @@
use anyhow::{Context, Result};
-use fs_storage::base_storage::BaseStorage;
-use fs_storage::file_storage::FileStorage;
+use fs_storage::{base_storage::BaseStorage, file_storage::FileStorage};
use serde_json::Value;
-use std::env;
-use std::fs;
-use std::path::Path;
+use std::{env, fs, path::Path};
fn main() {
if let Err(e) = run() {
diff --git a/fs-storage/src/base_storage.rs b/fs-storage/src/base_storage.rs
index 576b5a8f..4eba03b0 100644
--- a/fs-storage/src/base_storage.rs
+++ b/fs-storage/src/base_storage.rs
@@ -30,16 +30,22 @@ impl std::fmt::Display for SyncStatus {
}
}
-/// The `BaseStorage` trait represents a key-value mapping that is written to the file system.
+/// The `BaseStorage` trait represents a key-value mapping that is written to
+/// the file system.
///
-/// This trait provides methods to create or update entries in the internal mapping, remove entries from the internal mapping,
-/// determine if the in-memory model or the underlying storage requires syncing, scan and load the mapping from the filesystem,
-/// write the mapping to the filesystem, and remove all stored data.
+/// This trait provides methods to create or update entries in the internal
+/// mapping, remove entries from the internal mapping, determine if the
+/// in-memory model or the underlying storage requires syncing, scan and load
+/// the mapping from the filesystem, write the mapping to the filesystem, and
+/// remove all stored data.
///
-/// The trait also includes a method to merge values from another key-value mapping.
+/// The trait also includes a method to merge values from another key-value
+/// mapping.
///
-/// Note: The trait does not write to storage by default. It is up to the implementor to decide when to read or write to storage
-/// based on `SyncStatus`. This is to allow for trading off between performance and consistency.
+/// Note: The trait does not write to storage by default. It is up to the
+/// implementor to decide when to read or write to storage
+/// based on `SyncStatus`. This is to allow for trading off between performance
+/// and consistency.
pub trait BaseStorage: AsRef> {
/// Create or update an entry in the internal mapping.
fn set(&mut self, id: K, value: V);
diff --git a/fs-storage/src/btreemap_iter.rs b/fs-storage/src/btreemap_iter.rs
index b3d7a69f..bb52d6f1 100644
--- a/fs-storage/src/btreemap_iter.rs
+++ b/fs-storage/src/btreemap_iter.rs
@@ -1,7 +1,5 @@
use crate::base_storage::BaseStorage;
-use std::cell::RefCell;
-use std::collections::btree_map::Iter;
-use std::rc::Rc;
+use std::{cell::RefCell, collections::btree_map::Iter, rc::Rc};
pub struct BTreeMapIterator<'a, K, V> {
iter: Rc>>,
diff --git a/fs-storage/src/file_storage.rs b/fs-storage/src/file_storage.rs
index 0e27766f..9b762bbd 100644
--- a/fs-storage/src/file_storage.rs
+++ b/fs-storage/src/file_storage.rs
@@ -1,15 +1,17 @@
use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
-use std::io::Write;
-use std::time::SystemTime;
use std::{
collections::BTreeMap,
+ fs::{self, File},
+ io::Write,
path::{Path, PathBuf},
+ time::SystemTime,
};
-use crate::base_storage::{BaseStorage, SyncStatus};
-use crate::monoid::Monoid;
-use crate::utils::read_version_2_fs;
+use crate::{
+ base_storage::{BaseStorage, SyncStatus},
+ monoid::Monoid,
+ utils::read_version_2_fs,
+};
use data_error::{ArklibError, Result};
/*
@@ -80,8 +82,8 @@ where
/// Create a new file storage with a diagnostic label and file path
/// The storage will be initialized using the disk data, if the path exists
///
- /// Note: if the file storage already exists, the data will be read from the file
- /// without overwriting it.
+ /// Note: if the file storage already exists, the data will be read from the
+ /// file without overwriting it.
pub fn new(label: String, path: &Path) -> Result {
let time = SystemTime::now();
let mut storage = Self {
@@ -114,7 +116,8 @@ where
// First check if the file starts with "version: 2"
let file_content = std::fs::read_to_string(&self.path)?;
if file_content.starts_with("version: 2") {
- // Attempt to parse the file using the legacy version 2 storage format of FileStorage.
+ // Attempt to parse the file using the legacy version 2 storage
+ // format of FileStorage.
match read_version_2_fs(&self.path) {
Ok(data) => {
log::info!(
@@ -193,14 +196,14 @@ where
// Determine the synchronization status based on the modification times
// Conditions:
- // 1. If both the in-memory storage and the storage on disk have been modified
- // since the last write, then the storage is diverged.
- // 2. If only the in-memory storage has been modified since the last write,
- // then the storage on disk is stale.
- // 3. If only the storage on disk has been modified since the last write,
- // then the in-memory storage is stale.
- // 4. If neither the in-memory storage nor the storage on disk has been modified
- // since the last write, then the storage is in sync.
+ // 1. If both the in-memory storage and the storage on disk have been
+ // modified since the last write, then the storage is diverged.
+ // 2. If only the in-memory storage has been modified since the last
+ // write, then the storage on disk is stale.
+ // 3. If only the storage on disk has been modified since the last
+ // write, then the in-memory storage is stale.
+ // 4. If neither the in-memory storage nor the storage on disk has been
+ // modified since the last write, then the storage is in sync.
let status = match (
self.modified > self.written_to_disk,
file_updated > self.written_to_disk,
diff --git a/fs-storage/src/jni/btreemap_iter.rs b/fs-storage/src/jni/btreemap_iter.rs
index 6ebd6d61..61573e09 100644
--- a/fs-storage/src/jni/btreemap_iter.rs
+++ b/fs-storage/src/jni/btreemap_iter.rs
@@ -1,5 +1,4 @@
-use crate::btreemap_iter::BTreeMapIterator;
-use crate::file_storage::FileStorage;
+use crate::{btreemap_iter::BTreeMapIterator, file_storage::FileStorage};
// This is the interface to the JVM that we'll call the majority of our
// methods on.
use jni::JNIEnv;
diff --git a/fs-storage/src/jni/file_storage.rs b/fs-storage/src/jni/file_storage.rs
index 20820cd0..b62ed5bb 100644
--- a/fs-storage/src/jni/file_storage.rs
+++ b/fs-storage/src/jni/file_storage.rs
@@ -1,7 +1,6 @@
use crate::base_storage::SyncStatus;
use jni::signature::ReturnType;
-use std::collections::BTreeMap;
-use std::path::Path;
+use std::{collections::BTreeMap, path::Path};
// This is the interface to the JVM that we'll call the majority of our
// methods on.
use jni::JNIEnv;
@@ -45,7 +44,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_create<'local>(
let file_storage: FileStorage =
FileStorage::new(label, Path::new(&path)).unwrap_or_else(|err| {
- env.throw_new("java/lang/RuntimeException", &err.to_string())
+ env.throw_new("java/lang/RuntimeException", err.to_string())
.expect("Failed to throw RuntimeException");
FileStorage::new("".to_string(), Path::new("")).unwrap()
});
@@ -77,13 +76,13 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_remove<'local>(
FileStorage::from_jlong(file_storage_ptr)
.remove(&id)
.unwrap_or_else(|err| {
- env.throw_new("java/lang/RuntimeException", &err.to_string())
+ env.throw_new("java/lang/RuntimeException", err.to_string())
.unwrap();
});
}
-// A JNI function called from Java that creates a `MyData` Rust type, converts it to a Java
-// type and returns it.
+// A JNI function called from Java that creates a `MyData` Rust type, converts
+// it to a Java type and returns it.
#[no_mangle]
#[allow(non_snake_case)]
pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_syncStatus<
@@ -114,7 +113,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_sync(
FileStorage::from_jlong(file_storage_ptr)
.sync()
.unwrap_or_else(|err| {
- env.throw_new("java/lang/RuntimeException", &err.to_string())
+ env.throw_new("java/lang/RuntimeException", err.to_string())
.unwrap();
});
}
@@ -129,7 +128,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_readFS(
match FileStorage::from_jlong(file_storage_ptr).read_fs() {
Ok(data) => data.clone(),
Err(err) => {
- env.throw_new("java/lang/RuntimeException", &err.to_string())
+ env.throw_new("java/lang/RuntimeException", err.to_string())
.expect("Failed to throw RuntimeException");
return JObject::null().into_raw();
}
@@ -202,7 +201,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_writeFS(
FileStorage::from_jlong(file_storage_ptr)
.write_fs()
.unwrap_or_else(|err| {
- env.throw_new("java/lang/RuntimeException", &err.to_string())
+ env.throw_new("java/lang/RuntimeException", err.to_string())
.unwrap();
});
}
@@ -219,7 +218,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_erase(
Box::from_raw(file_storage_ptr as *mut FileStorage)
};
file_storage.erase().unwrap_or_else(|err| {
- env.throw_new("java/lang/RuntimeException", &err.to_string())
+ env.throw_new("java/lang/RuntimeException", err.to_string())
.unwrap();
});
}
@@ -234,7 +233,7 @@ pub extern "system" fn Java_dev_arkbuilders_core_FileStorage_merge(
FileStorage::from_jlong(file_storage_ptr)
.merge_from(FileStorage::from_jlong(other_file_storage_ptr))
.unwrap_or_else(|err| {
- env.throw_new("java/lang/RuntimeException", &err.to_string())
+ env.throw_new("java/lang/RuntimeException", err.to_string())
.unwrap();
});
}
diff --git a/fs-storage/src/monoid.rs b/fs-storage/src/monoid.rs
index b2acf546..1c483201 100644
--- a/fs-storage/src/monoid.rs
+++ b/fs-storage/src/monoid.rs
@@ -1,10 +1,11 @@
-// Currently, we have three structures: Tags (HashSet), Properties (HashSet), Score (int).
-// In fact, HashSet already implements a union function,
+// Currently, we have three structures: Tags (HashSet), Properties (HashSet),
+// Score (int). In fact, HashSet already implements a union function,
// so only a special function for integers is needed.
// CRDTs can be considered later when we need to add structures that require
// more powerful combine semantics.
-// Trait defining a Monoid, which represents a mathematical structure with an identity element and an associative binary operation.
+// Trait defining a Monoid, which represents a mathematical structure with an
+// identity element and an associative binary operation.
pub trait Monoid {
// Returns the neutral element of the monoid.
fn neutral() -> V;
@@ -13,7 +14,8 @@ pub trait Monoid {
fn combine(a: &V, b: &V) -> V;
// Combines multiple elements of the monoid into a single element.
- // Default implementation uses `neutral()` as the initial accumulator and `combine()` for folding.
+ // Default implementation uses `neutral()` as the initial accumulator and
+ // `combine()` for folding.
fn combine_all>(values: I) -> V {
values
.into_iter()
diff --git a/fs-storage/src/utils.rs b/fs-storage/src/utils.rs
index b5b6830a..d1e818bd 100644
--- a/fs-storage/src/utils.rs
+++ b/fs-storage/src/utils.rs
@@ -1,6 +1,5 @@
use data_error::Result;
-use std::collections::BTreeMap;
-use std::path::Path;
+use std::{collections::BTreeMap, path::Path};
/// Parses version 2 `FileStorage` format and returns the data as a BTreeMap
///
diff --git a/rustfmt.toml b/rustfmt.toml
index 0a869692..5b415646 100644
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -1,15 +1,18 @@
+# General settings
+imports_granularity = 'Crate'
verbose = "Verbose"
tab_spaces = 4
-
max_width = 80
+newline_style = "Unix"
+
+# Code style settings
chain_width = 50
single_line_if_else_max_width = 30
-
force_explicit_abi = true
-
+# Import settings
reorder_imports = true
+# Comment settings
wrap_comments = true
-
-newline_style = "Unix"
+comment_width = 80