Skip to content

Commit

Permalink
WIP: deleted files
Browse files Browse the repository at this point in the history
  • Loading branch information
tatref committed Oct 20, 2024
1 parent 86b2bd9 commit 5b1c8a3
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 9 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ directories = "4"
sysinfo = "0.27"
ctrlc = "3.4"
chrono = "0.4"
procfs = { version = "0.17.0", default-features = false }

[target.'cfg(not(target_has_atomic = "64"))'.dependencies]
portable-atomic = "1.4"
Expand Down
120 changes: 118 additions & 2 deletions src/dir_walker.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
use std::cmp::Ordering;
use std::fs;
use std::fs::Metadata;
use std::os::linux::fs::MetadataExt;
use std::sync::Arc;
use std::sync::Mutex;

use crate::node::Node;
use crate::platform::InodeAndDevice;
use crate::progress::Operation;
use crate::progress::PAtomicInfo;
use crate::progress::RuntimeErrors;
use crate::progress::ORDERING;
use crate::utils::is_filtered_out_due_to_file_time;
use crate::utils::is_filtered_out_due_to_invert_regex;
use crate::utils::is_filtered_out_due_to_regex;
use procfs::process::FDTarget;
use rayon::iter::ParallelBridge;
use rayon::prelude::ParallelIterator;
use regex::Regex;
Expand Down Expand Up @@ -48,9 +52,44 @@ pub struct WalkData<'a> {
pub errors: Arc<Mutex<RuntimeErrors>>,
}

/// Return deleted file still accessed by a process by walking /proc/$PID/fd/$FD
/// Deleted files have nlinks == 0
fn get_deleted_files() -> Vec<(PathBuf, Metadata)> {
let mut deleted_files = Vec::new();

for p in procfs::process::all_processes().unwrap() {
let Ok(p) = p else {
continue;
};
let Ok(fds) = p.fd() else {
continue;
};

for fd in fds {
let Ok(fd) = fd else {
continue;
};

if let FDTarget::Path(path) = &fd.target {
let proc_fd = format!("/proc/{}/fd/{}", p.pid, fd.fd);
let Ok(metadata) = std::fs::metadata(&proc_fd) else {
continue;
};

if metadata.st_nlink() == 0 {
// TODO: remove " (deleted)", not part of actual name
deleted_files.push((path.clone(), metadata));
}
}
}
}

deleted_files
}

pub fn walk_it(dirs: HashSet<PathBuf>, walk_data: &WalkData) -> Vec<Node> {
let mut inodes = HashSet::new();
let top_level_nodes: Vec<_> = dirs
let mut top_level_nodes: Vec<_> = dirs
.into_iter()
.filter_map(|d| {
let prog_data = &walk_data.progress_data;
Expand All @@ -62,11 +101,88 @@ pub fn walk_it(dirs: HashSet<PathBuf>, walk_data: &WalkData) -> Vec<Node> {
clean_inodes(node, &mut inodes, walk_data)
})
.collect();

// TODO: use a flag
let handle_deleted_files = true;

if handle_deleted_files {
let deleted_files: Vec<_> = get_deleted_files()
.into_iter()
.filter(|(_path, metadata)| {
let inode_and_device = (metadata.st_ino(), metadata.st_dev());
// ignore inodes already collected as part of regular files
!inodes.contains(&inode_and_device)
})
.collect();

// we try to insert deleted files in the node tree
for (path, m) in &deleted_files {
for mut top_level_node in &mut top_level_nodes {
// deleted files are always absolute, need to canonicalize the node tree
let absolute_path = top_level_node.name.canonicalize().unwrap();
if path.starts_with(absolute_path.components().next().unwrap()) {
insert_deleted_file_in_node(
path.clone(),
m,
&mut top_level_node,

Check failure on line 127 in src/dir_walker.rs

View workflow job for this annotation

GitHub Actions / Style (ubuntu-latest)

this expression creates a reference which is immediately dereferenced by the compiler
&walk_data,

Check failure on line 128 in src/dir_walker.rs

View workflow job for this annotation

GitHub Actions / Style (ubuntu-latest)

this expression creates a reference which is immediately dereferenced by the compiler
0,
);
}
}
}
}

top_level_nodes
}

fn insert_deleted_file_in_node(
path: PathBuf,
m: &Metadata,
root: &mut Node,
walk_data: &WalkData,
depth: usize,
) -> bool {
if path.parent().unwrap() == root.name {
// we found the node that represents the parent dir
// TODO: filecount, filetime, regex...
let size = if walk_data.use_apparent_size {
m.st_size()
} else {
m.st_blocks() * 512
};

root.size += size;

let node = Node {
name: path.clone(),
size,
children: vec![],
inode_device: Some((m.st_ino(), m.st_dev())),
depth,
};

root.children.push(node);
println!("inserted {:?} in {:?}", path, root.name);

return true;
}

for child in &mut root.children {
if path.starts_with(&child.name) {
return insert_deleted_file_in_node(path, m, child, &walk_data, depth + 1);

Check failure on line 173 in src/dir_walker.rs

View workflow job for this annotation

GitHub Actions / Style (ubuntu-latest)

this expression creates a reference which is immediately dereferenced by the compiler
}
}

return false;

Check failure on line 177 in src/dir_walker.rs

View workflow job for this annotation

GitHub Actions / Style (ubuntu-latest)

unneeded `return` statement
}

// Remove files which have the same inode, we don't want to double count them.
fn clean_inodes(x: Node, inodes: &mut HashSet<(u64, u64)>, walk_data: &WalkData) -> Option<Node> {
fn clean_inodes(
x: Node,
inodes: &mut HashSet<InodeAndDevice>,
walk_data: &WalkData,
) -> Option<Node> {
if !walk_data.use_apparent_size {
if let Some(id) = x.inode_device {
if !inodes.insert(id) {
Expand Down
13 changes: 7 additions & 6 deletions src/node.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::dir_walker::WalkData;
use crate::platform::get_metadata;
use crate::platform::InodeAndDevice;
use crate::utils::is_filtered_out_due_to_file_time;
use crate::utils::is_filtered_out_due_to_invert_regex;
use crate::utils::is_filtered_out_due_to_regex;
Expand All @@ -12,7 +13,7 @@ pub struct Node {
pub name: PathBuf,
pub size: u64,
pub children: Vec<Node>,
pub inode_device: Option<(u64, u64)>,
pub inode_device: Option<InodeAndDevice>,
pub depth: usize,
}

Expand All @@ -25,7 +26,7 @@ pub enum FileTime {

#[allow(clippy::too_many_arguments)]
pub fn build_node(
dir: PathBuf,
path: PathBuf,
children: Vec<Node>,
is_symlink: bool,
is_file: bool,
Expand All @@ -37,15 +38,15 @@ pub fn build_node(
let by_filetime = &walk_data.by_filetime;

get_metadata(
&dir,
&path,
use_apparent_size,
walk_data.follow_links && is_symlink,
)
.map(|data| {
let inode_device = data.1;

let size = if is_filtered_out_due_to_regex(walk_data.filter_regex, &dir)
|| is_filtered_out_due_to_invert_regex(walk_data.invert_filter_regex, &dir)
let size = if is_filtered_out_due_to_regex(walk_data.filter_regex, &path)
|| is_filtered_out_due_to_invert_regex(walk_data.invert_filter_regex, &path)
|| by_filecount && !is_file
|| [
(&walk_data.filter_modified_time, data.2 .0),
Expand All @@ -71,7 +72,7 @@ pub fn build_node(
};

Node {
name: dir,
name: path,
size,
children,
inode_device,
Expand Down
2 changes: 1 addition & 1 deletion src/platform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fn get_block_size() -> u64 {
512
}

type InodeAndDevice = (u64, u64);
pub(crate) type InodeAndDevice = (u64, u64);
type FileTime = (i64, i64, i64);

#[cfg(target_family = "unix")]
Expand Down

0 comments on commit 5b1c8a3

Please sign in to comment.