Skip to content

Commit

Permalink
replace the new file module with inherent methods on Hasher
Browse files Browse the repository at this point in the history
New methods:
- update_reader
- update_mmap
- update_mmap_rayon

These are more discoverable, more convenient, and safer.

There are two problems I want to avoid by taking a `Path` instead of a
`File`. First, exposing `Mmap` objects to the caller is fundamentally
unsafe, and making `maybe_mmap_file` private avoids that issue. Second,
taking a `File` raises questions about whether memory mapped reads
should behave like regular file reads. (Should they respect the current
seek position? Should they update the seek position?) Taking a `Path`
from the caller and opening the `File` internally avoids these
questions.
  • Loading branch information
oconnor663 committed Sep 17, 2023
1 parent e0bb915 commit cb32f0b
Show file tree
Hide file tree
Showing 9 changed files with 466 additions and 244 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ jobs:
run: cargo test --features=rayon,traits-preview,zeroize
env:
RAYON_NUM_THREADS: 1
# The mmap feature by itself (update_mmap_rayon is omitted).
- run: cargo test --features=mmap
# All public features put together.
- run: cargo test --features=mmap,rayon,traits-preview,zeroize
# no_std tests.
- run: cargo test --no-default-features

Expand Down
21 changes: 13 additions & 8 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,17 @@ neon = []
# entire build, with e.g. RUSTFLAGS="-C target-cpu=native".
std = []

# The "rayon" feature (defined below as an optional dependency) enables the
# `Hasher::update_rayon` method, for multithreaded hashing. However, even if
# this feature is enabled, all other APIs remain single-threaded.
# The `rayon` feature (disabled by default, but enabled for docs.rs) adds the
# `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon`
# methods, for multithreaded hashing. However, even if this feature is enabled,
# all other APIs remain single-threaded.
rayon = ["dep:rayon", "std"]

# The `mmap` feature (disabled by default, but enabled for docs.rs) adds the
# `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon`
# helper methods for memory-mapped IO.
mmap = ["std", "dep:memmap2"]

# Implement the zeroize::Zeroize trait for types in this crate.
zeroize = ["dep:zeroize", "arrayvec/zeroize"]

Expand Down Expand Up @@ -81,11 +87,9 @@ no_avx2 = []
no_avx512 = []
no_neon = []

file = ["memmap2", "rayon", "std"]

[package.metadata.docs.rs]
# Document Hasher::update_rayon on docs.rs.
features = ["rayon", "zeroize"]
# Document the rayon/mmap methods and the Zeroize impls on docs.rs.
features = ["mmap", "rayon", "zeroize"]

[dependencies]
arrayref = "0.3.5"
Expand All @@ -98,12 +102,13 @@ zeroize = { version = "1", default-features = false, features = ["zeroize_derive
memmap2 = { version = "0.7.1", optional = true }

[dev-dependencies]
hmac = "0.12.0"
hex = "0.4.2"
page_size = "0.6.0"
rand = "0.8.0"
rand_chacha = "0.3.0"
reference_impl = { path = "./reference_impl" }
hmac = "0.12.0"
tempfile = "3.8.0"

[build-dependencies]
cc = "1.0.4"
24 changes: 12 additions & 12 deletions b3sum/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion b3sum/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pure = ["blake3/pure"]

[dependencies]
anyhow = "1.0.25"
blake3 = { version = "1", path = "..", features = ["file", "rayon"] }
blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
hex = "0.4.0"
memmap2 = "0.7.0"
Expand Down
110 changes: 33 additions & 77 deletions b3sum/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,73 +163,22 @@ impl Args {
}
}

enum Input {
Mmap(io::Cursor<memmap2::Mmap>),
File(File),
Stdin,
}

impl Input {
// Open an input file, using mmap if appropriate. "-" means stdin. Note
// that this convention applies both to command line arguments, and to
// filepaths that appear in a checkfile.
fn open(path: &Path, args: &Args) -> Result<Self> {
if path == Path::new("-") {
if args.keyed() {
bail!("Cannot open `-` in keyed mode");
}
return Ok(Self::Stdin);
}
let file = File::open(path)?;
if !args.no_mmap() {
if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? {
return Ok(Self::Mmap(io::Cursor::new(mmap)));
}
}
Ok(Self::File(file))
}

fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> {
let mut hasher = args.base_hasher.clone();
match self {
// The fast path: If we mmapped the file successfully, hash using
// multiple threads. This doesn't work on stdin, or on some files,
// and it can also be disabled with --no-mmap.
Self::Mmap(cursor) => {
hasher.update_rayon(cursor.get_ref());
}
// The slower paths, for stdin or files we didn't/couldn't mmap.
// This is currently all single-threaded. Doing multi-threaded
// hashing without memory mapping is tricky, since all your worker
// threads have to stop every time you refill the buffer, and that
// ends up being a lot of overhead. To solve that, we need a more
// complicated double-buffering strategy where a background thread
// fills one buffer while the worker threads are hashing the other
// one. We might implement that in the future, but since this is
// the slow path anyway, it's not high priority.
Self::File(file) => {
blake3::copy_wide(file, &mut hasher)?;
}
Self::Stdin => {
let stdin = io::stdin();
let lock = stdin.lock();
blake3::copy_wide(lock, &mut hasher)?;
}
}
let mut output_reader = hasher.finalize_xof();
output_reader.set_position(args.seek());
Ok(output_reader)
}
}

impl Read for Input {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
match self {
Self::Mmap(cursor) => cursor.read(buf),
Self::File(file) => file.read(buf),
Self::Stdin => io::stdin().read(buf),
fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
let mut hasher = args.base_hasher.clone();
if path == Path::new("-") {
if args.keyed() {
bail!("Cannot open `-` in keyed mode");
}
hasher.update_reader(io::stdin().lock())?;
} else if args.no_mmap() {
hasher.update_reader(File::open(path)?)?;
} else {
// The fast path: Try to mmap the file and hash it with multiple threads.
hasher.update_mmap_rayon(path)?;
}
let mut output_reader = hasher.finalize_xof();
output_reader.set_position(args.seek());
Ok(output_reader)
}

fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
Expand Down Expand Up @@ -425,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
}

fn hash_one_input(path: &Path, args: &Args) -> Result<()> {
let mut input = Input::open(path, args)?;
let output = input.hash(args)?;
let output = hash_path(args, path)?;
if args.raw() {
write_raw_output(output, args)?;
return Ok(());
Expand Down Expand Up @@ -470,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool {
} else {
file_string
};
let hash_result: Result<blake3::Hash> = Input::open(&file_path, args)
.and_then(|mut input| input.hash(args))
.map(|mut hash_output| {
let found_hash: blake3::Hash;
match hash_path(args, &file_path) {
Ok(mut output) => {
let mut found_hash_bytes = [0; blake3::OUT_LEN];
hash_output.fill(&mut found_hash_bytes);
found_hash_bytes.into()
});
let found_hash: blake3::Hash = match hash_result {
Ok(hash) => hash,
output.fill(&mut found_hash_bytes);
found_hash = found_hash_bytes.into();
}
Err(e) => {
println!("{}: FAILED ({})", file_string, e);
return false;
Expand All @@ -497,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool {
}

fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> {
let checkfile_input = Input::open(path, args)?;
let mut bufreader = io::BufReader::new(checkfile_input);
let mut file;
let stdin;
let mut stdin_lock;
let mut bufreader: io::BufReader<&mut dyn Read>;
if path == Path::new("-") {
stdin = io::stdin();
stdin_lock = stdin.lock();
bufreader = io::BufReader::new(&mut stdin_lock);
} else {
file = File::open(path)?;
bufreader = io::BufReader::new(&mut file);
}
let mut line = String::new();
loop {
line.clear();
Expand Down
67 changes: 0 additions & 67 deletions src/file.rs

This file was deleted.

Loading

0 comments on commit cb32f0b

Please sign in to comment.