Skip to content

Commit

Permalink
Merge 6968ae3 into 7b9b9fa
Browse files Browse the repository at this point in the history
  • Loading branch information
wcampbell0x2a authored Aug 29, 2024
2 parents 7b9b9fa + 6968ae3 commit d1c6030
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 39 deletions.
21 changes: 14 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion backhand-test/tests/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ fn test_raw_00() {
frag_count: 0x1,
compressor: Compressor::Xz,
block_log: 0x11,
flags: 0x0,
flags: backhand::Flags::DataHasBeenDeduplicated as u16,
id_count: 0x2,
version_major: 0x4,
version_minor: 0x0,
Expand Down
3 changes: 2 additions & 1 deletion backhand/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ xz2 = { version = "0.1.7", optional = true }
rust-lzo = { version = "0.6.2", optional = true }
zstd = { version = "0.13.1", optional = true }
zstd-safe = { version = "7.2.1", optional = true }
rustc-hash = "2.0.0"
document-features = { version = "0.2.10", optional = true }
xxhash-rust = { version = "0.8.12", features = ["xxh64"] }
solana-nohash-hasher = "0.2.1"

[features]
default = ["xz", "gzip", "zstd"]
Expand Down
122 changes: 104 additions & 18 deletions backhand/src/data.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
//! File Data
use std::collections::HashMap;
use std::io::{Read, Seek, Write};

use deku::prelude::*;
use solana_nohash_hasher::IntMap;
use tracing::trace;
use xxhash_rust::xxh64::xxh64;

use crate::compressor::CompressionAction;
use crate::error::BackhandError;
Expand Down Expand Up @@ -100,6 +104,9 @@ pub(crate) struct DataWriter<'a> {
kind: &'a dyn CompressionAction,
block_size: u32,
fs_compressor: FilesystemCompressor,
/// If some, cache of HashMap<file_len, HashMap<hash, (file_len, Added)>>
#[allow(clippy::type_complexity)]
dup_cache: Option<IntMap<u64, IntMap<u64, (usize, Added)>>>,
/// Un-written fragment_bytes
pub(crate) fragment_bytes: Vec<u8>,
pub(crate) fragment_table: Vec<Fragment>,
Expand All @@ -110,11 +117,13 @@ impl<'a> DataWriter<'a> {
kind: &'a dyn CompressionAction,
fs_compressor: FilesystemCompressor,
block_size: u32,
no_duplicate_files: bool,
) -> Self {
Self {
kind,
block_size,
fs_compressor,
dup_cache: no_duplicate_files.then_some(HashMap::default()),
fragment_bytes: Vec::with_capacity(block_size as usize),
fragment_table: vec![],
}
Expand Down Expand Up @@ -186,6 +195,9 @@ impl<'a> DataWriter<'a> {
}

/// Add to data writer, either a Data or Fragment
///
/// If `self.dup_cache` is on, return alrady added `(usize, Added)` if duplicate
/// is found
// TODO: support tail-end fragments (off by default in squashfs-tools/mksquashfs)
pub(crate) fn add_bytes<W: WriteSeek>(
&mut self,
Expand All @@ -197,6 +209,8 @@ impl<'a> DataWriter<'a> {
file_len: 0,
reader,
};

// read entire chunk (file)
let mut chunk = chunk_reader.read_chunk()?;

// chunk size not exactly the size of the block
Expand All @@ -212,29 +226,58 @@ impl<'a> DataWriter<'a> {
let block_offset = self.fragment_bytes.len() as u32;
self.fragment_bytes.write_all(chunk)?;

Ok((chunk_reader.file_len, Added::Fragment { frag_index, block_offset }))
} else {
// Add to data bytes
let blocks_start = writer.stream_position()? as u32;
let mut block_sizes = vec![];
while !chunk.is_empty() {
let cb = self.kind.compress(chunk, self.fs_compressor, self.block_size)?;
return Ok((chunk_reader.file_len, Added::Fragment { frag_index, block_offset }));
}

// compression didn't reduce size
if cb.len() > chunk.len() {
// store uncompressed
block_sizes.push(DataSize::new_uncompressed(chunk.len() as u32));
writer.write_all(chunk)?;
} else {
// store compressed
block_sizes.push(DataSize::new_compressed(cb.len() as u32));
writer.write_all(&cb)?;
// Add to data bytes
let blocks_start = writer.stream_position()? as u32;
let mut block_sizes = vec![];

// If duplicate file checking is enabled, use the old data position as this file if it hashes the same
if let Some(dup_cache) = &self.dup_cache {
if let Some(c) = dup_cache.get(&(chunk.len() as u64)) {
let hash = xxh64(chunk, 0);
if let Some(res) = c.get(&hash) {
trace!("duplicate file data found");
return Ok(res.clone());
}
chunk = chunk_reader.read_chunk()?;
}
}

// Save information needed to add to duplicate_cache later
let chunk_len = chunk.len();
let hash = xxh64(chunk, 0);

while !chunk.is_empty() {
let cb = self.kind.compress(chunk, self.fs_compressor, self.block_size)?;

// compression didn't reduce size
if cb.len() > chunk.len() {
// store uncompressed
block_sizes.push(DataSize::new_uncompressed(chunk.len() as u32));
writer.write_all(chunk)?;
} else {
// store compressed
block_sizes.push(DataSize::new_compressed(cb.len() as u32));
writer.write_all(&cb)?;
}
chunk = chunk_reader.read_chunk()?;
}

Ok((chunk_reader.file_len, Added::Data { blocks_start, block_sizes }))
// Add to duplicate information cache
let added = (chunk_reader.file_len, Added::Data { blocks_start, block_sizes });

// If duplicate files checking is enbaled, then add this to it's memory
if let Some(dup_cache) = &mut self.dup_cache {
if let Some(entry) = dup_cache.get_mut(&(chunk_len as u64)) {
entry.insert(hash, added.clone());
} else {
let mut hashmap = IntMap::default();
hashmap.insert(hash, added.clone());
dup_cache.insert(chunk_len as u64, hashmap);
}
}
Ok(added)
}

/// Compress the fragments that were under length, write to data, add to fragment table, clear
Expand All @@ -258,3 +301,46 @@ impl<'a> DataWriter<'a> {
Ok(())
}
}

#[cfg(test)]
mod tests {
use std::io::Cursor;

use super::*;
use crate::{
compression::{Compressor, DefaultCompressor},
DEFAULT_BLOCK_SIZE,
};

#[test]
#[cfg(feature = "gzip")]
fn test_duplicate_check() {
let mut data_writer = DataWriter::new(
&DefaultCompressor,
FilesystemCompressor::new(Compressor::Gzip, None).unwrap(),
DEFAULT_BLOCK_SIZE,
true,
);
let bytes = [0xff_u8; DEFAULT_BLOCK_SIZE as usize * 2];
let mut writer = Cursor::new(vec![]);
let added_1 = data_writer.add_bytes(&bytes[..], &mut writer).unwrap();
let added_2 = data_writer.add_bytes(&bytes[..], &mut writer).unwrap();
assert_eq!(added_1, added_2);
}

#[test]
#[cfg(feature = "gzip")]
fn test_no_duplicate_check() {
let mut data_writer = DataWriter::new(
&DefaultCompressor,
FilesystemCompressor::new(Compressor::Gzip, None).unwrap(),
DEFAULT_BLOCK_SIZE,
false,
);
let bytes = [0xff_u8; DEFAULT_BLOCK_SIZE as usize * 2];
let mut writer = Cursor::new(vec![]);
let added_1 = data_writer.add_bytes(&bytes[..], &mut writer).unwrap();
let added_2 = data_writer.add_bytes(&bytes[..], &mut writer).unwrap();
assert_ne!(added_1, added_2);
}
}
20 changes: 18 additions & 2 deletions backhand/src/filesystem/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ pub struct FilesystemWriter<'a, 'b, 'c> {
/// The log2 of the block size. If the two fields do not agree, the archive is considered corrupted.
pub(crate) block_log: u16,
pub(crate) pad_len: u32,
pub(crate) no_duplicate_files: bool,
}

impl Default for FilesystemWriter<'_, '_, '_> {
Expand All @@ -96,6 +97,7 @@ impl Default for FilesystemWriter<'_, '_, '_> {
root: Nodes::new_root(NodeHeader::default()),
block_log: (block_size as f32).log2() as u16,
pad_len: DEFAULT_PAD_LEN,
no_duplicate_files: true,
}
}
}
Expand Down Expand Up @@ -193,6 +195,11 @@ impl<'a, 'b, 'c> FilesystemWriter<'a, 'b, 'c> {
self.pad_len = 0;
}

/// Set if we perform duplicate file checking, on by default
pub fn set_no_duplicate_files(&mut self, value: bool) {
self.no_duplicate_files = value;
}

/// Inherit filesystem structure and properties from `reader`
pub fn from_fs_reader(reader: &'a FilesystemReader<'b>) -> Result<Self, BackhandError> {
let mut root: Vec<Node<_>> = reader
Expand Down Expand Up @@ -228,6 +235,7 @@ impl<'a, 'b, 'c> FilesystemWriter<'a, 'b, 'c> {
id_table: reader.id_table.clone(),
root: Nodes { nodes: root },
pad_len: DEFAULT_PAD_LEN,
no_duplicate_files: true,
})
}

Expand Down Expand Up @@ -637,6 +645,10 @@ impl<'a, 'b, 'c> FilesystemWriter<'a, 'b, 'c> {
let mut superblock =
SuperBlock::new(self.fs_compressor.id, Kind { inner: self.kind.inner.clone() });

if self.no_duplicate_files {
superblock.flags |= Flags::DataHasBeenDeduplicated as u16;
}

trace!("{:#02x?}", self.root);

// Empty Squashfs Superblock
Expand Down Expand Up @@ -674,8 +686,12 @@ impl<'a, 'b, 'c> FilesystemWriter<'a, 'b, 'c> {
metadata.finalize(&mut w)?;
}

let mut data_writer =
DataWriter::new(self.kind.inner.compressor, self.fs_compressor, self.block_size);
let mut data_writer = DataWriter::new(
self.kind.inner.compressor,
self.fs_compressor,
self.block_size,
self.no_duplicate_files,
);
let mut inode_writer = MetadataWriter::new(
self.fs_compressor,
self.block_size,
Expand Down
3 changes: 2 additions & 1 deletion backhand/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ pub use crate::id::Id;
pub use crate::inode::{BasicFile, Inode};
pub use crate::reader::BufReadSeek;
pub use crate::squashfs::{
Squashfs, SuperBlock, DEFAULT_BLOCK_SIZE, DEFAULT_PAD_LEN, MAX_BLOCK_SIZE, MIN_BLOCK_SIZE,
Flags, Squashfs, SuperBlock, DEFAULT_BLOCK_SIZE, DEFAULT_PAD_LEN, MAX_BLOCK_SIZE,
MIN_BLOCK_SIZE,
};

/// Support the wonderful world of vendor formats
Expand Down
8 changes: 4 additions & 4 deletions backhand/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::collections::HashMap;
use std::io::{BufRead, Cursor, Read, Seek, SeekFrom, Write};

use deku::prelude::*;
use rustc_hash::FxHashMap;
use solana_nohash_hasher::IntMap;
use tracing::{error, trace};

use crate::error::BackhandError;
Expand Down Expand Up @@ -86,15 +86,15 @@ pub trait SquashFsReader: BufReadSeek + Sized {
&mut self,
superblock: &SuperBlock,
kind: &Kind,
) -> Result<(Inode, FxHashMap<u32, Inode>), BackhandError> {
) -> Result<(Inode, IntMap<u32, Inode>), BackhandError> {
let (map, bytes) = self.uncompress_metadatas(
superblock.inode_table,
superblock,
superblock.dir_table,
kind,
)?;

let mut inodes = FxHashMap::default();
let mut inodes = IntMap::default();
inodes.try_reserve(superblock.inode_count as usize)?;

let byte_len = bytes.len();
Expand Down Expand Up @@ -152,7 +152,7 @@ pub trait SquashFsReader: BufReadSeek + Sized {
superblock: &SuperBlock,
end_ptr: u64,
kind: &Kind,
) -> Result<(FxHashMap<u64, u64>, Vec<u8>), BackhandError> {
) -> Result<(IntMap<u64, u64>, Vec<u8>), BackhandError> {
self.seek(SeekFrom::Start(seek))?;
let mut map = HashMap::default();
let mut all_bytes = vec![];
Expand Down
Loading

0 comments on commit d1c6030

Please sign in to comment.