Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use inline storage for small hashes #47

Merged
merged 10 commits into from
Feb 21, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ edition = "2018"
[dependencies]
blake2b_simd = { version = "0.5.9", default-features = false }
blake2s_simd = { version = "0.5.9", default-features = false }
bytes = "0.5"
sha1 = "0.5"
sha2 = { version = "0.7", default-features = false }
tiny-keccak = "1.4"
unsigned-varint = "0.3"

[dev-dependencies]
quickcheck = "0.9.2"
67 changes: 43 additions & 24 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,22 @@

mod errors;
mod hashes;
mod storage;

use std::convert::TryFrom;
use std::fmt::Debug;
use std::hash;

use blake2b_simd::{blake2b, Params as Blake2bVariable};
use blake2s_simd::{blake2s, Params as Blake2sVariable};
use bytes::{BufMut, Bytes, BytesMut};
use sha2::Digest;
use tiny_keccak::Keccak;
use unsigned_varint::{decode, encode};

pub use errors::{DecodeError, DecodeOwnedError, EncodeError};
pub use hashes::Hash;
use std::fmt;
use storage::Storage;

// Helper macro for encoding input into output using sha1, sha2, tiny_keccak, or blake2
macro_rules! encode {
Expand Down Expand Up @@ -104,15 +108,8 @@ pub fn encode(hash: Hash, input: &[u8]) -> Result<Multihash, EncodeError> {
let code = encode::u16(hash.code(), &mut buf);
let mut len_buf = encode::u32_buffer();
let size = encode::u32(input.len() as u32, &mut len_buf);

let total_len = code.len() + size.len() + input.len();

let mut output = BytesMut::with_capacity(total_len);
output.put_slice(code);
output.put_slice(size);
output.put_slice(input);
Ok(Multihash {
bytes: output.freeze(),
storage: Storage::from_slices(&[&code, &size, &input]),
})
} else {
let (offset, mut output) = encode_hash(hash);
Expand All @@ -135,31 +132,51 @@ pub fn encode(hash: Hash, input: &[u8]) -> Result<Multihash, EncodeError> {
});

Ok(Multihash {
bytes: output.freeze(),
storage: Storage::from_slice(&output),
rklaehn marked this conversation as resolved.
Show resolved Hide resolved
})
}
}

// Encode the given [`Hash`] value and ensure the returned [`BytesMut`]
// Encode the given [`Hash`] value and ensure the returned [`Vec<u8>`]
// has enough capacity to hold the actual digest.
fn encode_hash(hash: Hash) -> (usize, BytesMut) {
fn encode_hash(hash: Hash) -> (usize, Vec<u8>) {
let mut buf = encode::u16_buffer();
let code = encode::u16(hash.code(), &mut buf);

let len = code.len() + 1 + usize::from(hash.size());

let mut output = BytesMut::with_capacity(len);
output.put_slice(code);
output.put_u8(hash.size());
let mut output = Vec::with_capacity(len);
output.extend_from_slice(code);
output.push(hash.size());
output.resize(len, 0);

(code.len() + 1, output)
}

/// Represents a valid multihash.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[derive(Clone)]
pub struct Multihash {
bytes: Bytes,
storage: Storage,
}

impl Debug for Multihash {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Multihash")
}
}

impl PartialEq for Multihash {
fn eq(&self, other: &Self) -> bool {
self.storage.bytes() == other.storage.bytes()
}
}

impl Eq for Multihash {}

impl hash::Hash for Multihash {
fn hash<H: hash::Hasher>(&self, state: &mut H) {
self.storage.bytes().hash(state);
}
}

impl Multihash {
Expand All @@ -172,7 +189,7 @@ impl Multihash {
});
}
Ok(Multihash {
bytes: Bytes::from(bytes),
storage: Storage::from_slice(&bytes),
rklaehn marked this conversation as resolved.
Show resolved Hide resolved
})
}

Expand All @@ -183,17 +200,19 @@ impl Multihash {

/// Returns the bytes representation of the multihash.
pub fn to_vec(&self) -> Vec<u8> {
Vec::from(&self.bytes[..])
Vec::from(self.as_bytes())
}

/// Returns the bytes representation of this multihash.
pub fn as_bytes(&self) -> &[u8] {
&self.bytes
self.storage.bytes()
}

/// Builds a `MultihashRef` corresponding to this `Multihash`.
pub fn as_ref(&self) -> MultihashRef {
MultihashRef { bytes: &self.bytes }
MultihashRef {
bytes: self.as_bytes(),
}
}

/// Returns which hashing algorithm is used in this multihash.
Expand All @@ -215,7 +234,7 @@ impl AsRef<[u8]> for Multihash {

impl<'a> PartialEq<MultihashRef<'a>> for Multihash {
fn eq(&self, other: &MultihashRef<'a>) -> bool {
&*self.bytes == other.bytes
&*self.as_bytes() == other.as_bytes()
}
}

Expand Down Expand Up @@ -290,7 +309,7 @@ impl<'a> MultihashRef<'a> {
/// This operation allocates.
pub fn to_owned(&self) -> Multihash {
Multihash {
bytes: Bytes::copy_from_slice(self.bytes),
storage: Storage::from_slice(self.bytes),
}
}

Expand All @@ -302,7 +321,7 @@ impl<'a> MultihashRef<'a> {

impl<'a> PartialEq<Multihash> for MultihashRef<'a> {
fn eq(&self, other: &Multihash) -> bool {
self.bytes == &*other.bytes
self.as_bytes() == &*other.as_bytes()
}
}

Expand Down
107 changes: 107 additions & 0 deletions src/storage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
use std::sync::Arc;

/// MAX_INLINE is the maximum size of a multihash that can be stored inline
///
/// We want the currently most common multihashes using 256bit hashes to be stored inline. These
/// hashes are 34 bytes long. An overall size of 38 seems like a good compromise. It allows storing
/// any 256bit hash with some room to spare and gives an overall size for Storage of 40 bytes, which
/// is a multiple of 8. We need 2 extra bytes, one for the size and one for the enum discriminator.
const MAX_INLINE: usize = 38;
rklaehn marked this conversation as resolved.
Show resolved Hide resolved

#[derive(Clone)]
pub(crate) enum Storage {
/// hash is stored inline if it is smaller than MAX_INLINE
Inline(u8, [u8; MAX_INLINE]),
/// hash is stored on the heap. this must be only used if the hash is actually larger than
/// MAX_INLINE bytes to ensure an unique representation.
Heap(Arc<[u8]>),
}

impl Storage {
/// The raw bytes.
pub fn bytes(&self) -> &[u8] {
match self {
Storage::Inline(len, bytes) => &bytes[..(*len as usize)],
Storage::Heap(data) => &data,
}
}

/// creates storage from a vec. For a size up to MAX_INLINE, this will not allocate.
pub fn from_slice(slice: &[u8]) -> Self {
let len = slice.len();
if len <= MAX_INLINE {
let mut data: [u8; MAX_INLINE] = [0; MAX_INLINE];
data[..len].copy_from_slice(slice);
Storage::Inline(len as u8, data)
} else {
Storage::Heap(slice.into())
}
}

/// creates storage from multiple slices. For a size up to MAX_INLINE, this will not allocate.
pub fn from_slices(slices: &[&[u8]]) -> Self {
let n = slices.iter().fold(0usize, |a, s| a.saturating_add(s.len()));
if n <= MAX_INLINE {
let s = slices
.iter()
.fold(([0; MAX_INLINE], 0), |(mut array, i), s| {
array[i..i + s.len()].copy_from_slice(s);
(array, i + s.len())
});
Storage::Inline(n as u8, s.0)
} else {
let mut v = Vec::with_capacity(n);
for s in slices {
v.extend_from_slice(s)
}
Storage::Heap(v.into())
}
}
}

#[cfg(test)]
mod tests {
use super::{Storage, MAX_INLINE};
use quickcheck::quickcheck;

#[test]
fn struct_size() {
// this should be true for both 32 and 64 bit archs
assert_eq!(std::mem::size_of::<Storage>(), 40);
}

#[test]
fn roundtrip() {
// check that .bytes() returns whatever the storage was created with
for i in 0..((MAX_INLINE + 10) as u8) {
let data = (0..i).collect::<Vec<u8>>();
let storage = Storage::from_slice(&data);
assert_eq!(data, storage.bytes());
}
}

fn check_invariants(storage: Storage) -> bool {
match storage {
Storage::Inline(len, _) => len as usize <= MAX_INLINE,
Storage::Heap(arc) => arc.len() > MAX_INLINE,
}
}

quickcheck! {
fn roundtrip_check(data: Vec<u8>) -> bool {
let storage = Storage::from_slice(&data);
storage.bytes() == data.as_slice() && check_invariants(storage)
}

fn from_slices_roundtrip_check(data: Vec<Vec<u8>>) -> bool {
let mut slices = Vec::new();
let mut expected = Vec::new();
for v in data.iter() {
slices.push(v.as_slice());
expected.extend_from_slice(&v);
}
let storage = Storage::from_slices(&slices);
storage.bytes() == expected.as_slice() && check_invariants(storage)
}
}
}