Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for dictionary compression. #51

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions lz4-sys/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ pub struct LZ4StreamEncode(c_void);
#[repr(C)]
pub struct LZ4StreamDecode(c_void);

#[derive(Clone, Copy)]
#[repr(C)]
pub struct LZ4FCDict(pub *mut c_void);
unsafe impl Send for LZ4FCDict {}
unsafe impl Sync for LZ4FCDict {}

pub const LZ4F_VERSION: c_uint = 100;

extern "C" {
Expand Down Expand Up @@ -136,6 +142,16 @@ extern "C" {
// LZ4F_compressionContext_t LZ4F_compressionContext);
pub fn LZ4F_freeCompressionContext(ctx: LZ4FCompressionContext) -> LZ4FErrorCode;

// LZ4_createCDict() :
// When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
// LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
// LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
// `dictBuffer` can be released after LZ4_CDict creation, since its content is copied within CDict.
// LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize);
// void LZ4F_freeCDict(LZ4F_CDict* CDict);
pub fn LZ4F_createCDict(dictBuffer: *const u8, dictSize: size_t) -> LZ4FCDict;
pub fn LZ4F_freeCDict(CDict: LZ4FCDict);

// LZ4F_compressBegin() :
// will write the frame header into dstBuffer.
// dstBuffer must be large enough to accommodate a header (dstMaxSize). Maximum header
Expand All @@ -155,6 +171,25 @@ extern "C" {
preferencesPtr: *const LZ4FPreferences)
-> LZ4FErrorCode;

// LZ4F_compressBegin_usingCDict() :
// Inits streaming dictionary compression, and writes the frame header into dstBuffer.
// dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
// `prefsPtr` is optional : you may provide NULL as argument,
// however, it's the only way to provide dictID in the frame header.
// @return : number of bytes written into dstBuffer for the header,
// or an error code (which can be tested using LZ4F_isError()) */
// size_t LZ4F_compressBegin_usingCDict(
// LZ4F_cctx* cctx,
// void* dstBuffer, size_t dstCapacity,
// const LZ4F_CDict* cdict,
// const LZ4F_preferences_t* prefsPtr);
pub fn LZ4F_compressBegin_usingCDict(ctx: LZ4FCompressionContext,
dstBuffer: *mut u8,
dstCapacity: size_t,
cdict: LZ4FCDict,
prefsPtr: *const LZ4FPreferences)
-> LZ4FErrorCode;

// LZ4F_compressBound() :
// Provides the minimum size of Dst buffer given srcSize to handle worst case situations.
// preferencesPtr is optional : you can provide NULL as argument, all preferences will then
Expand Down Expand Up @@ -317,6 +352,26 @@ extern "C" {
optionsPtr: *const LZ4FDecompressOptions)
-> LZ4FErrorCode;

// LZ4F_decompress_usingDict() :
// Same as LZ4F_decompress(), using a predefined dictionary.
// Dictionary is used "in place", without any preprocessing.
// It must remain accessible throughout the entire frame decoding. */
// size_t LZ4F_decompress_usingDict(
// LZ4F_dctx* dctxPtr,
// void* dstBuffer, size_t* dstSizePtr,
// const void* srcBuffer, size_t* srcSizePtr,
// const void* dict, size_t dictSize,
// const LZ4F_decompressOptions_t* decompressOptionsPtr);
pub fn LZ4F_decompress_usingDict(ctx: LZ4FDecompressionContext,
dstBuffer: *mut u8,
dstSizePtr: &mut size_t,
srcBuffer: *const u8,
srcSizePtr: &mut size_t,
dict: *const u8,
dictSize: size_t,
optionsPtr: *const LZ4FDecompressOptions)
-> LZ4FErrorCode;

// int LZ4_versionNumber(void)
pub fn LZ4_versionNumber() -> c_int;

Expand Down
80 changes: 66 additions & 14 deletions src/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,21 @@ struct DecoderContext {
c: LZ4FDecompressionContext,
}

pub struct Decoder<R> {
pub struct Decoder<'a, R> {
c: DecoderContext,
r: R,
buf: Box<[u8]>,
pos: usize,
len: usize,
next: usize,
dict: Option<&'a [u8]>,
}

impl<R: Read> Decoder<R> {
impl<'a, R: Read> Decoder<'a, R> {
/// Creates a new encoder which will have its output written to the given
/// output stream. The output stream can be re-acquired by calling
/// `finish()`
pub fn new(r: R) -> Result<Decoder<R>> {
pub fn new(r: R) -> Result<Decoder<'a, R>> {
Ok(Decoder {
r: r,
c: try!(DecoderContext::new()),
Expand All @@ -31,6 +32,22 @@ impl<R: Read> Decoder<R> {
len: BUFFER_SIZE,
// Minimal LZ4 stream size
next: 11,
dict: None,
})
}

/// Creates a new decoder with a dictionary, which should be the one which
/// was used in encoding.
pub fn with_dictionary(r: R, dict: &'a [u8]) -> Result<Decoder<'a, R>> {
Ok(Decoder {
r: r,
c: try!(DecoderContext::new()),
buf: vec![0; BUFFER_SIZE].into_boxed_slice(),
pos: BUFFER_SIZE,
len: BUFFER_SIZE,
// Minimal LZ4 stream size
next: 11,
dict: Some(dict),
})
}

Expand All @@ -53,7 +70,7 @@ impl<R: Read> Decoder<R> {
}
}

impl<R: Read> Read for Decoder<R> {
impl<'a, R: Read> Read for Decoder<'a, R> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
if self.next == 0 || buf.len() == 0 {
return Ok(0);
Expand All @@ -76,14 +93,26 @@ impl<R: Read> Read for Decoder<R> {
let mut src_size = (self.len - self.pos) as size_t;
let mut dst_size = (buf.len() - dst_offset) as size_t;
let len = try!(check_error(unsafe {
LZ4F_decompress(
self.c.c,
buf[dst_offset..].as_mut_ptr(),
&mut dst_size,
self.buf[self.pos..].as_ptr(),
&mut src_size,
ptr::null(),
)
match self.dict {
Some(dict) => LZ4F_decompress_usingDict(
self.c.c,
buf[dst_offset..].as_mut_ptr(),
&mut dst_size,
self.buf[self.pos..].as_ptr(),
&mut src_size,
dict.as_ptr(),
dict.len(),
ptr::null(),
),
None => LZ4F_decompress(
self.c.c,
buf[dst_offset..].as_mut_ptr(),
&mut dst_size,
self.buf[self.pos..].as_ptr(),
&mut src_size,
ptr::null(),
),
}
}));
self.pos += src_size as usize;
dst_offset += dst_size as usize;
Expand Down Expand Up @@ -119,9 +148,9 @@ impl Drop for DecoderContext {
mod test {
extern crate rand;

use self::rand::Rng;
use self::rand::rngs::StdRng;
use super::super::encoder::{Encoder, EncoderBuilder};
use self::rand::Rng;
use super::super::encoder::{Encoder, EncoderBuilder, EncoderDictionary};
use super::Decoder;
use std::io::{Cursor, Error, ErrorKind, Read, Result, Write};

Expand Down Expand Up @@ -289,6 +318,29 @@ mod test {
finish_decode(decoder);
}

#[test]
fn test_decoder_smoke_dictionary() {
let dict_data = b"dictionary with some data";
let dict = EncoderDictionary::new(dict_data).unwrap();
let mut encoder = EncoderBuilder::new()
.level(1)
.dictionary(&dict)
.build(Vec::new())
.unwrap();
let mut expected = Vec::new();
expected.write(b"some data").unwrap();
encoder.write(&expected[..4]).unwrap();
encoder.write(&expected[4..]).unwrap();
let buffer = finish_encode(encoder);

let mut decoder = Decoder::with_dictionary(Cursor::new(buffer), dict_data).unwrap();
let mut actual = Vec::new();

decoder.read_to_end(&mut actual).unwrap();
assert_eq!(expected, actual);
finish_decode(decoder);
}

fn random() -> StdRng {
let seed: [u8; 32] = [
157, 164, 190, 237, 231, 103, 60, 22, 197, 108, 51, 176, 30, 170, 155, 21, 163, 249,
Expand Down
83 changes: 80 additions & 3 deletions src/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,27 @@ use libc::size_t;
use std::cmp;
use std::io::Result;
use std::io::Write;
use std::io::{Error, ErrorKind};
use std::ptr;

struct EncoderContext {
c: LZ4FCompressionContext,
}

pub struct EncoderDictionary {
cdict: LZ4FCDict,
}

#[derive(Clone)]
pub struct EncoderBuilder {
pub struct EncoderBuilder<'a> {
block_size: BlockSize,
block_mode: BlockMode,
checksum: ContentChecksum,
// 0 == default (fast mode); values above 16 count as 16; values below 0 count as 0
level: u32,
// 1 == always flush (reduce need for tmp buffer)
auto_flush: bool,
dictionary: Option<&'a EncoderDictionary>,
}

pub struct Encoder<W> {
Expand All @@ -27,14 +33,38 @@ pub struct Encoder<W> {
buffer: Vec<u8>,
}

impl EncoderBuilder {
impl EncoderDictionary {
pub fn new(dict: &[u8]) -> Result<EncoderDictionary> {
let cdict = unsafe { LZ4F_createCDict(dict.as_ptr(), dict.len()) };

if cdict.0.is_null() {
Err(Error::new(
ErrorKind::Other,
LZ4Error::new(String::from("Failed to create CDict.")),
))
} else {
Ok(EncoderDictionary { cdict })
}
}
}

impl Drop for EncoderDictionary {
fn drop(&mut self) {
unsafe {
LZ4F_freeCDict(self.cdict);
}
}
}

impl<'a> EncoderBuilder<'a> {
pub fn new() -> Self {
EncoderBuilder {
block_size: BlockSize::Default,
block_mode: BlockMode::Linked,
checksum: ContentChecksum::ChecksumEnabled,
level: 0,
auto_flush: false,
dictionary: None,
}
}

Expand Down Expand Up @@ -63,6 +93,11 @@ impl EncoderBuilder {
self
}

pub fn dictionary(&mut self, dictionary: &'a EncoderDictionary) -> &mut Self {
self.dictionary = Some(dictionary);
self
}

pub fn build<W: Write>(&self, w: W) -> Result<Encoder<W>> {
let block_size = self.block_size.get_size();
let preferences = LZ4FPreferences {
Expand All @@ -87,7 +122,15 @@ impl EncoderBuilder {
LZ4F_compressBound(block_size as size_t, &preferences)
}))),
};
try!(encoder.write_header(&preferences));
match &self.dictionary {
Some(dict) => {
try!(encoder.write_header_with_dict(&preferences, dict));
}
None => {
try!(encoder.write_header(&preferences));
}
}

Ok(encoder)
}
}
Expand All @@ -106,6 +149,24 @@ impl<W: Write> Encoder<W> {
self.w.write_all(&self.buffer)
}

fn write_header_with_dict(
&mut self,
preferences: &LZ4FPreferences,
dict: &EncoderDictionary,
) -> Result<()> {
unsafe {
let len = try!(check_error(LZ4F_compressBegin_usingCDict(
self.c.c,
self.buffer.as_mut_ptr(),
self.buffer.capacity() as size_t,
dict.cdict,
preferences
)));
self.buffer.set_len(len);
}
self.w.write_all(&self.buffer)
}

fn write_end(&mut self) -> Result<()> {
unsafe {
let len = try!(check_error(LZ4F_compressEnd(
Expand Down Expand Up @@ -194,6 +255,7 @@ impl Drop for EncoderContext {
#[cfg(test)]
mod test {
use super::EncoderBuilder;
use super::EncoderDictionary;
use std::io::Write;

#[test]
Expand Down Expand Up @@ -225,4 +287,19 @@ mod test {
let enc = EncoderBuilder::new().build(Vec::new());
check_send(&enc);
}

#[test]
fn test_encoder_dictionary() {
let dictionary = EncoderDictionary::new(b"dictionary").unwrap();
let mut encoder = EncoderBuilder::new()
.level(1)
.dictionary(&dictionary)
.build(Vec::new())
.unwrap();
encoder
.write(b"dictionary compression is good for small files")
.unwrap();
let (_, result) = encoder.finish();
result.unwrap();
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub mod block;
pub use decoder::Decoder;
pub use encoder::Encoder;
pub use encoder::EncoderBuilder;
pub use encoder::EncoderDictionary;
pub use liblz4::version;
pub use liblz4::BlockMode;
pub use liblz4::BlockSize;
Expand Down
8 changes: 7 additions & 1 deletion src/liblz4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@ use std::str;
pub use lz4_sys::*;

#[derive(Debug)]
pub struct LZ4Error(String);
pub struct LZ4Error(pub String);

impl LZ4Error {
pub fn new(reason: String) -> LZ4Error {
LZ4Error(reason)
}
}

impl Display for LZ4Error {
fn fmt(&self, f: &mut Formatter) -> Result<(), ::std::fmt::Error> {
Expand Down