bozaro · thoren-d · Mar 31, 2020
diff --git a/lz4-sys/src/lib.rs b/lz4-sys/src/lib.rs
@@ -88,6 +88,12 @@ pub struct LZ4StreamEncode(c_void);
 #[repr(C)]
 pub struct LZ4StreamDecode(c_void);
 
+#[derive(Clone, Copy)]
+#[repr(C)]
+pub struct LZ4FCDict(pub *mut c_void);
+unsafe impl Send for LZ4FCDict {}
+unsafe impl Sync for LZ4FCDict {}
+
 pub const LZ4F_VERSION: c_uint = 100;
 
 extern "C" {
@@ -136,6 +142,16 @@ extern "C" {
     //                                  LZ4F_compressionContext_t LZ4F_compressionContext);
     pub fn LZ4F_freeCompressionContext(ctx: LZ4FCompressionContext) -> LZ4FErrorCode;
 
+    // LZ4_createCDict() :
+    // When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
+    // LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+    // LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+    // `dictBuffer` can be released after LZ4_CDict creation, since its content is copied within CDict.
+    // LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize);
+    // void        LZ4F_freeCDict(LZ4F_CDict* CDict);
+    pub fn LZ4F_createCDict(dictBuffer: *const u8, dictSize: size_t) -> LZ4FCDict;
+    pub fn LZ4F_freeCDict(CDict: LZ4FCDict);
+
     // LZ4F_compressBegin() :
     // will write the frame header into dstBuffer.
     // dstBuffer must be large enough to accommodate a header (dstMaxSize). Maximum header
@@ -155,6 +171,25 @@ extern "C" {
                               preferencesPtr: *const LZ4FPreferences)
                               -> LZ4FErrorCode;
 
+    // LZ4F_compressBegin_usingCDict() :
+    // Inits streaming dictionary compression, and writes the frame header into dstBuffer.
+    // dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+    // `prefsPtr` is optional : you may provide NULL as argument,
+    // however, it's the only way to provide dictID in the frame header.
+    // @return : number of bytes written into dstBuffer for the header,
+    //           or an error code (which can be tested using LZ4F_isError()) */
+    // size_t LZ4F_compressBegin_usingCDict(
+    //         LZ4F_cctx* cctx,
+    //         void* dstBuffer, size_t dstCapacity,
+    //         const LZ4F_CDict* cdict,
+    //         const LZ4F_preferences_t* prefsPtr);
+    pub fn LZ4F_compressBegin_usingCDict(ctx: LZ4FCompressionContext,
+        dstBuffer: *mut u8,
+        dstCapacity: size_t,
+        cdict: LZ4FCDict,
+        prefsPtr: *const LZ4FPreferences)
+         -> LZ4FErrorCode;
+
     // LZ4F_compressBound() :
     // Provides the minimum size of Dst buffer given srcSize to handle worst case situations.
     // preferencesPtr is optional : you can provide NULL as argument, all preferences will then
@@ -317,6 +352,26 @@ extern "C" {
                            optionsPtr: *const LZ4FDecompressOptions)
                            -> LZ4FErrorCode;
 
+    // LZ4F_decompress_usingDict() :
+    // Same as LZ4F_decompress(), using a predefined dictionary.
+    // Dictionary is used "in place", without any preprocessing.
+    // It must remain accessible throughout the entire frame decoding. */
+    // size_t LZ4F_decompress_usingDict(
+    //         LZ4F_dctx* dctxPtr,
+    //         void* dstBuffer, size_t* dstSizePtr,
+    //         const void* srcBuffer, size_t* srcSizePtr,
+    //         const void* dict, size_t dictSize,
+    //         const LZ4F_decompressOptions_t* decompressOptionsPtr);
+    pub fn LZ4F_decompress_usingDict(ctx: LZ4FDecompressionContext,
+                                     dstBuffer: *mut u8,
+                                     dstSizePtr: &mut size_t,
+                                     srcBuffer: *const u8,
+                                     srcSizePtr: &mut size_t,
+                                     dict: *const u8,
+                                     dictSize: size_t,
+                                     optionsPtr: *const LZ4FDecompressOptions)
+                                     -> LZ4FErrorCode;
+
     // int LZ4_versionNumber(void)
     pub fn LZ4_versionNumber() -> c_int;
 

diff --git a/src/decoder.rs b/src/decoder.rs
@@ -9,20 +9,21 @@ struct DecoderContext {
     c: LZ4FDecompressionContext,
 }
 
-pub struct Decoder<R> {
+pub struct Decoder<'a, R> {
     c: DecoderContext,
     r: R,
     buf: Box<[u8]>,
     pos: usize,
     len: usize,
     next: usize,
+    dict: Option<&'a [u8]>,
 }
 
-impl<R: Read> Decoder<R> {
+impl<'a, R: Read> Decoder<'a, R> {
     /// Creates a new encoder which will have its output written to the given
     /// output stream. The output stream can be re-acquired by calling
     /// `finish()`
-    pub fn new(r: R) -> Result<Decoder<R>> {
+    pub fn new(r: R) -> Result<Decoder<'a, R>> {
         Ok(Decoder {
             r: r,
             c: try!(DecoderContext::new()),
@@ -31,6 +32,22 @@ impl<R: Read> Decoder<R> {
             len: BUFFER_SIZE,
             // Minimal LZ4 stream size
             next: 11,
+            dict: None,
+        })
+    }
+
+    /// Creates a new decoder with a dictionary, which should be the one which
+    /// was used in encoding.
+    pub fn with_dictionary(r: R, dict: &'a [u8]) -> Result<Decoder<'a, R>> {
+        Ok(Decoder {
+            r: r,
+            c: try!(DecoderContext::new()),
+            buf: vec![0; BUFFER_SIZE].into_boxed_slice(),
+            pos: BUFFER_SIZE,
+            len: BUFFER_SIZE,
+            // Minimal LZ4 stream size
+            next: 11,
+            dict: Some(dict),
         })
     }
 
@@ -53,7 +70,7 @@ impl<R: Read> Decoder<R> {
     }
 }
 
-impl<R: Read> Read for Decoder<R> {
+impl<'a, R: Read> Read for Decoder<'a, R> {
     fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
         if self.next == 0 || buf.len() == 0 {
             return Ok(0);
@@ -76,14 +93,26 @@ impl<R: Read> Read for Decoder<R> {
                 let mut src_size = (self.len - self.pos) as size_t;
                 let mut dst_size = (buf.len() - dst_offset) as size_t;
                 let len = try!(check_error(unsafe {
-                    LZ4F_decompress(
-                        self.c.c,
-                        buf[dst_offset..].as_mut_ptr(),
-                        &mut dst_size,
-                        self.buf[self.pos..].as_ptr(),
-                        &mut src_size,
-                        ptr::null(),
-                    )
+                    match self.dict {
+                        Some(dict) => LZ4F_decompress_usingDict(
+                            self.c.c,
+                            buf[dst_offset..].as_mut_ptr(),
+                            &mut dst_size,
+                            self.buf[self.pos..].as_ptr(),
+                            &mut src_size,
+                            dict.as_ptr(),
+                            dict.len(),
+                            ptr::null(),
+                        ),
+                        None => LZ4F_decompress(
+                            self.c.c,
+                            buf[dst_offset..].as_mut_ptr(),
+                            &mut dst_size,
+                            self.buf[self.pos..].as_ptr(),
+                            &mut src_size,
+                            ptr::null(),
+                        ),
+                    }
                 }));
                 self.pos += src_size as usize;
                 dst_offset += dst_size as usize;
@@ -119,9 +148,9 @@ impl Drop for DecoderContext {
 mod test {
     extern crate rand;
 
-    use self::rand::Rng;
     use self::rand::rngs::StdRng;
-    use super::super::encoder::{Encoder, EncoderBuilder};
+    use self::rand::Rng;
+    use super::super::encoder::{Encoder, EncoderBuilder, EncoderDictionary};
     use super::Decoder;
     use std::io::{Cursor, Error, ErrorKind, Read, Result, Write};
 
@@ -289,6 +318,29 @@ mod test {
         finish_decode(decoder);
     }
 
+    #[test]
+    fn test_decoder_smoke_dictionary() {
+        let dict_data = b"dictionary with some data";
+        let dict = EncoderDictionary::new(dict_data).unwrap();
+        let mut encoder = EncoderBuilder::new()
+            .level(1)
+            .dictionary(&dict)
+            .build(Vec::new())
+            .unwrap();
+        let mut expected = Vec::new();
+        expected.write(b"some data").unwrap();
+        encoder.write(&expected[..4]).unwrap();
+        encoder.write(&expected[4..]).unwrap();
+        let buffer = finish_encode(encoder);
+
+        let mut decoder = Decoder::with_dictionary(Cursor::new(buffer), dict_data).unwrap();
+        let mut actual = Vec::new();
+
+        decoder.read_to_end(&mut actual).unwrap();
+        assert_eq!(expected, actual);
+        finish_decode(decoder);
+    }
+
     fn random() -> StdRng {
         let seed: [u8; 32] = [
             157, 164, 190, 237, 231, 103, 60, 22, 197, 108, 51, 176, 30, 170, 155, 21, 163, 249,

diff --git a/src/encoder.rs b/src/encoder.rs
@@ -3,21 +3,27 @@ use libc::size_t;
 use std::cmp;
 use std::io::Result;
 use std::io::Write;
+use std::io::{Error, ErrorKind};
 use std::ptr;
 
 struct EncoderContext {
     c: LZ4FCompressionContext,
 }
 
+pub struct EncoderDictionary {
+    cdict: LZ4FCDict,
+}
+
 #[derive(Clone)]
-pub struct EncoderBuilder {
+pub struct EncoderBuilder<'a> {
     block_size: BlockSize,
     block_mode: BlockMode,
     checksum: ContentChecksum,
     // 0 == default (fast mode); values above 16 count as 16; values below 0 count as 0
     level: u32,
     // 1 == always flush (reduce need for tmp buffer)
     auto_flush: bool,
+    dictionary: Option<&'a EncoderDictionary>,
 }
 
 pub struct Encoder<W> {
@@ -27,14 +33,38 @@ pub struct Encoder<W> {
     buffer: Vec<u8>,
 }
 
-impl EncoderBuilder {
+impl EncoderDictionary {
+    pub fn new(dict: &[u8]) -> Result<EncoderDictionary> {
+        let cdict = unsafe { LZ4F_createCDict(dict.as_ptr(), dict.len()) };
+
+        if cdict.0.is_null() {
+            Err(Error::new(
+                ErrorKind::Other,
+                LZ4Error::new(String::from("Failed to create CDict.")),
+            ))
+        } else {
+            Ok(EncoderDictionary { cdict })
+        }
+    }
+}
+
+impl Drop for EncoderDictionary {
+    fn drop(&mut self) {
+        unsafe {
+            LZ4F_freeCDict(self.cdict);
+        }
+    }
+}
+
+impl<'a> EncoderBuilder<'a> {
     pub fn new() -> Self {
         EncoderBuilder {
             block_size: BlockSize::Default,
             block_mode: BlockMode::Linked,
             checksum: ContentChecksum::ChecksumEnabled,
             level: 0,
             auto_flush: false,
+            dictionary: None,
         }
     }
 
@@ -63,6 +93,11 @@ impl EncoderBuilder {
         self
     }
 
+    pub fn dictionary(&mut self, dictionary: &'a EncoderDictionary) -> &mut Self {
+        self.dictionary = Some(dictionary);
+        self
+    }
+
     pub fn build<W: Write>(&self, w: W) -> Result<Encoder<W>> {
         let block_size = self.block_size.get_size();
         let preferences = LZ4FPreferences {
@@ -87,7 +122,15 @@ impl EncoderBuilder {
                 LZ4F_compressBound(block_size as size_t, &preferences)
             }))),
         };
-        try!(encoder.write_header(&preferences));
+        match &self.dictionary {
+            Some(dict) => {
+                try!(encoder.write_header_with_dict(&preferences, dict));
+            }
+            None => {
+                try!(encoder.write_header(&preferences));
+            }
+        }
+
         Ok(encoder)
     }
 }
@@ -106,6 +149,24 @@ impl<W: Write> Encoder<W> {
         self.w.write_all(&self.buffer)
     }
 
+    fn write_header_with_dict(
+        &mut self,
+        preferences: &LZ4FPreferences,
+        dict: &EncoderDictionary,
+    ) -> Result<()> {
+        unsafe {
+            let len = try!(check_error(LZ4F_compressBegin_usingCDict(
+                self.c.c,
+                self.buffer.as_mut_ptr(),
+                self.buffer.capacity() as size_t,
+                dict.cdict,
+                preferences
+            )));
+            self.buffer.set_len(len);
+        }
+        self.w.write_all(&self.buffer)
+    }
+
     fn write_end(&mut self) -> Result<()> {
         unsafe {
             let len = try!(check_error(LZ4F_compressEnd(
@@ -194,6 +255,7 @@ impl Drop for EncoderContext {
 #[cfg(test)]
 mod test {
     use super::EncoderBuilder;
+    use super::EncoderDictionary;
     use std::io::Write;
 
     #[test]
@@ -225,4 +287,19 @@ mod test {
         let enc = EncoderBuilder::new().build(Vec::new());
         check_send(&enc);
     }
+
+    #[test]
+    fn test_encoder_dictionary() {
+        let dictionary = EncoderDictionary::new(b"dictionary").unwrap();
+        let mut encoder = EncoderBuilder::new()
+            .level(1)
+            .dictionary(&dictionary)
+            .build(Vec::new())
+            .unwrap();
+        encoder
+            .write(b"dictionary compression is good for small files")
+            .unwrap();
+        let (_, result) = encoder.finish();
+        result.unwrap();
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -11,6 +11,7 @@ pub mod block;
 pub use decoder::Decoder;
 pub use encoder::Encoder;
 pub use encoder::EncoderBuilder;
+pub use encoder::EncoderDictionary;
 pub use liblz4::version;
 pub use liblz4::BlockMode;
 pub use liblz4::BlockSize;

diff --git a/src/liblz4.rs b/src/liblz4.rs
@@ -8,7 +8,13 @@ use std::str;
 pub use lz4_sys::*;
 
 #[derive(Debug)]
-pub struct LZ4Error(String);
+pub struct LZ4Error(pub String);
+
+impl LZ4Error {
+    pub fn new(reason: String) -> LZ4Error {
+        LZ4Error(reason)
+    }
+}
 
 impl Display for LZ4Error {
     fn fmt(&self, f: &mut Formatter) -> Result<(), ::std::fmt::Error> {