diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ac6cfb3..76e5fdd 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -36,7 +36,7 @@ jobs: target: - aarch64-unknown-linux-gnu - i686-unknown-linux-gnu - - mips64-unknown-linux-gnuabi64 + - powerpc-unknown-linux-gnu steps: - uses: actions/checkout@v1 - uses: actions-rs/toolchain@v1 @@ -90,7 +90,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: 1.54.0 + toolchain: 1.66.0 override: true - run: rustup component add clippy - uses: actions-rs/cargo@v1 diff --git a/.gitignore b/.gitignore index c452573..1f076f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .* *.bk target +/Cargo.lock diff --git a/Cargo.toml b/Cargo.toml index b5e028b..38dea09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "finalfusion" -version = "0.17.1" -edition = "2018" -authors = ["Daniël de Kok ", "Sebastian Pütz "] +version = "0.18.0" +edition = "2021" +rust-version = "1.66" description = "Reader and writer for common word embedding formats" documentation = "https://docs.rs/finalfusion/" keywords = ["embeddings", "word2vec", "glove", "finalfusion", "fasttext"] @@ -10,28 +10,25 @@ homepage = "https://github.com/finalfusion/finalfusion-rust" repository = "https://github.com/finalfusion/finalfusion-rust" license = "MIT OR Apache-2.0" readme = "README.md" -exclude = [ - ".gitignore", - ".travis.yml" -] +exclude = [".gitignore"] [dependencies] byteorder = "1" fnv = "1" -itertools = "0.10" +itertools = "0.11" murmur3 = "0.5" -ndarray = "0.15" -ordered-float = "2" +ndarray = { version = "0.15", features = ["approx-0_5"] } +ordered-float = "4" rand = "0.8" rand_chacha = "0.3" reductive = "0.9" serde = { version = "1", features = ["derive"] } smallvec = "1.7" thiserror = "1" -toml = "0.5" +toml = "0.8" [dependencies.memmap2] -version = "0.5" +version = "0.9" optional = true [features] @@ -39,8 +36,8 @@ default = ["memmap"] memmap = ["memmap2"] [dev-dependencies] -approx = "0.4" -criterion = "0.3" +approx = "0.5" +criterion = "0.5" lazy_static = "1" maplit = "1" tempfile = "3" diff --git a/benches/array.rs b/benches/array.rs index 32c79ea..b849ae7 100644 --- a/benches/array.rs +++ b/benches/array.rs @@ -25,18 +25,18 @@ fn allround_iter() -> impl Iterator + Clone { corpus.into_iter() } -fn known_iter<'a>( - embeds: &'a Embeddings, -) -> impl 'a + Iterator + Clone { +fn known_iter( + embeds: &Embeddings, +) -> impl '_ + Iterator + Clone { allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) { Some(WordIndex::Word(_)) => Some(w), _ => None, }) } -fn unknown_iter<'a>( - embeds: &'a Embeddings, -) -> impl 'a + Iterator + Clone { +fn unknown_iter( + embeds: &Embeddings, +) -> impl '_ + Iterator + Clone { allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) { Some(WordIndex::Subword(_)) => Some(w), _ => None, diff --git a/benches/quantized.rs b/benches/quantized.rs index 655725b..5029818 100644 --- a/benches/quantized.rs +++ b/benches/quantized.rs @@ -25,18 +25,18 @@ fn allround_iter() -> impl Iterator + Clone { corpus.into_iter() } -fn known_iter<'a>( - embeds: &'a Embeddings, -) -> impl 'a + Iterator + Clone { +fn known_iter( + embeds: &Embeddings, +) -> impl '_ + Iterator + Clone { allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) { Some(WordIndex::Word(_)) => Some(w), _ => None, }) } -fn unknown_iter<'a>( - embeds: &'a Embeddings, -) -> impl 'a + Iterator + Clone { +fn unknown_iter( + embeds: &Embeddings, +) -> impl '_ + Iterator + Clone { allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) { Some(WordIndex::Subword(_)) => Some(w), _ => None, diff --git a/benches/subword.rs b/benches/subword.rs index 5b47c8b..eafb0b0 100644 --- a/benches/subword.rs +++ b/benches/subword.rs @@ -13,7 +13,6 @@ fn subwords(string: &str, min_n: usize, max_n: usize, indexer: &impl Indexer) -> // evaluates them. string .subword_indices(min_n, max_n, indexer) - .into_iter() .fold(0, |sum, v| sum.wrapping_add(v)) } diff --git a/src/chunks/io.rs b/src/chunks/io.rs index d59de49..c25a25f 100644 --- a/src/chunks/io.rs +++ b/src/chunks/io.rs @@ -1,4 +1,3 @@ -use std::convert::TryFrom; use std::fmt::{self, Display}; use std::fs::File; use std::io::{BufReader, Read, Seek, Write}; diff --git a/src/chunks/metadata.rs b/src/chunks/metadata.rs index 0b0fffa..eef1048 100644 --- a/src/chunks/metadata.rs +++ b/src/chunks/metadata.rs @@ -5,7 +5,7 @@ use std::mem; use std::ops::{Deref, DerefMut}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use toml::Value; +use toml::Table; use crate::chunks::io::{ChunkIdentifier, Header, ReadChunk, WriteChunk}; use crate::error::{Error, Result}; @@ -16,18 +16,18 @@ use crate::io::ReadMetadata; /// finalfusion metadata in TOML format. #[derive(Clone, Debug, PartialEq)] pub struct Metadata { - inner: Value, + inner: Table, } impl Metadata { /// Construct new `Metadata`. - pub fn new(data: Value) -> Self { - Metadata { inner: data } + pub fn new(inner: Table) -> Self { + Metadata { inner } } } impl Deref for Metadata { - type Target = Value; + type Target = Table; fn deref(&self) -> &Self::Target { &self.inner @@ -40,9 +40,9 @@ impl DerefMut for Metadata { } } -impl From for Metadata { - fn from(value: Value) -> Self { - Metadata { inner: value } +impl From for Metadata { + fn from(inner: Table) -> Self { + Metadata { inner } } } @@ -69,7 +69,7 @@ impl ReadChunk for Metadata { Ok(Metadata::new( buf_str - .parse::() + .parse::
() .map_err(|e| Error::Format(format!("Cannot deserialize TOML metadata: {}", e))) .map_err(Error::from)?, )) diff --git a/src/chunks/norms.rs b/src/chunks/norms.rs index 45d962c..05cd924 100644 --- a/src/chunks/norms.rs +++ b/src/chunks/norms.rs @@ -1,6 +1,5 @@ //! Norms chunk -use std::convert::TryInto; use std::io::{Read, Seek, SeekFrom, Write}; use std::mem; use std::mem::size_of; @@ -71,7 +70,7 @@ impl ReadChunk for NdNorms { f32::ensure_data_type(read)?; let n_padding = - padding::(read.seek(SeekFrom::Current(0)).map_err(|e| { + padding::(read.stream_position().map_err(|e| { Error::read_error("Cannot get file position for computing padding", e) })?); read.seek(SeekFrom::Current(n_padding as i64)) @@ -109,12 +108,12 @@ impl WriteChunk for NdNorms { write .write_u32::(ChunkIdentifier::NdNorms as u32) .map_err(|e| Error::write_error("Cannot write norms chunk identifier", e))?; - let n_padding = padding::(write.seek(SeekFrom::Current(0)).map_err(|e| { + let n_padding = padding::(write.stream_position().map_err(|e| { Error::write_error("Cannot get file position for computing padding", e) })?); let remaining_chunk_len = - self.chunk_len(write.seek(SeekFrom::Current(0)).map_err(|e| { + self.chunk_len(write.stream_position().map_err(|e| { Error::read_error("Cannot get file position for computing padding", e) })?) - (size_of::() + size_of::()) as u64; diff --git a/src/chunks/storage/array.rs b/src/chunks/storage/array.rs index 85ce794..7d0c632 100644 --- a/src/chunks/storage/array.rs +++ b/src/chunks/storage/array.rs @@ -1,4 +1,3 @@ -use std::convert::TryInto; use std::io::{Read, Seek, SeekFrom, Write}; use std::mem; use std::mem::size_of; @@ -13,7 +12,6 @@ use crate::util::padding; #[cfg(feature = "memmap")] mod mmap { - use std::convert::TryInto; use std::fs::File; #[cfg(target_endian = "little")] use std::io::Write; @@ -134,7 +132,7 @@ mod mmap { // The components of the embedding matrix should be of type f32. f32::ensure_data_type(read)?; - let n_padding = padding::(read.seek(SeekFrom::Current(0)).map_err(|e| { + let n_padding = padding::(read.stream_position().map_err(|e| { Error::read_error("Cannot get file position for computing padding", e) })?); read.seek(SeekFrom::Current(n_padding as i64)) @@ -142,7 +140,7 @@ mod mmap { // Set up memory mapping. let matrix_len = shape.size() * size_of::(); - let offset = read.seek(SeekFrom::Current(0)).map_err(|e| { + let offset = read.stream_position().map_err(|e| { Error::read_error( "Cannot get file position for memory mapping embedding matrix", e, @@ -153,7 +151,7 @@ mod mmap { mmap_opts .offset(offset) .len(matrix_len) - .map(&*read.get_ref()) + .map(read.get_ref()) .map_err(|e| Error::read_error("Cannot memory map embedding matrix", e))? }; @@ -218,13 +216,13 @@ impl NdArray { write .write_u32::(ChunkIdentifier::NdArray as u32) .map_err(|e| Error::write_error("Cannot write embedding matrix chunk identifier", e))?; - let n_padding = padding::(write.seek(SeekFrom::Current(0)).map_err(|e| { + let n_padding = padding::(write.stream_position().map_err(|e| { Error::write_error("Cannot get file position for computing padding", e) })?); let remaining_chunk_len = Self::chunk_len( data.view(), - write.seek(SeekFrom::Current(0)).map_err(|e| { + write.stream_position().map_err(|e| { Error::read_error("Cannot get file position for computing padding", e) })?, ) - (size_of::() + size_of::()) as u64; @@ -346,7 +344,7 @@ impl ReadChunk for NdArray { f32::ensure_data_type(read)?; let n_padding = - padding::(read.seek(SeekFrom::Current(0)).map_err(|e| { + padding::(read.stream_position().map_err(|e| { Error::read_error("Cannot get file position for computing padding", e) })?); read.seek(SeekFrom::Current(n_padding as i64)) diff --git a/src/chunks/storage/quantized.rs b/src/chunks/storage/quantized.rs index 50db780..3d26c4a 100644 --- a/src/chunks/storage/quantized.rs +++ b/src/chunks/storage/quantized.rs @@ -1,4 +1,3 @@ -use std::convert::TryInto; use std::io::{Read, Seek, SeekFrom, Write}; use std::mem; use std::mem::size_of; @@ -117,7 +116,7 @@ impl QuantizedArray { f32::ensure_data_type(read)?; let n_padding = - padding::(read.seek(SeekFrom::Current(0)).map_err(|e| { + padding::(read.stream_position().map_err(|e| { Error::read_error("Cannot get file position for computing padding", e) })?); read.seek(SeekFrom::Current(n_padding as i64)) @@ -171,12 +170,12 @@ impl QuantizedArray { quantizer, quantized.view(), norms, - write.seek(SeekFrom::Current(0)).map_err(|e| { + write.stream_position().map_err(|e| { Error::read_error("Cannot get file position for computing padding", e) })?, ) - (size_of::() + size_of::()) as u64; - let n_padding = padding::(write.seek(SeekFrom::Current(0)).map_err(|e| { + let n_padding = padding::(write.stream_position().map_err(|e| { Error::write_error("Cannot get file position for computing padding", e) })?); @@ -562,7 +561,7 @@ mod mmap { n_embeddings: usize, quantized_len: usize, ) -> Result { - let offset = read.seek(SeekFrom::Current(0)).map_err(|e| { + let offset = read.stream_position().map_err(|e| { Error::read_error( "Cannot get file position for memory mapping embedding matrix", e, @@ -574,7 +573,7 @@ mod mmap { mmap_opts .offset(offset) .len(matrix_len) - .map(&*read.get_ref()) + .map(read.get_ref()) .map_err(|e| { Error::read_error("Cannot memory map quantized embedding matrix", e) })? diff --git a/src/chunks/storage/wrappers.rs b/src/chunks/storage/wrappers.rs index 9f81cc1..f08472b 100644 --- a/src/chunks/storage/wrappers.rs +++ b/src/chunks/storage/wrappers.rs @@ -1,4 +1,3 @@ -use std::convert::TryFrom; #[cfg(feature = "memmap")] use std::fs::File; #[cfg(feature = "memmap")] @@ -126,7 +125,7 @@ impl ReadChunk for StorageWrap { R: Read + Seek, { let chunk_start_pos = read - .seek(SeekFrom::Current(0)) + .stream_position() .map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?; let chunk_id = read @@ -156,7 +155,7 @@ impl ReadChunk for StorageWrap { impl MmapChunk for StorageWrap { fn mmap_chunk(read: &mut BufReader) -> Result { let chunk_start_pos = read - .seek(SeekFrom::Current(0)) + .stream_position() .map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?; let chunk_id = read @@ -306,7 +305,7 @@ impl ReadChunk for StorageViewWrap { R: Read + Seek, { let chunk_start_pos = read - .seek(SeekFrom::Current(0)) + .stream_position() .map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?; let chunk_id = read @@ -361,7 +360,7 @@ impl WriteChunk for StorageViewWrap { impl MmapChunk for StorageViewWrap { fn mmap_chunk(read: &mut BufReader) -> Result { let chunk_start_pos = read - .seek(SeekFrom::Current(0)) + .stream_position() .map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?; let chunk_id = read diff --git a/src/chunks/vocab/simple.rs b/src/chunks/vocab/simple.rs index 2915bb8..d9922dd 100644 --- a/src/chunks/vocab/simple.rs +++ b/src/chunks/vocab/simple.rs @@ -1,6 +1,5 @@ use std::collections::HashMap; -use std::convert::TryInto; -use std::io::{Read, Seek, SeekFrom, Write}; +use std::io::{Read, Seek, Write}; use std::mem::size_of; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; @@ -103,7 +102,7 @@ impl WriteChunk for SimpleVocab { .map_err(|e| Error::write_error("Cannot write vocabulary chunk identifier", e))?; let remaining_chunk_len = - self.chunk_len(write.seek(SeekFrom::Current(0)).map_err(|e| { + self.chunk_len(write.stream_position().map_err(|e| { Error::read_error("Cannot get file position for computing padding", e) })?) - (size_of::() + size_of::()) as u64; diff --git a/src/chunks/vocab/subword.rs b/src/chunks/vocab/subword.rs index 325422d..253da20 100644 --- a/src/chunks/vocab/subword.rs +++ b/src/chunks/vocab/subword.rs @@ -1,5 +1,4 @@ use std::collections::HashMap; -use std::convert::TryFrom; use std::io; use std::io::{ErrorKind, Read, Seek, Write}; use std::mem::size_of; @@ -400,7 +399,7 @@ where .read_u32::() .map_err(|e| Error::read_error("Cannot read number of buckets", e))?; - let words = read_vocab_items(read, vocab_len as usize)?; + let words = read_vocab_items(read, vocab_len)?; Ok(SubwordVocab::new( words, diff --git a/src/chunks/vocab/wrappers.rs b/src/chunks/vocab/wrappers.rs index e9563b6..4df6527 100644 --- a/src/chunks/vocab/wrappers.rs +++ b/src/chunks/vocab/wrappers.rs @@ -1,4 +1,3 @@ -use std::convert::TryFrom; use std::io::{Read, Seek, SeekFrom, Write}; use byteorder::{LittleEndian, ReadBytesExt}; @@ -110,7 +109,7 @@ impl ReadChunk for VocabWrap { R: Read + Seek, { let chunk_start_pos = read - .seek(SeekFrom::Current(0)) + .stream_position() .map_err(|e| Error::read_error("Cannot get vocabulary chunk start position", e))?; let chunk_id = read .read_u32::() diff --git a/src/compat/fasttext/io.rs b/src/compat/fasttext/io.rs index 560c92a..6706df5 100644 --- a/src/compat/fasttext/io.rs +++ b/src/compat/fasttext/io.rs @@ -1,11 +1,10 @@ -use std::convert::TryInto; use std::io::{BufRead, Write}; use std::ops::Mul; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use ndarray::{s, Array2, ErrorKind as ShapeErrorKind, ShapeError}; use serde::Serialize; -use toml::Value; +use toml::Table; use crate::chunks::metadata::Metadata; use crate::chunks::norms::NdNorms; @@ -107,7 +106,7 @@ impl ReadFastTextPrivate for Embeddings { ))); } - let metadata = Value::try_from(config).map_err(|e| { + let metadata = Table::try_from(config).map_err(|e| { Error::Format(format!("Cannot serialize model metadata to TOML: {}", e)) })?; diff --git a/src/similarity.rs b/src/similarity.rs index e594850..67f246f 100644 --- a/src/similarity.rs +++ b/src/similarity.rs @@ -362,7 +362,7 @@ where #[cfg(test)] mod tests { - + use std::f32; use std::fs::File; use std::io::BufReader; @@ -482,7 +482,7 @@ mod tests { .abs_diff_eq(&1f32, 1e-5)); assert!((WordSimilarityResult { word: "test", - similarity: NotNan::new(0.70710678).unwrap() + similarity: NotNan::new(f32::consts::FRAC_1_SQRT_2).unwrap() }) .angular_similarity() .abs_diff_eq(&0.75, 1e-5)); @@ -510,7 +510,7 @@ mod tests { .abs_diff_eq(&0f32, 1e-5)); assert!((WordSimilarityResult { word: "test", - similarity: NotNan::new(0.70710678).unwrap() + similarity: NotNan::new(f32::consts::FRAC_1_SQRT_2).unwrap() }) .euclidean_distance() .abs_diff_eq(&0.76537, 1e-5)); @@ -538,7 +538,7 @@ mod tests { .abs_diff_eq(&1f32, 1e-5)); assert!((WordSimilarityResult { word: "test", - similarity: NotNan::new(0.70710678).unwrap() + similarity: NotNan::new(f32::consts::FRAC_1_SQRT_2).unwrap() }) .euclidean_similarity() .abs_diff_eq(&0.61732, 1e-5)); @@ -602,7 +602,7 @@ mod tests { assert_eq!(10, result.len()); assert_eq!(result.next().unwrap().word, "Berlin"); - for (idx, word_similarity) in result.into_iter().enumerate() { + for (idx, word_similarity) in result.enumerate() { assert_eq!(SIMILARITY_ORDER[idx], word_similarity.word) } } diff --git a/src/subword.rs b/src/subword.rs index 6d1c326..4a91502 100644 --- a/src/subword.rs +++ b/src/subword.rs @@ -86,17 +86,13 @@ where } fn buckets(&self) -> usize { - self.buckets_exp as usize + self.buckets_exp } } impl Clone for HashIndexer { fn clone(&self) -> Self { - HashIndexer { - buckets_exp: self.buckets_exp, - mask: self.mask, - _phantom: PhantomData, - } + *self } } @@ -699,8 +695,7 @@ mod tests { .subword_indices_with_ngrams(3, 6, &indexer) .collect::>(); ngrams_indices_test.sort_by_key(|ngrams_indices_pairs| ngrams_indices_pairs.1.clone()); - for (iter_check, iter_test) in ngrams_indices_check.into_iter().zip(ngrams_indices_test) - { + for (iter_check, iter_test) in ngrams_indices_check.iter().zip(ngrams_indices_test) { assert_eq!(iter_check.0, iter_test.0); } } diff --git a/src/util.rs b/src/util.rs index 63f7f20..30e8f7f 100644 --- a/src/util.rs +++ b/src/util.rs @@ -20,7 +20,7 @@ impl FromIteratorWithCapacity for Vec { I: IntoIterator, { let mut v = Vec::with_capacity(capacity); - v.extend(iter.into_iter()); + v.extend(iter); v } } @@ -31,7 +31,7 @@ impl FromIteratorWithCapacity for VecDeque { I: IntoIterator, { let mut v = VecDeque::with_capacity(capacity); - v.extend(iter.into_iter()); + v.extend(iter); v } }