Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update dependencies #199

Merged
merged 15 commits into from
Oct 10, 2023
4 changes: 2 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
target:
- aarch64-unknown-linux-gnu
- i686-unknown-linux-gnu
- mips64-unknown-linux-gnuabi64
- powerpc-unknown-linux-gnu
steps:
- uses: actions/checkout@v1
- uses: actions-rs/toolchain@v1
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: 1.54.0
toolchain: 1.66.0
override: true
- run: rustup component add clippy
- uses: actions-rs/cargo@v1
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.*
*.bk
target
/Cargo.lock
25 changes: 11 additions & 14 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,46 +1,43 @@
[package]
name = "finalfusion"
version = "0.17.1"
edition = "2018"
authors = ["Daniël de Kok <[email protected]>", "Sebastian Pütz <[email protected]>"]
version = "0.18.0"
edition = "2021"
rust-version = "1.66"
description = "Reader and writer for common word embedding formats"
documentation = "https://docs.rs/finalfusion/"
keywords = ["embeddings", "word2vec", "glove", "finalfusion", "fasttext"]
homepage = "https://github.com/finalfusion/finalfusion-rust"
repository = "https://github.com/finalfusion/finalfusion-rust"
license = "MIT OR Apache-2.0"
readme = "README.md"
exclude = [
".gitignore",
".travis.yml"
]
exclude = [".gitignore"]

[dependencies]
byteorder = "1"
fnv = "1"
itertools = "0.10"
itertools = "0.11"
murmur3 = "0.5"
ndarray = "0.15"
ordered-float = "2"
ndarray = { version = "0.15", features = ["approx-0_5"] }
ordered-float = "4"
rand = "0.8"
rand_chacha = "0.3"
reductive = "0.9"
serde = { version = "1", features = ["derive"] }
smallvec = "1.7"
thiserror = "1"
toml = "0.5"
toml = "0.8"

[dependencies.memmap2]
version = "0.5"
version = "0.9"
optional = true

[features]
default = ["memmap"]
memmap = ["memmap2"]

[dev-dependencies]
approx = "0.4"
criterion = "0.3"
approx = "0.5"
criterion = "0.5"
lazy_static = "1"
maplit = "1"
tempfile = "3"
Expand Down
12 changes: 6 additions & 6 deletions benches/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@ fn allround_iter() -> impl Iterator<Item = String> + Clone {
corpus.into_iter()
}

fn known_iter<'a>(
embeds: &'a Embeddings<VocabWrap, StorageWrap>,
) -> impl 'a + Iterator<Item = String> + Clone {
fn known_iter(
embeds: &Embeddings<VocabWrap, StorageWrap>,
) -> impl '_ + Iterator<Item = String> + Clone {
allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) {
Some(WordIndex::Word(_)) => Some(w),
_ => None,
})
}

fn unknown_iter<'a>(
embeds: &'a Embeddings<VocabWrap, StorageWrap>,
) -> impl 'a + Iterator<Item = String> + Clone {
fn unknown_iter(
embeds: &Embeddings<VocabWrap, StorageWrap>,
) -> impl '_ + Iterator<Item = String> + Clone {
allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) {
Some(WordIndex::Subword(_)) => Some(w),
_ => None,
Expand Down
12 changes: 6 additions & 6 deletions benches/quantized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@ fn allround_iter() -> impl Iterator<Item = String> + Clone {
corpus.into_iter()
}

fn known_iter<'a>(
embeds: &'a Embeddings<VocabWrap, StorageWrap>,
) -> impl 'a + Iterator<Item = String> + Clone {
fn known_iter(
embeds: &Embeddings<VocabWrap, StorageWrap>,
) -> impl '_ + Iterator<Item = String> + Clone {
allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) {
Some(WordIndex::Word(_)) => Some(w),
_ => None,
})
}

fn unknown_iter<'a>(
embeds: &'a Embeddings<VocabWrap, StorageWrap>,
) -> impl 'a + Iterator<Item = String> + Clone {
fn unknown_iter(
embeds: &Embeddings<VocabWrap, StorageWrap>,
) -> impl '_ + Iterator<Item = String> + Clone {
allround_iter().filter_map(move |w| match embeds.vocab().idx(&w) {
Some(WordIndex::Subword(_)) => Some(w),
_ => None,
Expand Down
1 change: 0 additions & 1 deletion benches/subword.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ fn subwords(string: &str, min_n: usize, max_n: usize, indexer: &impl Indexer) ->
// evaluates them.
string
.subword_indices(min_n, max_n, indexer)
.into_iter()
.fold(0, |sum, v| sum.wrapping_add(v))
}

Expand Down
1 change: 0 additions & 1 deletion src/chunks/io.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryFrom;
use std::fmt::{self, Display};
use std::fs::File;
use std::io::{BufReader, Read, Seek, Write};
Expand Down
18 changes: 9 additions & 9 deletions src/chunks/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::mem;
use std::ops::{Deref, DerefMut};

use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use toml::Value;
use toml::Table;

use crate::chunks::io::{ChunkIdentifier, Header, ReadChunk, WriteChunk};
use crate::error::{Error, Result};
Expand All @@ -16,18 +16,18 @@ use crate::io::ReadMetadata;
/// finalfusion metadata in TOML format.
#[derive(Clone, Debug, PartialEq)]
pub struct Metadata {
inner: Value,
inner: Table,
}

impl Metadata {
/// Construct new `Metadata`.
pub fn new(data: Value) -> Self {
Metadata { inner: data }
pub fn new(inner: Table) -> Self {
Metadata { inner }
}
}

impl Deref for Metadata {
type Target = Value;
type Target = Table;

fn deref(&self) -> &Self::Target {
&self.inner
Expand All @@ -40,9 +40,9 @@ impl DerefMut for Metadata {
}
}

impl From<Value> for Metadata {
fn from(value: Value) -> Self {
Metadata { inner: value }
impl From<Table> for Metadata {
fn from(inner: Table) -> Self {
Metadata { inner }
}
}

Expand All @@ -69,7 +69,7 @@ impl ReadChunk for Metadata {

Ok(Metadata::new(
buf_str
.parse::<Value>()
.parse::<Table>()
.map_err(|e| Error::Format(format!("Cannot deserialize TOML metadata: {}", e)))
.map_err(Error::from)?,
))
Expand Down
7 changes: 3 additions & 4 deletions src/chunks/norms.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
//! Norms chunk

use std::convert::TryInto;
use std::io::{Read, Seek, SeekFrom, Write};
use std::mem;
use std::mem::size_of;
Expand Down Expand Up @@ -71,7 +70,7 @@ impl ReadChunk for NdNorms {
f32::ensure_data_type(read)?;

let n_padding =
padding::<f32>(read.seek(SeekFrom::Current(0)).map_err(|e| {
padding::<f32>(read.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?);
read.seek(SeekFrom::Current(n_padding as i64))
Expand Down Expand Up @@ -109,12 +108,12 @@ impl WriteChunk for NdNorms {
write
.write_u32::<LittleEndian>(ChunkIdentifier::NdNorms as u32)
.map_err(|e| Error::write_error("Cannot write norms chunk identifier", e))?;
let n_padding = padding::<f32>(write.seek(SeekFrom::Current(0)).map_err(|e| {
let n_padding = padding::<f32>(write.stream_position().map_err(|e| {
Error::write_error("Cannot get file position for computing padding", e)
})?);

let remaining_chunk_len =
self.chunk_len(write.seek(SeekFrom::Current(0)).map_err(|e| {
self.chunk_len(write.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?) - (size_of::<u32>() + size_of::<u64>()) as u64;

Expand Down
14 changes: 6 additions & 8 deletions src/chunks/storage/array.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryInto;
use std::io::{Read, Seek, SeekFrom, Write};
use std::mem;
use std::mem::size_of;
Expand All @@ -13,7 +12,6 @@ use crate::util::padding;

#[cfg(feature = "memmap")]
mod mmap {
use std::convert::TryInto;
use std::fs::File;
#[cfg(target_endian = "little")]
use std::io::Write;
Expand Down Expand Up @@ -134,15 +132,15 @@ mod mmap {
// The components of the embedding matrix should be of type f32.
f32::ensure_data_type(read)?;

let n_padding = padding::<f32>(read.seek(SeekFrom::Current(0)).map_err(|e| {
let n_padding = padding::<f32>(read.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?);
read.seek(SeekFrom::Current(n_padding as i64))
.map_err(|e| Error::read_error("Cannot skip padding", e))?;

// Set up memory mapping.
let matrix_len = shape.size() * size_of::<f32>();
let offset = read.seek(SeekFrom::Current(0)).map_err(|e| {
let offset = read.stream_position().map_err(|e| {
Error::read_error(
"Cannot get file position for memory mapping embedding matrix",
e,
Expand All @@ -153,7 +151,7 @@ mod mmap {
mmap_opts
.offset(offset)
.len(matrix_len)
.map(&*read.get_ref())
.map(read.get_ref())
.map_err(|e| Error::read_error("Cannot memory map embedding matrix", e))?
};

Expand Down Expand Up @@ -218,13 +216,13 @@ impl NdArray {
write
.write_u32::<LittleEndian>(ChunkIdentifier::NdArray as u32)
.map_err(|e| Error::write_error("Cannot write embedding matrix chunk identifier", e))?;
let n_padding = padding::<f32>(write.seek(SeekFrom::Current(0)).map_err(|e| {
let n_padding = padding::<f32>(write.stream_position().map_err(|e| {
Error::write_error("Cannot get file position for computing padding", e)
})?);

let remaining_chunk_len = Self::chunk_len(
data.view(),
write.seek(SeekFrom::Current(0)).map_err(|e| {
write.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?,
) - (size_of::<u32>() + size_of::<u64>()) as u64;
Expand Down Expand Up @@ -346,7 +344,7 @@ impl ReadChunk for NdArray {
f32::ensure_data_type(read)?;

let n_padding =
padding::<f32>(read.seek(SeekFrom::Current(0)).map_err(|e| {
padding::<f32>(read.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?);
read.seek(SeekFrom::Current(n_padding as i64))
Expand Down
11 changes: 5 additions & 6 deletions src/chunks/storage/quantized.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryInto;
use std::io::{Read, Seek, SeekFrom, Write};
use std::mem;
use std::mem::size_of;
Expand Down Expand Up @@ -117,7 +116,7 @@ impl QuantizedArray {
f32::ensure_data_type(read)?;

let n_padding =
padding::<f32>(read.seek(SeekFrom::Current(0)).map_err(|e| {
padding::<f32>(read.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?);
read.seek(SeekFrom::Current(n_padding as i64))
Expand Down Expand Up @@ -171,12 +170,12 @@ impl QuantizedArray {
quantizer,
quantized.view(),
norms,
write.seek(SeekFrom::Current(0)).map_err(|e| {
write.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?,
) - (size_of::<u32>() + size_of::<u64>()) as u64;

let n_padding = padding::<f32>(write.seek(SeekFrom::Current(0)).map_err(|e| {
let n_padding = padding::<f32>(write.stream_position().map_err(|e| {
Error::write_error("Cannot get file position for computing padding", e)
})?);

Expand Down Expand Up @@ -562,7 +561,7 @@ mod mmap {
n_embeddings: usize,
quantized_len: usize,
) -> Result<Mmap> {
let offset = read.seek(SeekFrom::Current(0)).map_err(|e| {
let offset = read.stream_position().map_err(|e| {
Error::read_error(
"Cannot get file position for memory mapping embedding matrix",
e,
Expand All @@ -574,7 +573,7 @@ mod mmap {
mmap_opts
.offset(offset)
.len(matrix_len)
.map(&*read.get_ref())
.map(read.get_ref())
.map_err(|e| {
Error::read_error("Cannot memory map quantized embedding matrix", e)
})?
Expand Down
9 changes: 4 additions & 5 deletions src/chunks/storage/wrappers.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::convert::TryFrom;
#[cfg(feature = "memmap")]
use std::fs::File;
#[cfg(feature = "memmap")]
Expand Down Expand Up @@ -126,7 +125,7 @@ impl ReadChunk for StorageWrap {
R: Read + Seek,
{
let chunk_start_pos = read
.seek(SeekFrom::Current(0))
.stream_position()
.map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?;

let chunk_id = read
Expand Down Expand Up @@ -156,7 +155,7 @@ impl ReadChunk for StorageWrap {
impl MmapChunk for StorageWrap {
fn mmap_chunk(read: &mut BufReader<File>) -> Result<Self> {
let chunk_start_pos = read
.seek(SeekFrom::Current(0))
.stream_position()
.map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?;

let chunk_id = read
Expand Down Expand Up @@ -306,7 +305,7 @@ impl ReadChunk for StorageViewWrap {
R: Read + Seek,
{
let chunk_start_pos = read
.seek(SeekFrom::Current(0))
.stream_position()
.map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?;

let chunk_id = read
Expand Down Expand Up @@ -361,7 +360,7 @@ impl WriteChunk for StorageViewWrap {
impl MmapChunk for StorageViewWrap {
fn mmap_chunk(read: &mut BufReader<File>) -> Result<Self> {
let chunk_start_pos = read
.seek(SeekFrom::Current(0))
.stream_position()
.map_err(|e| Error::read_error("Cannot get storage chunk start position", e))?;

let chunk_id = read
Expand Down
5 changes: 2 additions & 3 deletions src/chunks/vocab/simple.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::HashMap;
use std::convert::TryInto;
use std::io::{Read, Seek, SeekFrom, Write};
use std::io::{Read, Seek, Write};
use std::mem::size_of;

use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
Expand Down Expand Up @@ -103,7 +102,7 @@ impl WriteChunk for SimpleVocab {
.map_err(|e| Error::write_error("Cannot write vocabulary chunk identifier", e))?;

let remaining_chunk_len =
self.chunk_len(write.seek(SeekFrom::Current(0)).map_err(|e| {
self.chunk_len(write.stream_position().map_err(|e| {
Error::read_error("Cannot get file position for computing padding", e)
})?) - (size_of::<u32>() + size_of::<u64>()) as u64;

Expand Down
Loading
Loading