Skip to content

Commit

Permalink
Use binary search for sparse vec dot product in beam search
Browse files Browse the repository at this point in the history
This leads to a significant speed-up during prediction.
  • Loading branch information
tomtung committed Mar 10, 2019
1 parent bac2f65 commit 6049701
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 49 deletions.
90 changes: 56 additions & 34 deletions src/mat_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,6 @@ pub type SparseMatView<'a> = sprs::CsMatViewI<'a, f32, Index>;
pub type DenseVec = ndarray::Array1<f32>;
pub type DenseMat = ndarray::Array2<f32>;

/// A vector, with both its sparse & dense forms available.
pub struct SparseDenseVec {
sparse: SparseVec,
dense: DenseVec,
}

impl SparseDenseVec {
pub fn from_sparse(sparse: SparseVec) -> Self {
let mut dense = DenseVec::zeros(sparse.dim());
sparse.scatter(dense.as_slice_mut().expect(
"Dense vector should have been created to be contiguous and in standard order",
));

Self { sparse, dense }
}
}

/// A matrix, can be either dense or sparse.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum Mat {
Expand All @@ -37,28 +20,54 @@ pub enum Mat {
}

impl Mat {
pub fn dot_vec(&self, vec: &SparseDenseVec) -> DenseVec {
pub fn dot_vec(&self, vec: &SparseVec) -> DenseVec {
match self {
// If the matrix is dense, use the sparse vector; otherwise, use the dense vector
Mat::Dense(mat) => {
DenseVec::from_iter(mat.outer_iter().map(|w| vec.sparse.dot_dense(w)))
}
Mat::Sparse(mat) => {
let mut prod = DenseVec::zeros(mat.rows());
sprs::prod::mul_acc_mat_vec_csr(
mat.view(),
vec.dense.as_slice()
.expect("Dense vector must be contiguous and in standard order"),
prod
.as_slice_mut()
.expect("We should've created dense vector `prod` to be contiguous and in standard order"),
);
prod
}
Mat::Dense(mat) => DenseVec::from_iter(mat.outer_iter().map(|w| vec.dot_dense(w))),
Mat::Sparse(mat) => DenseVec::from_iter(
mat.outer_iterator()
.map(|row| csvec_dot_by_binary_search(vec.view(), row)),
),
}
}
}

/// Compute the dot product of two sparse vectors, using binary search to find matching indices.
///
/// Runs in O(MlogN) time, where M and N are the number of non-zero entries in each vector.
pub fn csvec_dot_by_binary_search<'a, N, I>(
vec1: sprs::CsVecViewI<N, I>,
vec2: sprs::CsVecViewI<N, I>,
) -> N
where
I: SpIndex,
N: Num + Copy + AddAssign,
{
let (mut idx1, mut val1, mut idx2, mut val2) = if vec1.nnz() < vec2.nnz() {
(vec1.indices(), vec1.data(), vec2.indices(), vec2.data())
} else {
(vec2.indices(), vec2.data(), vec1.indices(), vec1.data())
};

let mut sum = N::zero();
while !idx1.is_empty() && !idx2.is_empty() {
debug_assert_eq!(idx1.len(), val1.len());
debug_assert_eq!(idx2.len(), val2.len());

let (found, i) = match idx2.binary_search(&idx1[0]) {
Ok(i) => (true, i),
Err(i) => (false, i),
};
if found {
sum += val1[0] * val2[i];
}
idx1 = &idx1[1..];
val1 = &val1[1..];
idx2 = &idx2[i..];
val2 = &val2[i..];
}
sum
}

pub trait IndexValuePairs<IndexT: SpIndex + Unsigned, ValueT: Copy>:
Deref<Target = [(IndexT, ValueT)]>
{
Expand Down Expand Up @@ -326,6 +335,19 @@ mod tests {
use ndarray::array;
use sprs::CsVecI;

#[test]
fn test_csvec_dot_by_binary_search() {
let vec1 = CsVecI::new(8, vec![0, 2, 4, 6], vec![1.; 4]);
let vec2 = CsVecI::new(8, vec![1, 3, 5, 7], vec![2.; 4]);
let vec3 = CsVecI::new(8, vec![1, 2, 5, 6], vec![3.; 4]);

assert_eq!(0., csvec_dot_by_binary_search(vec1.view(), vec2.view()));
assert_eq!(4., csvec_dot_by_binary_search(vec1.view(), vec1.view()));
assert_eq!(16., csvec_dot_by_binary_search(vec2.view(), vec2.view()));
assert_eq!(6., csvec_dot_by_binary_search(vec1.view(), vec3.view()));
assert_eq!(12., csvec_dot_by_binary_search(vec2.view(), vec3.view()));
}

#[test]
fn test_is_valid_sparse_vec() {
assert!(Vec::<(usize, f64)>::new().is_valid_sparse_vec(0));
Expand Down
2 changes: 1 addition & 1 deletion src/model/liblinear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ pub struct MultiLabelClassifier {

impl MultiLabelClassifier {
/// Predict scores for each label.
pub fn predict(&self, feature_vec: &SparseDenseVec) -> DenseVec {
pub fn predict(&self, feature_vec: &SparseVec) -> DenseVec {
let mut scores = self.weights.dot_vec(feature_vec);
match self.loss_type {
LossType::Log => scores.mapv_inplace(|v| -(-v).exp().ln_1p()),
Expand Down
24 changes: 10 additions & 14 deletions src/model/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,27 +59,23 @@ impl Model {
}

/// Prepare the feature vector in both dense and sparse forms to make prediction more efficient.
fn prepare_feature_vec(&self, sparse_vec: &[(Index, f32)]) -> SparseDenseVec {
fn prepare_feature_vec(&self, sparse_vec: &[(Index, f32)]) -> SparseVec {
let norm = sparse_vec
.iter()
.map(|(_, v)| v.powi(2))
.sum::<f32>()
.sqrt();

let sparse_vec = {
let (mut indices, mut data): (Vec<_>, Vec<_>) = sparse_vec
.iter()
.cloned()
.map(|(i, v)| (i, v / norm))
.unzip();

indices.push(self.n_features as Index);
data.push(1.);
let (mut indices, mut data): (Vec<_>, Vec<_>) = sparse_vec
.iter()
.cloned()
.map(|(i, v)| (i, v / norm))
.unzip();

SparseVec::new(self.n_features + 1, indices, data)
};
indices.push(self.n_features as Index);
data.push(1.);

SparseDenseVec::from_sparse(sparse_vec)
SparseVec::new(self.n_features + 1, indices, data)
}

/// Serialize model.
Expand Down Expand Up @@ -140,7 +136,7 @@ impl TreeNode {
}

impl Tree {
fn predict(&self, feature_vec: &SparseDenseVec, beam_size: usize) -> IndexValueVec {
fn predict(&self, feature_vec: &SparseVec, beam_size: usize) -> IndexValueVec {
assert!(beam_size > 0);
let mut curr_level = Vec::<(&TreeNode, f32)>::with_capacity(beam_size * 2);
let mut next_level = Vec::<(&TreeNode, f32)>::with_capacity(beam_size * 2);
Expand Down

0 comments on commit 6049701

Please sign in to comment.