dalek-cryptography · hdevalence · Jun 4, 2019 · May 21, 2019 · May 21, 2019 · May 21, 2019
diff --git a/src/backend/serial/scalar_mul/mod.rs b/src/backend/serial/scalar_mul/mod.rs
@@ -26,3 +26,6 @@ pub mod straus;
 
 #[cfg(feature = "alloc")]
 pub mod precomputed_straus;
+
+#[cfg(feature = "alloc")]
+pub mod pippenger;
diff --git a/src/backend/serial/scalar_mul/pippenger.rs b/src/backend/serial/scalar_mul/pippenger.rs
@@ -0,0 +1,205 @@
+// -*- mode: rust; -*-
+//
+// This file is part of curve25519-dalek.
+// Copyright (c) 2019 Oleg Andreev
+// See LICENSE for licensing information.
+//
+// Authors:
+// - Oleg Andreev <[email protected]>
+
+//! Implementation of a variant of Pippenger's algorithm.
+
+#![allow(non_snake_case)]
+
+use core::borrow::Borrow;
+
+use edwards::EdwardsPoint;
+use scalar::Scalar;
+use traits::VartimeMultiscalarMul;
+
+#[allow(unused_imports)]
+use prelude::*;
+
+/// Implements a version of Pippenger's algorithm.
+///
+/// The algorithm works as follows:
+///
+/// Let `n` be a number of point-scalar pairs.
+/// Let `w` be a window of bits (6..8, chosen based on `n`, see cost factor).
+///
+/// 1. Prepare `2^(w-1) - 1` buckets with indices `[1..2^(w-1))` initialized with identity points.
+///    Bucket 0 is not needed as it would contain points multiplied by 0.
+/// 2. Convert scalars to a radix-`2^w` representation with signed digits in `[-2^w/2, 2^w/2]`.
+///    Note: only the last digit may equal `2^w/2`.
+/// 3. Starting with the last window, for each point `i=[0..n)` add it to a a bucket indexed by
+///    the point's scalar's value in the window.
+/// 4. Once all points in a window are sorted into buckets, add buckets by multiplying each
+///    by their index. Efficient way of doing it is to start with the last bucket and compute two sums:
+///    intermediate sum from the last to the first, and the full sum made of all intermediate sums.
+/// 5. Shift the resulting sum of buckets by `w` bits by using `w` doublings.
+/// 6. Add to the return value.
+/// 7. Repeat the loop.
+///
+/// Approximate cost w/o wNAF optimizations (A = addition, D = doubling):
+///
+/// ```ascii
+/// cost = (n*A + 2*(2^w/2)*A + w*D + A)*256/w
+///          |          |       |     |   |
+///          |          |       |     |   looping over 256/w windows
+///          |          |       |     adding to the result
+///    sorting points   |       shifting the sum by w bits (to the next window, starting from last window)
+///    one by one       |
+///    into buckets     adding/subtracting all buckets
+///                     multiplied by their indexes
+///                     using a sum of intermediate sums
+/// ```
+///
+/// For large `n`, dominant factor is (n*256/w) additions.
+/// However, if `w` is too big and `n` is not too big, then `(2^w/2)*A` could dominate.
+/// Therefore, the optimal choice of `w` grows slowly as `n` grows.
+///
+/// This algorithm is adapted from section 4 of https://eprint.iacr.org/2012/549.pdf.
+pub struct Pippenger;
+
+#[cfg(any(feature = "alloc", feature = "std"))]
+impl VartimeMultiscalarMul for Pippenger {
+    type Point = EdwardsPoint;
+
+    fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
+    where
+        I: IntoIterator,
+        I::Item: Borrow<Scalar>,
+        J: IntoIterator<Item = Option<EdwardsPoint>>,
+    {
+        use traits::Identity;
+
+        let mut scalars = scalars.into_iter();
+        let size = scalars.by_ref().size_hint().0;
+
+        // Digit width in bits. As digit width grows,
+        // number of point additions goes down, but amount of
+        // buckets and bucket additions grows exponentially.
+        let w = if size < 500 {
+            6
+        } else if size < 800 {
+            7
+        } else {
+            8
+        };
+
+        let max_digit: usize = 1 << w;
+        let digits_count: usize = (256 + w - 1) / w; // == ceil(256/w)
+        let buckets_count: usize = max_digit / 2; // digits are signed+centered hence 2^w/2, excluding 0-th bucket
+
+        // Collect optimized scalars and points in buffers for repeated access
+        // (scanning the whole set per digit position).
+        let scalars = scalars
+            .into_iter()
+            .map(|s| s.borrow().to_radix_2w(w).0);
+
+        let points = points
+            .into_iter()
+            .map(|p| p.map(|P| P.to_projective_niels()));
+
+        let scalars_points = scalars.zip(points).map(|(s,maybe_p)| maybe_p.map(|p| (s,p) ) )
+            .collect::<Option<Vec<_>>>();
+        let scalars_points = match scalars_points {
+            Some(sp) => sp,
+            None => return None,
+        };
+
+        // Prepare 2^w/2 buckets.
+        // buckets[i] corresponds to a multiplication factor (i+1).
+        let mut buckets: Vec<_> = (0..buckets_count)
+            .map(|_| EdwardsPoint::identity())
+            .collect();
+
+        let mut columns = (0..digits_count).rev().map(|digit_index| {
+            // Clear the buckets when processing another digit.
+            for i in 0..buckets_count {
+                buckets[i] = EdwardsPoint::identity();
+            }
+
+            // Iterate over pairs of (point, scalar)
+            // and add/sub the point to the corresponding bucket.
+            // Note: if we add support for precomputed lookup tables,
+            // we'll be adding/subtracting point premultiplied by `digits[i]` to buckets[0].
+            for (digits, pt) in scalars_points.iter() {
+                let digit = digits[digit_index];
+                if digit > 0 {
+                    let b = (digit - 1) as usize;
+                    buckets[b] = (&buckets[b] + pt).to_extended();
+                } else if digit < 0 {
+                    let b = (-digit - 1) as usize;
+                    buckets[b] = (&buckets[b] - pt).to_extended();
+                }
+            }
+
+            // Add the buckets applying the multiplication factor to each bucket.
+            // The most efficient way to do that is to have a single sum with two running sums:
+            // an intermediate sum from last bucket to the first, and a sum of intermediate sums.
+            //
+            // For example, to add buckets 1*A, 2*B, 3*C we need to add these points:
+            //   C
+            //   C B
+            //   C B A   Sum = C + (C+B) + (C+B+A)
+            let mut buckets_intermediate_sum = buckets[buckets_count - 1];
+            let mut buckets_sum = buckets[buckets_count - 1];
+            for i in (0..(buckets_count - 1)).rev() {
+                buckets_intermediate_sum += buckets[i];
+                buckets_sum += buckets_intermediate_sum;
+            }
+
+            buckets_sum
+        });
+
+        // Take the high column as an initial value to avoid wasting time doubling the identity element in `fold()`.
+        // `unwrap()` always succeeds because we know we have more than zero digits.
+        let hi_column = columns.next().unwrap();
+
+        Some(
+            columns
+                .fold(hi_column, |total, p| total.mul_by_pow_2(w as u32) + p)
+                .into(),
+        )
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use constants;
+    use scalar::Scalar;
+
+    #[test]
+    fn test_vartime_pippenger() {
+        // Reuse points across different tests
+        let mut n = 512;
+        let x = Scalar::from(2128506u64).invert();
+        let y = Scalar::from(4443282u64).invert();
+        let points: Vec<_> = (0..n)
+            .map(|i| constants::ED25519_BASEPOINT_POINT * Scalar::from(1 + i as u64))
+            .collect();
+        let scalars: Vec<_> = (0..n)
+            .map(|i| x + (Scalar::from(i as u64) * y)) // fast way to make ~random but deterministic scalars
+            .collect();
+
+        let premultiplied: Vec<EdwardsPoint> = scalars
+            .iter()
+            .zip(points.iter())
+            .map(|(sc, pt)| sc * pt)
+            .collect();
+
+        while n > 0 {
+            let scalars = &scalars[0..n].to_vec();
+            let points = &points[0..n].to_vec();
+            let control: EdwardsPoint = premultiplied[0..n].iter().sum();
+
+            let subject = Pippenger::vartime_multiscalar_mul(scalars.clone(), points.clone());
+
+            assert_eq!(subject.compress(), control.compress());
+
+            n = n / 2;
+        }
+    }
+}
diff --git a/src/backend/vector/scalar_mul/mod.rs b/src/backend/vector/scalar_mul/mod.rs
@@ -17,3 +17,6 @@ pub mod straus;
 
 #[cfg(feature = "alloc")]
 pub mod precomputed_straus;
+
+#[cfg(feature = "alloc")]
+pub mod pippenger;
diff --git a/src/backend/vector/scalar_mul/pippenger.rs b/src/backend/vector/scalar_mul/pippenger.rs
@@ -0,0 +1,165 @@
+// -*- mode: rust; -*-
+//
+// This file is part of curve25519-dalek.
+// Copyright (c) 2019 Oleg Andreev
+// See LICENSE for licensing information.
+//
+// Authors:
+// - Oleg Andreev <[email protected]>
+
+#![allow(non_snake_case)]
+
+use core::borrow::Borrow;
+
+use backend::vector::{CachedPoint, ExtendedPoint};
+use edwards::EdwardsPoint;
+use scalar::Scalar;
+use traits::{Identity, VartimeMultiscalarMul};
+
+#[allow(unused_imports)]
+use prelude::*;
+
+/// Implements a version of Pippenger's algorithm.
+///
+/// See the documentation in the serial `scalar_mul::pippenger` module for details.
+pub struct Pippenger;
+
+#[cfg(any(feature = "alloc", feature = "std"))]
+impl VartimeMultiscalarMul for Pippenger {
+    type Point = EdwardsPoint;
+
+    fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
+    where
+        I: IntoIterator,
+        I::Item: Borrow<Scalar>,
+        J: IntoIterator<Item = Option<EdwardsPoint>>,
+    {
+        let mut scalars = scalars.into_iter();
+        let size = scalars.by_ref().size_hint().0;
+        let w = if size < 500 {
+            6
+        } else if size < 800 {
+            7
+        } else {
+            8
+        };
+
+        let max_digit: usize = 1 << w;
+        let digits_count: usize = (256 + w - 1) / w; // == ceil(256/w)
+        let buckets_count: usize = max_digit / 2; // digits are signed+centered hence 2^w/2, excluding 0-th bucket
+
+        // Collect optimized scalars and points in a buffer for repeated access
+        // (scanning the whole collection per each digit position).
+        let scalars = scalars
+            .into_iter()
+            .map(|s| s.borrow().to_radix_2w(w).0);
+
+        let points = points
+            .into_iter()
+            .map(|p| p.map(|P| CachedPoint::from(ExtendedPoint::from(P))));
+
+        let scalars_points = scalars.zip(points).map(|(s,maybe_p)| maybe_p.map(|p| (s,p) ) )
+            .collect::<Option<Vec<_>>>();
+        let scalars_points = match scalars_points {
+            Some(sp) => sp,
+            None => return None,
+        };
+
+        // Prepare 2^w/2 buckets.
+        // buckets[i] corresponds to a multiplication factor (i+1).
+        let mut buckets: Vec<ExtendedPoint> = (0..buckets_count)
+            .map(|_| ExtendedPoint::identity())
+            .collect();
+
+        let mut columns = (0..digits_count).rev().map(|digit_index| {
+            // Clear the buckets when processing another digit.
+            for i in 0..buckets_count {
+                buckets[i] = ExtendedPoint::identity();
+            }
+
+            // Iterate over pairs of (point, scalar)
+            // and add/sub the point to the corresponding bucket.
+            // Note: if we add support for precomputed lookup tables,
+            // we'll be adding/subtractiong point premultiplied by `digits[i]` to buckets[0].
+            for (digits, pt) in scalars_points.iter() {
+                let digit = digits[digit_index];
+                if digit > 0 {
+                    let b = (digit - 1) as usize;
+                    buckets[b] = &buckets[b] + pt;
+                } else if digit < 0 {
+                    let b = (-digit - 1) as usize;
+                    buckets[b] = &buckets[b] - pt;
+                }
+            }
+
+            // Add the buckets applying the multiplication factor to each bucket.
+            // The most efficient way to do that is to have a single sum with two running sums:
+            // an intermediate sum from last bucket to the first, and a sum of intermediate sums.
+            //
+            // For example, to add buckets 1*A, 2*B, 3*C we need to add these points:
+            //   C
+            //   C B
+            //   C B A   Sum = C + (C+B) + (C+B+A)
+            let mut buckets_intermediate_sum = buckets[buckets_count - 1];
+            let mut buckets_sum = buckets[buckets_count - 1];
+            for i in (0..(buckets_count - 1)).rev() {
+                buckets_intermediate_sum =
+                    &buckets_intermediate_sum + &CachedPoint::from(buckets[i]);
+                buckets_sum = &buckets_sum + &CachedPoint::from(buckets_intermediate_sum);
+            }
+
+            buckets_sum
+        });
+
+        // Take the high column as an initial value to avoid wasting time doubling the identity element in `fold()`.
+        // `unwrap()` always succeeds because we know we have more than zero digits.
+        let hi_column = columns.next().unwrap();
+
+        Some(
+            columns
+                .fold(hi_column, |total, p| {
+                    &total.mul_by_pow_2(w as u32) + &CachedPoint::from(p)
+                })
+                .into(),
+        )
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use constants;
+    use scalar::Scalar;
+
+    #[test]
+    fn test_vartime_pippenger() {
+        // Reuse points across different tests
+        let mut n = 512;
+        let x = Scalar::from(2128506u64).invert();
+        let y = Scalar::from(4443282u64).invert();
+        let points: Vec<_> = (0..n)
+            .map(|i| constants::ED25519_BASEPOINT_POINT * Scalar::from(1 + i as u64))
+            .collect();
+        let scalars: Vec<_> = (0..n)
+            .map(|i| x + (Scalar::from(i as u64) * y)) // fast way to make ~random but deterministic scalars
+            .collect();
+
+        let premultiplied: Vec<EdwardsPoint> = scalars
+            .iter()
+            .zip(points.iter())
+            .map(|(sc, pt)| sc * pt)
+            .collect();
+
+        while n > 0 {
+            let scalars = &scalars[0..n].to_vec();
+            let points = &points[0..n].to_vec();
+            let control: EdwardsPoint = premultiplied[0..n].iter().sum();
+
+            let subject = Pippenger::vartime_multiscalar_mul(scalars.clone(), points.clone());
+
+            assert_eq!(subject.compress(), control.compress());
+
+            n = n / 2;
+        }
+    }
+}