Double-Precision towering (#155)

* consistent naming for dbl-width * Isolate double-width Fp2 mul * Implement double-width complex multiplication * Lay out Fp4 double-width mul * Off by p in square Fp4 as well :/ * less copies and stack space in addition chains * Address #154 partly * Fix #154, faster Fp4 square: less non-residue, no Mul, only square (bit more ops total) * Fix typo * better assembly scheduling for add/sub * Double-width -> Double-precision * Unred -> Unr * double-precision modular addition * Replace canUseNoCarryMontyMul and canUseNoCarryMontySquare by getSpareBits * Complete the double-precision implementation * Use double-precision path for Fp4 squaring and mul * remove mixin annotations * Lazy reduction in Fp4 prod * Fix assembly for sum2xMod * Assembly for double-precision negation * reduce white spaces in pairing benchmarks * ADX implies BMI2
mratsim · Feb 9, 2021 · 5806cc4 · 5806cc4
1 parent 491b4d4
commit 5806cc4
Show file tree

Hide file tree

Showing 31 changed files with 1,572 additions and 699 deletions.
diff --git a/benchmarks/bench_blueprint.nim b/benchmarks/bench_blueprint.nim
@@ -88,7 +88,6 @@ proc notes*() =
   echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
   echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
   echo "  - The simplest operations might be optimized away by the compiler."
-  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
 
 template measure*(iters: int,
                startTime, stopTime: untyped,

diff --git a/benchmarks/bench_fp_double_width.nim → benchmarks/bench_fp_double_precision.nim b/benchmarks/bench_fp_double_width.nim → benchmarks/bench_fp_double_precision.nim
@@ -89,11 +89,13 @@ proc notes*() =
   echo "Notes:"
   echo "  - Compilers:"
   echo "    Compilers are severely limited on multiprecision arithmetic."
-  echo "    Inline Assembly is used by default (nimble bench_fp)."
-  echo "    Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
+  echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
   echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
+  echo "    GCC also seems to have issues with large temporaries and register spilling."
+  echo "    This is somewhat alleviated by Constantine compile-time assembler."
+  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
+  echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
   echo "  - The simplest operations might be optimized away by the compiler."
-  echo "  - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"
 
 template bench(op: string, desc: string, iters: int, body: untyped): untyped =
   let start = getMonotime()
@@ -121,12 +123,12 @@ func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
   for i in 0 ..< aHi.mres.limbs.len:
     a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]
 
-proc sumNoReduce(T: typedesc, iters: int) =
+proc sumUnr(T: typedesc, iters: int) =
   var r: T
   let a = rng.random_unsafe(T)
   let b = rng.random_unsafe(T)
-  bench("Addition no reduce", $T, iters):
-    r.sumNoReduce(a, b)
+  bench("Addition unreduced", $T, iters):
+    r.sumUnr(a, b)
 
 proc sum(T: typedesc, iters: int) =
   var r: T
@@ -135,12 +137,12 @@ proc sum(T: typedesc, iters: int) =
   bench("Addition", $T, iters):
     r.sum(a, b)
 
-proc diffNoReduce(T: typedesc, iters: int) =
+proc diffUnr(T: typedesc, iters: int) =
   var r: T
   let a = rng.random_unsafe(T)
   let b = rng.random_unsafe(T)
-  bench("Substraction no reduce", $T, iters):
-    r.diffNoReduce(a, b)
+  bench("Substraction unreduced", $T, iters):
+    r.diffUnr(a, b)
 
 proc diff(T: typedesc, iters: int) =
   var r: T
@@ -149,52 +151,86 @@ proc diff(T: typedesc, iters: int) =
   bench("Substraction", $T, iters):
     r.diff(a, b)
 
-proc diff2xNoReduce(T: typedesc, iters: int) =
-  var r, a, b: doubleWidth(T)
+proc neg(T: typedesc, iters: int) =
+  var r: T
+  let a = rng.random_unsafe(T)
+  bench("Negation", $T, iters):
+    r.neg(a)
+
+proc sum2xUnreduce(T: typedesc, iters: int) =
+  var r, a, b: doublePrec(T)
+  rng.random_unsafe(r, T)
+  rng.random_unsafe(a, T)
+  rng.random_unsafe(b, T)
+  bench("Addition 2x unreduced", $doublePrec(T), iters):
+    r.sum2xUnr(a, b)
+
+proc sum2x(T: typedesc, iters: int) =
+  var r, a, b: doublePrec(T)
+  rng.random_unsafe(r, T)
+  rng.random_unsafe(a, T)
+  rng.random_unsafe(b, T)
+  bench("Addition 2x reduced", $doublePrec(T), iters):
+    r.sum2xMod(a, b)
+
+proc diff2xUnreduce(T: typedesc, iters: int) =
+  var r, a, b: doublePrec(T)
   rng.random_unsafe(r, T)
   rng.random_unsafe(a, T)
   rng.random_unsafe(b, T)
-  bench("Substraction 2x no reduce", $doubleWidth(T), iters):
-    r.diffNoReduce(a, b)
+  bench("Substraction 2x unreduced", $doublePrec(T), iters):
+    r.diff2xUnr(a, b)
 
 proc diff2x(T: typedesc, iters: int) =
-  var r, a, b: doubleWidth(T)
+  var r, a, b: doublePrec(T)
   rng.random_unsafe(r, T)
   rng.random_unsafe(a, T)
   rng.random_unsafe(b, T)
-  bench("Substraction 2x", $doubleWidth(T), iters):
-    r.diff(a, b)
+  bench("Substraction 2x reduced", $doublePrec(T), iters):
+    r.diff2xMod(a, b)
 
-proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
+proc neg2x(T: typedesc, iters: int) =
+  var r, a: doublePrec(T)
+  rng.random_unsafe(a, T)
+  bench("Negation 2x reduced", $doublePrec(T), iters):
+    r.neg2xMod(a)
+
+proc prod2xBench*(rLen, aLen, bLen: static int, iters: int) =
   var r: BigInt[rLen]
   let a = rng.random_unsafe(BigInt[aLen])
   let b = rng.random_unsafe(BigInt[bLen])
-  bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
+  bench("Multiplication 2x", $rLen & " <- " & $aLen & " x " & $bLen, iters):
     r.prod(a, b)
 
 proc square2xBench*(rLen, aLen: static int, iters: int) =
   var r: BigInt[rLen]
   let a = rng.random_unsafe(BigInt[aLen])
-  bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
+  bench("Squaring 2x", $rLen & " <- " & $aLen & "²", iters):
     r.square(a)
 
 proc reduce2x*(T: typedesc, iters: int) =
   var r: T
-  var t: doubleWidth(T)
+  var t: doublePrec(T)
   rng.random_unsafe(t, T)
 
-  bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):
-    r.reduce(t)
+  bench("Redc 2x", $T & " <- " & $doublePrec(T), iters):
+    r.redc2x(t)
 
 proc main() =
   separator()
-  sumNoReduce(Fp[BLS12_381], iters = 10_000_000)
-  diffNoReduce(Fp[BLS12_381], iters = 10_000_000)
   sum(Fp[BLS12_381], iters = 10_000_000)
+  sumUnr(Fp[BLS12_381], iters = 10_000_000)
   diff(Fp[BLS12_381], iters = 10_000_000)
+  diffUnr(Fp[BLS12_381], iters = 10_000_000)
+  neg(Fp[BLS12_381], iters = 10_000_000)
+  separator()
+  sum2x(Fp[BLS12_381], iters = 10_000_000)
+  sum2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
   diff2x(Fp[BLS12_381], iters = 10_000_000)
-  diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
-  mul2xBench(768, 384, 384, iters = 10_000_000)
+  diff2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
+  neg2x(Fp[BLS12_381], iters = 10_000_000)
+  separator()
+  prod2xBench(768, 384, 384, iters = 10_000_000)
   square2xBench(768, 384, iters = 10_000_000)
   reduce2x(Fp[BLS12_381], iters = 10_000_000)
   separator()

diff --git a/benchmarks/bench_pairing_template.nim b/benchmarks/bench_pairing_template.nim
@@ -32,15 +32,15 @@ import
   ./bench_blueprint
 
 export notes
-proc separator*() = separator(177)
+proc separator*() = separator(132)
 
 proc report(op, curve: string, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) =
   let ns = inNanoseconds((stopTime-startTime) div iters)
   let throughput = 1e9 / float64(ns)
   when SupportsGetTicks:
-    echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+    echo &"{op:<40} {curve:<15} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
   else:
-    echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+    echo &"{op:<40} {curve:<15} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
 
 template bench(op: string, C: static Curve, iters: int, body: untyped): untyped =
   measure(iters, startTime, stopTime, startClk, stopClk, body)

diff --git a/constantine.nimble b/constantine.nimble
@@ -43,13 +43,14 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
   ("tests/t_finite_fields_powinv.nim", false),
   ("tests/t_finite_fields_vs_gmp.nim", true),
   ("tests/t_fp_cubic_root.nim", false),
-  # Double-width finite fields
+  # Double-precision finite fields
   # ----------------------------------------------------------
-  ("tests/t_finite_fields_double_width.nim", false),
+  ("tests/t_finite_fields_double_precision.nim", false),
   # Towers of extension fields
   # ----------------------------------------------------------
   ("tests/t_fp2.nim", false),
   ("tests/t_fp2_sqrt.nim", false),
+  ("tests/t_fp4.nim", false),
   ("tests/t_fp6_bn254_snarks.nim", false),
   ("tests/t_fp6_bls12_377.nim", false),
   ("tests/t_fp6_bls12_381.nim", false),
@@ -259,7 +260,7 @@ proc buildAllBenches() =
   echo "\n\n------------------------------------------------------\n"
   echo "Building benchmarks to ensure they stay relevant ..."
   buildBench("bench_fp")
-  buildBench("bench_fp_double_width")
+  buildBench("bench_fp_double_precision")
   buildBench("bench_fp2")
   buildBench("bench_fp6")
   buildBench("bench_fp12")
@@ -400,19 +401,19 @@ task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
   runBench("bench_fp", "clang", useAsm = false)
 
 task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
-  runBench("bench_fp_double_width")
+  runBench("bench_fp_double_precision")
 
 task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
-  runBench("bench_fp_double_width", "gcc")
+  runBench("bench_fp_double_precision", "gcc")
 
 task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
-  runBench("bench_fp_double_width", "clang")
+  runBench("bench_fp_double_precision", "clang")
 
 task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
-  runBench("bench_fp_double_width", "gcc", useAsm = false)
+  runBench("bench_fp_double_precision", "gcc", useAsm = false)
 
 task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
-  runBench("bench_fp_double_width", "clang", useAsm = false)
+  runBench("bench_fp_double_precision", "clang", useAsm = false)
 
 task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
   runBench("bench_fp2")

diff --git a/constantine/arithmetic.nim b/constantine/arithmetic.nim
@@ -12,7 +12,7 @@ import
     finite_fields,
     finite_fields_inversion,
     finite_fields_square_root,
-    finite_fields_double_width
+    finite_fields_double_precision
   ]
 
 export
@@ -21,4 +21,4 @@ export
   finite_fields,
   finite_fields_inversion,
   finite_fields_square_root,
-  finite_fields_double_width
+  finite_fields_double_precision