Skip to content

Commit

Permalink
Double-Precision towering (#155)
Browse files Browse the repository at this point in the history
* consistent naming for dbl-width

* Isolate double-width Fp2 mul

* Implement double-width complex multiplication

* Lay out Fp4 double-width mul

* Off by p in square Fp4 as well :/

* less copies and stack space in addition chains

* Address #154 partly

* Fix #154, faster Fp4 square: less non-residue, no Mul, only square (bit more ops total)

* Fix typo

* better assembly scheduling for add/sub

* Double-width -> Double-precision

* Unred -> Unr

* double-precision modular addition

* Replace canUseNoCarryMontyMul and canUseNoCarryMontySquare by getSpareBits

* Complete the double-precision implementation

* Use double-precision path for Fp4 squaring and mul

* remove mixin annotations

* Lazy reduction in Fp4 prod

* Fix assembly for sum2xMod

* Assembly for double-precision negation

* reduce white spaces in pairing benchmarks

* ADX implies BMI2
  • Loading branch information
mratsim authored Feb 9, 2021
1 parent 491b4d4 commit 5806cc4
Show file tree
Hide file tree
Showing 31 changed files with 1,572 additions and 699 deletions.
1 change: 0 additions & 1 deletion benchmarks/bench_blueprint.nim
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ proc notes*() =
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
echo " - The simplest operations might be optimized away by the compiler."
echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"

template measure*(iters: int,
startTime, stopTime: untyped,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,13 @@ proc notes*() =
echo "Notes:"
echo " - Compilers:"
echo " Compilers are severely limited on multiprecision arithmetic."
echo " Inline Assembly is used by default (nimble bench_fp)."
echo " Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
echo " Constantine compile-time assembler is used by default (nimble bench_fp)."
echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
echo " GCC also seems to have issues with large temporaries and register spilling."
echo " This is somewhat alleviated by Constantine compile-time assembler."
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
echo " - The simplest operations might be optimized away by the compiler."
echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"

template bench(op: string, desc: string, iters: int, body: untyped): untyped =
let start = getMonotime()
Expand Down Expand Up @@ -121,12 +123,12 @@ func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
for i in 0 ..< aHi.mres.limbs.len:
a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]

proc sumNoReduce(T: typedesc, iters: int) =
proc sumUnr(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Addition no reduce", $T, iters):
r.sumNoReduce(a, b)
bench("Addition unreduced", $T, iters):
r.sumUnr(a, b)

proc sum(T: typedesc, iters: int) =
var r: T
Expand All @@ -135,12 +137,12 @@ proc sum(T: typedesc, iters: int) =
bench("Addition", $T, iters):
r.sum(a, b)

proc diffNoReduce(T: typedesc, iters: int) =
proc diffUnr(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Substraction no reduce", $T, iters):
r.diffNoReduce(a, b)
bench("Substraction unreduced", $T, iters):
r.diffUnr(a, b)

proc diff(T: typedesc, iters: int) =
var r: T
Expand All @@ -149,52 +151,86 @@ proc diff(T: typedesc, iters: int) =
bench("Substraction", $T, iters):
r.diff(a, b)

proc diff2xNoReduce(T: typedesc, iters: int) =
var r, a, b: doubleWidth(T)
proc neg(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
bench("Negation", $T, iters):
r.neg(a)

proc sum2xUnreduce(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Addition 2x unreduced", $doublePrec(T), iters):
r.sum2xUnr(a, b)

proc sum2x(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Addition 2x reduced", $doublePrec(T), iters):
r.sum2xMod(a, b)

proc diff2xUnreduce(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Substraction 2x no reduce", $doubleWidth(T), iters):
r.diffNoReduce(a, b)
bench("Substraction 2x unreduced", $doublePrec(T), iters):
r.diff2xUnr(a, b)

proc diff2x(T: typedesc, iters: int) =
var r, a, b: doubleWidth(T)
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Substraction 2x", $doubleWidth(T), iters):
r.diff(a, b)
bench("Substraction 2x reduced", $doublePrec(T), iters):
r.diff2xMod(a, b)

proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
proc neg2x(T: typedesc, iters: int) =
var r, a: doublePrec(T)
rng.random_unsafe(a, T)
bench("Negation 2x reduced", $doublePrec(T), iters):
r.neg2xMod(a)

proc prod2xBench*(rLen, aLen, bLen: static int, iters: int) =
var r: BigInt[rLen]
let a = rng.random_unsafe(BigInt[aLen])
let b = rng.random_unsafe(BigInt[bLen])
bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
bench("Multiplication 2x", $rLen & " <- " & $aLen & " x " & $bLen, iters):
r.prod(a, b)

proc square2xBench*(rLen, aLen: static int, iters: int) =
var r: BigInt[rLen]
let a = rng.random_unsafe(BigInt[aLen])
bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
bench("Squaring 2x", $rLen & " <- " & $aLen & "²", iters):
r.square(a)

proc reduce2x*(T: typedesc, iters: int) =
var r: T
var t: doubleWidth(T)
var t: doublePrec(T)
rng.random_unsafe(t, T)

bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):
r.reduce(t)
bench("Redc 2x", $T & " <- " & $doublePrec(T), iters):
r.redc2x(t)

proc main() =
separator()
sumNoReduce(Fp[BLS12_381], iters = 10_000_000)
diffNoReduce(Fp[BLS12_381], iters = 10_000_000)
sum(Fp[BLS12_381], iters = 10_000_000)
sumUnr(Fp[BLS12_381], iters = 10_000_000)
diff(Fp[BLS12_381], iters = 10_000_000)
diffUnr(Fp[BLS12_381], iters = 10_000_000)
neg(Fp[BLS12_381], iters = 10_000_000)
separator()
sum2x(Fp[BLS12_381], iters = 10_000_000)
sum2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
diff2x(Fp[BLS12_381], iters = 10_000_000)
diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
mul2xBench(768, 384, 384, iters = 10_000_000)
diff2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
neg2x(Fp[BLS12_381], iters = 10_000_000)
separator()
prod2xBench(768, 384, 384, iters = 10_000_000)
square2xBench(768, 384, iters = 10_000_000)
reduce2x(Fp[BLS12_381], iters = 10_000_000)
separator()
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/bench_pairing_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ import
./bench_blueprint

export notes
proc separator*() = separator(177)
proc separator*() = separator(132)

proc report(op, curve: string, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) =
let ns = inNanoseconds((stopTime-startTime) div iters)
let throughput = 1e9 / float64(ns)
when SupportsGetTicks:
echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
echo &"{op:<40} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
else:
echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op"
echo &"{op:<40} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op"

template bench(op: string, C: static Curve, iters: int, body: untyped): untyped =
measure(iters, startTime, stopTime, startClk, stopClk, body)
Expand Down
17 changes: 9 additions & 8 deletions constantine.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/t_finite_fields_powinv.nim", false),
("tests/t_finite_fields_vs_gmp.nim", true),
("tests/t_fp_cubic_root.nim", false),
# Double-width finite fields
# Double-precision finite fields
# ----------------------------------------------------------
("tests/t_finite_fields_double_width.nim", false),
("tests/t_finite_fields_double_precision.nim", false),
# Towers of extension fields
# ----------------------------------------------------------
("tests/t_fp2.nim", false),
("tests/t_fp2_sqrt.nim", false),
("tests/t_fp4.nim", false),
("tests/t_fp6_bn254_snarks.nim", false),
("tests/t_fp6_bls12_377.nim", false),
("tests/t_fp6_bls12_381.nim", false),
Expand Down Expand Up @@ -259,7 +260,7 @@ proc buildAllBenches() =
echo "\n\n------------------------------------------------------\n"
echo "Building benchmarks to ensure they stay relevant ..."
buildBench("bench_fp")
buildBench("bench_fp_double_width")
buildBench("bench_fp_double_precision")
buildBench("bench_fp2")
buildBench("bench_fp6")
buildBench("bench_fp12")
Expand Down Expand Up @@ -400,19 +401,19 @@ task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
runBench("bench_fp", "clang", useAsm = false)

task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
runBench("bench_fp_double_width")
runBench("bench_fp_double_precision")

task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
runBench("bench_fp_double_width", "gcc")
runBench("bench_fp_double_precision", "gcc")

task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
runBench("bench_fp_double_width", "clang")
runBench("bench_fp_double_precision", "clang")

task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
runBench("bench_fp_double_width", "gcc", useAsm = false)
runBench("bench_fp_double_precision", "gcc", useAsm = false)

task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
runBench("bench_fp_double_width", "clang", useAsm = false)
runBench("bench_fp_double_precision", "clang", useAsm = false)

task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
runBench("bench_fp2")
Expand Down
4 changes: 2 additions & 2 deletions constantine/arithmetic.nim
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import
finite_fields,
finite_fields_inversion,
finite_fields_square_root,
finite_fields_double_width
finite_fields_double_precision
]

export
Expand All @@ -21,4 +21,4 @@ export
finite_fields,
finite_fields_inversion,
finite_fields_square_root,
finite_fields_double_width
finite_fields_double_precision
Loading

0 comments on commit 5806cc4

Please sign in to comment.