Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Double-Precision towering #155

Merged
merged 22 commits into from
Feb 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion benchmarks/bench_blueprint.nim
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ proc notes*() =
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
echo " - The simplest operations might be optimized away by the compiler."
echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"

template measure*(iters: int,
startTime, stopTime: untyped,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,13 @@ proc notes*() =
echo "Notes:"
echo " - Compilers:"
echo " Compilers are severely limited on multiprecision arithmetic."
echo " Inline Assembly is used by default (nimble bench_fp)."
echo " Bench without assembly can use \"nimble bench_fp_gcc\" or \"nimble bench_fp_clang\"."
echo " Constantine compile-time assembler is used by default (nimble bench_fp)."
echo " GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
echo " GCC also seems to have issues with large temporaries and register spilling."
echo " This is somewhat alleviated by Constantine compile-time assembler."
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
echo " Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
echo " - The simplest operations might be optimized away by the compiler."
echo " - Fast Squaring and Fast Multiplication are possible if there are spare bits in the prime representation (i.e. the prime uses 254 bits out of 256 bits)"

template bench(op: string, desc: string, iters: int, body: untyped): untyped =
let start = getMonotime()
Expand Down Expand Up @@ -121,12 +123,12 @@ func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
for i in 0 ..< aHi.mres.limbs.len:
a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]

proc sumNoReduce(T: typedesc, iters: int) =
proc sumUnr(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Addition no reduce", $T, iters):
r.sumNoReduce(a, b)
bench("Addition unreduced", $T, iters):
r.sumUnr(a, b)

proc sum(T: typedesc, iters: int) =
var r: T
Expand All @@ -135,12 +137,12 @@ proc sum(T: typedesc, iters: int) =
bench("Addition", $T, iters):
r.sum(a, b)

proc diffNoReduce(T: typedesc, iters: int) =
proc diffUnr(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
let b = rng.random_unsafe(T)
bench("Substraction no reduce", $T, iters):
r.diffNoReduce(a, b)
bench("Substraction unreduced", $T, iters):
r.diffUnr(a, b)

proc diff(T: typedesc, iters: int) =
var r: T
Expand All @@ -149,52 +151,86 @@ proc diff(T: typedesc, iters: int) =
bench("Substraction", $T, iters):
r.diff(a, b)

proc diff2xNoReduce(T: typedesc, iters: int) =
var r, a, b: doubleWidth(T)
proc neg(T: typedesc, iters: int) =
var r: T
let a = rng.random_unsafe(T)
bench("Negation", $T, iters):
r.neg(a)

proc sum2xUnreduce(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Addition 2x unreduced", $doublePrec(T), iters):
r.sum2xUnr(a, b)

proc sum2x(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Addition 2x reduced", $doublePrec(T), iters):
r.sum2xMod(a, b)

proc diff2xUnreduce(T: typedesc, iters: int) =
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Substraction 2x no reduce", $doubleWidth(T), iters):
r.diffNoReduce(a, b)
bench("Substraction 2x unreduced", $doublePrec(T), iters):
r.diff2xUnr(a, b)

proc diff2x(T: typedesc, iters: int) =
var r, a, b: doubleWidth(T)
var r, a, b: doublePrec(T)
rng.random_unsafe(r, T)
rng.random_unsafe(a, T)
rng.random_unsafe(b, T)
bench("Substraction 2x", $doubleWidth(T), iters):
r.diff(a, b)
bench("Substraction 2x reduced", $doublePrec(T), iters):
r.diff2xMod(a, b)

proc mul2xBench*(rLen, aLen, bLen: static int, iters: int) =
proc neg2x(T: typedesc, iters: int) =
var r, a: doublePrec(T)
rng.random_unsafe(a, T)
bench("Negation 2x reduced", $doublePrec(T), iters):
r.neg2xMod(a)

proc prod2xBench*(rLen, aLen, bLen: static int, iters: int) =
var r: BigInt[rLen]
let a = rng.random_unsafe(BigInt[aLen])
let b = rng.random_unsafe(BigInt[bLen])
bench("Multiplication", $rLen & " <- " & $aLen & " x " & $bLen, iters):
bench("Multiplication 2x", $rLen & " <- " & $aLen & " x " & $bLen, iters):
r.prod(a, b)

proc square2xBench*(rLen, aLen: static int, iters: int) =
var r: BigInt[rLen]
let a = rng.random_unsafe(BigInt[aLen])
bench("Squaring", $rLen & " <- " & $aLen & "²", iters):
bench("Squaring 2x", $rLen & " <- " & $aLen & "²", iters):
r.square(a)

proc reduce2x*(T: typedesc, iters: int) =
var r: T
var t: doubleWidth(T)
var t: doublePrec(T)
rng.random_unsafe(t, T)

bench("Reduce 2x-width", $T & " <- " & $doubleWidth(T), iters):
r.reduce(t)
bench("Redc 2x", $T & " <- " & $doublePrec(T), iters):
r.redc2x(t)

proc main() =
separator()
sumNoReduce(Fp[BLS12_381], iters = 10_000_000)
diffNoReduce(Fp[BLS12_381], iters = 10_000_000)
sum(Fp[BLS12_381], iters = 10_000_000)
sumUnr(Fp[BLS12_381], iters = 10_000_000)
diff(Fp[BLS12_381], iters = 10_000_000)
diffUnr(Fp[BLS12_381], iters = 10_000_000)
neg(Fp[BLS12_381], iters = 10_000_000)
separator()
sum2x(Fp[BLS12_381], iters = 10_000_000)
sum2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
diff2x(Fp[BLS12_381], iters = 10_000_000)
diff2xNoReduce(Fp[BLS12_381], iters = 10_000_000)
mul2xBench(768, 384, 384, iters = 10_000_000)
diff2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
neg2x(Fp[BLS12_381], iters = 10_000_000)
separator()
prod2xBench(768, 384, 384, iters = 10_000_000)
square2xBench(768, 384, iters = 10_000_000)
reduce2x(Fp[BLS12_381], iters = 10_000_000)
separator()
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/bench_pairing_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ import
./bench_blueprint

export notes
proc separator*() = separator(177)
proc separator*() = separator(132)

proc report(op, curve: string, startTime, stopTime: MonoTime, startClk, stopClk: int64, iters: int) =
let ns = inNanoseconds((stopTime-startTime) div iters)
let throughput = 1e9 / float64(ns)
when SupportsGetTicks:
echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
echo &"{op:<40} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
else:
echo &"{op:<60} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op"
echo &"{op:<40} {curve:<15} {throughput:>15.3f} ops/s {ns:>9} ns/op"

template bench(op: string, C: static Curve, iters: int, body: untyped): untyped =
measure(iters, startTime, stopTime, startClk, stopClk, body)
Expand Down
17 changes: 9 additions & 8 deletions constantine.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
("tests/t_finite_fields_powinv.nim", false),
("tests/t_finite_fields_vs_gmp.nim", true),
("tests/t_fp_cubic_root.nim", false),
# Double-width finite fields
# Double-precision finite fields
# ----------------------------------------------------------
("tests/t_finite_fields_double_width.nim", false),
("tests/t_finite_fields_double_precision.nim", false),
# Towers of extension fields
# ----------------------------------------------------------
("tests/t_fp2.nim", false),
("tests/t_fp2_sqrt.nim", false),
("tests/t_fp4.nim", false),
("tests/t_fp6_bn254_snarks.nim", false),
("tests/t_fp6_bls12_377.nim", false),
("tests/t_fp6_bls12_381.nim", false),
Expand Down Expand Up @@ -259,7 +260,7 @@ proc buildAllBenches() =
echo "\n\n------------------------------------------------------\n"
echo "Building benchmarks to ensure they stay relevant ..."
buildBench("bench_fp")
buildBench("bench_fp_double_width")
buildBench("bench_fp_double_precision")
buildBench("bench_fp2")
buildBench("bench_fp6")
buildBench("bench_fp12")
Expand Down Expand Up @@ -400,19 +401,19 @@ task bench_fp_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
runBench("bench_fp", "clang", useAsm = false)

task bench_fpdbl, "Run benchmark 𝔽pDbl with your default compiler":
runBench("bench_fp_double_width")
runBench("bench_fp_double_precision")

task bench_fpdbl_gcc, "Run benchmark 𝔽p with gcc":
runBench("bench_fp_double_width", "gcc")
runBench("bench_fp_double_precision", "gcc")

task bench_fpdbl_clang, "Run benchmark 𝔽p with clang":
runBench("bench_fp_double_width", "clang")
runBench("bench_fp_double_precision", "clang")

task bench_fpdbl_gcc_noasm, "Run benchmark 𝔽p with gcc - no Assembly":
runBench("bench_fp_double_width", "gcc", useAsm = false)
runBench("bench_fp_double_precision", "gcc", useAsm = false)

task bench_fpdbl_clang_noasm, "Run benchmark 𝔽p with clang - no Assembly":
runBench("bench_fp_double_width", "clang", useAsm = false)
runBench("bench_fp_double_precision", "clang", useAsm = false)

task bench_fp2, "Run benchmark with 𝔽p2 your default compiler":
runBench("bench_fp2")
Expand Down
4 changes: 2 additions & 2 deletions constantine/arithmetic.nim
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import
finite_fields,
finite_fields_inversion,
finite_fields_square_root,
finite_fields_double_width
finite_fields_double_precision
]

export
Expand All @@ -21,4 +21,4 @@ export
finite_fields,
finite_fields_inversion,
finite_fields_square_root,
finite_fields_double_width
finite_fields_double_precision
Loading