Skip to content

Commit

Permalink
Reuse stint primitives for limbs
Browse files Browse the repository at this point in the history
This PR makes bncurve less slow by reusing stint integer primtivies and
unrolling a few loops and arrays to avoid array length checks and the
like.

To give an idea, it brings down processing 8k nimbus-eth1 blocks around
the 18M block height mark from 24 to 16 minutes - this is quite
significant given that a lot of time in eth1 is spent reading the
database - this is at least an order of magnitude of bncurve improvement
but probably quite a lot more - how much doesn't greatly matter but now
there's at least a decent baseline for any future performance work ;)

Of course, reusing private primitives from `stint` is not pretty - the
plan is to extract them to a separate library, work started in
status-im/nim-stew#187.
  • Loading branch information
arnetheduck committed Aug 8, 2024
1 parent 9c10dec commit 1920adc
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 104 deletions.
3 changes: 2 additions & 1 deletion bncurve.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ skipDirs = @["tests", "Nim", "nim"]
### Dependencies

requires "nim >= 1.6.0",
"nimcrypto"
"nimcrypto",
"stint"

task test, "Run all tests":
for tprog in @[
Expand Down
132 changes: 37 additions & 95 deletions bncurve/arith.nim
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ import options, endians
import nimcrypto/[utils, sysrand]
export options

{.deadCodeElim: on.}
# TODO replace private stint operations with an integer primitive library
import stint/private/primitives/[addcarry_subborrow, extended_precision]
import stint/private/datatypes

type
BNU256* = array[4, uint64]
Expand Down Expand Up @@ -68,16 +70,7 @@ proc getBit*(a: openArray[uint64], n: int): bool {.inline, noinit.} =
let bit = n - (part shl 6)
result = ((a[part] and (1'u64 shl bit)) != 0)

template splitU64(n: uint64, hi, lo: untyped) =
## Split 64bit unsigned integer to 32bit parts
hi = n shr 32
lo = n and 0xFFFF_FFFF'u64

template combineU64(hi, lo: untyped): uint64 =
## Combine 64bit unsigned integer from 32bit parts
(hi shl 32) or lo

proc div2*(a: var BNU256) {.inline.} =
proc div2(a: var BNU256) {.inline.} =
## Divide integer ``a`` in place by ``2``.
var t = a[3] shl 63
a[3] = a[3] shr 1
Expand All @@ -90,7 +83,7 @@ proc div2*(a: var BNU256) {.inline.} =
a[0] = a[0] shr 1
a[0] = a[0] or t

proc mul2*(a: var BNU256) {.inline.} =
proc mul2(a: var BNU256) {.inline.} =
## Multiply integer ``a`` in place by ``2``.
var last = 0'u64
for i in a.mitems():
Expand All @@ -99,92 +92,42 @@ proc mul2*(a: var BNU256) {.inline.} =
i = i or last
last = tmp

proc adc(a, b: uint64, carry: var uint64): uint64 {.inline, noinit.} =
## Calculate ``a + b`` and return result, set ``carry`` to addition
## operation carry.
var a0, a1, b0, b1, c, r0, r1: uint64
splitU64(a, a1, a0)
splitU64(b, b1, b0)
let tmp0 = a0 + b0 + carry
splitU64(tmp0, c, r0)
let tmp1 = a1 + b1 + c
splitU64(tmp1, c, r1)
carry = c
result = combineU64(r1, r0)

proc addNoCarry*(a: var BNU256, b: BNU256) {.inline.} =
proc addNoCarry(a: var BNU256, b: BNU256) {.inline.} =
## Calculate integer addition ``a = a + b``.
var carry = 0'u64
a[0] = adc(a[0], b[0], carry)
a[1] = adc(a[1], b[1], carry)
a[2] = adc(a[2], b[2], carry)
a[3] = adc(a[3], b[3], carry)
doAssert(carry == 0)
var carry: Carry
staticFor i, 0, 4:
addC(carry, a[i], a[i], b[i], carry)

proc subNoBorrow*(a: var BNU256, b: BNU256) {.inline.} =
proc subNoBorrow(a: var BNU256, b: BNU256) {.inline.} =
## Calculate integer substraction ``a = a - b``.
proc sbb(a: uint64, b: uint64,
borrow: var uint64): uint64 {.inline, noinit.}=
var a0, a1, b0, b1, t0, r0, r1: uint64
splitU64(a, a1, a0)
splitU64(b, b1, b0)
let tmp0 = (1'u64 shl 32) + a0 - b0 - borrow
splitU64(tmp0, t0, r0)
let tmp1 = (1'u64 shl 32) + a1 - b1 - uint64(t0 == 0'u64)
splitU64(tmp1, t0, r1)
borrow = uint64(t0 == 0)
result = combineU64(r1, r0)
var borrow = 0'u64
a[0] = sbb(a[0], b[0], borrow)
a[1] = sbb(a[1], b[1], borrow)
a[2] = sbb(a[2], b[2], borrow)
a[3] = sbb(a[3], b[3], borrow)
doAssert(borrow == 0)

proc macDigit(acc: var openArray[uint64], pos: int, b: openArray[uint64],
c: uint64) =
proc macWithCarry(a, b, c: uint64, carry: var uint64): uint64 {.noinit.} =
var
bhi, blo, chi, clo, ahi, alo, carryhi, carrylo: uint64
xhi, xlo, yhi, ylo, zhi, zlo, rhi, rlo: uint64
splitU64(b, bhi, blo)
splitU64(c, chi, clo)
splitU64(a, ahi, alo)
splitU64(carry, carryhi, carrylo)
splitU64(blo * clo + alo + carrylo, xhi, xlo)
splitU64(blo * chi, yhi, ylo)
splitU64(bhi * clo, zhi, zlo)
splitU64(xhi + ylo + zlo + ahi + carryhi, rhi, rlo)
carry = (bhi * chi) + rhi + yhi + zhi
result = combineU64(rlo, xlo)
var borrow: Borrow
staticFor i, 0, 4:
subB(borrow, a[i], a[i], b[i], borrow)

proc macDigit[N, N2: static int](
acc: var array[N, uint64], pos: static int, b: array[N2, uint64], c: uint64) =
if c == 0'u64:
return

var carry = 0'u64
for i in pos..<len(acc):
if (i - pos) < len(b):
acc[i] = macWithCarry(acc[i], b[i - pos], c, carry)
elif carry != 0:
acc[i] = macWithCarry(acc[i], 0'u64, c, carry)

staticFor i, pos, N:
when (i - pos) < len(b):
muladd2(carry, acc[i], b[i-pos], c, acc[i], carry)
else:
break
doAssert(carry == 0)
muladd2(carry, acc[i], 0, c, acc[i], carry)

proc mulReduce(a: var BNU256, by: BNU256, modulus: BNU256,
inv: uint64) =
proc mulReduce(a: var BNU256, by: BNU256, modulus: BNU256, inv: uint64) =
var res: array[4 * 2, uint64]
var k: uint64
macDigit(res, 0, by, a[0])
macDigit(res, 1, by, a[1])
macDigit(res, 2, by, a[2])
macDigit(res, 3, by, a[3])
for i in 0..<4:
k = inv * res[i]
staticFor i, 0, 4:
macDigit(res, i, by, a[i])

staticFor i, 0, 4:
let k = inv * res[i]
macDigit(res, i, modulus, k)
a[0] = res[4]
a[1] = res[5]
a[2] = res[6]
a[3] = res[7]

staticFor i, 0, 4:
a[i] = res[i + 4]

proc compare*(a: BNU256, b: BNU256): int {.noinit, inline.}=
## Compare integers ``a`` and ``b``.
Expand Down Expand Up @@ -267,15 +210,14 @@ proc into*(t: typedesc[BNU512], c1: BNU256,
macDigit(result, 1, modulo, c1[1])
macDigit(result, 2, modulo, c1[2])
macDigit(result, 3, modulo, c1[3])
var carry = 0'u64
for i in 0..<len(result):
if len(c0) > i:
result[i] = adc(result[i], c0[i], carry)
elif carry != 0'u64:
result[i] = adc(result[i], 0'u64, carry)
var carry: Carry
staticFor i, 0, len(result):
when len(c0) > i:
addC(carry, result[i], result[i], c0[i], carry)
else:
break
doAssert(carry == 0'u64)
addC(carry, result[i], result[i], 0'u64, carry)

doAssert(carry == 0)

proc fromBytes*(dst: var BNU256, src: openArray[byte]): bool =
## Create 256bit integer from big-endian bytes representation ``src``.
Expand Down
2 changes: 0 additions & 2 deletions bncurve/fp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
# those terms.
import arith, options

{.deadCodeElim: on.}

template fieldImplementation(finame, fimodulus, firsquared, fircubed,
fionep, fiinv: untyped): untyped {.dirty.} =
type finame* = distinct BNU256
Expand Down
2 changes: 0 additions & 2 deletions bncurve/fq12.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import options
import fq6, fq2, fp, arith

{.deadCodeElim: on.}

const frobeniusCoeffsC1: array[4, FQ2] = [
FQ2.one(),
FQ2(
Expand Down
2 changes: 0 additions & 2 deletions bncurve/fq2.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import options
import fp, arith

{.deadCodeElim: on.}

type
FQ2* = object
c0*: FQ
Expand Down
2 changes: 0 additions & 2 deletions bncurve/fq6.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import options
import fq2, fp, arith

{.deadCodeElim: on.}

const frobeniusCoeffsC1: array[4, FQ2] = [
FQ2.one(),
FQ2(
Expand Down

0 comments on commit 1920adc

Please sign in to comment.