gpu.cl

/*
 * OCLExplorer bitcoin addresses brute-force tool
 * Copyright (C) 2017 Stanislav V. Tretyakov <svtrostov@yandex.ru>
 *
 * Based on OpenCL script:
 * Vanitygen, vanity bitcoin address generator
 * Copyright (C) 2011 <samr7@cs.washington.edu>
 *
 * This file contains an OpenCL kernel for performing certain parts of
 * the bitcoin address calculation process.
 */

/* Byte-swapping and endianness */
#define bswap32(v) \
  (((v) >> 24) | (((v) >> 8) & 0xff00) | (((v) << 8) & 0xff0000) | ((v) << 24))

#if __ENDIAN_LITTLE__ != 1
#define load_le32(v) bswap32(v)
#define load_be32(v) (v)
#else
#define load_le32(v) (v)
#define load_be32(v) bswap32(v)
#endif

/*
 * Loop unrolling macros
 *
 * In most cases, preprocessor unrolling works best.
 * The exception is NVIDIA's compiler, which seems to take unreasonably
 * long to compile a loop with a larger iteration count, or a loop with
 * a body of >50 PTX instructions, with preprocessor unrolling.
 * However, it does not seem to take as long with pragma unroll, and
 * produces good output.
 */

/* Explicit loop unrolling */
#define unroll_5(a)          \
  do {                       \
    a(0) a(1) a(2) a(3) a(4) \
  } while (0)
#define unroll_8(a)                         \
  do {                                      \
    a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) \
  } while (0)
#define unroll_1_7(a)                  \
  do {                                 \
    a(1) a(2) a(3) a(4) a(5) a(6) a(7) \
  } while (0)
#define unroll_7(a)                    \
  do {                                 \
    a(0) a(1) a(2) a(3) a(4) a(5) a(6) \
  } while (0)
#define unroll_7_0(a)                       \
  do {                                      \
    a(7) a(6) a(5) a(4) a(3) a(2) a(1) a(0) \
  } while (0)
#define unroll_7_1(a)                  \
  do {                                 \
    a(7) a(6) a(5) a(4) a(3) a(2) a(1) \
  } while (0)
#define unroll_16(a)                                                          \
  do {                                                                        \
    a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) a(8) a(9) a(10) a(11) a(12) a(13) \
        a(14) a(15)                                                           \
  } while (0)
#define unroll_64(a)                                                          \
  do {                                                                        \
    a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) a(8) a(9) a(10) a(11) a(12) a(13) \
        a(14) a(15) a(16) a(17) a(18) a(19) a(20) a(21) a(22) a(23) a(24)     \
            a(25) a(26) a(27) a(28) a(29) a(30) a(31) a(32) a(33) a(34) a(35) \
                a(36) a(37) a(38) a(39) a(40) a(41) a(42) a(43) a(44) a(45)   \
                    a(46) a(47) a(48) a(49) a(50) a(51) a(52) a(53) a(54)     \
                        a(55) a(56) a(57) a(58) a(59) a(60) a(61) a(62) a(63) \
  } while (0)

/* Conditional loop unrolling */
#if defined(DEEP_PREPROC_UNROLL)
#define iter_5(a) unroll_5(a)
#define iter_8(a) unroll_8(a)
#define iter_16(a) unroll_16(a)
#define iter_64(a) unroll_64(a)
#else
#define iter_5(a)                \
  do {                           \
    int _i;                      \
    for (_i = 0; _i < 5; _i++) { \
      a(_i)                      \
    }                            \
  } while (0)
#define iter_8(a)                \
  do {                           \
    int _i;                      \
    for (_i = 0; _i < 8; _i++) { \
      a(_i)                      \
    }                            \
  } while (0)
#define iter_16(a)                \
  do {                            \
    int _i;                       \
    for (_i = 0; _i < 16; _i++) { \
      a(_i)                       \
    }                             \
  } while (0)
#define iter_64(a)                \
  do {                            \
    int _i;                       \
    for (_i = 0; _i < 64; _i++) { \
      a(_i)                       \
    }                             \
  } while (0)
#endif

/*
* BIGNUM mini-library
* This module deals with fixed-size 256-bit bignums.
* Where modular arithmetic is performed, the SECP256k1 prime
* modulus (below) is assumed.
*
* Methods include:
* - bn_is_zero/bn_is_one/bn_is_odd/bn_is_even/bn_is_bit_set
* - bn_rshift[1]/bn_lshift[1]
* - bn_neg
* - bn_uadd/bn_uadd_p
* - bn_usub/bn_usub_p
*/

typedef uint bn_word;
#define BN_NBITS 256
#define BN_WSHIFT 5
#define BN_WBITS (1 << BN_WSHIFT)
#define BN_NWORDS ((BN_NBITS / 8) / sizeof(bn_word))
#define BN_WORDMAX 0xffffffff

#define MODULUS_BYTES                                                     \
  0xfffffc2f, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
      0xffffffff, 0xffffffff

typedef struct { bn_word d[BN_NWORDS]; } bignum;

__constant bn_word modulus[] = {MODULUS_BYTES};
__constant bignum bn_zero = {{0}};

__constant bn_word mont_rr[BN_NWORDS] = {
    0xe90a1, 0x7a2, 0x1, 0,
};
__constant bn_word mont_n0[2] = {0xd2253531, 0xd838091d};

#define bn_is_odd(bn) (bn.d[0] & 1)
#define bn_is_even(bn) (!bn_is_odd(bn))
#define bn_is_zero(bn)                                                     \
  (!bn.d[0] && !bn.d[1] && !bn.d[2] && !bn.d[3] && !bn.d[4] && !bn.d[5] && \
   !bn.d[6] && !bn.d[7])
#define bn_is_one(bn)                                                \
  ((bn.d[0] == 1) && !bn.d[1] && !bn.d[2] && !bn.d[3] && !bn.d[4] && \
   !bn.d[5] && !bn.d[6] && !bn.d[7])
#define bn_is_bit_set(bn, n) \
  ((((bn_word *)&bn)[n >> BN_WSHIFT]) & (1 << (n & (BN_WBITS - 1))))

#define bn_unroll(e) unroll_8(e)
#define bn_unroll_sf(e) unroll_1_7(e)
#define bn_unroll_sl(e) unroll_7(e)
#define bn_unroll_reverse(e) unroll_7_0(e)
#define bn_unroll_reverse_sl(e) unroll_7_1(e)

#define bn_unroll_arg(e, arg)                                           \
  e(arg, 0) e(arg, 1) e(arg, 2) e(arg, 3) e(arg, 4) e(arg, 5) e(arg, 6) \
      e(arg, 7)
#define bn_unroll_arg_sf(e, arg) \
  e(arg, 1) e(arg, 2) e(arg, 3) e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)

#define bn_iter(e) iter_8(e)

/*
 * Bitwise shift
 */

void bn_lshift1(bignum *bn)
{
#define bn_lshift1_inner1(i) bn->d[i] = (bn->d[i] << 1) | (bn->d[i - 1] >> 31);
    bn_unroll_reverse_sl(bn_lshift1_inner1);
    bn->d[0] <<= 1;
}

void bn_rshift(bignum *bn, int shift)
{
    int wd, iws, iwr;
    bn_word ihw, ilw;
    iws = (shift & (BN_WBITS - 1));
    iwr = BN_WBITS - iws;
    wd = (shift >> BN_WSHIFT);
    ihw = (wd < BN_WBITS) ? bn->d[wd] : 0;

#define bn_rshift_inner1(i)              \
  wd++;                                  \
  ilw = ihw;                             \
  ihw = (wd < BN_WBITS) ? bn->d[wd] : 0; \
  bn->d[i] = (ilw >> iws) | (ihw << iwr);
    bn_unroll_sl(bn_rshift_inner1);
    bn->d[BN_NWORDS - 1] = (ihw >> iws);
}

void bn_rshift1(bignum *bn)
{
#define bn_rshift1_inner1(i) bn->d[i] = (bn->d[i + 1] << 31) | (bn->d[i] >> 1);
    bn_unroll_sl(bn_rshift1_inner1);
    bn->d[BN_NWORDS - 1] >>= 1;
}

void bn_rshift1_2(bignum *bna, bignum *bnb)
{
#define bn_rshift1_2_inner1(i)                          \
  bna->d[i] = (bna->d[i + 1] << 31) | (bna->d[i] >> 1); \
  bnb->d[i] = (bnb->d[i + 1] << 31) | (bnb->d[i] >> 1);
    bn_unroll_sl(bn_rshift1_2_inner1);
    bna->d[BN_NWORDS - 1] >>= 1;
    bnb->d[BN_NWORDS - 1] >>= 1;
}

/*
 * Unsigned comparison
 */

int bn_ucmp_ge(bignum *a, bignum *b)
{
    int l = 0, g = 0;

#define bn_ucmp_ge_inner1(i)            \
  if (a->d[i] < b->d[i]) l |= (1 << i); \
  if (a->d[i] > b->d[i]) g |= (1 << i);
    bn_unroll_reverse(bn_ucmp_ge_inner1);
    return (l > g) ? 0 : 1;
}

int bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
{
    int l = 0, g = 0;

#define bn_ucmp_ge_c_inner1(i)       \
  if (a->d[i] < b[i]) l |= (1 << i); \
  if (a->d[i] > b[i]) g |= (1 << i);
    bn_unroll_reverse(bn_ucmp_ge_c_inner1);
    return (l > g) ? 0 : 1;
}

/*
 * Negate
 */

void bn_neg(bignum *n)
{
    int c = 1;

#define bn_neg_inner1(i) c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
    bn_unroll(bn_neg_inner1);
}

/*
 * Add/subtract
 */

#define bn_add_word(r, a, b, t, c) \
  do {                             \
    t = a + b;                     \
    c = (t < a) ? 1 : 0;           \
    r = t;                         \
  } while (0)

#define bn_addc_word(r, a, b, t, c)             \
  do {                                          \
    t = a + b + c;                              \
    c = (t < a) ? 1 : ((c & (t == a)) ? 1 : 0); \
    r = t;                                      \
  } while (0)

bn_word bn_uadd_words_seq(bn_word *r, bn_word *a, bn_word *b)
{
    bn_word t, c = 0;

#define bn_uadd_words_seq_inner1(i) bn_addc_word(r[i], a[i], b[i], t, c);
    bn_add_word(r[0], a[0], b[0], t, c);
    bn_unroll_sf(bn_uadd_words_seq_inner1);
    return c;
}

bn_word bn_uadd_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
{
    bn_word t, c = 0;

    bn_add_word(r[0], a[0], b[0], t, c);
    bn_unroll_sf(bn_uadd_words_seq_inner1);
    return c;
}

#define bn_sub_word(r, a, b, t, c) \
  do {                             \
    t = a - b;                     \
    c = (a < b) ? 1 : 0;           \
    r = t;                         \
  } while (0)

#define bn_subb_word(r, a, b, t, c) \
  do {                              \
    t = a - (b + c);                \
    c = (!(a) && c) ? 1 : 0;        \
    c |= (a < b) ? 1 : 0;           \
    r = t;                          \
  } while (0)

bn_word bn_usub_words_seq(bn_word *r, bn_word *a, bn_word *b)
{
    bn_word t, c = 0;

#define bn_usub_words_seq_inner1(i) bn_subb_word(r[i], a[i], b[i], t, c);

    bn_sub_word(r[0], a[0], b[0], t, c);
    bn_unroll_sf(bn_usub_words_seq_inner1);
    return c;
}

bn_word bn_usub_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
{
    bn_word t, c = 0;

    bn_sub_word(r[0], a[0], b[0], t, c);
    bn_unroll_sf(bn_usub_words_seq_inner1);
    return c;
}

/*
 * Add/subtract better suited for AMD's VLIW architecture
 */
bn_word bn_uadd_words_vliw(bn_word *r, bn_word *a, bn_word *b)
{
    bignum x;
    bn_word c = 0, cp = 0;

#define bn_uadd_words_vliw_inner1(i) x.d[i] = a[i] + b[i];

#define bn_uadd_words_vliw_inner2(i)   \
  c |= (a[i] > x.d[i]) ? (1 << i) : 0; \
  cp |= (!~x.d[i]) ? (1 << i) : 0;

#define bn_uadd_words_vliw_inner3(i) r[i] = x.d[i] + ((c >> i) & 1);

    bn_unroll(bn_uadd_words_vliw_inner1);
    bn_unroll(bn_uadd_words_vliw_inner2);
    c = ((cp + (c << 1)) ^ cp);
    r[0] = x.d[0];
    bn_unroll_sf(bn_uadd_words_vliw_inner3);
    return c >> BN_NWORDS;
}

bn_word bn_uadd_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
{
    bignum x;
    bn_word c = 0, cp = 0;

    bn_unroll(bn_uadd_words_vliw_inner1);
    bn_unroll(bn_uadd_words_vliw_inner2);
    c = ((cp + (c << 1)) ^ cp);
    r[0] = x.d[0];
    bn_unroll_sf(bn_uadd_words_vliw_inner3);
    return c >> BN_NWORDS;
}

bn_word bn_usub_words_vliw(bn_word *r, bn_word *a, bn_word *b)
{
    bignum x;
    bn_word c = 0, cp = 0;

#define bn_usub_words_vliw_inner1(i) x.d[i] = a[i] - b[i];

#define bn_usub_words_vliw_inner2(i) \
  c |= (a[i] < b[i]) ? (1 << i) : 0; \
  cp |= (!x.d[i]) ? (1 << i) : 0;

#define bn_usub_words_vliw_inner3(i) r[i] = x.d[i] - ((c >> i) & 1);

    bn_unroll(bn_usub_words_vliw_inner1);
    bn_unroll(bn_usub_words_vliw_inner2);
    c = ((cp + (c << 1)) ^ cp);
    r[0] = x.d[0];
    bn_unroll_sf(bn_usub_words_vliw_inner3);
    return c >> BN_NWORDS;
}

bn_word bn_usub_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
{
    bignum x;
    bn_word c = 0, cp = 0;

    bn_unroll(bn_usub_words_vliw_inner1);
    bn_unroll(bn_usub_words_vliw_inner2);
    c = ((cp + (c << 1)) ^ cp);
    r[0] = x.d[0];
    bn_unroll_sf(bn_usub_words_vliw_inner3);
    return c >> BN_NWORDS;
}

#if defined(DEEP_VLIW)
#define bn_uadd_words bn_uadd_words_vliw
#define bn_uadd_words_c bn_uadd_words_c_vliw
#define bn_usub_words bn_usub_words_vliw
#define bn_usub_words_c bn_usub_words_c_vliw
#else
#define bn_uadd_words bn_uadd_words_seq
#define bn_uadd_words_c bn_uadd_words_c_seq
#define bn_usub_words bn_usub_words_seq
#define bn_usub_words_c bn_usub_words_c_seq
#endif

#define bn_uadd(r, a, b) bn_uadd_words((r)->d, (a)->d, (b)->d)
#define bn_uadd_c(r, a, b) bn_uadd_words_c((r)->d, (a)->d, b)
#define bn_usub(r, a, b) bn_usub_words((r)->d, (a)->d, (b)->d)
#define bn_usub_c(r, a, b) bn_usub_words_c((r)->d, (a)->d, b)

/*
 * Modular add/sub
 */

void bn_mod_add(bignum *r, bignum *a, bignum *b)
{
    if (bn_uadd(r, a, b) || (bn_ucmp_ge_c(r, modulus)))
        bn_usub_c(r, r, modulus);
}

void bn_mod_sub(bignum *r, bignum *a, bignum *b)
{
    if (bn_usub(r, a, b))
        bn_uadd_c(r, r, modulus);
}

void bn_mod_lshift1(bignum *bn)
{
    bn_word c = (bn->d[BN_NWORDS - 1] & 0x80000000);
    bn_lshift1(bn);
    if (c || (bn_ucmp_ge_c(bn, modulus)))
        bn_usub_c(bn, bn, modulus);
}

/*
 * Montgomery multiplication
 *
 * This includes normal multiplication of two "Montgomeryized"
 * bignums, and bn_from_mont for de-Montgomeryizing a bignum.
 */

#define bn_mul_word(r, a, w, c, p, s) \
  do {                                \
    r = (a * w) + c;                  \
    p = mul_hi(a, w);                 \
    c = (r < c) ? p + 1 : p;          \
  } while (0)

#define bn_mul_add_word(r, a, w, c, p, s) \
  do {                                    \
    s = r + c;                            \
    p = mul_hi(a, w);                     \
    r = (a * w) + s;                      \
    c = (s < c) ? p + 1 : p;              \
    if (r < s) c++;                       \
  } while (0)
void bn_mul_mont(bignum *r, bignum *a, bignum *b)
{
    bignum t;
    bn_word tea, teb, c, p, s, m;

#if !defined(VERY_EXPENSIVE_BRANCHES)
    int q;
#endif

    c = 0;
#define bn_mul_mont_inner1(j) bn_mul_word(t.d[j], a->d[j], b->d[0], c, p, s);
    bn_unroll(bn_mul_mont_inner1);
    tea = c;
    teb = 0;

    c = 0;
    m = t.d[0] * mont_n0[0];
    bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
#define bn_mul_mont_inner2(j)                      \
  bn_mul_add_word(t.d[j], modulus[j], m, c, p, s); \
  t.d[j - 1] = t.d[j];
    bn_unroll_sf(bn_mul_mont_inner2);
    t.d[BN_NWORDS - 1] = tea + c;
    tea = teb + ((t.d[BN_NWORDS - 1] < c) ? 1 : 0);

#define bn_mul_mont_inner3_1(i, j) \
  bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s);
#define bn_mul_mont_inner3_2(i, j)                 \
  bn_mul_add_word(t.d[j], modulus[j], m, c, p, s); \
  t.d[j - 1] = t.d[j];
#define bn_mul_mont_inner3(i)                      \
  c = 0;                                           \
  bn_unroll_arg(bn_mul_mont_inner3_1, i);          \
  tea += c;                                        \
  teb = ((tea < c) ? 1 : 0);                       \
  c = 0;                                           \
  m = t.d[0] * mont_n0[0];                         \
  bn_mul_add_word(t.d[0], modulus[0], m, c, p, s); \
  bn_unroll_arg_sf(bn_mul_mont_inner3_2, i);       \
  t.d[BN_NWORDS - 1] = tea + c;                    \
  tea = teb + ((t.d[BN_NWORDS - 1] < c) ? 1 : 0);

    /*
     * The outer loop here is quite long, and we won't unroll it
     * unless VERY_EXPENSIVE_BRANCHES is set.
     */
#if defined(VERY_EXPENSIVE_BRANCHES)
    bn_unroll_sf(bn_mul_mont_inner3);
    c = tea | !bn_usub_c(r, &t, modulus);
    if (!c)
        *r = t;

#else
    for (q = 1; q < BN_NWORDS; q++) {
        bn_mul_mont_inner3(q);
    }
    c = tea || (t.d[BN_NWORDS - 1] >= modulus[BN_NWORDS - 1]);
    if (c) {
        c = tea | !bn_usub_c(r, &t, modulus);
        if (c)
            return;
    }
    *r = t;
#endif
}

void bn_from_mont(bignum *rb, bignum *b)
{
#define WORKSIZE ((2 * BN_NWORDS) + 1)
    bn_word r[WORKSIZE];
    bn_word m, c, p, s;
#if defined(PRAGMA_UNROLL)
    int i;
#endif

    /* Copy the input to the working area */
    /* Zero the upper words */
#define bn_from_mont_inner1(i) r[i] = b->d[i];
#define bn_from_mont_inner2(i) r[BN_NWORDS + i] = 0;

    bn_unroll(bn_from_mont_inner1);
    bn_unroll(bn_from_mont_inner2);
    r[WORKSIZE - 1] = 0;

    /* Multiply (long) by modulus */
#define bn_from_mont_inner3_1(i, j) \
  bn_mul_add_word(r[i + j], modulus[j], m, c, p, s);

#if !defined(VERY_EXPENSIVE_BRANCHES)
#define bn_from_mont_inner3_2(i) \
  if (r[BN_NWORDS + i] < c) r[BN_NWORDS + i + 1] += 1;
#else
#define bn_from_mont_inner3_2(i) \
  r[BN_NWORDS + i + 1] += (r[BN_NWORDS + i] < c) ? 1 : 0;
#endif

#define bn_from_mont_inner3(i)             \
  m = r[i] * mont_n0[0];                   \
  c = 0;                                   \
  bn_unroll_arg(bn_from_mont_inner3_1, i); \
  r[BN_NWORDS + i] += c;                   \
  bn_from_mont_inner3_2(i)

    /*
     * The outer loop here is not very long, so we will unroll
     * it by default.  However, it's just complicated enough to
     * cause NVIDIA's compiler to take unreasonably long to compile
     * it, unless we use pragma unroll.
     */
#if !defined(PRAGMA_UNROLL)
    bn_iter(bn_from_mont_inner3);
#else
#pragma unroll 8
    for (i = 0; i < BN_NWORDS; i++) {
        bn_from_mont_inner3(i)
    }
#endif

    /*
    * Make sure the result is less than the modulus.
    * Subtracting is not much more expensive than compare, so
    * subtract always and assign based on the carry out value.
    */
    c = bn_usub_words_c(rb->d, &r[BN_NWORDS], modulus);
    if (c) {
#define bn_from_mont_inner4(i) rb->d[i] = r[BN_NWORDS + i];
        bn_unroll(bn_from_mont_inner4);
    }
}

/*
 * Modular inversion
 */

void bn_mod_inverse(bignum *r, bignum *n)
{
    bignum a, b, x, y;
    int shift;
    bn_word xc, yc;
    for (shift = 0; shift < BN_NWORDS; shift++) {
        a.d[shift] = modulus[shift];
        x.d[shift] = 0;
        y.d[shift] = 0;
    }
    b = *n;
    x.d[0] = 1;
    xc = 0;
    yc = 0;
    while (!bn_is_zero(b)) {
        shift = 0;
        while (!bn_is_odd(b)) {
            if (bn_is_odd(x))
                xc += bn_uadd_c(&x, &x, modulus);
            bn_rshift1_2(&x, &b);
            x.d[7] |= (xc << 31);
            xc >>= 1;
        }

        while (!bn_is_odd(a)) {
            if (bn_is_odd(y))
                yc += bn_uadd_c(&y, &y, modulus);
            bn_rshift1_2(&y, &a);
            y.d[7] |= (yc << 31);
            yc >>= 1;
        }

        if (bn_ucmp_ge(&b, &a)) {
            xc += yc + bn_uadd(&x, &x, &y);
            bn_usub(&b, &b, &a);
        } else {
            yc += xc + bn_uadd(&y, &y, &x);
            bn_usub(&a, &a, &b);
        }
    }

    if (!bn_is_one(a)) {
        /* no modular inverse */
        *r = bn_zero;
    } else {
        /* Compute y % m as cheaply as possible */
        while (yc < 0x80000000)
            yc -= bn_usub_c(&y, &y, modulus);
        bn_neg(&y);
        *r = y;
    }
}

/*
 * HASH FUNCTIONS
 *
 * BYTE ORDER NOTE: None of the hash functions below deal with byte
 * order.  The caller is expected to be aware of this when it stuffs
 * data into in the native integer.
 *
 * NOTE #2: Endianness of the OpenCL device makes no difference here.
 */

#define hash256_unroll(a) unroll_8(a)
#define hash160_unroll(a) unroll_5(a)
#define hash256_iter(a) iter_8(a)
#define hash160_iter(a) iter_5(a)

/*
 * SHA-2 256
 *
 * CAUTION: Input buffer will be overwritten/mangled.
 * Data expected in big-endian format.
 * This implementation is designed for space efficiency more than
 * raw speed.
 */

__constant uint sha2_init[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
                                0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
                               };

__constant uint sha2_k[64] = {
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
    0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
    0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
    0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
    0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
    0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

void sha2_256_init(uint *out)
{
#define sha2_256_init_inner_1(i) out[i] = sha2_init[i];

    hash256_unroll(sha2_256_init_inner_1);
}

/* The state variable remapping is really contorted */
#define sha2_stvar(vals, i, v) vals[(64 + v - i) % 8]
#define sha2_s0(a) (rotate(a, 30U) ^ rotate(a, 19U) ^ rotate(a, 10U))
#define sha2_s1(a) (rotate(a, 26U) ^ rotate(a, 21U) ^ rotate(a, 7U))
#if defined(AMD_BFI_INT)
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#define sha2_ch(a, b, c) amd_bytealign(a, b, c)
#define sha2_ma(a, b, c) amd_bytealign((a ^ c), b, a)
#else
#define sha2_ch(a, b, c) (c ^ (a & (b ^ c)))
#define sha2_ma(a, b, c) ((a & c) | (b & (a | c)))
#endif

void sha2_256_block(uint *out, uint *in)
{
    uint state[8], t1, t2;
#if defined(PRAGMA_UNROLL)
    int i;
#endif

#define sha2_256_block_inner_1(i) state[i] = out[i];
    hash256_unroll(sha2_256_block_inner_1);

#define sha2_256_block_inner_2(i)                                             \
  if (i >= 16) {                                                              \
    t1 = in[(i + 1) % 16];                                                    \
    t2 = in[(i + 14) % 16];                                                   \
    in[i % 16] +=                                                             \
        (in[(i + 9) % 16] + (rotate(t1, 25U) ^ rotate(t1, 14U) ^ (t1 >> 3)) + \
         (rotate(t2, 15U) ^ rotate(t2, 13U) ^ (t2 >> 10)));                   \
  }                                                                           \
  t1 = (sha2_stvar(state, i, 7) + sha2_s1(sha2_stvar(state, i, 4)) +          \
        sha2_ch(sha2_stvar(state, i, 4), sha2_stvar(state, i, 5),             \
                sha2_stvar(state, i, 6)) +                                    \
        sha2_k[i] + in[i % 16]);                                              \
  t2 = (sha2_s0(sha2_stvar(state, i, 0)) + sha2_ma(sha2_stvar(state, i, 0),   \
                                                   sha2_stvar(state, i, 1),   \
                                                   sha2_stvar(state, i, 2))); \
  sha2_stvar(state, i, 3) += t1;                                              \
  sha2_stvar(state, i, 7) = t1 + t2;

#if !defined(PRAGMA_UNROLL)
    iter_64(sha2_256_block_inner_2);
#else
#pragma unroll 64
    for (i = 0; i < 64; i++) {
        sha2_256_block_inner_2(i)
    }
#endif

#define sha2_256_block_inner_3(i) out[i] += state[i];

    hash256_unroll(sha2_256_block_inner_3);
}

/*
* RIPEMD160
*
* Data expected in little-endian format.
*/

__constant uint ripemd160_iv[] = {0x67452301, 0xEFCDAB89, 0x98BADCFE,
                                  0x10325476, 0xC3D2E1F0
                                 };
__constant uint ripemd160_k[] = {0x00000000, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC,
                                 0xA953FD4E
                                };
__constant uint ripemd160_kp[] = {0x50A28BE6, 0x5C4DD124, 0x6D703EF3,
                                  0x7A6D76E9, 0x00000000
                                 };
__constant uchar ripemd160_ws[] = {
    0,  1, 2,  3, 4,  5,  6, 7,  8, 9,  10, 11, 12, 13, 14, 15, 7,  4,  13, 1,
    10, 6, 15, 3, 12, 0,  9, 5,  2, 14, 11, 8,  3,  10, 14, 4,  9,  15, 8,  1,
    2,  7, 0,  6, 13, 11, 5, 12, 1, 9,  11, 10, 0,  8,  12, 4,  13, 3,  7,  15,
    14, 5, 6,  2, 4,  0,  5, 9,  7, 12, 2,  10, 14, 1,  3,  8,  11, 6,  15, 13,
};
__constant uchar ripemd160_wsp[] = {
    5,  14, 7,  0,  9,  2,  11, 4,  13, 6, 15, 8, 1,  10, 3,  12, 6, 11, 3, 7,
    0,  13, 5,  10, 14, 15, 8,  12, 4,  9, 1,  2, 15, 5,  1,  3,  7, 14, 6, 9,
    11, 8,  12, 2,  10, 0,  4,  13, 8,  6, 4,  1, 3,  11, 15, 0,  5, 12, 2, 13,
    9,  7,  10, 14, 12, 15, 10, 4,  1,  5, 8,  7, 6,  2,  13, 14, 0, 3,  9, 11
};
__constant uchar ripemd160_rl[] = {
    11, 14, 15, 12, 5,  8,  7,  9,  11, 13, 14, 15, 6,  7,  9,  8,
    7,  6,  8,  13, 11, 9,  7,  15, 7,  12, 15, 9,  11, 7,  13, 12,
    11, 13, 6,  7,  14, 9,  13, 15, 14, 8,  13, 6,  5,  12, 7,  5,
    11, 12, 14, 15, 14, 15, 9,  8,  9,  14, 5,  6,  8,  6,  5,  12,
    9,  15, 5,  11, 6,  8,  13, 12, 5,  12, 13, 14, 11, 8,  5,  6,
};
__constant uchar ripemd160_rlp[] = {
    8,  9,  9,  11, 13, 15, 15, 5,  7,  7,  8,  11, 14, 14, 12, 6,
    9,  13, 15, 7,  12, 8,  9,  11, 7,  7,  12, 7,  6,  15, 13, 11,
    9,  7,  15, 11, 8,  6,  6,  14, 12, 13, 5,  14, 13, 13, 7,  5,
    15, 5,  8,  11, 14, 14, 6,  14, 6,  9,  12, 9,  12, 5,  15, 8,
    8,  5,  12, 9,  12, 5,  14, 6,  8,  13, 6,  5,  15, 13, 11, 11
};

#define ripemd160_val(v, i, n) (v)[(80 + (n) - (i)) % 5]
#define ripemd160_valp(v, i, n) (v)[5 + ((80 + (n) - (i)) % 5)]
#if defined(AMD_BFI_INT)
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
#define ripemd160_f1(x, y, z) amd_bytealign(x, y, z)
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
#define ripemd160_f3(x, y, z) amd_bytealign(z, x, y)
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
#else
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
#define ripemd160_f1(x, y, z) ((x & y) | (~x & z))
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
#define ripemd160_f3(x, y, z) ((x & z) | (y & ~z))
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
#endif
#define ripemd160_round(i, in, vals, f, fp, t)                                \
  do {                                                                        \
    ripemd160_val(vals, i, 0) =                                               \
        rotate(ripemd160_val(vals, i, 0) +                                    \
                   f(ripemd160_val(vals, i, 1), ripemd160_val(vals, i, 2),    \
                     ripemd160_val(vals, i, 3)) +                             \
                   in[ripemd160_ws[i]] + ripemd160_k[i / 16],                 \
               (uint)ripemd160_rl[i]) +                                       \
        ripemd160_val(vals, i, 4);                                            \
    ripemd160_val(vals, i, 2) = rotate(ripemd160_val(vals, i, 2), 10U);       \
    ripemd160_valp(vals, i, 0) =                                              \
        rotate(ripemd160_valp(vals, i, 0) +                                   \
                   fp(ripemd160_valp(vals, i, 1), ripemd160_valp(vals, i, 2), \
                      ripemd160_valp(vals, i, 3)) +                           \
                   in[ripemd160_wsp[i]] + ripemd160_kp[i / 16],               \
               (uint)ripemd160_rlp[i]) +                                      \
        ripemd160_valp(vals, i, 4);                                           \
    ripemd160_valp(vals, i, 2) = rotate(ripemd160_valp(vals, i, 2), 10U);     \
  } while (0)

void ripemd160_init(uint *out)
{
#define ripemd160_init_inner_1(i) out[i] = ripemd160_iv[i];

    hash160_unroll(ripemd160_init_inner_1);
}

void ripemd160_block(uint *out, uint *in)
{
    uint vals[10], t;
#if defined(PRAGMA_UNROLL)
    int i;
#endif

#define ripemd160_block_inner_1(i) vals[i] = vals[i + 5] = out[i];

    hash160_unroll(ripemd160_block_inner_1);

#define ripemd160_block_inner_p0(i) \
  ripemd160_round(i, in, vals, ripemd160_f0, ripemd160_f4, t);
#define ripemd160_block_inner_p1(i) \
  ripemd160_round((16 + i), in, vals, ripemd160_f1, ripemd160_f3, t);
#define ripemd160_block_inner_p2(i) \
  ripemd160_round((32 + i), in, vals, ripemd160_f2, ripemd160_f2, t);
#define ripemd160_block_inner_p3(i) \
  ripemd160_round((48 + i), in, vals, ripemd160_f3, ripemd160_f1, t);
#define ripemd160_block_inner_p4(i) \
  ripemd160_round((64 + i), in, vals, ripemd160_f4, ripemd160_f0, t);

#if !defined(PRAGMA_UNROLL)
    iter_16(ripemd160_block_inner_p0);
    iter_16(ripemd160_block_inner_p1);
    iter_16(ripemd160_block_inner_p2);
    iter_16(ripemd160_block_inner_p3);
    iter_16(ripemd160_block_inner_p4);
#else
#pragma unroll 16
    for (i = 0; i < 16; i++) {
        ripemd160_block_inner_p0(i);
    }
#pragma unroll 16
    for (i = 0; i < 16; i++) {
        ripemd160_block_inner_p1(i);
    }
#pragma unroll 16
    for (i = 0; i < 16; i++) {
        ripemd160_block_inner_p2(i);
    }
#pragma unroll 16
    for (i = 0; i < 16; i++) {
        ripemd160_block_inner_p3(i);
    }
#pragma unroll 16
    for (i = 0; i < 16; i++) {
        ripemd160_block_inner_p4(i);
    }
#endif

    t = out[1] + vals[2] + vals[8];
    out[1] = out[2] + vals[3] + vals[9];
    out[2] = out[3] + vals[4] + vals[5];
    out[3] = out[4] + vals[0] + vals[6];
    out[4] = out[0] + vals[1] + vals[7];
    out[0] = t;
}

#define ACCESS_BUNDLE 1024
#define ACCESS_STRIDE (ACCESS_BUNDLE / BN_NWORDS)

__kernel void ec_add_grid(__global bn_word *points_out,
                          __global bn_word *z_heap, __global bn_word *row_in,
                          __global bignum *col_in)
{
    bignum rx, ry;
    bignum x1, y1, a, b, c, d, e, z;
    bn_word cy;
    int i, cell, start;

    /* Load the row increment point */
    i = 2 * get_global_id(1);
    rx = col_in[i];
    ry = col_in[i + 1];

    cell = get_global_id(0);
    start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
             (cell % (ACCESS_STRIDE / 2)));

#define ec_add_grid_inner_1(i) x1.d[i] = row_in[start + (i * ACCESS_STRIDE)];

    bn_unroll(ec_add_grid_inner_1);
    start += (ACCESS_STRIDE / 2);

#define ec_add_grid_inner_2(i) y1.d[i] = row_in[start + (i * ACCESS_STRIDE)];

    bn_unroll(ec_add_grid_inner_2);

    bn_mod_sub(&z, &x1, &rx);

    cell += (get_global_id(1) * get_global_size(0));
    start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % ACCESS_STRIDE));

#define ec_add_grid_inner_3(i) z_heap[start + (i * ACCESS_STRIDE)] = z.d[i];

    bn_unroll(ec_add_grid_inner_3);

    bn_mod_sub(&b, &y1, &ry);
    bn_mod_add(&c, &x1, &rx);
    bn_mod_add(&d, &y1, &ry);
    bn_mul_mont(&y1, &b, &b);
    bn_mul_mont(&x1, &z, &z);
    bn_mul_mont(&e, &c, &x1);
    bn_mod_sub(&y1, &y1, &e);

    /*
    * This disgusting code caters to the global memory unit on
    * various GPUs, by giving it a nice contiguous patch to write
    * per warp/wavefront.
    */
    start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
             (cell % (ACCESS_STRIDE / 2)));

#define ec_add_grid_inner_4(i) \
  points_out[start + (i * ACCESS_STRIDE)] = y1.d[i];

    bn_unroll(ec_add_grid_inner_4);

    bn_mod_lshift1(&y1);
    bn_mod_sub(&y1, &e, &y1);
    bn_mul_mont(&y1, &y1, &b);
    bn_mul_mont(&a, &x1, &z);
    bn_mul_mont(&c, &d, &a);
    bn_mod_sub(&y1, &y1, &c);
    cy = 0;
    if (bn_is_odd(y1))
        cy = bn_uadd_c(&y1, &y1, modulus);
    bn_rshift1(&y1);
    y1.d[BN_NWORDS - 1] |= (cy ? 0x80000000 : 0);

    start += (ACCESS_STRIDE / 2);

    bn_unroll(ec_add_grid_inner_4);
}

__kernel void heap_invert(__global bn_word *z_heap, int batch)
{
    bignum a, b, c, z;
    int i, off, lcell, hcell, start;

#define heap_invert_inner_load_a(j) a.d[j] = z_heap[start + j * ACCESS_STRIDE];
#define heap_invert_inner_load_b(j) b.d[j] = z_heap[start + j * ACCESS_STRIDE];
#define heap_invert_inner_load_z(j) z.d[j] = z_heap[start + j * ACCESS_STRIDE];
#define heap_invert_inner_store_z(j) z_heap[start + j * ACCESS_STRIDE] = z.d[j];
#define heap_invert_inner_store_c(j) z_heap[start + j * ACCESS_STRIDE] = c.d[j];

    off = get_global_size(0);
    lcell = get_global_id(0);
    hcell = (off * batch) + lcell;
    for (i = 0; i < (batch - 1); i++) {
        start =
                (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (lcell % ACCESS_STRIDE));

        bn_unroll(heap_invert_inner_load_a);

        lcell += off;
        start =
                (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (lcell % ACCESS_STRIDE));

        bn_unroll(heap_invert_inner_load_b);

        bn_mul_mont(&z, &a, &b);

        start =
                (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (hcell % ACCESS_STRIDE));

        bn_unroll(heap_invert_inner_store_z);

        lcell += off;
        hcell += off;
    }

    /* Invert the root, fix up 1/ZR -> R/Z */
    bn_mod_inverse(&z, &z);

#define heap_invert_inner_1(i) a.d[i] = mont_rr[i];

    bn_unroll(heap_invert_inner_1);

    bn_mul_mont(&z, &z, &a);
    bn_mul_mont(&z, &z, &a);

    /* Unroll the first iteration to avoid a load/store on the root */
    lcell -= (off << 1);
    hcell -= (off << 1);

    start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (lcell % ACCESS_STRIDE));
    bn_unroll(heap_invert_inner_load_a);

    lcell += off;
    start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (lcell % ACCESS_STRIDE));
    bn_unroll(heap_invert_inner_load_b);

    bn_mul_mont(&c, &a, &z);

    bn_unroll(heap_invert_inner_store_c);

    bn_mul_mont(&c, &b, &z);

    lcell -= off;
    start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (lcell % ACCESS_STRIDE));
    bn_unroll(heap_invert_inner_store_c);

    lcell -= (off << 1);

    for (i = 0; i < (batch - 2); i++) {
        start =
                (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (hcell % ACCESS_STRIDE));
        bn_unroll(heap_invert_inner_load_z);

        start =
                (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (lcell % ACCESS_STRIDE));
        bn_unroll(heap_invert_inner_load_a);

        lcell += off;
        start =
                (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (lcell % ACCESS_STRIDE));
        bn_unroll(heap_invert_inner_load_b);

        bn_mul_mont(&c, &a, &z);

        bn_unroll(heap_invert_inner_store_c);

        bn_mul_mont(&c, &b, &z);

        lcell -= off;
        start =
                (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (lcell % ACCESS_STRIDE));
        bn_unroll(heap_invert_inner_store_c);

        lcell -= (off << 1);
        hcell -= off;
    }
}

void hash_ec_point(uint *hash_out_u, uint *hash_out_c, const bignum *x, const bignum *y)
{
    uint hash1u[16], hash2u[16];
    uint hash1c[16], hash2c[16];
    //bignum hx, hy, yn;
    bn_word whu, wlu;
    bn_word whc, wlc;

    whu = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */
    whc = 0x00000002; /* POINT_CONVERSION_COMPRESSED */

#define hash_ec_point_inner_x(i)        \
  wlu = whu;                            \
  wlc = whc;                            \
  whu = x->d[(BN_NWORDS - 1) - i];      \
  whc = whu;                            \
  hash1u[i] = (wlu << 24) | (whu >> 8); \
  hash1c[i] = (wlc << 24) | (whc >> 8);

    bn_unroll(hash_ec_point_inner_x);

#define hash_ec_point_inner_y(i)    \
  wlu = whu;                        \
  whu = y->d[(BN_NWORDS - 1) - i];  \
  hash1u[BN_NWORDS + i] = (wlu << 24) | (whu >> 8);

    bn_unroll(hash_ec_point_inner_y);

    if (y->d[0] & 1) {
        hash1c[0] |= 0x01000000; /* 0x03 for odd y */
    }

    hash1c[8] = whc << 24 | 0x800000;
    hash1c[9] = 0;
    hash1c[10] = 0;
    hash1c[11] = 0;
    hash1c[12] = 0;
    hash1c[13] = 0;
    hash1c[14] = 0;
    hash1c[15] = 33 * 8;

    sha2_256_init(hash2c);
    sha2_256_block(hash2c, hash1c);

    sha2_256_init(hash2u);
    sha2_256_block(hash2u, hash1u);

    hash1u[0] = whu << 24 | 0x800000;
    hash1u[1] = 0;
    hash1u[2] = 0;
    hash1u[3] = 0;
    hash1u[4] = 0;
    hash1u[5] = 0;
    hash1u[6] = 0;
    hash1u[7] = 0;
    hash1u[8] = 0;
    hash1u[9] = 0;
    hash1u[10] = 0;
    hash1u[11] = 0;
    hash1u[12] = 0;
    hash1u[13] = 0;
    hash1u[14] = 0;
    hash1u[15] = 65 * 8;
    sha2_256_block(hash2u, hash1u);

#define hash_ec_point2_inner_uc(i)      \
  hash2c[i] = bswap32(hash2c[i]);   \
  hash2u[i] = bswap32(hash2u[i]);

    hash256_unroll(hash_ec_point2_inner_uc);

    hash2u[8] = bswap32(0x80000000);
    hash2u[9] = 0;
    hash2u[10] = 0;
    hash2u[11] = 0;
    hash2u[12] = 0;
    hash2u[13] = 0;
    hash2u[14] = 32 * 8;
    hash2u[15] = 0;

    ripemd160_init(hash_out_u);
    ripemd160_block(hash_out_u, hash2u);

    hash2c[8] = bswap32(0x80000000);
    hash2c[9] = 0;
    hash2c[10] = 0;
    hash2c[11] = 0;
    hash2c[12] = 0;
    hash2c[13] = 0;
    hash2c[14] = 32 * 8;
    hash2c[15] = 0;

    ripemd160_init(hash_out_c);
    ripemd160_block(hash_out_c, hash2c);
}

void hash_ec_point_u(uint *hash_out_u, const bignum *x, const bignum *y)
{
    uint hash1u[16], hash2u[16];
    //uint hash1c[16], hash2c[16];
    //bignum hx, hy, yn;
    bn_word whu, wlu;
    //bn_word whc, wlc;

    whu = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */
    //whc = 0x00000002; /* POINT_CONVERSION_COMPRESSED */

#define hash_ec_point_inner_x(i)        \
  wlu = whu;                            \
  whu = x->d[(BN_NWORDS - 1) - i];      \
  hash1u[i] = (wlu << 24) | (whu >> 8); \

    bn_unroll(hash_ec_point_inner_x);

#define hash_ec_point_inner_y(i)    \
  wlu = whu;                        \
  whu = y->d[(BN_NWORDS - 1) - i];  \
  hash1u[BN_NWORDS + i] = (wlu << 24) | (whu >> 8);

    bn_unroll(hash_ec_point_inner_y);

    sha2_256_init(hash2u);
    sha2_256_block(hash2u, hash1u);

    hash1u[0] = whu << 24 | 0x800000;
    hash1u[1] = 0;
    hash1u[2] = 0;
    hash1u[3] = 0;
    hash1u[4] = 0;
    hash1u[5] = 0;
    hash1u[6] = 0;
    hash1u[7] = 0;
    hash1u[8] = 0;
    hash1u[9] = 0;
    hash1u[10] = 0;
    hash1u[11] = 0;
    hash1u[12] = 0;
    hash1u[13] = 0;
    hash1u[14] = 0;
    hash1u[15] = 65 * 8;
    sha2_256_block(hash2u, hash1u);

#define hash_ec_point2_inner_uc(i)      \
  hash2u[i] = bswap32(hash2u[i]);

    hash256_unroll(hash_ec_point2_inner_uc);

    hash2u[8] = bswap32(0x80000000);
    hash2u[9] = 0;
    hash2u[10] = 0;
    hash2u[11] = 0;
    hash2u[12] = 0;
    hash2u[13] = 0;
    hash2u[14] = 32 * 8;
    hash2u[15] = 0;

    ripemd160_init(hash_out_u);
    ripemd160_block(hash_out_u, hash2u);
}

void hash_ec_point_c(uint *hash_out_c, const bignum *x, const bignum *y)
{
    //uint hash1u[16], hash2u[16];
    uint hash1c[16], hash2c[16];
    //bignum hx, hy, yn;
    //bn_word whu, wlu;
    bn_word whc, wlc;

    //whu = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */
    whc = 0x00000002; /* POINT_CONVERSION_COMPRESSED */

#define hash_ec_point_inner_x(i)        \
  wlc = whc;                            \
  whc = x->d[(BN_NWORDS - 1) - i];      \
  hash1c[i] = (wlc << 24) | (whc >> 8);

    bn_unroll(hash_ec_point_inner_x);

    if (y->d[0] & 1) {
        hash1c[0] |= 0x01000000; /* 0x03 for odd y */
    }

    hash1c[8] = whc << 24 | 0x800000;
    hash1c[9] = 0;
    hash1c[10] = 0;
    hash1c[11] = 0;
    hash1c[12] = 0;
    hash1c[13] = 0;
    hash1c[14] = 0;
    hash1c[15] = 33 * 8;

    sha2_256_init(hash2c);
    sha2_256_block(hash2c, hash1c);

#define hash_ec_point2_inner_uc(i)      \
  hash2c[i] = bswap32(hash2c[i]);   \

    hash256_unroll(hash_ec_point2_inner_uc);

    hash2c[8] = bswap32(0x80000000);
    hash2c[9] = 0;
    hash2c[10] = 0;
    hash2c[11] = 0;
    hash2c[12] = 0;
    hash2c[13] = 0;
    hash2c[14] = 32 * 8;
    hash2c[15] = 0;

    ripemd160_init(hash_out_c);
    ripemd160_block(hash_out_c, hash2c);
}

int test_bit_set_bit(__global uchar *buf, uint bit, int set_bit)
{
    uint byte = bit >> 3;
    uchar c = buf[byte];        // expensive memory access
    uchar mask = 1 << (bit % 8);

    if (c & mask) {
        return 1;
    } else {
        if (set_bit) {
            buf[byte] = c | mask;
        }
        return 0;
    }
}

uint murmurhash2(const void *key, int len, const uint seed)
{
    const uint m = 0x5bd1e995;
    const int r = 24;

    uint h = seed ^ len;
    const uchar *data = (const uchar *)key;
    while (len >= 4) {
        uint k = *(uint *)data;
        k *= m;
        k ^= k >> r;
        k *= m;
        h *= m;
        h ^= k;
        data += 4;
        len -= 4;
    }
    switch (len) {
    case 3: h ^= data[2] << 16;
    case 2: h ^= data[1] << 8;
    case 1: h ^= data[0];
        h *= m;
    };

    h ^= h >> 13;
    h *= m;
    h ^= h >> 15;

    return h;
}

void check_hash_bloom(__global uint *foundu, __global uint *foundc,
                      uint *hashu, uint *hashc, __global uchar *bl_bloom,
                      uint cell, int bl_hashes, int bl_bits)
{
    int add = 0;
    uchar hits = 0;
    uint a = murmurhash2((uchar *)hashu, 20, 0x9747b28c);
    uint b = murmurhash2((uchar *)hashu, 20, a);
    uint x;
    uchar i;
    for (i = 0; i < bl_hashes; i++) {
        x = (a + b * i) % bl_bits;
        if (test_bit_set_bit(bl_bloom, x, add)) {
            hits++;
        } else if (!add) {
            //foundu[0] = 0xffffffff;
            goto CHECK_COMP;
        }
    }
    if (hits == bl_hashes) {
        foundu[0] = cell;
        foundu[1] = hashu[0];
        foundu[2] = hashu[1];
        foundu[3] = hashu[2];
        foundu[4] = hashu[3];
        foundu[5] = hashu[4];
        goto CHECK_COMP;
    }
    foundu[0] = 0xffffffff;

    ////////////////////////////////////////////////////////////
CHECK_COMP:
    add = 0;
    hits = 0;
    a = murmurhash2((uchar *)hashc, 20, 0x9747b28c);
    b = murmurhash2((uchar *)hashc, 20, a);
    for (i = 0; i < bl_hashes; i++) {
        x = (a + b * i) % bl_bits;
        if (test_bit_set_bit(bl_bloom, x, add)) {
            hits++;
        } else if (!add) {
            //foundc[0] = 0xffffffff;
            return;
        }
    }
    if (hits == bl_hashes) {
        foundc[0] = cell;
        foundc[1] = hashc[0];
        foundc[2] = hashc[1];
        foundc[3] = hashc[2];
        foundc[4] = hashc[3];
        foundc[5] = hashc[4];
        return;
    }
    foundc[0] = 0xffffffff;

    return;
}

void check_hash_bloom_s(__global uint *found, uint *hash,
                        __global uchar *bl_bloom, uint cell,
                        int bl_hashes, int bl_bits)
{
    int add = 0;
    uchar hits = 0;
    uint a = murmurhash2((uchar *)hash, 20, 0x9747b28c);
    uint b = murmurhash2((uchar *)hash, 20, a);
    uint x;
    uchar i;
    for (i = 0; i < bl_hashes; i++) {
        x = (a + b * i) % bl_bits;
        if (test_bit_set_bit(bl_bloom, x, add)) {
            hits++;
        } else if (!add) {
            //found[0] = 0xffffffff;
            return;
        }
    }
    if (hits == bl_hashes) {
        found[0] = cell;
        found[1] = hash[0];
        found[2] = hash[1];
        found[3] = hash[2];
        found[4] = hash[3];
        found[5] = hash[4];
        //printf("found\n");
        return;
    }
    found[0] = 0xffffffff;
    return;
}

__kernel void hash_and_check_bloom(__global uint *found, __global bn_word *xy,
                                   __global bn_word *z, __global uchar *bl_bloom,
                                   int bl_hashes, int bl_bits)
{
    uint hu[5];
    uint hc[5];
    int i, cell, start;
    bignum x, y, zi, zzi;

    cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
    start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % ACCESS_STRIDE));
    z += start;

    start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
             (cell % (ACCESS_STRIDE / 2)));
    xy += start;

#define processing_inner_z(i) zi.d[i] = z[i * ACCESS_STRIDE];
    bn_unroll(processing_inner_z);

    bn_mul_mont(&zzi, &zi, &zi); /* 1 / Z^2 */

#define processing_inner_x(i) x.d[i] = xy[i * ACCESS_STRIDE];
    bn_unroll(processing_inner_x);
    bn_mul_mont(&x, &x, &zzi); /* X / Z^2 */
    bn_from_mont(&x, &x);

    bn_mul_mont(&zzi, &zzi, &zi); /* 1 / Z^3 */
#define processing_inner_y(i) \
  y.d[i] = xy[(ACCESS_STRIDE / 2) + i * ACCESS_STRIDE];
    bn_unroll(processing_inner_y);

    bn_mul_mont(&y, &y, &zzi); /* Y / Z^3 */
    bn_from_mont(&y, &y);

    /* Complete the coordinates and check hash */
    hash_ec_point(hu, hc, &x, &y);
    check_hash_bloom(found + 0, found + 6, hu, hc, bl_bloom, cell, bl_hashes, bl_bits);
}


__kernel void hash_and_check_bloom_u(__global uint *found, __global bn_word *xy,
                                   __global bn_word *z, __global uchar *bl_bloom,
                                   int bl_hashes, int bl_bits)
{
    uint hu[5];
    //uint hc[5];
    int i, cell, start;
    bignum x, y, zi, zzi;

    cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
    start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % ACCESS_STRIDE));
    z += start;

    start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
             (cell % (ACCESS_STRIDE / 2)));
    xy += start;

#define processing_inner_z(i) zi.d[i] = z[i * ACCESS_STRIDE];
    bn_unroll(processing_inner_z);

    bn_mul_mont(&zzi, &zi, &zi); /* 1 / Z^2 */

#define processing_inner_x(i) x.d[i] = xy[i * ACCESS_STRIDE];
    bn_unroll(processing_inner_x);
    bn_mul_mont(&x, &x, &zzi); /* X / Z^2 */
    bn_from_mont(&x, &x);

    bn_mul_mont(&zzi, &zzi, &zi); /* 1 / Z^3 */
#define processing_inner_y(i) \
  y.d[i] = xy[(ACCESS_STRIDE / 2) + i * ACCESS_STRIDE];
    bn_unroll(processing_inner_y);

    bn_mul_mont(&y, &y, &zzi); /* Y / Z^3 */
    bn_from_mont(&y, &y);

    /* Complete the coordinates and check hash */
    hash_ec_point_u(hu, &x, &y);
    check_hash_bloom_s(found + 0, hu, bl_bloom, cell, bl_hashes, bl_bits);
}

__kernel void hash_and_check_bloom_c(__global uint *found, __global bn_word *xy,
                                     __global bn_word *z, __global uchar *bl_bloom,
                                     int bl_hashes, int bl_bits)
{
    //uint hu[5];
    uint hc[5];
    int i, cell, start;
    bignum x, y, zi, zzi;

    cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0));
    start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % ACCESS_STRIDE));
    z += start;

    start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
             (cell % (ACCESS_STRIDE / 2)));
    xy += start;

#define processing_inner_z(i) zi.d[i] = z[i * ACCESS_STRIDE];
    bn_unroll(processing_inner_z);

    bn_mul_mont(&zzi, &zi, &zi); /* 1 / Z^2 */

#define processing_inner_x(i) x.d[i] = xy[i * ACCESS_STRIDE];
    bn_unroll(processing_inner_x);
    bn_mul_mont(&x, &x, &zzi); /* X / Z^2 */
    bn_from_mont(&x, &x);

    bn_mul_mont(&zzi, &zzi, &zi); /* 1 / Z^3 */
#define processing_inner_y(i) \
  y.d[i] = xy[(ACCESS_STRIDE / 2) + i * ACCESS_STRIDE];
    bn_unroll(processing_inner_y);

    bn_mul_mont(&y, &y, &zzi); /* Y / Z^3 */
    bn_from_mont(&y, &y);

    /* Complete the coordinates and check hash */
    hash_ec_point_c(hc, &x, &y);
    check_hash_bloom_s(found + 6, hc, bl_bloom, cell, bl_hashes, bl_bits);
}