diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu index c59e84e..62d143b 100644 --- a/JHA/cuda_jha_keccak512.cu +++ b/JHA/cuda_jha_keccak512.cu @@ -1,572 +1,572 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -__constant__ uint64_t c_State[25]; -__constant__ uint32_t c_PaddedMessage[18]; - -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return __byte_perm(x, 0, 0x0123); -} - -// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt -#if __CUDA_ARCH__ >= 350 -__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { - uint2 result; - if(offset >= 32) { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } else { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); -} -#else -#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) -#endif - -#define U32TO64_LE(p) \ - (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) - -#define U64TO32_LE(p, v) \ - *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); - -static const uint64_t host_keccak_round_constants[24] = { - 0x0000000000000001ull, 0x0000000000008082ull, - 0x800000000000808aull, 0x8000000080008000ull, - 0x000000000000808bull, 0x0000000080000001ull, - 0x8000000080008081ull, 0x8000000000008009ull, - 0x000000000000008aull, 0x0000000000000088ull, - 0x0000000080008009ull, 0x000000008000000aull, - 0x000000008000808bull, 0x800000000000008bull, - 0x8000000000008089ull, 0x8000000000008003ull, - 0x8000000000008002ull, 0x8000000000000080ull, - 0x000000000000800aull, 0x800000008000000aull, - 0x8000000080008081ull, 0x8000000000008080ull, - 0x0000000080000001ull, 0x8000000080008008ull -}; - -__constant__ uint64_t c_keccak_round_constants[24]; - -static __device__ __forceinline__ void -keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { - size_t i; - uint64_t t[5], u[5], v, w; - - /* absorb input */ -#pragma unroll 9 - for (i = 0; i < 72 / 8; i++, in += 2) - s[i] ^= U32TO64_LE(in); - - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } -} - -__global__ void jackpot_keccak512_gpu_hash_88(int threads, uint32_t startNounce, uint64_t *g_hash) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = startNounce + thread; - - int hashPosition = nounce - startNounce; - - // Nachricht kopieren - uint32_t message[18]; -#pragma unroll 18 - for(int i=0;i<18;i++) - message[i] = c_PaddedMessage[i]; - - // die individuelle Nounce einsetzen - message[1] = cuda_swab32(nounce); - - // State initialisieren - uint64_t keccak_gpu_state[25]; -#pragma unroll 25 - for (int i=0; i<25; i++) - keccak_gpu_state[i] = c_State[i]; - - // den Block einmal gut durchschütteln - keccak_block(keccak_gpu_state, message, c_keccak_round_constants); - - // das Hash erzeugen - uint32_t hash[16]; - -#pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { - U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); - } - - // fertig - uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition]; - -#pragma unroll 16 - for(int i=0;i<16;i++) - outpHash[i] = hash[i]; - } -} - -// Setup-Funktionen -__host__ void jackpot_keccak512_cpu_init(int thr_id, int threads) -{ - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( c_keccak_round_constants, - host_keccak_round_constants, - sizeof(host_keccak_round_constants), - 0, cudaMemcpyHostToDevice); -} - -#define cKeccakB 1600 -#define cKeccakR 576 - -#define cKeccakR_SizeInBytes (cKeccakR / 8) -#define crypto_hash_BYTES 64 - -#if (cKeccakB == 1600) - typedef unsigned long long UINT64; - typedef UINT64 tKeccakLane; - #define cKeccakNumberOfRounds 24 -#endif - -#define cKeccakLaneSizeInBits (sizeof(tKeccakLane) * 8) - -#define ROL(a, offset) ((((tKeccakLane)a) << ((offset) % cKeccakLaneSizeInBits)) ^ (((tKeccakLane)a) >> (cKeccakLaneSizeInBits-((offset) % cKeccakLaneSizeInBits)))) -#if ((cKeccakB/25) == 8) - #define ROL_mult8(a, offset) ((tKeccakLane)a) -#else - #define ROL_mult8(a, offset) ROL(a, offset) -#endif -void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ); - -const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = -{ - (tKeccakLane)0x0000000000000001ULL, - (tKeccakLane)0x0000000000008082ULL, - (tKeccakLane)0x800000000000808aULL, - (tKeccakLane)0x8000000080008000ULL, - (tKeccakLane)0x000000000000808bULL, - (tKeccakLane)0x0000000080000001ULL, - (tKeccakLane)0x8000000080008081ULL, - (tKeccakLane)0x8000000000008009ULL, - (tKeccakLane)0x000000000000008aULL, - (tKeccakLane)0x0000000000000088ULL, - (tKeccakLane)0x0000000080008009ULL, - (tKeccakLane)0x000000008000000aULL, - (tKeccakLane)0x000000008000808bULL, - (tKeccakLane)0x800000000000008bULL, - (tKeccakLane)0x8000000000008089ULL, - (tKeccakLane)0x8000000000008003ULL, - (tKeccakLane)0x8000000000008002ULL, - (tKeccakLane)0x8000000000000080ULL - #if (cKeccakB >= 400) - , (tKeccakLane)0x000000000000800aULL, - (tKeccakLane)0x800000008000000aULL - #if (cKeccakB >= 800) - , (tKeccakLane)0x8000000080008081ULL, - (tKeccakLane)0x8000000000008080ULL - #if (cKeccakB == 1600) - , (tKeccakLane)0x0000000080000001ULL, - (tKeccakLane)0x8000000080008008ULL - #endif - #endif - #endif -}; - -void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ) -{ - - { - while ( --laneCount >= 0 ) - { - state[laneCount] ^= in[laneCount]; - } - } - - { - tKeccakLane Aba, Abe, Abi, Abo, Abu; - tKeccakLane Aga, Age, Agi, Ago, Agu; - tKeccakLane Aka, Ake, Aki, Ako, Aku; - tKeccakLane Ama, Ame, Ami, Amo, Amu; - tKeccakLane Asa, Ase, Asi, Aso, Asu; - tKeccakLane BCa, BCe, BCi, BCo, BCu; - tKeccakLane Da, De, Di, Do, Du; - tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu; - tKeccakLane Ega, Ege, Egi, Ego, Egu; - tKeccakLane Eka, Eke, Eki, Eko, Eku; - tKeccakLane Ema, Eme, Emi, Emo, Emu; - tKeccakLane Esa, Ese, Esi, Eso, Esu; - #define round laneCount - - //copyFromState(A, state) - Aba = state[ 0]; - Abe = state[ 1]; - Abi = state[ 2]; - Abo = state[ 3]; - Abu = state[ 4]; - Aga = state[ 5]; - Age = state[ 6]; - Agi = state[ 7]; - Ago = state[ 8]; - Agu = state[ 9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for( round = 0; round < cKeccakNumberOfRounds; round += 2 ) - { - // prepareTheta - BCa = Aba^Aga^Aka^Ama^Asa; - BCe = Abe^Age^Ake^Ame^Ase; - BCi = Abi^Agi^Aki^Ami^Asi; - BCo = Abo^Ago^Ako^Amo^Aso; - BCu = Abu^Agu^Aku^Amu^Asu; - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Aba ^= Da; - BCa = Aba; - Age ^= De; - BCe = ROL(Age, 44); - Aki ^= Di; - BCi = ROL(Aki, 43); - Amo ^= Do; - BCo = ROL(Amo, 21); - Asu ^= Du; - BCu = ROL(Asu, 14); - Eba = BCa ^((~BCe)& BCi ); - Eba ^= (tKeccakLane)KeccakF_RoundConstants[round]; - Ebe = BCe ^((~BCi)& BCo ); - Ebi = BCi ^((~BCo)& BCu ); - Ebo = BCo ^((~BCu)& BCa ); - Ebu = BCu ^((~BCa)& BCe ); - - Abo ^= Do; - BCa = ROL(Abo, 28); - Agu ^= Du; - BCe = ROL(Agu, 20); - Aka ^= Da; - BCi = ROL(Aka, 3); - Ame ^= De; - BCo = ROL(Ame, 45); - Asi ^= Di; - BCu = ROL(Asi, 61); - Ega = BCa ^((~BCe)& BCi ); - Ege = BCe ^((~BCi)& BCo ); - Egi = BCi ^((~BCo)& BCu ); - Ego = BCo ^((~BCu)& BCa ); - Egu = BCu ^((~BCa)& BCe ); - - Abe ^= De; - BCa = ROL(Abe, 1); - Agi ^= Di; - BCe = ROL(Agi, 6); - Ako ^= Do; - BCi = ROL(Ako, 25); - Amu ^= Du; - BCo = ROL_mult8(Amu, 8); - Asa ^= Da; - BCu = ROL(Asa, 18); - Eka = BCa ^((~BCe)& BCi ); - Eke = BCe ^((~BCi)& BCo ); - Eki = BCi ^((~BCo)& BCu ); - Eko = BCo ^((~BCu)& BCa ); - Eku = BCu ^((~BCa)& BCe ); - - Abu ^= Du; - BCa = ROL(Abu, 27); - Aga ^= Da; - BCe = ROL(Aga, 36); - Ake ^= De; - BCi = ROL(Ake, 10); - Ami ^= Di; - BCo = ROL(Ami, 15); - Aso ^= Do; - BCu = ROL_mult8(Aso, 56); - Ema = BCa ^((~BCe)& BCi ); - Eme = BCe ^((~BCi)& BCo ); - Emi = BCi ^((~BCo)& BCu ); - Emo = BCo ^((~BCu)& BCa ); - Emu = BCu ^((~BCa)& BCe ); - - Abi ^= Di; - BCa = ROL(Abi, 62); - Ago ^= Do; - BCe = ROL(Ago, 55); - Aku ^= Du; - BCi = ROL(Aku, 39); - Ama ^= Da; - BCo = ROL(Ama, 41); - Ase ^= De; - BCu = ROL(Ase, 2); - Esa = BCa ^((~BCe)& BCi ); - Ese = BCe ^((~BCi)& BCo ); - Esi = BCi ^((~BCo)& BCu ); - Eso = BCo ^((~BCu)& BCa ); - Esu = BCu ^((~BCa)& BCe ); - - // prepareTheta - BCa = Eba^Ega^Eka^Ema^Esa; - BCe = Ebe^Ege^Eke^Eme^Ese; - BCi = Ebi^Egi^Eki^Emi^Esi; - BCo = Ebo^Ego^Eko^Emo^Eso; - BCu = Ebu^Egu^Eku^Emu^Esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - Da = BCu^ROL(BCe, 1); - De = BCa^ROL(BCi, 1); - Di = BCe^ROL(BCo, 1); - Do = BCi^ROL(BCu, 1); - Du = BCo^ROL(BCa, 1); - - Eba ^= Da; - BCa = Eba; - Ege ^= De; - BCe = ROL(Ege, 44); - Eki ^= Di; - BCi = ROL(Eki, 43); - Emo ^= Do; - BCo = ROL(Emo, 21); - Esu ^= Du; - BCu = ROL(Esu, 14); - Aba = BCa ^((~BCe)& BCi ); - Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1]; - Abe = BCe ^((~BCi)& BCo ); - Abi = BCi ^((~BCo)& BCu ); - Abo = BCo ^((~BCu)& BCa ); - Abu = BCu ^((~BCa)& BCe ); - - Ebo ^= Do; - BCa = ROL(Ebo, 28); - Egu ^= Du; - BCe = ROL(Egu, 20); - Eka ^= Da; - BCi = ROL(Eka, 3); - Eme ^= De; - BCo = ROL(Eme, 45); - Esi ^= Di; - BCu = ROL(Esi, 61); - Aga = BCa ^((~BCe)& BCi ); - Age = BCe ^((~BCi)& BCo ); - Agi = BCi ^((~BCo)& BCu ); - Ago = BCo ^((~BCu)& BCa ); - Agu = BCu ^((~BCa)& BCe ); - - Ebe ^= De; - BCa = ROL(Ebe, 1); - Egi ^= Di; - BCe = ROL(Egi, 6); - Eko ^= Do; - BCi = ROL(Eko, 25); - Emu ^= Du; - BCo = ROL_mult8(Emu, 8); - Esa ^= Da; - BCu = ROL(Esa, 18); - Aka = BCa ^((~BCe)& BCi ); - Ake = BCe ^((~BCi)& BCo ); - Aki = BCi ^((~BCo)& BCu ); - Ako = BCo ^((~BCu)& BCa ); - Aku = BCu ^((~BCa)& BCe ); - - Ebu ^= Du; - BCa = ROL(Ebu, 27); - Ega ^= Da; - BCe = ROL(Ega, 36); - Eke ^= De; - BCi = ROL(Eke, 10); - Emi ^= Di; - BCo = ROL(Emi, 15); - Eso ^= Do; - BCu = ROL_mult8(Eso, 56); - Ama = BCa ^((~BCe)& BCi ); - Ame = BCe ^((~BCi)& BCo ); - Ami = BCi ^((~BCo)& BCu ); - Amo = BCo ^((~BCu)& BCa ); - Amu = BCu ^((~BCa)& BCe ); - - Ebi ^= Di; - BCa = ROL(Ebi, 62); - Ego ^= Do; - BCe = ROL(Ego, 55); - Eku ^= Du; - BCi = ROL(Eku, 39); - Ema ^= Da; - BCo = ROL(Ema, 41); - Ese ^= De; - BCu = ROL(Ese, 2); - Asa = BCa ^((~BCe)& BCi ); - Ase = BCe ^((~BCi)& BCo ); - Asi = BCi ^((~BCo)& BCu ); - Aso = BCo ^((~BCu)& BCa ); - Asu = BCu ^((~BCa)& BCe ); - } - - //copyToState(state, A) - state[ 0] = Aba; - state[ 1] = Abe; - state[ 2] = Abi; - state[ 3] = Abo; - state[ 4] = Abu; - state[ 5] = Aga; - state[ 6] = Age; - state[ 7] = Agi; - state[ 8] = Ago; - state[ 9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - - #undef round - } -} - -__host__ void jackpot_keccak512_cpu_setBlock_88(void *pdata) -{ - unsigned long long inlen = 88; - const unsigned char *in = (const unsigned char*)pdata; - - tKeccakLane state[5 * 5]; - unsigned char temp[cKeccakR_SizeInBytes]; - - memset( state, 0, sizeof(state) ); - - for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes ) - { - KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) ); - } - - // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten) - // ins Constant Memory - cudaMemcpyToSymbol( c_State, - state, - sizeof(state), - 0, cudaMemcpyHostToDevice); - - // padding - memcpy( temp, in, (size_t)inlen ); - temp[inlen++] = 1; - memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen ); - temp[cKeccakR_SizeInBytes-1] |= 0x80; - - - // Kopiere den Rest der Message und das Padding ins Constant Memory - cudaMemcpyToSymbol( c_PaddedMessage, - temp, - cKeccakR_SizeInBytes, - 0, cudaMemcpyHostToDevice); -} - -__host__ void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order) -{ - const int threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - jackpot_keccak512_gpu_hash_88<<>>(threads, startNounce, (uint64_t*)d_hash); - MyStreamSynchronize(NULL, order, thr_id); -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +__constant__ uint64_t c_State[25]; +__constant__ uint32_t c_PaddedMessage[18]; + +static __device__ uint32_t cuda_swab32(uint32_t x) +{ + return __byte_perm(x, 0, 0x0123); +} + +// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt +#if __CUDA_ARCH__ >= 350 +__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { + uint2 result; + if(offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#else +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif + +#define U32TO64_LE(p) \ + (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) + +#define U64TO32_LE(p, v) \ + *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); + +static const uint64_t host_keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +__constant__ uint64_t c_keccak_round_constants[24]; + +static __device__ __forceinline__ void +keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { + size_t i; + uint64_t t[5], u[5], v, w; + + /* absorb input */ +#pragma unroll 9 + for (i = 0; i < 72 / 8; i++, in += 2) + s[i] ^= U32TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +__global__ void jackpot_keccak512_gpu_hash_88(int threads, uint32_t startNounce, uint64_t *g_hash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + + int hashPosition = nounce - startNounce; + + // Nachricht kopieren + uint32_t message[18]; +#pragma unroll 18 + for(int i=0;i<18;i++) + message[i] = c_PaddedMessage[i]; + + // die individuelle Nounce einsetzen + message[1] = cuda_swab32(nounce); + + // State initialisieren + uint64_t keccak_gpu_state[25]; +#pragma unroll 25 + for (int i=0; i<25; i++) + keccak_gpu_state[i] = c_State[i]; + + // den Block einmal gut durchschütteln + keccak_block(keccak_gpu_state, message, c_keccak_round_constants); + + // das Hash erzeugen + uint32_t hash[16]; + +#pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); + } + + // fertig + uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition]; + +#pragma unroll 16 + for(int i=0;i<16;i++) + outpHash[i] = hash[i]; + } +} + +// Setup-Funktionen +__host__ void jackpot_keccak512_cpu_init(int thr_id, int threads) +{ + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( c_keccak_round_constants, + host_keccak_round_constants, + sizeof(host_keccak_round_constants), + 0, cudaMemcpyHostToDevice); +} + +#define cKeccakB 1600 +#define cKeccakR 576 + +#define cKeccakR_SizeInBytes (cKeccakR / 8) +#define crypto_hash_BYTES 64 + +#if (cKeccakB == 1600) + typedef unsigned long long UINT64; + typedef UINT64 tKeccakLane; + #define cKeccakNumberOfRounds 24 +#endif + +#define cKeccakLaneSizeInBits (sizeof(tKeccakLane) * 8) + +#define ROL(a, offset) ((((tKeccakLane)a) << ((offset) % cKeccakLaneSizeInBits)) ^ (((tKeccakLane)a) >> (cKeccakLaneSizeInBits-((offset) % cKeccakLaneSizeInBits)))) +#if ((cKeccakB/25) == 8) + #define ROL_mult8(a, offset) ((tKeccakLane)a) +#else + #define ROL_mult8(a, offset) ROL(a, offset) +#endif +void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ); + +const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = +{ + (tKeccakLane)0x0000000000000001ULL, + (tKeccakLane)0x0000000000008082ULL, + (tKeccakLane)0x800000000000808aULL, + (tKeccakLane)0x8000000080008000ULL, + (tKeccakLane)0x000000000000808bULL, + (tKeccakLane)0x0000000080000001ULL, + (tKeccakLane)0x8000000080008081ULL, + (tKeccakLane)0x8000000000008009ULL, + (tKeccakLane)0x000000000000008aULL, + (tKeccakLane)0x0000000000000088ULL, + (tKeccakLane)0x0000000080008009ULL, + (tKeccakLane)0x000000008000000aULL, + (tKeccakLane)0x000000008000808bULL, + (tKeccakLane)0x800000000000008bULL, + (tKeccakLane)0x8000000000008089ULL, + (tKeccakLane)0x8000000000008003ULL, + (tKeccakLane)0x8000000000008002ULL, + (tKeccakLane)0x8000000000000080ULL + #if (cKeccakB >= 400) + , (tKeccakLane)0x000000000000800aULL, + (tKeccakLane)0x800000008000000aULL + #if (cKeccakB >= 800) + , (tKeccakLane)0x8000000080008081ULL, + (tKeccakLane)0x8000000000008080ULL + #if (cKeccakB == 1600) + , (tKeccakLane)0x0000000080000001ULL, + (tKeccakLane)0x8000000080008008ULL + #endif + #endif + #endif +}; + +void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ) +{ + + { + while ( --laneCount >= 0 ) + { + state[laneCount] ^= in[laneCount]; + } + } + + { + tKeccakLane Aba, Abe, Abi, Abo, Abu; + tKeccakLane Aga, Age, Agi, Ago, Agu; + tKeccakLane Aka, Ake, Aki, Ako, Aku; + tKeccakLane Ama, Ame, Ami, Amo, Amu; + tKeccakLane Asa, Ase, Asi, Aso, Asu; + tKeccakLane BCa, BCe, BCi, BCo, BCu; + tKeccakLane Da, De, Di, Do, Du; + tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu; + tKeccakLane Ega, Ege, Egi, Ego, Egu; + tKeccakLane Eka, Eke, Eki, Eko, Eku; + tKeccakLane Ema, Eme, Emi, Emo, Emu; + tKeccakLane Esa, Ese, Esi, Eso, Esu; + #define round laneCount + + //copyFromState(A, state) + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( round = 0; round < cKeccakNumberOfRounds; round += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (tKeccakLane)KeccakF_RoundConstants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL_mult8(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL_mult8(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL_mult8(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL_mult8(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + //copyToState(state, A) + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; + + #undef round + } +} + +__host__ void jackpot_keccak512_cpu_setBlock_88(void *pdata) +{ + unsigned long long inlen = 88; + const unsigned char *in = (const unsigned char*)pdata; + + tKeccakLane state[5 * 5]; + unsigned char temp[cKeccakR_SizeInBytes]; + + memset( state, 0, sizeof(state) ); + + for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes ) + { + KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) ); + } + + // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten) + // ins Constant Memory + cudaMemcpyToSymbol( c_State, + state, + sizeof(state), + 0, cudaMemcpyHostToDevice); + + // padding + memcpy( temp, in, (size_t)inlen ); + temp[inlen++] = 1; + memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen ); + temp[cKeccakR_SizeInBytes-1] |= 0x80; + + + // Kopiere den Rest der Message und das Padding ins Constant Memory + cudaMemcpyToSymbol( c_PaddedMessage, + temp, + cKeccakR_SizeInBytes, + 0, cudaMemcpyHostToDevice); +} + +__host__ void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order) +{ + const int threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + jackpot_keccak512_gpu_hash_88<<>>(threads, startNounce, (uint64_t*)d_hash); + MyStreamSynchronize(NULL, order, thr_id); +} diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index 0802eb2..a4242a8 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -1,173 +1,173 @@ - -extern "C" -{ -#include "sph/sph_keccak.h" -#include "sph/sph_blake.h" -#include "sph/sph_groestl.h" -#include "sph/sph_jh.h" -#include "sph/sph_skein.h" -} - -#include "miner.h" -#include - -// aus cpu-miner.c -extern int device_map[8]; -extern bool opt_benchmark; - -// Speicher für Input/Output der verketteten Hashfunktionen -static uint32_t *d_hash[8]; - -extern void jackpot_keccak512_cpu_init(int thr_id, int threads); -extern void jackpot_keccak512_cpu_setBlock_88(void *pdata); -extern void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); - -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); - -// Original jackpothash Funktion aus einem miner Quelltext -inline unsigned int jackpothash(void *state, const void *input) -{ - sph_blake512_context ctx_blake; - sph_groestl512_context ctx_groestl; - sph_jh512_context ctx_jh; - sph_keccak512_context ctx_keccak; - sph_skein512_context ctx_skein; - - uint32_t hash[16]; - - sph_keccak512_init(&ctx_keccak); - sph_keccak512 (&ctx_keccak, input, 88); - sph_keccak512_close(&ctx_keccak, hash); - - unsigned int round_mask = ( - (unsigned int)(((unsigned char *)input)[84]) << 0 | - (unsigned int)(((unsigned char *)input)[85]) << 8 | - (unsigned int)(((unsigned char *)input)[86]) << 16 | - (unsigned int)(((unsigned char *)input)[87]) << 24 ); - unsigned int round_max = hash[0] & round_mask; - unsigned int round; - for (round = 0; round < round_max; round++) { - switch (hash[0] & 3) { - case 0: - sph_blake512_init(&ctx_blake); - sph_blake512 (&ctx_blake, hash, 64); - sph_blake512_close(&ctx_blake, hash); - break; - case 1: - sph_groestl512_init(&ctx_groestl); - sph_groestl512 (&ctx_groestl, hash, 64); - sph_groestl512_close(&ctx_groestl, hash); - break; - case 2: - sph_jh512_init(&ctx_jh); - sph_jh512 (&ctx_jh, hash, 64); - sph_jh512_close(&ctx_jh, hash); - break; - case 3: - sph_skein512_init(&ctx_skein); - sph_skein512 (&ctx_skein, hash, 64); - sph_skein512_close(&ctx_skein, hash); - break; - } - } - memcpy(state, hash, 32); - - return round_max; -} - - -static int bit_population(uint32_t n){ - int c =0; - while(n){ - c += n&1; - n = n>>1; - } - return c; -} - -extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) -{ - const uint32_t first_nonce = pdata[19]; - - // TODO: entfernen für eine Release! Ist nur zum Testen! - if (opt_benchmark) { - ((uint32_t*)ptarget)[7] = 0x00000f; - ((uint32_t*)pdata)[21] = 0x07000000; // round_mask von 7 vorgeben - } - - const uint32_t Htarg = ptarget[7]; - - const int throughput = 256*4096; // 100; - - static bool init[8] = {0,0,0,0,0,0,0,0}; - if (!init[thr_id]) - { - cudaSetDevice(device_map[thr_id]); - - // Konstanten kopieren, Speicher belegen - cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); - jackpot_keccak512_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); - init[thr_id] = true; - } - - uint32_t endiandata[22]; - for (int k=0; k < 22; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); - - unsigned int round_mask = ( - (unsigned int)(((unsigned char *)endiandata)[84]) << 0 | - (unsigned int)(((unsigned char *)endiandata)[85]) << 8 | - (unsigned int)(((unsigned char *)endiandata)[86]) << 16 | - (unsigned int)(((unsigned char *)endiandata)[87]) << 24 ); - - // Zählen wie viele Bits in round_mask gesetzt sind - int bitcount = bit_population(round_mask); - - jackpot_keccak512_cpu_setBlock_88((void*)endiandata); - quark_check_cpu_setTarget(ptarget); - - do { - int order = 0; - - // erstes Blake512 Hash mit CUDA - jackpot_keccak512_cpu_hash_88(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - // TODO: hier fehlen jetzt natürlich noch die anderen Hashrunden. - // bei round_mask=7 haben wir eine 1:8 Chance, dass das Hash dennoch - // die Kriterien erfüllt wenn hash[0] & round_mask zufällig 0 ist. - - // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - if (foundNonce != 0xffffffff) - { - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); - - // diese jackpothash Funktion gibt die Zahl der zusätzlichen Runden zurück - unsigned int rounds = jackpothash(vhash64, endiandata); - - // wir akzeptieren nur solche Hashes wo ausschliesslich Keccak verwendet wurde - if (rounds == 0) { - if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { - - pdata[19] = foundNonce; - *hashes_done = (foundNonce - first_nonce + 1) / (1 << bitcount); - return 1; - } else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds); - } - } - } - - pdata[19] += throughput; - - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = (pdata[19] - first_nonce + 1) / (1 << bitcount); - return 0; -} + +extern "C" +{ +#include "sph/sph_keccak.h" +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_jh.h" +#include "sph/sph_skein.h" +} + +#include "miner.h" +#include + +// aus cpu-miner.c +extern int device_map[8]; +extern bool opt_benchmark; + +// Speicher für Input/Output der verketteten Hashfunktionen +static uint32_t *d_hash[8]; + +extern void jackpot_keccak512_cpu_init(int thr_id, int threads); +extern void jackpot_keccak512_cpu_setBlock_88(void *pdata); +extern void jackpot_keccak512_cpu_hash_88(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); + +extern void quark_check_cpu_init(int thr_id, int threads); +extern void quark_check_cpu_setTarget(const void *ptarget); +extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); + +// Original jackpothash Funktion aus einem miner Quelltext +inline unsigned int jackpothash(void *state, const void *input) +{ + sph_blake512_context ctx_blake; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + + uint32_t hash[16]; + + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, input, 88); + sph_keccak512_close(&ctx_keccak, hash); + + unsigned int round_mask = ( + (unsigned int)(((unsigned char *)input)[84]) << 0 | + (unsigned int)(((unsigned char *)input)[85]) << 8 | + (unsigned int)(((unsigned char *)input)[86]) << 16 | + (unsigned int)(((unsigned char *)input)[87]) << 24 ); + unsigned int round_max = hash[0] & round_mask; + unsigned int round; + for (round = 0; round < round_max; round++) { + switch (hash[0] & 3) { + case 0: + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, hash, 64); + sph_blake512_close(&ctx_blake, hash); + break; + case 1: + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, hash, 64); + sph_groestl512_close(&ctx_groestl, hash); + break; + case 2: + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, hash, 64); + sph_jh512_close(&ctx_jh, hash); + break; + case 3: + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, hash, 64); + sph_skein512_close(&ctx_skein, hash); + break; + } + } + memcpy(state, hash, 32); + + return round_max; +} + + +static int bit_population(uint32_t n){ + int c =0; + while(n){ + c += n&1; + n = n>>1; + } + return c; +} + +extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + + // TODO: entfernen für eine Release! Ist nur zum Testen! + if (opt_benchmark) { + ((uint32_t*)ptarget)[7] = 0x00000f; + ((uint32_t*)pdata)[21] = 0x07000000; // round_mask von 7 vorgeben + } + + const uint32_t Htarg = ptarget[7]; + + const int throughput = 256*4096; // 100; + + static bool init[8] = {0,0,0,0,0,0,0,0}; + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + + // Konstanten kopieren, Speicher belegen + cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + jackpot_keccak512_cpu_init(thr_id, throughput); + quark_check_cpu_init(thr_id, throughput); + init[thr_id] = true; + } + + uint32_t endiandata[22]; + for (int k=0; k < 22; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + unsigned int round_mask = ( + (unsigned int)(((unsigned char *)endiandata)[84]) << 0 | + (unsigned int)(((unsigned char *)endiandata)[85]) << 8 | + (unsigned int)(((unsigned char *)endiandata)[86]) << 16 | + (unsigned int)(((unsigned char *)endiandata)[87]) << 24 ); + + // Zählen wie viele Bits in round_mask gesetzt sind + int bitcount = bit_population(round_mask); + + jackpot_keccak512_cpu_setBlock_88((void*)endiandata); + quark_check_cpu_setTarget(ptarget); + + do { + int order = 0; + + // erstes Blake512 Hash mit CUDA + jackpot_keccak512_cpu_hash_88(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + // TODO: hier fehlen jetzt natürlich noch die anderen Hashrunden. + // bei round_mask=7 haben wir eine 1:8 Chance, dass das Hash dennoch + // die Kriterien erfüllt wenn hash[0] & round_mask zufällig 0 ist. + + // Scan nach Gewinner Hashes auf der GPU + uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + + // diese jackpothash Funktion gibt die Zahl der zusätzlichen Runden zurück + unsigned int rounds = jackpothash(vhash64, endiandata); + + // wir akzeptieren nur solche Hashes wo ausschliesslich Keccak verwendet wurde + if (rounds == 0) { + if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { + + pdata[19] = foundNonce; + *hashes_done = (foundNonce - first_nonce + 1) / (1 << bitcount); + return 1; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds); + } + } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = (pdata[19] - first_nonce + 1) / (1 << bitcount); + return 0; +} diff --git a/Makefile.am b/Makefile.am index 3f8f816..1a75f16 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,37 +1,40 @@ - -if WANT_JANSSON -JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson -else -JANSSON_INCLUDES= -endif - -EXTRA_DIST = autogen.sh README.txt LICENSE.txt \ - cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \ - compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in - -SUBDIRS = compat - -bin_PROGRAMS = ccminer - -ccminer_SOURCES = elist.h miner.h compat.h \ - compat/inttypes.h compat/stdbool.h compat/unistd.h \ - compat/sys/time.h compat/getopt/getopt.h \ - cpu-miner.c util.c blake.c groestl.c hefty1.c keccak.c scrypt.c sha2.c \ - sph_blake.h sph_groestl.h sph_keccak.h sph_types.h \ - heavy.cu \ - cuda_blake512.cu cuda_blake512.h \ - cuda_combine.cu cuda_combine.h \ - cuda_groestl512.cu cuda_groestl512.h \ - cuda_hefty1.cu cuda_hefty1.h \ - cuda_keccak512.cu cuda_keccak512.h \ - cuda_sha256.cu cuda_sha256.h \ - fuguecoin.cpp cuda_fugue256.cu fugue.c sph_fugue.h uint256.h \ - groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h - -ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ -ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ -ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME - -# we're now targeting all major compute architectures within one binary. -.cu.o: - $(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + +if WANT_JANSSON +JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson +else +JANSSON_INCLUDES= +endif + +EXTRA_DIST = autogen.sh README.txt LICENSE.txt \ + cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \ + compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in + +SUBDIRS = compat + +bin_PROGRAMS = ccminer + +ccminer_SOURCES = elist.h miner.h compat.h \ + compat/inttypes.h compat/stdbool.h compat/unistd.h \ + compat/sys/time.h compat/getopt/getopt.h \ + cpu-miner.c util.c sph/blake.c sph/groestl.c sph/keccak.c hefty1.c scrypt.c sha2.c \ + sph/sph_blake.h sph/sph_groestl.h sph/sph_keccak.h sph/sph_types.h \ + heavy.cu \ + cuda_blake512.cu cuda_blake512.h \ + cuda_combine.cu cuda_combine.h \ + cuda_groestl512.cu cuda_groestl512.h \ + cuda_hefty1.cu cuda_hefty1.h \ + cuda_keccak512.cu cuda_keccak512.h \ + cuda_sha256.cu cuda_sha256.h \ + fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \ + groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ + JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu sph/jh.c sph/skein.c \ + sph/sph_jh.h sph/sph_skein.h quark/cuda_quark_checkhash.cu \ + myriadgroestl.cpp cuda_myriadgroestl.cu + +ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ +ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ +ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME + +# we're now targeting all major compute architectures within one binary. +.cu.o: + $(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< diff --git a/Makefile.in b/Makefile.in index 130d7ce..93fc56e 100644 --- a/Makefile.in +++ b/Makefile.in @@ -53,16 +53,21 @@ CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = am__installdirs = "$(DESTDIR)$(bindir)" PROGRAMS = $(bin_PROGRAMS) +am__dirstamp = $(am__leading_dot)dirstamp am_ccminer_OBJECTS = ccminer-cpu-miner.$(OBJEXT) \ ccminer-util.$(OBJEXT) ccminer-blake.$(OBJEXT) \ - ccminer-groestl.$(OBJEXT) ccminer-hefty1.$(OBJEXT) \ - ccminer-keccak.$(OBJEXT) ccminer-scrypt.$(OBJEXT) \ + ccminer-groestl.$(OBJEXT) ccminer-keccak.$(OBJEXT) \ + ccminer-hefty1.$(OBJEXT) ccminer-scrypt.$(OBJEXT) \ ccminer-sha2.$(OBJEXT) heavy.$(OBJEXT) cuda_blake512.$(OBJEXT) \ cuda_combine.$(OBJEXT) cuda_groestl512.$(OBJEXT) \ cuda_hefty1.$(OBJEXT) cuda_keccak512.$(OBJEXT) \ cuda_sha256.$(OBJEXT) ccminer-fuguecoin.$(OBJEXT) \ cuda_fugue256.$(OBJEXT) ccminer-fugue.$(OBJEXT) \ - ccminer-groestlcoin.$(OBJEXT) cuda_groestlcoin.$(OBJEXT) + ccminer-groestlcoin.$(OBJEXT) cuda_groestlcoin.$(OBJEXT) \ + JHA/jackpotcoin.$(OBJEXT) JHA/cuda_jha_keccak512.$(OBJEXT) \ + ccminer-jh.$(OBJEXT) ccminer-skein.$(OBJEXT) \ + quark/cuda_quark_checkhash.$(OBJEXT) \ + ccminer-myriadgroestl.$(OBJEXT) cuda_myriadgroestl.$(OBJEXT) ccminer_OBJECTS = $(am_ccminer_OBJECTS) ccminer_DEPENDENCIES = ccminer_LINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(ccminer_LDFLAGS) \ @@ -267,8 +272,8 @@ SUBDIRS = compat ccminer_SOURCES = elist.h miner.h compat.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/sys/time.h compat/getopt/getopt.h \ - cpu-miner.c util.c blake.c groestl.c hefty1.c keccak.c scrypt.c sha2.c \ - sph_blake.h sph_groestl.h sph_keccak.h sph_types.h \ + cpu-miner.c util.c sph/blake.c sph/groestl.c sph/keccak.c hefty1.c scrypt.c sha2.c \ + sph/sph_blake.h sph/sph_groestl.h sph/sph_keccak.h sph/sph_types.h \ heavy.cu \ cuda_blake512.cu cuda_blake512.h \ cuda_combine.cu cuda_combine.h \ @@ -276,8 +281,11 @@ ccminer_SOURCES = elist.h miner.h compat.h \ cuda_hefty1.cu cuda_hefty1.h \ cuda_keccak512.cu cuda_keccak512.h \ cuda_sha256.cu cuda_sha256.h \ - fuguecoin.cpp cuda_fugue256.cu fugue.c sph_fugue.h uint256.h \ - groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h + fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \ + groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ + JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu sph/jh.c sph/skein.c \ + sph/sph_jh.h sph/sph_skein.h quark/cuda_quark_checkhash.cu \ + myriadgroestl.cpp cuda_myriadgroestl.cu ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ @@ -373,12 +381,33 @@ uninstall-binPROGRAMS: clean-binPROGRAMS: -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) +JHA/$(am__dirstamp): + @$(MKDIR_P) JHA + @: > JHA/$(am__dirstamp) +JHA/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) JHA/$(DEPDIR) + @: > JHA/$(DEPDIR)/$(am__dirstamp) +JHA/jackpotcoin.$(OBJEXT): JHA/$(am__dirstamp) \ + JHA/$(DEPDIR)/$(am__dirstamp) +JHA/cuda_jha_keccak512.$(OBJEXT): JHA/$(am__dirstamp) \ + JHA/$(DEPDIR)/$(am__dirstamp) +quark/$(am__dirstamp): + @$(MKDIR_P) quark + @: > quark/$(am__dirstamp) +quark/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) quark/$(DEPDIR) + @: > quark/$(DEPDIR)/$(am__dirstamp) +quark/cuda_quark_checkhash.$(OBJEXT): quark/$(am__dirstamp) \ + quark/$(DEPDIR)/$(am__dirstamp) ccminer$(EXEEXT): $(ccminer_OBJECTS) $(ccminer_DEPENDENCIES) $(EXTRA_ccminer_DEPENDENCIES) @rm -f ccminer$(EXEEXT) $(ccminer_LINK) $(ccminer_OBJECTS) $(ccminer_LDADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) + -rm -f JHA/cuda_jha_keccak512.$(OBJEXT) + -rm -f JHA/jackpotcoin.$(OBJEXT) + -rm -f quark/cuda_quark_checkhash.$(OBJEXT) distclean-compile: -rm -f *.tab.c @@ -390,9 +419,12 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-groestl.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-groestlcoin.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-hefty1.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-jh.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-keccak.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-myriadgroestl.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-scrypt.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-sha2.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-skein.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-util.Po@am__quote@ .c.o: @@ -437,33 +469,47 @@ ccminer-util.obj: util.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-util.obj `if test -f 'util.c'; then $(CYGPATH_W) 'util.c'; else $(CYGPATH_W) '$(srcdir)/util.c'; fi` -ccminer-blake.o: blake.c -@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.o -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.o `test -f 'blake.c' || echo '$(srcdir)/'`blake.c +ccminer-blake.o: sph/blake.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.o -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.o `test -f 'sph/blake.c' || echo '$(srcdir)/'`sph/blake.c @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-blake.Tpo $(DEPDIR)/ccminer-blake.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='blake.c' object='ccminer-blake.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/blake.c' object='ccminer-blake.o' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-blake.o `test -f 'blake.c' || echo '$(srcdir)/'`blake.c +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-blake.o `test -f 'sph/blake.c' || echo '$(srcdir)/'`sph/blake.c -ccminer-blake.obj: blake.c -@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.obj -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.obj `if test -f 'blake.c'; then $(CYGPATH_W) 'blake.c'; else $(CYGPATH_W) '$(srcdir)/blake.c'; fi` +ccminer-blake.obj: sph/blake.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.obj -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.obj `if test -f 'sph/blake.c'; then $(CYGPATH_W) 'sph/blake.c'; else $(CYGPATH_W) '$(srcdir)/sph/blake.c'; fi` @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-blake.Tpo $(DEPDIR)/ccminer-blake.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='blake.c' object='ccminer-blake.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/blake.c' object='ccminer-blake.obj' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-blake.obj `if test -f 'blake.c'; then $(CYGPATH_W) 'blake.c'; else $(CYGPATH_W) '$(srcdir)/blake.c'; fi` +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-blake.obj `if test -f 'sph/blake.c'; then $(CYGPATH_W) 'sph/blake.c'; else $(CYGPATH_W) '$(srcdir)/sph/blake.c'; fi` -ccminer-groestl.o: groestl.c -@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-groestl.o -MD -MP -MF $(DEPDIR)/ccminer-groestl.Tpo -c -o ccminer-groestl.o `test -f 'groestl.c' || echo '$(srcdir)/'`groestl.c +ccminer-groestl.o: sph/groestl.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-groestl.o -MD -MP -MF $(DEPDIR)/ccminer-groestl.Tpo -c -o ccminer-groestl.o `test -f 'sph/groestl.c' || echo '$(srcdir)/'`sph/groestl.c @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-groestl.Tpo $(DEPDIR)/ccminer-groestl.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='groestl.c' object='ccminer-groestl.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/groestl.c' object='ccminer-groestl.o' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.o `test -f 'groestl.c' || echo '$(srcdir)/'`groestl.c +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.o `test -f 'sph/groestl.c' || echo '$(srcdir)/'`sph/groestl.c -ccminer-groestl.obj: groestl.c -@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-groestl.obj -MD -MP -MF $(DEPDIR)/ccminer-groestl.Tpo -c -o ccminer-groestl.obj `if test -f 'groestl.c'; then $(CYGPATH_W) 'groestl.c'; else $(CYGPATH_W) '$(srcdir)/groestl.c'; fi` +ccminer-groestl.obj: sph/groestl.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-groestl.obj -MD -MP -MF $(DEPDIR)/ccminer-groestl.Tpo -c -o ccminer-groestl.obj `if test -f 'sph/groestl.c'; then $(CYGPATH_W) 'sph/groestl.c'; else $(CYGPATH_W) '$(srcdir)/sph/groestl.c'; fi` @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-groestl.Tpo $(DEPDIR)/ccminer-groestl.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='groestl.c' object='ccminer-groestl.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/groestl.c' object='ccminer-groestl.obj' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.obj `if test -f 'groestl.c'; then $(CYGPATH_W) 'groestl.c'; else $(CYGPATH_W) '$(srcdir)/groestl.c'; fi` +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.obj `if test -f 'sph/groestl.c'; then $(CYGPATH_W) 'sph/groestl.c'; else $(CYGPATH_W) '$(srcdir)/sph/groestl.c'; fi` + +ccminer-keccak.o: sph/keccak.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.o -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.o `test -f 'sph/keccak.c' || echo '$(srcdir)/'`sph/keccak.c +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/keccak.c' object='ccminer-keccak.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.o `test -f 'sph/keccak.c' || echo '$(srcdir)/'`sph/keccak.c + +ccminer-keccak.obj: sph/keccak.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.obj -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.obj `if test -f 'sph/keccak.c'; then $(CYGPATH_W) 'sph/keccak.c'; else $(CYGPATH_W) '$(srcdir)/sph/keccak.c'; fi` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/keccak.c' object='ccminer-keccak.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.obj `if test -f 'sph/keccak.c'; then $(CYGPATH_W) 'sph/keccak.c'; else $(CYGPATH_W) '$(srcdir)/sph/keccak.c'; fi` ccminer-hefty1.o: hefty1.c @am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-hefty1.o -MD -MP -MF $(DEPDIR)/ccminer-hefty1.Tpo -c -o ccminer-hefty1.o `test -f 'hefty1.c' || echo '$(srcdir)/'`hefty1.c @@ -479,20 +525,6 @@ ccminer-hefty1.obj: hefty1.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-hefty1.obj `if test -f 'hefty1.c'; then $(CYGPATH_W) 'hefty1.c'; else $(CYGPATH_W) '$(srcdir)/hefty1.c'; fi` -ccminer-keccak.o: keccak.c -@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.o -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.o `test -f 'keccak.c' || echo '$(srcdir)/'`keccak.c -@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='keccak.c' object='ccminer-keccak.o' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.o `test -f 'keccak.c' || echo '$(srcdir)/'`keccak.c - -ccminer-keccak.obj: keccak.c -@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.obj -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.obj `if test -f 'keccak.c'; then $(CYGPATH_W) 'keccak.c'; else $(CYGPATH_W) '$(srcdir)/keccak.c'; fi` -@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='keccak.c' object='ccminer-keccak.obj' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.obj `if test -f 'keccak.c'; then $(CYGPATH_W) 'keccak.c'; else $(CYGPATH_W) '$(srcdir)/keccak.c'; fi` - ccminer-scrypt.o: scrypt.c @am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-scrypt.o -MD -MP -MF $(DEPDIR)/ccminer-scrypt.Tpo -c -o ccminer-scrypt.o `test -f 'scrypt.c' || echo '$(srcdir)/'`scrypt.c @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-scrypt.Tpo $(DEPDIR)/ccminer-scrypt.Po @@ -521,19 +553,47 @@ ccminer-sha2.obj: sha2.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-sha2.obj `if test -f 'sha2.c'; then $(CYGPATH_W) 'sha2.c'; else $(CYGPATH_W) '$(srcdir)/sha2.c'; fi` -ccminer-fugue.o: fugue.c -@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-fugue.o -MD -MP -MF $(DEPDIR)/ccminer-fugue.Tpo -c -o ccminer-fugue.o `test -f 'fugue.c' || echo '$(srcdir)/'`fugue.c +ccminer-fugue.o: sph/fugue.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-fugue.o -MD -MP -MF $(DEPDIR)/ccminer-fugue.Tpo -c -o ccminer-fugue.o `test -f 'sph/fugue.c' || echo '$(srcdir)/'`sph/fugue.c @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-fugue.Tpo $(DEPDIR)/ccminer-fugue.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='fugue.c' object='ccminer-fugue.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/fugue.c' object='ccminer-fugue.o' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.o `test -f 'fugue.c' || echo '$(srcdir)/'`fugue.c +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.o `test -f 'sph/fugue.c' || echo '$(srcdir)/'`sph/fugue.c -ccminer-fugue.obj: fugue.c -@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-fugue.obj -MD -MP -MF $(DEPDIR)/ccminer-fugue.Tpo -c -o ccminer-fugue.obj `if test -f 'fugue.c'; then $(CYGPATH_W) 'fugue.c'; else $(CYGPATH_W) '$(srcdir)/fugue.c'; fi` +ccminer-fugue.obj: sph/fugue.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-fugue.obj -MD -MP -MF $(DEPDIR)/ccminer-fugue.Tpo -c -o ccminer-fugue.obj `if test -f 'sph/fugue.c'; then $(CYGPATH_W) 'sph/fugue.c'; else $(CYGPATH_W) '$(srcdir)/sph/fugue.c'; fi` @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-fugue.Tpo $(DEPDIR)/ccminer-fugue.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='fugue.c' object='ccminer-fugue.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/fugue.c' object='ccminer-fugue.obj' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.obj `if test -f 'fugue.c'; then $(CYGPATH_W) 'fugue.c'; else $(CYGPATH_W) '$(srcdir)/fugue.c'; fi` +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.obj `if test -f 'sph/fugue.c'; then $(CYGPATH_W) 'sph/fugue.c'; else $(CYGPATH_W) '$(srcdir)/sph/fugue.c'; fi` + +ccminer-jh.o: sph/jh.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-jh.o -MD -MP -MF $(DEPDIR)/ccminer-jh.Tpo -c -o ccminer-jh.o `test -f 'sph/jh.c' || echo '$(srcdir)/'`sph/jh.c +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-jh.Tpo $(DEPDIR)/ccminer-jh.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/jh.c' object='ccminer-jh.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-jh.o `test -f 'sph/jh.c' || echo '$(srcdir)/'`sph/jh.c + +ccminer-jh.obj: sph/jh.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-jh.obj -MD -MP -MF $(DEPDIR)/ccminer-jh.Tpo -c -o ccminer-jh.obj `if test -f 'sph/jh.c'; then $(CYGPATH_W) 'sph/jh.c'; else $(CYGPATH_W) '$(srcdir)/sph/jh.c'; fi` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-jh.Tpo $(DEPDIR)/ccminer-jh.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/jh.c' object='ccminer-jh.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-jh.obj `if test -f 'sph/jh.c'; then $(CYGPATH_W) 'sph/jh.c'; else $(CYGPATH_W) '$(srcdir)/sph/jh.c'; fi` + +ccminer-skein.o: sph/skein.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-skein.o -MD -MP -MF $(DEPDIR)/ccminer-skein.Tpo -c -o ccminer-skein.o `test -f 'sph/skein.c' || echo '$(srcdir)/'`sph/skein.c +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-skein.Tpo $(DEPDIR)/ccminer-skein.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/skein.c' object='ccminer-skein.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-skein.o `test -f 'sph/skein.c' || echo '$(srcdir)/'`sph/skein.c + +ccminer-skein.obj: sph/skein.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-skein.obj -MD -MP -MF $(DEPDIR)/ccminer-skein.Tpo -c -o ccminer-skein.obj `if test -f 'sph/skein.c'; then $(CYGPATH_W) 'sph/skein.c'; else $(CYGPATH_W) '$(srcdir)/sph/skein.c'; fi` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-skein.Tpo $(DEPDIR)/ccminer-skein.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/skein.c' object='ccminer-skein.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-skein.obj `if test -f 'sph/skein.c'; then $(CYGPATH_W) 'sph/skein.c'; else $(CYGPATH_W) '$(srcdir)/sph/skein.c'; fi` .cpp.o: @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @@ -577,6 +637,20 @@ ccminer-groestlcoin.obj: groestlcoin.cpp @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-groestlcoin.obj `if test -f 'groestlcoin.cpp'; then $(CYGPATH_W) 'groestlcoin.cpp'; else $(CYGPATH_W) '$(srcdir)/groestlcoin.cpp'; fi` +ccminer-myriadgroestl.o: myriadgroestl.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-myriadgroestl.o -MD -MP -MF $(DEPDIR)/ccminer-myriadgroestl.Tpo -c -o ccminer-myriadgroestl.o `test -f 'myriadgroestl.cpp' || echo '$(srcdir)/'`myriadgroestl.cpp +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/ccminer-myriadgroestl.Tpo $(DEPDIR)/ccminer-myriadgroestl.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='myriadgroestl.cpp' object='ccminer-myriadgroestl.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-myriadgroestl.o `test -f 'myriadgroestl.cpp' || echo '$(srcdir)/'`myriadgroestl.cpp + +ccminer-myriadgroestl.obj: myriadgroestl.cpp +@am__fastdepCXX_TRUE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-myriadgroestl.obj -MD -MP -MF $(DEPDIR)/ccminer-myriadgroestl.Tpo -c -o ccminer-myriadgroestl.obj `if test -f 'myriadgroestl.cpp'; then $(CYGPATH_W) 'myriadgroestl.cpp'; else $(CYGPATH_W) '$(srcdir)/myriadgroestl.cpp'; fi` +@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/ccminer-myriadgroestl.Tpo $(DEPDIR)/ccminer-myriadgroestl.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='myriadgroestl.cpp' object='ccminer-myriadgroestl.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-myriadgroestl.obj `if test -f 'myriadgroestl.cpp'; then $(CYGPATH_W) 'myriadgroestl.cpp'; else $(CYGPATH_W) '$(srcdir)/myriadgroestl.cpp'; fi` + # This directory's subdirectories are mostly independent; you can cd # into them and run `make' without going through this Makefile. # To change the values of `make' variables: instead of editing Makefiles, @@ -936,6 +1010,10 @@ clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + -rm -f JHA/$(DEPDIR)/$(am__dirstamp) + -rm -f JHA/$(am__dirstamp) + -rm -f quark/$(DEPDIR)/$(am__dirstamp) + -rm -f quark/$(am__dirstamp) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @@ -1035,7 +1113,7 @@ uninstall-am: uninstall-binPROGRAMS # we're now targeting all major compute architectures within one binary. .cu.o: - $(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/README.txt b/README.txt index 3c956e0..9f81260 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization" +ccMiner release 0.6 (April 27th 2014) - "Jackpot" ------------------------------------------------------------- *************************************************************** @@ -37,6 +37,8 @@ its command line interface and options. heavy use to mine Heavycoin fugue256 use to mine Fuguecoin groestl use to mine Groestlcoin + myr-gr use to mine Myriad-Groestl + jackpot use to mine Jackpotcoin -d, --devices gives a comma separated list of CUDA device IDs to operate on. Device IDs start counting from 0! @@ -114,6 +116,11 @@ from your old clunkers. >>> RELEASE HISTORY <<< + April, 27 2014 this release adds Myriad-Groestl and Jackpot Coin. + we apply an optimization to Jackpot that turns this + into a Keccak-only CUDA coin ;) Jackpot is tested with + solo--mining only at the moment. + March, 27 2014 Heavycoin exchange rates soar, and as a result this coin gets some love: We greatly optimized the Hefty1 kernel for speed. Expect some hefty gains, especially on 750Ti's! diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 5424f0a..c79e5ef 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -1,289 +1,316 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - - - {36DC07F9-A4A6-4877-A146-1B960083CF6F} - ccminer - - - - Application - true - MultiByte - - - Application - true - MultiByte - - - Application - false - true - MultiByte - - - Application - false - true - MultiByte - - - - - - - - - - - - - - - - - - - - true - - - true - - - true - - - true - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0\include;..\OpenSSL-Win32\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) - - - true - Console - cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MTd.lib;ssleay32MTd.lib;%(AdditionalDependencies) - ..\pthreads\Pre-built.2\lib\x86;..\curl-7.29.0\build\lib\Debug;..\OpenSSL-Win32\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) - - - echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" -copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - - true - - - 80 - - - true - true - compute_35,sm_35 - - - -Xptxas "-abi=no -v" %(AdditionalOptions) - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0_x64\include;..\OpenSSL-Win64\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) - - - true - Console - cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MTd.lib;ssleay32MTd.lib;%(AdditionalDependencies) - ..\pthreads\Pre-built.2\lib\x64;..\curl-7.29.0_x64\build\lib\Debug;..\OpenSSL-Win64\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) - - - echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" -copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - - true - - - 80 - - - true - true - compute_35,sm_35 - - - -Xptxas "-abi=no -v" %(AdditionalOptions) - - - - - Level3 - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0\include;..\OpenSSL-Win32\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) - - - true - true - true - Console - cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MT.lib;ssleay32MT.lib;%(AdditionalDependencies) - ..\pthreads\Pre-built.2\lib\x86;..\curl-7.29.0\build\lib\Release;..\OpenSSL-Win32\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) - - - echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" -copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - - true - - - 80 - - - true - true - compute_35,sm_35 - - - -Xptxas "-abi=no -v" %(AdditionalOptions) - - - - - Level3 - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0_x64\include;..\OpenSSL-Win64\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) - - - true - true - true - Console - cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MT.lib;ssleay32MT.lib;%(AdditionalDependencies) - ..\pthreads\Pre-built.2\lib\x64;..\curl-7.29.0_x64\build\lib\Release;..\OpenSSL-Win64\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) - - - echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" -copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - - true - - - 80 - - - true - true - compute_35,sm_35 - - - -Xptxas "-abi=no -v" %(AdditionalOptions) - - - - - - - - - - - - - - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - - - - - - - - - - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - - - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - /TP %(AdditionalOptions) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {36DC07F9-A4A6-4877-A146-1B960083CF6F} + ccminer + + + + Application + true + MultiByte + + + Application + true + MultiByte + + + Application + false + true + MultiByte + + + Application + false + true + MultiByte + + + + + + + + + + + + + + + + + + + + true + + + true + + + true + + + true + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0\include;..\OpenSSL-Win32\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + + + true + Console + cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MTd.lib;ssleay32MTd.lib;%(AdditionalDependencies) + ..\pthreads\Pre-built.2\lib\x86;..\curl-7.29.0\build\lib\Debug;..\OpenSSL-Win32\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + true + + + 80 + + + true + true + compute_35,sm_35 + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0_x64\include;..\OpenSSL-Win64\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + + + true + Console + cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MTd.lib;ssleay32MTd.lib;%(AdditionalDependencies) + ..\pthreads\Pre-built.2\lib\x64;..\curl-7.29.0_x64\build\lib\Debug;..\OpenSSL-Win64\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + true + + + 80 + + + true + true + compute_35,sm_35 + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + + + Level3 + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0\include;..\OpenSSL-Win32\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + + + true + true + true + Console + cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MT.lib;ssleay32MT.lib;%(AdditionalDependencies) + ..\pthreads\Pre-built.2\lib\x86;..\curl-7.29.0\build\lib\Release;..\OpenSSL-Win32\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + true + + + 80 + + + true + true + compute_35,sm_35 + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + + + Level3 + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0_x64\include;..\OpenSSL-Win64\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + + + true + true + true + Console + cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MT.lib;ssleay32MT.lib;%(AdditionalDependencies) + ..\pthreads\Pre-built.2\lib\x64;..\curl-7.29.0_x64\build\lib\Release;..\OpenSSL-Win64\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + true + + + 80 + + + true + true + compute_35,sm_35 + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + + + + + + + + + + + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + + + + + + + + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + + + + + + + + + + + + + + + + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + /TP %(AdditionalOptions) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index a276918..d0e23e1 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -1,197 +1,290 @@ - - - - - {2450a9c7-a97a-49e1-ba19-c8dbc5a4e3e7} - - - {c53ce808-c5c5-4c6c-99a2-3947090c62f1} - - - {5a45c1bf-81d2-4bc6-97b5-714e34f51a82} - - - {431cec61-9376-4de9-aae9-04c4250652e7} - - - {cc8bb259-5332-4a45-ba81-f4840a55b604} - - - {89362bd8-4690-4f0c-a4f7-6b2fa67a1f34} - - - {6c3cd392-b6b8-424c-87d2-10e33dbd4b41} - - - {5a31b6f4-4943-4b22-b69a-230f3cc96269} - - - {a0f072d0-a831-4c23-8d64-7a026521df9c} - - - {fe39ded0-754b-415f-a284-038a15a0aa55} - - - {17b56151-79ec-4a32-bac3-9d94ae7f68fe} - - - - - Source Files\CUDA\jansson - - - Source Files\CUDA\jansson - - - Source Files\CUDA\jansson - - - Source Files\CUDA\jansson - - - Source Files\CUDA\jansson - - - Source Files\CUDA\jansson - - - Source Files\getopt - - - Source Files\gettimeofday - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - - - Header Files - - - Header Files - - - Header Files - - - Header Files\compat\sys - - - Header Files\compat - - - Header Files\compat - - - Header Files\compat\getopt - - - Header Files\compat - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files - - - Header Files - - - Header Files\CUDA - - - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - + + + + + {2450a9c7-a97a-49e1-ba19-c8dbc5a4e3e7} + + + {c53ce808-c5c5-4c6c-99a2-3947090c62f1} + + + {5a45c1bf-81d2-4bc6-97b5-714e34f51a82} + + + {431cec61-9376-4de9-aae9-04c4250652e7} + + + {cc8bb259-5332-4a45-ba81-f4840a55b604} + + + {89362bd8-4690-4f0c-a4f7-6b2fa67a1f34} + + + {6c3cd392-b6b8-424c-87d2-10e33dbd4b41} + + + {5a31b6f4-4943-4b22-b69a-230f3cc96269} + + + {a0f072d0-a831-4c23-8d64-7a026521df9c} + + + {fe39ded0-754b-415f-a284-038a15a0aa55} + + + {17b56151-79ec-4a32-bac3-9d94ae7f68fe} + + + {d8f2e173-a0a5-455b-8efc-42511b585156} + + + {dd0822bd-27cc-4d5c-8e2f-cf7d9b00feb4} + + + {0dc051db-f833-476f-b3f4-c69fd03b9348} + + + {7c2a98c6-064c-4a69-b803-d6f6ff5edd0b} + + + + + Source Files\CUDA\jansson + + + Source Files\CUDA\jansson + + + Source Files\CUDA\jansson + + + Source Files\CUDA\jansson + + + Source Files\CUDA\jansson + + + Source Files\CUDA\jansson + + + Source Files\getopt + + + Source Files\gettimeofday + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files\compat\sys + + + Header Files\compat + + + Header Files\compat + + + Header Files\compat\getopt + + + Header Files\compat + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files + + + Header Files + + + Header Files\CUDA + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA\JHA + + + Source Files\CUDA\JHA + + + Source Files\CUDA\quark + + + Source Files\CUDA + + \ No newline at end of file diff --git a/compat.h b/compat.h index ac7b8b9..c7f201c 100644 --- a/compat.h +++ b/compat.h @@ -1,24 +1,24 @@ -#ifndef __COMPAT_H__ -#define __COMPAT_H__ - -#ifdef WIN32 - -#include - -static __inline void sleep(int secs) -{ - Sleep(secs * 1000); -} - -enum { - PRIO_PROCESS = 0, -}; - -static __inline int setpriority(int which, int who, int prio) -{ - return -!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE /*THREAD_PRIORITY_TIME_CRITICAL*/); -} - -#endif /* WIN32 */ - -#endif /* __COMPAT_H__ */ +#ifndef __COMPAT_H__ +#define __COMPAT_H__ + +#ifdef WIN32 + +#include + +static __inline void sleep(int secs) +{ + Sleep(secs * 1000); +} + +enum { + PRIO_PROCESS = 0, +}; + +static __inline int setpriority(int which, int who, int prio) +{ + return -!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE /*THREAD_PRIORITY_TIME_CRITICAL*/); +} + +#endif /* WIN32 */ + +#endif /* __COMPAT_H__ */ diff --git a/compat/Makefile.in b/compat/Makefile.in index d1d76d9..7b1f20d 100644 --- a/compat/Makefile.in +++ b/compat/Makefile.in @@ -1,8 +1,9 @@ -# Makefile.in generated by automake 1.13.3 from Makefile.am. +# Makefile.in generated by automake 1.11.3 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2013 Free Software Foundation, Inc. - +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software +# Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. @@ -14,51 +15,6 @@ @SET_MAKE@ VPATH = @srcdir@ -am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' -am__make_running_with_option = \ - case $${target_option-} in \ - ?) ;; \ - *) echo "am__make_running_with_option: internal error: invalid" \ - "target option '$${target_option-}' specified" >&2; \ - exit 1;; \ - esac; \ - has_opt=no; \ - sane_makeflags=$$MAKEFLAGS; \ - if $(am__is_gnu_make); then \ - sane_makeflags=$$MFLAGS; \ - else \ - case $$MAKEFLAGS in \ - *\\[\ \ ]*) \ - bs=\\; \ - sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ - | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ - esac; \ - fi; \ - skip_next=no; \ - strip_trailopt () \ - { \ - flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ - }; \ - for flg in $$sane_makeflags; do \ - test $$skip_next = yes && { skip_next=no; continue; }; \ - case $$flg in \ - *=*|--*) continue;; \ - -*I) strip_trailopt 'I'; skip_next=yes;; \ - -*I?*) strip_trailopt 'I';; \ - -*O) strip_trailopt 'O'; skip_next=yes;; \ - -*O?*) strip_trailopt 'O';; \ - -*l) strip_trailopt 'l'; skip_next=yes;; \ - -*l?*) strip_trailopt 'l';; \ - -[dEDm]) skip_next=yes;; \ - -[JT]) skip_next=yes;; \ - esac; \ - case $$flg in \ - *$$target_option*) has_opt=yes; break;; \ - esac; \ - done; \ - test $$has_opt = yes -am__make_dryrun = (target_option=n; $(am__make_running_with_option)) -am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ @@ -79,7 +35,7 @@ build_triplet = @build@ host_triplet = @host@ target_triplet = @target@ subdir = compat -DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ @@ -88,58 +44,20 @@ mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/cpuminer-config.h CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = -AM_V_P = $(am__v_P_@AM_V@) -am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) -am__v_P_0 = false -am__v_P_1 = : -AM_V_GEN = $(am__v_GEN_@AM_V@) -am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) -am__v_GEN_0 = @echo " GEN " $@; -am__v_GEN_1 = -AM_V_at = $(am__v_at_@AM_V@) -am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) -am__v_at_0 = @ -am__v_at_1 = SOURCES = DIST_SOURCES = -RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \ - ctags-recursive dvi-recursive html-recursive info-recursive \ - install-data-recursive install-dvi-recursive \ - install-exec-recursive install-html-recursive \ - install-info-recursive install-pdf-recursive \ - install-ps-recursive install-recursive installcheck-recursive \ - installdirs-recursive pdf-recursive ps-recursive \ - tags-recursive uninstall-recursive -am__can_run_installinfo = \ - case $$AM_UPDATE_INFO_DIR in \ - n|no|NO) false;; \ - *) (install-info --version) >/dev/null 2>&1;; \ - esac +RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ + html-recursive info-recursive install-data-recursive \ + install-dvi-recursive install-exec-recursive \ + install-html-recursive install-info-recursive \ + install-pdf-recursive install-ps-recursive install-recursive \ + installcheck-recursive installdirs-recursive pdf-recursive \ + ps-recursive uninstall-recursive RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ distclean-recursive maintainer-clean-recursive -am__recursive_targets = \ - $(RECURSIVE_TARGETS) \ - $(RECURSIVE_CLEAN_TARGETS) \ - $(am__extra_recursive_targets) -AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \ +AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ + $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ distdir -am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) -# Read a list of newline-separated strings from the standard input, -# and print each of them once, without duplicates. Input order is -# *not* preserved. -am__uniquify_input = $(AWK) '\ - BEGIN { nonempty = 0; } \ - { items[$$0] = 1; nonempty = 1; } \ - END { if (nonempty) { for (i in items) print i; }; } \ -' -# Make sure the list of sources is unique. This is necessary because, -# e.g., the same source file might be shared among _SOURCES variables -# for different programs/libraries. -am__define_uniq_tagged_files = \ - list='$(am__tagged_files)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | $(am__uniquify_input)` ETAGS = etags CTAGS = ctags DIST_SUBDIRS = jansson @@ -172,7 +90,6 @@ am__relativize = \ ACLOCAL = @ACLOCAL@ ALLOCA = @ALLOCA@ AMTAR = @AMTAR@ -AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ @@ -327,25 +244,22 @@ $(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) $(am__aclocal_m4_deps): # This directory's subdirectories are mostly independent; you can cd -# into them and run 'make' without going through this Makefile. -# To change the values of 'make' variables: instead of editing Makefiles, -# (1) if the variable is set in 'config.status', edit 'config.status' -# (which will cause the Makefiles to be regenerated when you run 'make'); -# (2) otherwise, pass the desired values on the 'make' command line. -$(am__recursive_targets): - @fail=; \ - if $(am__make_keepgoing); then \ - failcom='fail=yes'; \ - else \ - failcom='exit 1'; \ - fi; \ +# into them and run `make' without going through this Makefile. +# To change the values of `make' variables: instead of editing Makefiles, +# (1) if the variable is set in `config.status', edit `config.status' +# (which will cause the Makefiles to be regenerated when you run `make'); +# (2) otherwise, pass the desired values on the `make' command line. +$(RECURSIVE_TARGETS): + @fail= failcom='exit 1'; \ + for f in x $$MAKEFLAGS; do \ + case $$f in \ + *=* | --[!k]*);; \ + *k*) failcom='fail=yes';; \ + esac; \ + done; \ dot_seen=no; \ target=`echo $@ | sed s/-recursive//`; \ - case "$@" in \ - distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ - *) list='$(SUBDIRS)' ;; \ - esac; \ - for subdir in $$list; do \ + list='$(SUBDIRS)'; for subdir in $$list; do \ echo "Making $$target in $$subdir"; \ if test "$$subdir" = "."; then \ dot_seen=yes; \ @@ -360,12 +274,57 @@ $(am__recursive_targets): $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ fi; test -z "$$fail" -ID: $(am__tagged_files) - $(am__define_uniq_tagged_files); mkid -fID $$unique -tags: tags-recursive -TAGS: tags +$(RECURSIVE_CLEAN_TARGETS): + @fail= failcom='exit 1'; \ + for f in x $$MAKEFLAGS; do \ + case $$f in \ + *=* | --[!k]*);; \ + *k*) failcom='fail=yes';; \ + esac; \ + done; \ + dot_seen=no; \ + case "$@" in \ + distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ + *) list='$(SUBDIRS)' ;; \ + esac; \ + rev=''; for subdir in $$list; do \ + if test "$$subdir" = "."; then :; else \ + rev="$$subdir $$rev"; \ + fi; \ + done; \ + rev="$$rev ."; \ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $$rev; do \ + echo "Making $$target in $$subdir"; \ + if test "$$subdir" = "."; then \ + local_target="$$target-am"; \ + else \ + local_target="$$target"; \ + fi; \ + ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ + || eval $$failcom; \ + done && test -z "$$fail" +tags-recursive: + list='$(SUBDIRS)'; for subdir in $$list; do \ + test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ + done +ctags-recursive: + list='$(SUBDIRS)'; for subdir in $$list; do \ + test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ + done -tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) set x; \ here=`pwd`; \ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ @@ -381,7 +340,12 @@ tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ fi; \ done; \ - $(am__define_uniq_tagged_files); \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ @@ -393,11 +357,15 @@ tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) $$unique; \ fi; \ fi -ctags: ctags-recursive - -CTAGS: ctags -ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) - $(am__define_uniq_tagged_files); \ +ctags: CTAGS +CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique @@ -406,21 +374,6 @@ GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" -cscopelist: cscopelist-recursive - -cscopelist-am: $(am__tagged_files) - list='$(am__tagged_files)'; \ - case "$(srcdir)" in \ - [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ - *) sdir=$(subdir)/$(srcdir) ;; \ - esac; \ - for i in $$list; do \ - if test -f "$$i"; then \ - echo "$(subdir)/$$i"; \ - else \ - echo "$$sdir/$$i"; \ - fi; \ - done >> $(top_builddir)/cscope.files distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags @@ -457,10 +410,13 @@ distdir: $(DISTFILES) done @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ - $(am__make_dryrun) \ - || test -d "$(distdir)/$$subdir" \ - || $(MKDIR_P) "$(distdir)/$$subdir" \ - || exit 1; \ + test -d "$(distdir)/$$subdir" \ + || $(MKDIR_P) "$(distdir)/$$subdir" \ + || exit 1; \ + fi; \ + done + @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ + if test "$$subdir" = .; then :; else \ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ $(am__relativize); \ new_distdir=$$reldir; \ @@ -581,19 +537,21 @@ ps-am: uninstall-am: -.MAKE: $(am__recursive_targets) install-am install-strip - -.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \ - check-am clean clean-generic cscopelist-am ctags ctags-am \ - distclean distclean-generic distclean-tags distdir dvi dvi-am \ - html html-am info info-am install install-am install-data \ - install-data-am install-dvi install-dvi-am install-exec \ - install-exec-am install-html install-html-am install-info \ - install-info-am install-man install-pdf install-pdf-am \ - install-ps install-ps-am install-strip installcheck \ - installcheck-am installdirs installdirs-am maintainer-clean \ - maintainer-clean-generic mostlyclean mostlyclean-generic pdf \ - pdf-am ps ps-am tags tags-am uninstall uninstall-am +.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ + install-am install-strip tags-recursive + +.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ + all all-am check check-am clean clean-generic ctags \ + ctags-recursive distclean distclean-generic distclean-tags \ + distdir dvi dvi-am html html-am info info-am install \ + install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + installdirs-am maintainer-clean maintainer-clean-generic \ + mostlyclean mostlyclean-generic pdf pdf-am ps ps-am tags \ + tags-recursive uninstall uninstall-am # Tell versions [3.59,3.63) of GNU make to not export all variables. diff --git a/compat/gettimeofday.c b/compat/gettimeofday.c index da17893..8512a05 100644 --- a/compat/gettimeofday.c +++ b/compat/gettimeofday.c @@ -1,83 +1,83 @@ -#include < time.h > -#include //I've ommited this line. -#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) - #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 -#else - #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -struct timezone -{ - int tz_minuteswest; /* minutes W of Greenwich */ - int tz_dsttime; /* type of dst correction */ -}; - -int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - if (NULL != tz) - { - if (!tzflag) - { - _tzset(); - tzflag++; - } - tz->tz_minuteswest = _timezone / 60; - tz->tz_dsttime = _daylight; - } - - return 0; -} - -void usleep(__int64 waitTime) -{ - if (waitTime > 0) - { - if (waitTime > 100) - { - // use a waitable timer for larger intervals > 0.1ms - - HANDLE timer; - LARGE_INTEGER ft; - - ft.QuadPart = -(10*waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time - - timer = CreateWaitableTimer(NULL, TRUE, NULL); - SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); - WaitForSingleObject(timer, INFINITE); - CloseHandle(timer); - } - else - { - // use a polling loop for short intervals <= 100ms - - LARGE_INTEGER perfCnt, start, now; - __int64 elapsed; - - QueryPerformanceFrequency(&perfCnt); - QueryPerformanceCounter(&start); - do { - QueryPerformanceCounter((LARGE_INTEGER*) &now); - elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000); - } while ( elapsed < waitTime ); - } - } -} +#include < time.h > +#include //I've ommited this line. +#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) + #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 +#else + #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +struct timezone +{ + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + if (NULL != tz) + { + if (!tzflag) + { + _tzset(); + tzflag++; + } + tz->tz_minuteswest = _timezone / 60; + tz->tz_dsttime = _daylight; + } + + return 0; +} + +void usleep(__int64 waitTime) +{ + if (waitTime > 0) + { + if (waitTime > 100) + { + // use a waitable timer for larger intervals > 0.1ms + + HANDLE timer; + LARGE_INTEGER ft; + + ft.QuadPart = -(10*waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time + + timer = CreateWaitableTimer(NULL, TRUE, NULL); + SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); + WaitForSingleObject(timer, INFINITE); + CloseHandle(timer); + } + else + { + // use a polling loop for short intervals <= 100ms + + LARGE_INTEGER perfCnt, start, now; + __int64 elapsed; + + QueryPerformanceFrequency(&perfCnt); + QueryPerformanceCounter(&start); + do { + QueryPerformanceCounter((LARGE_INTEGER*) &now); + elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000); + } while ( elapsed < waitTime ); + } + } +} diff --git a/compat/inttypes.h b/compat/inttypes.h index f07d50f..dc7485e 100644 --- a/compat/inttypes.h +++ b/compat/inttypes.h @@ -1,2 +1,2 @@ -#pragma once -#include +#pragma once +#include diff --git a/compat/jansson/Makefile.in b/compat/jansson/Makefile.in index f5e30ac..010caa2 100644 --- a/compat/jansson/Makefile.in +++ b/compat/jansson/Makefile.in @@ -1,8 +1,9 @@ -# Makefile.in generated by automake 1.13.3 from Makefile.am. +# Makefile.in generated by automake 1.11.3 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2013 Free Software Foundation, Inc. - +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software +# Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. @@ -15,51 +16,6 @@ @SET_MAKE@ VPATH = @srcdir@ -am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' -am__make_running_with_option = \ - case $${target_option-} in \ - ?) ;; \ - *) echo "am__make_running_with_option: internal error: invalid" \ - "target option '$${target_option-}' specified" >&2; \ - exit 1;; \ - esac; \ - has_opt=no; \ - sane_makeflags=$$MAKEFLAGS; \ - if $(am__is_gnu_make); then \ - sane_makeflags=$$MFLAGS; \ - else \ - case $$MAKEFLAGS in \ - *\\[\ \ ]*) \ - bs=\\; \ - sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ - | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ - esac; \ - fi; \ - skip_next=no; \ - strip_trailopt () \ - { \ - flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ - }; \ - for flg in $$sane_makeflags; do \ - test $$skip_next = yes && { skip_next=no; continue; }; \ - case $$flg in \ - *=*|--*) continue;; \ - -*I) strip_trailopt 'I'; skip_next=yes;; \ - -*I?*) strip_trailopt 'I';; \ - -*O) strip_trailopt 'O'; skip_next=yes;; \ - -*O?*) strip_trailopt 'O';; \ - -*l) strip_trailopt 'l'; skip_next=yes;; \ - -*l?*) strip_trailopt 'l';; \ - -[dEDm]) skip_next=yes;; \ - -[JT]) skip_next=yes;; \ - esac; \ - case $$flg in \ - *$$target_option*) has_opt=yes; break;; \ - esac; \ - done; \ - test $$has_opt = yes -am__make_dryrun = (target_option=n; $(am__make_running_with_option)) -am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ @@ -80,8 +36,7 @@ build_triplet = @build@ host_triplet = @host@ target_triplet = @target@ subdir = compat/jansson -DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ - $(top_srcdir)/depcomp +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ @@ -93,75 +48,28 @@ CONFIG_CLEAN_VPATH_FILES = LIBRARIES = $(noinst_LIBRARIES) AR = ar ARFLAGS = cru -AM_V_AR = $(am__v_AR_@AM_V@) -am__v_AR_ = $(am__v_AR_@AM_DEFAULT_V@) -am__v_AR_0 = @echo " AR " $@; -am__v_AR_1 = libjansson_a_AR = $(AR) $(ARFLAGS) libjansson_a_LIBADD = am_libjansson_a_OBJECTS = dump.$(OBJEXT) hashtable.$(OBJEXT) \ load.$(OBJEXT) strbuffer.$(OBJEXT) utf.$(OBJEXT) \ value.$(OBJEXT) libjansson_a_OBJECTS = $(am_libjansson_a_OBJECTS) -AM_V_P = $(am__v_P_@AM_V@) -am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) -am__v_P_0 = false -am__v_P_1 = : -AM_V_GEN = $(am__v_GEN_@AM_V@) -am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) -am__v_GEN_0 = @echo " GEN " $@; -am__v_GEN_1 = -AM_V_at = $(am__v_at_@AM_V@) -am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) -am__v_at_0 = @ -am__v_at_1 = DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) depcomp = $(SHELL) $(top_srcdir)/depcomp am__depfiles_maybe = depfiles am__mv = mv -f COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -AM_V_CC = $(am__v_CC_@AM_V@) -am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) -am__v_CC_0 = @echo " CC " $@; -am__v_CC_1 = CCLD = $(CC) LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ -AM_V_CCLD = $(am__v_CCLD_@AM_V@) -am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) -am__v_CCLD_0 = @echo " CCLD " $@; -am__v_CCLD_1 = SOURCES = $(libjansson_a_SOURCES) DIST_SOURCES = $(libjansson_a_SOURCES) -am__can_run_installinfo = \ - case $$AM_UPDATE_INFO_DIR in \ - n|no|NO) false;; \ - *) (install-info --version) >/dev/null 2>&1;; \ - esac -am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) -# Read a list of newline-separated strings from the standard input, -# and print each of them once, without duplicates. Input order is -# *not* preserved. -am__uniquify_input = $(AWK) '\ - BEGIN { nonempty = 0; } \ - { items[$$0] = 1; nonempty = 1; } \ - END { if (nonempty) { for (i in items) print i; }; } \ -' -# Make sure the list of sources is unique. This is necessary because, -# e.g., the same source file might be shared among _SOURCES variables -# for different programs/libraries. -am__define_uniq_tagged_files = \ - list='$(am__tagged_files)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | $(am__uniquify_input)` ETAGS = etags CTAGS = ctags DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ ALLOCA = @ALLOCA@ AMTAR = @AMTAR@ -AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ @@ -332,11 +240,10 @@ $(am__aclocal_m4_deps): clean-noinstLIBRARIES: -test -z "$(noinst_LIBRARIES)" || rm -f $(noinst_LIBRARIES) - libjansson.a: $(libjansson_a_OBJECTS) $(libjansson_a_DEPENDENCIES) $(EXTRA_libjansson_a_DEPENDENCIES) - $(AM_V_at)-rm -f libjansson.a - $(AM_V_AR)$(libjansson_a_AR) libjansson.a $(libjansson_a_OBJECTS) $(libjansson_a_LIBADD) - $(AM_V_at)$(RANLIB) libjansson.a + -rm -f libjansson.a + $(libjansson_a_AR) libjansson.a $(libjansson_a_OBJECTS) $(libjansson_a_LIBADD) + $(RANLIB) libjansson.a mostlyclean-compile: -rm -f *.$(OBJEXT) @@ -352,28 +259,39 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/value.Po@am__quote@ .c.o: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c $< +@am__fastdepCC_FALSE@ $(COMPILE) -c $< .c.obj: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c `$(CYGPATH_W) '$<'` - -ID: $(am__tagged_files) - $(am__define_uniq_tagged_files); mkid -fID $$unique -tags: tags-am -TAGS: tags - -tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) set x; \ here=`pwd`; \ - $(am__define_uniq_tagged_files); \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ @@ -385,11 +303,15 @@ tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) $$unique; \ fi; \ fi -ctags: ctags-am - -CTAGS: ctags -ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) - $(am__define_uniq_tagged_files); \ +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique @@ -398,21 +320,6 @@ GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" -cscopelist: cscopelist-am - -cscopelist-am: $(am__tagged_files) - list='$(am__tagged_files)'; \ - case "$(srcdir)" in \ - [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ - *) sdir=$(subdir)/$(srcdir) ;; \ - esac; \ - for i in $$list; do \ - if test -f "$$i"; then \ - echo "$(subdir)/$$i"; \ - else \ - echo "$$sdir/$$i"; \ - fi; \ - done >> $(top_builddir)/cscope.files distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags @@ -552,17 +459,17 @@ uninstall-am: .MAKE: install-am install-strip -.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ - clean-noinstLIBRARIES cscopelist-am ctags ctags-am distclean \ - distclean-compile distclean-generic distclean-tags distdir dvi \ - dvi-am html html-am info info-am install install-am \ - install-data install-data-am install-dvi install-dvi-am \ - install-exec install-exec-am install-html install-html-am \ - install-info install-info-am install-man install-pdf \ - install-pdf-am install-ps install-ps-am install-strip \ - installcheck installcheck-am installdirs maintainer-clean \ +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-noinstLIBRARIES ctags distclean distclean-compile \ + distclean-generic distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ maintainer-clean-generic mostlyclean mostlyclean-compile \ - mostlyclean-generic pdf pdf-am ps ps-am tags tags-am uninstall \ + mostlyclean-generic pdf pdf-am ps ps-am tags uninstall \ uninstall-am diff --git a/compat/stdbool.h b/compat/stdbool.h index 31d0456..3def25f 100644 --- a/compat/stdbool.h +++ b/compat/stdbool.h @@ -1,6 +1,6 @@ -#pragma once - -#define false 0 -#define true 1 - -#define bool int +#pragma once + +#define false 0 +#define true 1 + +#define bool int diff --git a/compat/unistd.h b/compat/unistd.h index 193da66..b0690f9 100644 --- a/compat/unistd.h +++ b/compat/unistd.h @@ -1,2 +1,2 @@ -#pragma once +#pragma once #include "getopt/getopt.h" \ No newline at end of file diff --git a/configure b/configure index d826992..741cf90 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27. +# Generated by GNU Autoconf 2.68 for ccminer 2014.04.27. # # # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, @@ -557,8 +557,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='ccminer' PACKAGE_TARNAME='ccminer' -PACKAGE_VERSION='2014.03.27' -PACKAGE_STRING='ccminer 2014.03.27' +PACKAGE_VERSION='2014.04.27' +PACKAGE_STRING='ccminer 2014.04.27' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems. +\`configure' configures ccminer 2014.04.27 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1368,7 +1368,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of ccminer 2014.03.27:";; + short | recursive ) echo "Configuration of ccminer 2014.04.27:";; esac cat <<\_ACEOF @@ -1469,7 +1469,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -ccminer configure 2014.03.27 +ccminer configure 2014.04.27 generated by GNU Autoconf 2.68 Copyright (C) 2010 Free Software Foundation, Inc. @@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by ccminer $as_me 2014.03.27, which was +It was created by ccminer $as_me 2014.04.27, which was generated by GNU Autoconf 2.68. Invocation command line was $ $0 $@ @@ -2901,7 +2901,7 @@ fi # Define the identity of the package. PACKAGE='ccminer' - VERSION='2014.03.27' + VERSION='2014.04.27' cat >>confdefs.h <<_ACEOF @@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by ccminer $as_me 2014.03.27, which was +This file was extended by ccminer $as_me 2014.04.27, which was generated by GNU Autoconf 2.68. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7184,7 +7184,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -ccminer config.status 2014.03.27 +ccminer config.status 2014.04.27 configured by $0, generated by GNU Autoconf 2.68, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index ce82fdc..216a561 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.03.27]) +AC_INIT([ccminer], [2014.04.27]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 370a257..583efd2 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1,1565 +1,1586 @@ -/* - * Copyright 2010 Jeff Garzik - * Copyright 2012-2014 pooler - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#include "cpuminer-config.h" -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef WIN32 -#include -#else -#include -#include -#include -#if HAVE_SYS_SYSCTL_H -#include -#if HAVE_SYS_PARAM_H -#include -#endif -#include -#endif -#endif -#include -#include -#include -#include "compat.h" -#include "miner.h" - -#ifdef WIN32 -#include -#pragma comment(lib, "winmm.lib") -#endif - -#define PROGRAM_NAME "minerd" -#define LP_SCANTIME 60 -#define HEAVYCOIN_BLKHDR_SZ 84 - -// from heavy.cu -#ifdef __cplusplus -extern "C" -{ -#endif -int cuda_num_devices(); -int cuda_finddevice(char *name); -#ifdef __cplusplus -} -#endif - - -#ifdef __linux /* Linux specific policy and affinity management */ -#include -static inline void drop_policy(void) -{ - struct sched_param param; - param.sched_priority = 0; - -#ifdef SCHED_IDLE - if (unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) -#endif -#ifdef SCHED_BATCH - sched_setscheduler(0, SCHED_BATCH, ¶m); -#endif -} - -static inline void affine_to_cpu(int id, int cpu) -{ - cpu_set_t set; - - CPU_ZERO(&set); - CPU_SET(cpu, &set); - sched_setaffinity(0, sizeof(&set), &set); -} -#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */ -#include -static inline void drop_policy(void) -{ -} - -static inline void affine_to_cpu(int id, int cpu) -{ - cpuset_t set; - CPU_ZERO(&set); - CPU_SET(cpu, &set); - cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set); -} -#else -static inline void drop_policy(void) -{ -} - -static inline void affine_to_cpu(int id, int cpu) -{ -} -#endif - -enum workio_commands { - WC_GET_WORK, - WC_SUBMIT_WORK, -}; - -struct workio_cmd { - enum workio_commands cmd; - struct thr_info *thr; - union { - struct work *work; - } u; -}; - -typedef enum { - ALGO_HEAVY, /* Heavycoin hash */ - ALGO_FUGUE256, /* Fugue256 */ - ALGO_GROESTL, -} sha256_algos; - -static const char *algo_names[] = { - "heavy", - "fugue256", - "groestl" -}; - -bool opt_debug = false; -bool opt_protocol = false; -static bool opt_benchmark = false; -bool want_longpoll = true; -bool have_longpoll = false; -bool want_stratum = true; -bool have_stratum = false; -static bool submit_old = false; -bool use_syslog = false; -static bool opt_background = false; -static bool opt_quiet = false; -static int opt_retries = -1; -static int opt_fail_pause = 30; -int opt_timeout = 270; -static int opt_scantime = 5; -static json_t *opt_config; -static const bool opt_time = true; -static sha256_algos opt_algo = ALGO_HEAVY; -static int opt_n_threads = 0; -bool opt_trust_pool = false; -uint16_t opt_vote = 9999; -static int num_processors; -int device_map[8] = {0,1,2,3,4,5,6,7}; // CB -static char *rpc_url; -static char *rpc_userpass; -static char *rpc_user, *rpc_pass; -char *opt_cert; -char *opt_proxy; -long opt_proxy_type; -struct thr_info *thr_info; -static int work_thr_id; -int longpoll_thr_id = -1; -int stratum_thr_id = -1; -struct work_restart *work_restart = NULL; -static struct stratum_ctx stratum; - -pthread_mutex_t applog_lock; -static pthread_mutex_t stats_lock; - -static unsigned long accepted_count = 0L; -static unsigned long rejected_count = 0L; -static double *thr_hashrates; - -#ifdef HAVE_GETOPT_LONG -#include -#else -struct option { - const char *name; - int has_arg; - int *flag; - int val; -}; -#endif - -static char const usage[] = "\ -Usage: " PROGRAM_NAME " [OPTIONS]\n\ -Options:\n\ - -a, --algo=ALGO specify the algorithm to use\n\ - fugue256 Fuguecoin hash\n\ - heavy Heavycoin hash\n\ - -d, --devices takes a comma separated list of CUDA devices to use.\n\ - Device IDs start counting from 0! Alternatively takes\n\ - string names of your cards like gtx780ti or gt640#2\n\ - (matching 2nd gt640 in the PC)\n\ - -v, --vote=VOTE block reward vote (for HeavyCoin)\n\ - -m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ - -o, --url=URL URL of mining server\n\ - -O, --userpass=U:P username:password pair for mining server\n\ - -u, --user=USERNAME username for mining server\n\ - -p, --pass=PASSWORD password for mining server\n\ - --cert=FILE certificate for mining server using SSL\n\ - -x, --proxy=[PROTOCOL://]HOST[:PORT] connect through a proxy\n\ - -t, --threads=N number of miner threads (default: number of nVidia GPUs)\n\ - -r, --retries=N number of times to retry if a network call fails\n\ - (default: retry indefinitely)\n\ - -R, --retry-pause=N time to pause between retries, in seconds (default: 30)\n\ - -T, --timeout=N network timeout, in seconds (default: 270)\n\ - -s, --scantime=N upper bound on time spent scanning current work when\n\ - long polling is unavailable, in seconds (default: 5)\n\ - --no-longpoll disable X-Long-Polling support\n\ - --no-stratum disable X-Stratum support\n\ - -q, --quiet disable per-thread hashmeter output\n\ - -D, --debug enable debug output\n\ - -P, --protocol-dump verbose dump of protocol-level activities\n" -#ifdef HAVE_SYSLOG_H -"\ - -S, --syslog use system log for output messages\n" -#endif -#ifndef WIN32 -"\ - -B, --background run the miner in the background\n" -#endif -"\ - --benchmark run in offline benchmark mode\n\ - -c, --config=FILE load a JSON-format configuration file\n\ - -V, --version display version information and exit\n\ - -h, --help display this help text and exit\n\ -"; - -static char const short_options[] = -#ifndef WIN32 - "B" -#endif -#ifdef HAVE_SYSLOG_H - "S" -#endif - "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:"; - -static struct option const options[] = { - { "algo", 1, NULL, 'a' }, -#ifndef WIN32 - { "background", 0, NULL, 'B' }, -#endif - { "benchmark", 0, NULL, 1005 }, - { "cert", 1, NULL, 1001 }, - { "config", 1, NULL, 'c' }, - { "debug", 0, NULL, 'D' }, - { "help", 0, NULL, 'h' }, - { "no-longpoll", 0, NULL, 1003 }, - { "no-stratum", 0, NULL, 1007 }, - { "pass", 1, NULL, 'p' }, - { "protocol-dump", 0, NULL, 'P' }, - { "proxy", 1, NULL, 'x' }, - { "quiet", 0, NULL, 'q' }, - { "retries", 1, NULL, 'r' }, - { "retry-pause", 1, NULL, 'R' }, - { "scantime", 1, NULL, 's' }, -#ifdef HAVE_SYSLOG_H - { "syslog", 0, NULL, 'S' }, -#endif - { "threads", 1, NULL, 't' }, - { "vote", 1, NULL, 'v' }, - { "trust-pool", 0, NULL, 'm' }, - { "timeout", 1, NULL, 'T' }, - { "url", 1, NULL, 'o' }, - { "user", 1, NULL, 'u' }, - { "userpass", 1, NULL, 'O' }, - { "version", 0, NULL, 'V' }, - { "devices", 1, NULL, 'd' }, - { 0, 0, 0, 0 } -}; - -struct work { - uint32_t data[32]; - uint32_t target[8]; - uint32_t maxvote; - - char job_id[128]; - size_t xnonce2_len; - unsigned char xnonce2[32]; -}; - -static struct work g_work; -static time_t g_work_time; -static pthread_mutex_t g_work_lock; - -static bool jobj_binary(const json_t *obj, const char *key, - void *buf, size_t buflen) -{ - const char *hexstr; - json_t *tmp; - - tmp = json_object_get(obj, key); - if (unlikely(!tmp)) { - applog(LOG_ERR, "JSON key '%s' not found", key); - return false; - } - hexstr = json_string_value(tmp); - if (unlikely(!hexstr)) { - applog(LOG_ERR, "JSON key '%s' is not a string", key); - return false; - } - if (!hex2bin((unsigned char*)buf, hexstr, buflen)) - return false; - - return true; -} - -static bool work_decode(const json_t *val, struct work *work) -{ - int i; - - if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) { - applog(LOG_ERR, "JSON inval data"); - goto err_out; - } - if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) { - applog(LOG_ERR, "JSON inval target"); - goto err_out; - } - if (opt_algo == ALGO_HEAVY) { - if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) { - work->maxvote = 1024; - } - } else work->maxvote = 0; - - for (i = 0; i < ARRAY_SIZE(work->data); i++) - work->data[i] = le32dec(work->data + i); - for (i = 0; i < ARRAY_SIZE(work->target); i++) - work->target[i] = le32dec(work->target + i); - - return true; - -err_out: - return false; -} - -static void share_result(int result, const char *reason) -{ - char s[345]; - double hashrate; - int i; - - hashrate = 0.; - pthread_mutex_lock(&stats_lock); - for (i = 0; i < opt_n_threads; i++) - hashrate += thr_hashrates[i]; - result ? accepted_count++ : rejected_count++; - pthread_mutex_unlock(&stats_lock); - - sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); - applog(LOG_INFO, "accepted: %lu/%lu (%.2f%%), %s khash/s %s", - accepted_count, - accepted_count + rejected_count, - 100. * accepted_count / (accepted_count + rejected_count), - s, - result ? "(yay!!!)" : "(booooo)"); - - if (opt_debug && reason) - applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason); -} - -static bool submit_upstream_work(CURL *curl, struct work *work) -{ - char *str = NULL; - json_t *val, *res, *reason; - char s[345]; - int i; - bool rc = false; - - /* pass if the previous hash is not the current previous hash */ - if (memcmp(work->data + 1, g_work.data + 1, 32)) { - if (opt_debug) - applog(LOG_DEBUG, "DEBUG: stale work detected, discarding"); - return true; - } - - if (have_stratum) { - uint32_t ntime, nonce; - uint16_t nvote; - char *ntimestr, *noncestr, *xnonce2str, *nvotestr; - - le32enc(&ntime, work->data[17]); - le32enc(&nonce, work->data[19]); - be16enc(&nvote, *((uint16_t*)&work->data[20])); - - ntimestr = bin2hex((const unsigned char *)(&ntime), 4); - noncestr = bin2hex((const unsigned char *)(&nonce), 4); - xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len); - nvotestr = bin2hex((const unsigned char *)(&nvote), 2); - if (opt_algo == ALGO_HEAVY) { - sprintf(s, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr, nvotestr); - } else { - sprintf(s, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr); - } - free(ntimestr); - free(noncestr); - free(xnonce2str); - free(nvotestr); - - if (unlikely(!stratum_send_line(&stratum, s))) { - applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); - goto out; - } - } else { - - /* build hex string */ - - if (opt_algo != ALGO_HEAVY) { - for (i = 0; i < ARRAY_SIZE(work->data); i++) - le32enc(work->data + i, work->data[i]); - } - str = bin2hex((unsigned char *)work->data, sizeof(work->data)); - if (unlikely(!str)) { - applog(LOG_ERR, "submit_upstream_work OOM"); - goto out; - } - - /* build JSON-RPC request */ - sprintf(s, - "{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n", - str); - - /* issue JSON-RPC request */ - val = json_rpc_call(curl, rpc_url, rpc_userpass, s, false, false, NULL); - if (unlikely(!val)) { - applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); - goto out; - } - - res = json_object_get(val, "result"); - reason = json_object_get(val, "reject-reason"); - share_result(json_is_true(res), reason ? json_string_value(reason) : NULL); - - json_decref(val); - } - - rc = true; - -out: - free(str); - return rc; -} - -static const char *rpc_req = - "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; - -static bool get_upstream_work(CURL *curl, struct work *work) -{ - json_t *val; - bool rc; - struct timeval tv_start, tv_end, diff; - - gettimeofday(&tv_start, NULL); - val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req, - want_longpoll, false, NULL); - gettimeofday(&tv_end, NULL); - - if (have_stratum) { - if (val) - json_decref(val); - return true; - } - - if (!val) - return false; - - rc = work_decode(json_object_get(val, "result"), work); - - if (opt_debug && rc) { - timeval_subtract(&diff, &tv_end, &tv_start); - applog(LOG_DEBUG, "DEBUG: got new work in %d ms", - diff.tv_sec * 1000 + diff.tv_usec / 1000); - } - - json_decref(val); - - return rc; -} - -static void workio_cmd_free(struct workio_cmd *wc) -{ - if (!wc) - return; - - switch (wc->cmd) { - case WC_SUBMIT_WORK: - free(wc->u.work); - break; - default: /* do nothing */ - break; - } - - memset(wc, 0, sizeof(*wc)); /* poison */ - free(wc); -} - -static bool workio_get_work(struct workio_cmd *wc, CURL *curl) -{ - struct work *ret_work; - int failures = 0; - - ret_work = (struct work*)calloc(1, sizeof(*ret_work)); - if (!ret_work) - return false; - - /* obtain new work from bitcoin via JSON-RPC */ - while (!get_upstream_work(curl, ret_work)) { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { - applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); - free(ret_work); - return false; - } - - /* pause, then restart work-request loop */ - applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds", - opt_fail_pause); - sleep(opt_fail_pause); - } - - /* send work to requesting thread */ - if (!tq_push(wc->thr->q, ret_work)) - free(ret_work); - - return true; -} - -static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) -{ - int failures = 0; - - /* submit solution to bitcoin via JSON-RPC */ - while (!submit_upstream_work(curl, wc->u.work)) { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { - applog(LOG_ERR, "...terminating workio thread"); - return false; - } - - /* pause, then restart work-request loop */ - applog(LOG_ERR, "...retry after %d seconds", - opt_fail_pause); - sleep(opt_fail_pause); - } - - return true; -} - -static void *workio_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info*)userdata; - CURL *curl; - bool ok = true; - - curl = curl_easy_init(); - if (unlikely(!curl)) { - applog(LOG_ERR, "CURL initialization failed"); - return NULL; - } - - while (ok) { - struct workio_cmd *wc; - - /* wait for workio_cmd sent to us, on our queue */ - wc = (struct workio_cmd *)tq_pop(mythr->q, NULL); - if (!wc) { - ok = false; - break; - } - - /* process workio_cmd */ - switch (wc->cmd) { - case WC_GET_WORK: - ok = workio_get_work(wc, curl); - break; - case WC_SUBMIT_WORK: - ok = workio_submit_work(wc, curl); - break; - - default: /* should never happen */ - ok = false; - break; - } - - workio_cmd_free(wc); - } - - tq_freeze(mythr->q); - curl_easy_cleanup(curl); - - return NULL; -} - -static bool get_work(struct thr_info *thr, struct work *work) -{ - struct workio_cmd *wc; - struct work *work_heap; - - if (opt_benchmark) { - memset(work->data, 0x55, 76); - work->data[17] = swab32((uint32_t)time(NULL)); - memset(work->data + 19, 0x00, 52); - work->data[20] = 0x80000000; - work->data[31] = 0x00000280; - memset(work->target, 0x00, sizeof(work->target)); - return true; - } - - /* fill out work request message */ - wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); - if (!wc) - return false; - - wc->cmd = WC_GET_WORK; - wc->thr = thr; - - /* send work request to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) { - workio_cmd_free(wc); - return false; - } - - /* wait for response, a unit of work */ - work_heap = (struct work *)tq_pop(thr->q, NULL); - if (!work_heap) - return false; - - /* copy returned work into storage provided by caller */ - memcpy(work, work_heap, sizeof(*work)); - free(work_heap); - - return true; -} - -static bool submit_work(struct thr_info *thr, const struct work *work_in) -{ - struct workio_cmd *wc; - /* fill out work request message */ - wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); - if (!wc) - return false; - - wc->u.work = (struct work *)malloc(sizeof(*work_in)); - if (!wc->u.work) - goto err_out; - - wc->cmd = WC_SUBMIT_WORK; - wc->thr = thr; - memcpy(wc->u.work, work_in, sizeof(*work_in)); - - /* send solution to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) - goto err_out; - - return true; - -err_out: - workio_cmd_free(wc); - return false; -} - -static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) -{ - unsigned char merkle_root[64]; - int i; - - pthread_mutex_lock(&sctx->work_lock); - - strcpy(work->job_id, sctx->job.job_id); - work->xnonce2_len = sctx->xnonce2_size; - memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); - - /* Generate merkle root */ - if (opt_algo == ALGO_HEAVY) - heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); - else - if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL) - SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root); - else - sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); - - for (i = 0; i < sctx->job.merkle_count; i++) { - memcpy(merkle_root + 32, sctx->job.merkle[i], 32); - if (opt_algo == ALGO_HEAVY) - heavycoin_hash(merkle_root, merkle_root, 64); - else - sha256d(merkle_root, merkle_root, 64); - } - - /* Increment extranonce2 */ - for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); - - /* Assemble block header */ - memset(work->data, 0, 128); - work->data[0] = le32dec(sctx->job.version); - for (i = 0; i < 8; i++) - work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i); - for (i = 0; i < 8; i++) - work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); - work->data[17] = le32dec(sctx->job.ntime); - work->data[18] = le32dec(sctx->job.nbits); - work->data[20] = 0x80000000; - work->data[31] = 0x00000280; - - // HeavyCoin - if (opt_algo == ALGO_HEAVY) { - uint16_t *ext; - work->maxvote = 1024; - ext = (uint16_t*)(&work->data[20]); - ext[0] = opt_vote; - ext[1] = be16dec(sctx->job.nreward); - - for (i = 0; i < 20; i++) - work->data[i] = be32dec((uint32_t *)&work->data[i]); - } - // - - pthread_mutex_unlock(&sctx->work_lock); - - if (opt_debug) { - char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size); - applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x", - work->job_id, xnonce2str, swab32(work->data[17])); - free(xnonce2str); - } - - if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL) - diff_to_target(work->target, sctx->job.diff / 256.0); - else - diff_to_target(work->target, sctx->job.diff); -} - -static void *miner_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info *)userdata; - int thr_id = mythr->id; - struct work work; - uint32_t max_nonce; - uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; - unsigned char *scratchbuf = NULL; - char s[16]; - int i; - - memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized - - /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE - * and if that fails, then SCHED_BATCH. No need for this to be an - * error if it fails */ - if (!opt_benchmark) { - setpriority(PRIO_PROCESS, 0, 19); - drop_policy(); - } - - /* Cpu affinity only makes sense if the number of threads is a multiple - * of the number of CPUs */ - if (num_processors > 1 && opt_n_threads % num_processors == 0) { - if (!opt_quiet) - applog(LOG_INFO, "Binding thread %d to cpu %d", - thr_id, thr_id % num_processors); - affine_to_cpu(thr_id, thr_id % num_processors); - } - - while (1) { - unsigned long hashes_done; - struct timeval tv_start, tv_end, diff; - int64_t max64; - int rc; - - if (have_stratum) { - while (time(NULL) >= g_work_time + 120) - sleep(1); - pthread_mutex_lock(&g_work_lock); - if (work.data[19] >= end_nonce) - stratum_gen_work(&stratum, &g_work); - } else { - /* obtain new work from internal workio thread */ - pthread_mutex_lock(&g_work_lock); - if (!have_stratum && (!have_longpoll || - time(NULL) >= g_work_time + LP_SCANTIME*3/4 || - work.data[19] >= end_nonce)) { - if (unlikely(!get_work(mythr, &g_work))) { - applog(LOG_ERR, "work retrieval failed, exiting " - "mining thread %d", mythr->id); - pthread_mutex_unlock(&g_work_lock); - goto out; - } - g_work_time = have_stratum ? 0 : time(NULL); - } - if (have_stratum) { - pthread_mutex_unlock(&g_work_lock); - continue; - } - } - if (memcmp(work.data, g_work.data, 76)) { - memcpy(&work, &g_work, sizeof(struct work)); - work.data[19] = 0xffffffffU / opt_n_threads * thr_id; - } else - work.data[19]++; - pthread_mutex_unlock(&g_work_lock); - work_restart[thr_id].restart = 0; - - /* adjust max_nonce to meet target scan time */ - if (have_stratum) - max64 = LP_SCANTIME; - else - max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) - - time(NULL); - max64 *= (int64_t)thr_hashrates[thr_id]; - if (max64 <= 0) - max64 = 0x1fffffLL; - if ((int64_t)work.data[19] + max64 > end_nonce) - max_nonce = end_nonce; - else - max_nonce = (uint32_t)(work.data[19] + max64); - - hashes_done = 0; - gettimeofday(&tv_start, NULL); - - /* scan nonces for a proof-of-work hash */ - switch (opt_algo) { - - case ALGO_HEAVY: - rc = scanhash_heavy(thr_id, work.data, work.target, - max_nonce, &hashes_done, work.maxvote); - break; - - case ALGO_FUGUE256: - rc = scanhash_fugue256(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - case ALGO_GROESTL: - rc = scanhash_groestlcoin(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - default: - /* should never happen */ - goto out; - } - - /* record scanhash elapsed time */ - gettimeofday(&tv_end, NULL); - timeval_subtract(&diff, &tv_end, &tv_start); - if (diff.tv_usec || diff.tv_sec) { - pthread_mutex_lock(&stats_lock); - thr_hashrates[thr_id] = - hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); - pthread_mutex_unlock(&stats_lock); - } - if (!opt_quiet) { - sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", - 1e-3 * thr_hashrates[thr_id]); - applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s", - thr_id, hashes_done, s); - } - if (opt_benchmark && thr_id == opt_n_threads - 1) { - double hashrate = 0.; - for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++) - hashrate += thr_hashrates[i]; - if (i == opt_n_threads) { - sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); - applog(LOG_INFO, "Total: %s khash/s", s); - } - } - - /* if nonce found, submit work */ - if (rc && !opt_benchmark && !submit_work(mythr, &work)) - break; - } - -out: - tq_freeze(mythr->q); - - return NULL; -} - -static void restart_threads(void) -{ - int i; - - for (i = 0; i < opt_n_threads; i++) - work_restart[i].restart = 1; -} - -static void *longpoll_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info *)userdata; - CURL *curl = NULL; - char *copy_start, *hdr_path = NULL, *lp_url = NULL; - bool need_slash = false; - - curl = curl_easy_init(); - if (unlikely(!curl)) { - applog(LOG_ERR, "CURL initialization failed"); - goto out; - } - -start: - hdr_path = (char*)tq_pop(mythr->q, NULL); - if (!hdr_path) - goto out; - - /* full URL */ - if (strstr(hdr_path, "://")) { - lp_url = hdr_path; - hdr_path = NULL; - } - - /* absolute path, on current server */ - else { - copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path; - if (rpc_url[strlen(rpc_url) - 1] != '/') - need_slash = true; - - lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2); - if (!lp_url) - goto out; - - sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start); - } - - applog(LOG_INFO, "Long-polling activated for %s", lp_url); - - while (1) { - json_t *val, *soval; - int err; - - val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req, - false, true, &err); - if (have_stratum) { - if (val) - json_decref(val); - goto out; - } - if (likely(val)) { - if (!opt_quiet) applog(LOG_INFO, "LONGPOLL detected new block"); - soval = json_object_get(json_object_get(val, "result"), "submitold"); - submit_old = soval ? json_is_true(soval) : false; - pthread_mutex_lock(&g_work_lock); - if (work_decode(json_object_get(val, "result"), &g_work)) { - if (opt_debug) - applog(LOG_DEBUG, "DEBUG: got new work"); - time(&g_work_time); - restart_threads(); - } - pthread_mutex_unlock(&g_work_lock); - json_decref(val); - } else { - pthread_mutex_lock(&g_work_lock); - g_work_time -= LP_SCANTIME; - pthread_mutex_unlock(&g_work_lock); - if (err == CURLE_OPERATION_TIMEDOUT) { - restart_threads(); - } else { - have_longpoll = false; - restart_threads(); - free(hdr_path); - free(lp_url); - lp_url = NULL; - sleep(opt_fail_pause); - goto start; - } - } - } - -out: - free(hdr_path); - free(lp_url); - tq_freeze(mythr->q); - if (curl) - curl_easy_cleanup(curl); - - return NULL; -} - -static bool stratum_handle_response(char *buf) -{ - json_t *val, *err_val, *res_val, *id_val; - json_error_t err; - bool ret = false; - - val = JSON_LOADS(buf, &err); - if (!val) { - applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - id_val = json_object_get(val, "id"); - - if (!id_val || json_is_null(id_val) || !res_val) - goto out; - - share_result(json_is_true(res_val), - err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); - - ret = true; -out: - if (val) - json_decref(val); - - return ret; -} - -static void *stratum_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info *)userdata; - char *s; - - stratum.url = (char*)tq_pop(mythr->q, NULL); - if (!stratum.url) - goto out; - applog(LOG_INFO, "Starting Stratum on %s", stratum.url); - - while (1) { - int failures = 0; - - while (!stratum.curl) { - pthread_mutex_lock(&g_work_lock); - g_work_time = 0; - pthread_mutex_unlock(&g_work_lock); - restart_threads(); - - if (!stratum_connect(&stratum, stratum.url) || - !stratum_subscribe(&stratum) || - !stratum_authorize(&stratum, rpc_user, rpc_pass)) { - stratum_disconnect(&stratum); - if (opt_retries >= 0 && ++failures > opt_retries) { - applog(LOG_ERR, "...terminating workio thread"); - tq_push(thr_info[work_thr_id].q, NULL); - goto out; - } - applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); - sleep(opt_fail_pause); - } - } - - if (stratum.job.job_id && - (strcmp(stratum.job.job_id, g_work.job_id) || !g_work_time)) { - pthread_mutex_lock(&g_work_lock); - stratum_gen_work(&stratum, &g_work); - time(&g_work_time); - pthread_mutex_unlock(&g_work_lock); - if (stratum.job.clean) { - if (!opt_quiet) applog(LOG_INFO, "Stratum detected new block"); - restart_threads(); - } - } - - if (!stratum_socket_full(&stratum, 120)) { - applog(LOG_ERR, "Stratum connection timed out"); - s = NULL; - } else - s = stratum_recv_line(&stratum); - if (!s) { - stratum_disconnect(&stratum); - applog(LOG_ERR, "Stratum connection interrupted"); - continue; - } - if (!stratum_handle_method(&stratum, s)) - stratum_handle_response(s); - free(s); - } - -out: - return NULL; -} - -static void show_version_and_exit(void) -{ - printf("%s\n%s\n", PACKAGE_STRING, curl_version()); - exit(0); -} - -static void show_usage_and_exit(int status) -{ - if (status) - fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n"); - else - printf(usage); - exit(status); -} - -static void parse_arg (int key, char *arg) -{ - char *p; - int v, i; - - switch(key) { - case 'a': - for (i = 0; i < ARRAY_SIZE(algo_names); i++) { - if (algo_names[i] && - !strcmp(arg, algo_names[i])) { - opt_algo = (sha256_algos)i; - break; - } - } - if (i == ARRAY_SIZE(algo_names)) - show_usage_and_exit(1); - break; - case 'B': - opt_background = true; - break; - case 'c': { - json_error_t err; - if (opt_config) - json_decref(opt_config); -#if JANSSON_VERSION_HEX >= 0x020000 - opt_config = json_load_file(arg, 0, &err); -#else - opt_config = json_load_file(arg, &err); -#endif - if (!json_is_object(opt_config)) { - applog(LOG_ERR, "JSON decode of %s failed", arg); - exit(1); - } - break; - } - case 'q': - opt_quiet = true; - break; - case 'D': - opt_debug = true; - break; - case 'p': - free(rpc_pass); - rpc_pass = strdup(arg); - break; - case 'P': - opt_protocol = true; - break; - case 'r': - v = atoi(arg); - if (v < -1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_retries = v; - break; - case 'R': - v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_fail_pause = v; - break; - case 's': - v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_scantime = v; - break; - case 'T': - v = atoi(arg); - if (v < 1 || v > 99999) /* sanity check */ - show_usage_and_exit(1); - opt_timeout = v; - break; - case 't': - v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_n_threads = v; - break; - case 'v': - v = atoi(arg); - if (v < 0 || v > 1024) /* sanity check */ - show_usage_and_exit(1); - opt_vote = (uint16_t)v; - break; - case 'm': - opt_trust_pool = true; - break; - case 'u': - free(rpc_user); - rpc_user = strdup(arg); - break; - case 'o': /* --url */ - p = strstr(arg, "://"); - if (p) { - if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) && - strncasecmp(arg, "stratum+tcp://", 14)) - show_usage_and_exit(1); - free(rpc_url); - rpc_url = strdup(arg); - } else { - if (!strlen(arg) || *arg == '/') - show_usage_and_exit(1); - free(rpc_url); - rpc_url = (char*)malloc(strlen(arg) + 8); - sprintf(rpc_url, "http://%s", arg); - } - p = strrchr(rpc_url, '@'); - if (p) { - char *sp, *ap; - *p = '\0'; - ap = strstr(rpc_url, "://") + 3; - sp = strchr(ap, ':'); - if (sp) { - free(rpc_userpass); - rpc_userpass = strdup(ap); - free(rpc_user); - rpc_user = (char*)calloc(sp - ap + 1, 1); - strncpy(rpc_user, ap, sp - ap); - free(rpc_pass); - rpc_pass = strdup(sp + 1); - } else { - free(rpc_user); - rpc_user = strdup(ap); - } - memmove(ap, p + 1, strlen(p + 1) + 1); - } - have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7); - break; - case 'O': /* --userpass */ - p = strchr(arg, ':'); - if (!p) - show_usage_and_exit(1); - free(rpc_userpass); - rpc_userpass = strdup(arg); - free(rpc_user); - rpc_user = (char*)calloc(p - arg + 1, 1); - strncpy(rpc_user, arg, p - arg); - free(rpc_pass); - rpc_pass = strdup(p + 1); - break; - case 'x': /* --proxy */ - if (!strncasecmp(arg, "socks4://", 9)) - opt_proxy_type = CURLPROXY_SOCKS4; - else if (!strncasecmp(arg, "socks5://", 9)) - opt_proxy_type = CURLPROXY_SOCKS5; -#if LIBCURL_VERSION_NUM >= 0x071200 - else if (!strncasecmp(arg, "socks4a://", 10)) - opt_proxy_type = CURLPROXY_SOCKS4A; - else if (!strncasecmp(arg, "socks5h://", 10)) - opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME; -#endif - else - opt_proxy_type = CURLPROXY_HTTP; - free(opt_proxy); - opt_proxy = strdup(arg); - break; - case 1001: - free(opt_cert); - opt_cert = strdup(arg); - break; - case 1005: - opt_benchmark = true; - want_longpoll = false; - want_stratum = false; - have_stratum = false; - break; - case 1003: - want_longpoll = false; - break; - case 1007: - want_stratum = false; - break; - case 'S': - use_syslog = true; - break; - case 'd': // CB - { - char * pch = strtok (arg,","); - opt_n_threads = 0; - while (pch != NULL) { - if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') - { - if (atoi(pch) < num_processors) - device_map[opt_n_threads++] = atoi(pch); - else { - applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch)); - exit(1); - } - } else { - int device = cuda_finddevice(pch); - if (device >= 0 && device < num_processors) - device_map[opt_n_threads++] = device; - else { - applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch); - exit(1); - } - } - pch = strtok (NULL, ","); - } - } - break; - case 'V': - show_version_and_exit(); - case 'h': - show_usage_and_exit(0); - default: - show_usage_and_exit(1); - } -} - -static void parse_config(void) -{ - int i; - json_t *val; - - if (!json_is_object(opt_config)) - return; - - for (i = 0; i < ARRAY_SIZE(options); i++) { - if (!options[i].name) - break; - if (!strcmp(options[i].name, "config")) - continue; - - val = json_object_get(opt_config, options[i].name); - if (!val) - continue; - - if (options[i].has_arg && json_is_string(val)) { - char *s = strdup(json_string_value(val)); - if (!s) - break; - parse_arg(options[i].val, s); - free(s); - } else if (!options[i].has_arg && json_is_true(val)) - parse_arg(options[i].val, ""); - else - applog(LOG_ERR, "JSON option %s invalid", - options[i].name); - } - - if (opt_algo == ALGO_HEAVY && opt_vote == 9999) { - fprintf(stderr, "Heavycoin hash requires block reward vote parameter (see --vote)\n"); - show_usage_and_exit(1); - } -} - -static void parse_cmdline(int argc, char *argv[]) -{ - int key; - - while (1) { -#if HAVE_GETOPT_LONG - key = getopt_long(argc, argv, short_options, options, NULL); -#else - key = getopt(argc, argv, short_options); -#endif - if (key < 0) - break; - - parse_arg(key, optarg); - } - if (optind < argc) { - fprintf(stderr, "%s: unsupported non-option argument '%s'\n", - argv[0], argv[optind]); - show_usage_and_exit(1); - } - - if (opt_algo == ALGO_HEAVY && opt_vote == 9999) { - fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n", - argv[0]); - show_usage_and_exit(1); - } - - parse_config(); -} - -#ifndef WIN32 -static void signal_handler(int sig) -{ - switch (sig) { - case SIGHUP: - applog(LOG_INFO, "SIGHUP received"); - break; - case SIGINT: - applog(LOG_INFO, "SIGINT received, exiting"); - exit(0); - break; - case SIGTERM: - applog(LOG_INFO, "SIGTERM received, exiting"); - exit(0); - break; - } -} -#endif - -#define PROGRAM_VERSION "0.5" -int main(int argc, char *argv[]) -{ - struct thr_info *thr; - long flags; - int i; - -#ifdef WIN32 - SYSTEM_INFO sysinfo; -#endif - - printf(" *** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n"); - printf("\t This is version "PROGRAM_VERSION" (beta)\n"); - printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n"); - printf("\t based on pooler-cpuminer extension for HVC from\n\t https://github.com/heavycoin/cpuminer-heavycoin\n"); - printf("\t\t\tand\n\t http://hvc.1gh.com/\n"); - printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n"); - printf("\t LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm\n"); - printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n"); - printf("\t YAC donation address: Y87sptDEcpLkLeAuex6qZioDbvy1qXZEj4\n"); - - rpc_user = strdup(""); - rpc_pass = strdup(""); - - pthread_mutex_init(&applog_lock, NULL); - num_processors = cuda_num_devices(); - - /* parse command line */ - parse_cmdline(argc, argv); - - if (!opt_benchmark && !rpc_url) { - fprintf(stderr, "%s: no URL supplied\n", argv[0]); - show_usage_and_exit(1); - } - - if (!rpc_userpass) { - rpc_userpass = (char*)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2); - if (!rpc_userpass) - return 1; - sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); - } - - pthread_mutex_init(&stats_lock, NULL); - pthread_mutex_init(&g_work_lock, NULL); - pthread_mutex_init(&stratum.sock_lock, NULL); - pthread_mutex_init(&stratum.work_lock, NULL); - - flags = !opt_benchmark && strncmp(rpc_url, "https:", 6) - ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) - : CURL_GLOBAL_ALL; - if (curl_global_init(flags)) { - applog(LOG_ERR, "CURL initialization failed"); - return 1; - } - -#ifndef WIN32 - if (opt_background) { - i = fork(); - if (i < 0) exit(1); - if (i > 0) exit(0); - i = setsid(); - if (i < 0) - applog(LOG_ERR, "setsid() failed (errno = %d)", errno); - i = chdir("/"); - if (i < 0) - applog(LOG_ERR, "chdir() failed (errno = %d)", errno); - signal(SIGHUP, signal_handler); - signal(SIGINT, signal_handler); - signal(SIGTERM, signal_handler); - } -#endif - - if (num_processors == 0) - { - applog(LOG_ERR, "No CUDA devices found! terminating."); - exit(1); - } - if (!opt_n_threads) - opt_n_threads = num_processors; - -#ifdef HAVE_SYSLOG_H - if (use_syslog) - openlog("cpuminer", LOG_PID, LOG_USER); -#endif - - work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart)); - if (!work_restart) - return 1; - - thr_info = (struct thr_info *)calloc(opt_n_threads + 3, sizeof(*thr)); - if (!thr_info) - return 1; - - thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double)); - if (!thr_hashrates) - return 1; - - /* init workio thread info */ - work_thr_id = opt_n_threads; - thr = &thr_info[work_thr_id]; - thr->id = work_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - - /* start work I/O thread */ - if (pthread_create(&thr->pth, NULL, workio_thread, thr)) { - applog(LOG_ERR, "workio thread create failed"); - return 1; - } - - if (want_longpoll && !have_stratum) { - /* init longpoll thread info */ - longpoll_thr_id = opt_n_threads + 1; - thr = &thr_info[longpoll_thr_id]; - thr->id = longpoll_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - - /* start longpoll thread */ - if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) { - applog(LOG_ERR, "longpoll thread create failed"); - return 1; - } - } - if (want_stratum) { - /* init stratum thread info */ - stratum_thr_id = opt_n_threads + 2; - thr = &thr_info[stratum_thr_id]; - thr->id = stratum_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - - /* start stratum thread */ - if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) { - applog(LOG_ERR, "stratum thread create failed"); - return 1; - } - - if (have_stratum) - tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); - } - - /* start mining threads */ - for (i = 0; i < opt_n_threads; i++) { - thr = &thr_info[i]; - - thr->id = i; - thr->q = tq_new(); - if (!thr->q) - return 1; - - if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) { - applog(LOG_ERR, "thread %d create failed", i); - return 1; - } - } - - applog(LOG_INFO, "%d miner threads started, " - "using '%s' algorithm.", - opt_n_threads, - algo_names[opt_algo]); - -#ifdef WIN32 - timeBeginPeriod(1); // enable high timer precision (similar to Google Chrome Trick) -#endif - - /* main loop - simply wait for workio thread to exit */ - pthread_join(thr_info[work_thr_id].pth, NULL); - -#ifdef WIN32 - timeEndPeriod(1); // be nice and forego high timer precision -#endif - - applog(LOG_INFO, "workio thread dead, exiting."); - - return 0; -} +/* + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef WIN32 +#include +#else +#include +#include +#include +#if HAVE_SYS_SYSCTL_H +#include +#if HAVE_SYS_PARAM_H +#include +#endif +#include +#endif +#endif +#include +#include +#include +#include "compat.h" +#include "miner.h" + +#ifdef WIN32 +#include +#pragma comment(lib, "winmm.lib") +#endif + +#define PROGRAM_NAME "minerd" +#define LP_SCANTIME 60 +#define HEAVYCOIN_BLKHDR_SZ 84 + +// from heavy.cu +#ifdef __cplusplus +extern "C" +{ +#endif +int cuda_num_devices(); +int cuda_finddevice(char *name); +#ifdef __cplusplus +} +#endif + + +#ifdef __linux /* Linux specific policy and affinity management */ +#include +static inline void drop_policy(void) +{ + struct sched_param param; + param.sched_priority = 0; + +#ifdef SCHED_IDLE + if (unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) +#endif +#ifdef SCHED_BATCH + sched_setscheduler(0, SCHED_BATCH, ¶m); +#endif +} + +static inline void affine_to_cpu(int id, int cpu) +{ + cpu_set_t set; + + CPU_ZERO(&set); + CPU_SET(cpu, &set); + sched_setaffinity(0, sizeof(&set), &set); +} +#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */ +#include +static inline void drop_policy(void) +{ +} + +static inline void affine_to_cpu(int id, int cpu) +{ + cpuset_t set; + CPU_ZERO(&set); + CPU_SET(cpu, &set); + cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set); +} +#else +static inline void drop_policy(void) +{ +} + +static inline void affine_to_cpu(int id, int cpu) +{ +} +#endif + +enum workio_commands { + WC_GET_WORK, + WC_SUBMIT_WORK, +}; + +struct workio_cmd { + enum workio_commands cmd; + struct thr_info *thr; + union { + struct work *work; + } u; +}; + +typedef enum { + ALGO_HEAVY, /* Heavycoin hash */ + ALGO_FUGUE256, /* Fugue256 */ + ALGO_GROESTL, + ALGO_MYR_GR, + ALGO_JACKPOT +} sha256_algos; + +static const char *algo_names[] = { + "heavy", + "fugue256", + "groestl", + "myr-gr", + "jackpot" +}; + +bool opt_debug = false; +bool opt_protocol = false; +bool opt_benchmark = false; +bool want_longpoll = true; +bool have_longpoll = false; +bool want_stratum = true; +bool have_stratum = false; +static bool submit_old = false; +bool use_syslog = false; +static bool opt_background = false; +static bool opt_quiet = false; +static int opt_retries = -1; +static int opt_fail_pause = 30; +int opt_timeout = 270; +static int opt_scantime = 5; +static json_t *opt_config; +static const bool opt_time = true; +static sha256_algos opt_algo = ALGO_HEAVY; +static int opt_n_threads = 0; +bool opt_trust_pool = false; +uint16_t opt_vote = 9999; +static int num_processors; +int device_map[8] = {0,1,2,3,4,5,6,7}; // CB +static char *rpc_url; +static char *rpc_userpass; +static char *rpc_user, *rpc_pass; +char *opt_cert; +char *opt_proxy; +long opt_proxy_type; +struct thr_info *thr_info; +static int work_thr_id; +int longpoll_thr_id = -1; +int stratum_thr_id = -1; +struct work_restart *work_restart = NULL; +static struct stratum_ctx stratum; + +pthread_mutex_t applog_lock; +static pthread_mutex_t stats_lock; + +static unsigned long accepted_count = 0L; +static unsigned long rejected_count = 0L; +static double *thr_hashrates; + +#ifdef HAVE_GETOPT_LONG +#include +#else +struct option { + const char *name; + int has_arg; + int *flag; + int val; +}; +#endif + +static char const usage[] = "\ +Usage: " PROGRAM_NAME " [OPTIONS]\n\ +Options:\n\ + -a, --algo=ALGO specify the algorithm to use\n\ + fugue256 Fuguecoin hash\n\ + heavy Heavycoin hash\n\ + groestl Groestlcoin hash\n\ + myr-gr Myriad-Groestl hash\n\ + jackpot Jackpot hash\n\ + -d, --devices takes a comma separated list of CUDA devices to use.\n\ + Device IDs start counting from 0! Alternatively takes\n\ + string names of your cards like gtx780ti or gt640#2\n\ + (matching 2nd gt640 in the PC)\n\ + -v, --vote=VOTE block reward vote (for HeavyCoin)\n\ + -m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ + -o, --url=URL URL of mining server\n\ + -O, --userpass=U:P username:password pair for mining server\n\ + -u, --user=USERNAME username for mining server\n\ + -p, --pass=PASSWORD password for mining server\n\ + --cert=FILE certificate for mining server using SSL\n\ + -x, --proxy=[PROTOCOL://]HOST[:PORT] connect through a proxy\n\ + -t, --threads=N number of miner threads (default: number of nVidia GPUs)\n\ + -r, --retries=N number of times to retry if a network call fails\n\ + (default: retry indefinitely)\n\ + -R, --retry-pause=N time to pause between retries, in seconds (default: 30)\n\ + -T, --timeout=N network timeout, in seconds (default: 270)\n\ + -s, --scantime=N upper bound on time spent scanning current work when\n\ + long polling is unavailable, in seconds (default: 5)\n\ + --no-longpoll disable X-Long-Polling support\n\ + --no-stratum disable X-Stratum support\n\ + -q, --quiet disable per-thread hashmeter output\n\ + -D, --debug enable debug output\n\ + -P, --protocol-dump verbose dump of protocol-level activities\n" +#ifdef HAVE_SYSLOG_H +"\ + -S, --syslog use system log for output messages\n" +#endif +#ifndef WIN32 +"\ + -B, --background run the miner in the background\n" +#endif +"\ + --benchmark run in offline benchmark mode\n\ + -c, --config=FILE load a JSON-format configuration file\n\ + -V, --version display version information and exit\n\ + -h, --help display this help text and exit\n\ +"; + +static char const short_options[] = +#ifndef WIN32 + "B" +#endif +#ifdef HAVE_SYSLOG_H + "S" +#endif + "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:"; + +static struct option const options[] = { + { "algo", 1, NULL, 'a' }, +#ifndef WIN32 + { "background", 0, NULL, 'B' }, +#endif + { "benchmark", 0, NULL, 1005 }, + { "cert", 1, NULL, 1001 }, + { "config", 1, NULL, 'c' }, + { "debug", 0, NULL, 'D' }, + { "help", 0, NULL, 'h' }, + { "no-longpoll", 0, NULL, 1003 }, + { "no-stratum", 0, NULL, 1007 }, + { "pass", 1, NULL, 'p' }, + { "protocol-dump", 0, NULL, 'P' }, + { "proxy", 1, NULL, 'x' }, + { "quiet", 0, NULL, 'q' }, + { "retries", 1, NULL, 'r' }, + { "retry-pause", 1, NULL, 'R' }, + { "scantime", 1, NULL, 's' }, +#ifdef HAVE_SYSLOG_H + { "syslog", 0, NULL, 'S' }, +#endif + { "threads", 1, NULL, 't' }, + { "vote", 1, NULL, 'v' }, + { "trust-pool", 0, NULL, 'm' }, + { "timeout", 1, NULL, 'T' }, + { "url", 1, NULL, 'o' }, + { "user", 1, NULL, 'u' }, + { "userpass", 1, NULL, 'O' }, + { "version", 0, NULL, 'V' }, + { "devices", 1, NULL, 'd' }, + { 0, 0, 0, 0 } +}; + +struct work { + uint32_t data[32]; + uint32_t target[8]; + uint32_t maxvote; + + char job_id[128]; + size_t xnonce2_len; + unsigned char xnonce2[32]; +}; + +static struct work g_work; +static time_t g_work_time; +static pthread_mutex_t g_work_lock; + +static bool jobj_binary(const json_t *obj, const char *key, + void *buf, size_t buflen) +{ + const char *hexstr; + json_t *tmp; + + tmp = json_object_get(obj, key); + if (unlikely(!tmp)) { + applog(LOG_ERR, "JSON key '%s' not found", key); + return false; + } + hexstr = json_string_value(tmp); + if (unlikely(!hexstr)) { + applog(LOG_ERR, "JSON key '%s' is not a string", key); + return false; + } + if (!hex2bin((unsigned char*)buf, hexstr, buflen)) + return false; + + return true; +} + +static bool work_decode(const json_t *val, struct work *work) +{ + int i; + + if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) { + applog(LOG_ERR, "JSON inval data"); + goto err_out; + } + if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) { + applog(LOG_ERR, "JSON inval target"); + goto err_out; + } + if (opt_algo == ALGO_HEAVY) { + if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) { + work->maxvote = 1024; + } + } else work->maxvote = 0; + + for (i = 0; i < ARRAY_SIZE(work->data); i++) + work->data[i] = le32dec(work->data + i); + for (i = 0; i < ARRAY_SIZE(work->target); i++) + work->target[i] = le32dec(work->target + i); + + return true; + +err_out: + return false; +} + +static void share_result(int result, const char *reason) +{ + char s[345]; + double hashrate; + int i; + + hashrate = 0.; + pthread_mutex_lock(&stats_lock); + for (i = 0; i < opt_n_threads; i++) + hashrate += thr_hashrates[i]; + result ? accepted_count++ : rejected_count++; + pthread_mutex_unlock(&stats_lock); + + sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); + applog(LOG_INFO, "accepted: %lu/%lu (%.2f%%), %s khash/s %s", + accepted_count, + accepted_count + rejected_count, + 100. * accepted_count / (accepted_count + rejected_count), + s, + result ? "(yay!!!)" : "(booooo)"); + + if (opt_debug && reason) + applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason); +} + +static bool submit_upstream_work(CURL *curl, struct work *work) +{ + char *str = NULL; + json_t *val, *res, *reason; + char s[345]; + int i; + bool rc = false; + + /* pass if the previous hash is not the current previous hash */ + if (memcmp(work->data + 1, g_work.data + 1, 32)) { + if (opt_debug) + applog(LOG_DEBUG, "DEBUG: stale work detected, discarding"); + return true; + } + + if (have_stratum) { + uint32_t ntime, nonce; + uint16_t nvote; + char *ntimestr, *noncestr, *xnonce2str, *nvotestr; + + le32enc(&ntime, work->data[17]); + le32enc(&nonce, work->data[19]); + be16enc(&nvote, *((uint16_t*)&work->data[20])); + + ntimestr = bin2hex((const unsigned char *)(&ntime), 4); + noncestr = bin2hex((const unsigned char *)(&nonce), 4); + xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len); + nvotestr = bin2hex((const unsigned char *)(&nvote), 2); + if (opt_algo == ALGO_HEAVY) { + sprintf(s, + "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", + rpc_user, work->job_id, xnonce2str, ntimestr, noncestr, nvotestr); + } else { + sprintf(s, + "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", + rpc_user, work->job_id, xnonce2str, ntimestr, noncestr); + } + free(ntimestr); + free(noncestr); + free(xnonce2str); + free(nvotestr); + + if (unlikely(!stratum_send_line(&stratum, s))) { + applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); + goto out; + } + } else { + + /* build hex string */ + + if (opt_algo != ALGO_HEAVY) { + for (i = 0; i < ARRAY_SIZE(work->data); i++) + le32enc(work->data + i, work->data[i]); + } + str = bin2hex((unsigned char *)work->data, sizeof(work->data)); + if (unlikely(!str)) { + applog(LOG_ERR, "submit_upstream_work OOM"); + goto out; + } + + /* build JSON-RPC request */ + sprintf(s, + "{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n", + str); + + /* issue JSON-RPC request */ + val = json_rpc_call(curl, rpc_url, rpc_userpass, s, false, false, NULL); + if (unlikely(!val)) { + applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); + goto out; + } + + res = json_object_get(val, "result"); + reason = json_object_get(val, "reject-reason"); + share_result(json_is_true(res), reason ? json_string_value(reason) : NULL); + + json_decref(val); + } + + rc = true; + +out: + free(str); + return rc; +} + +static const char *rpc_req = + "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; + +static bool get_upstream_work(CURL *curl, struct work *work) +{ + json_t *val; + bool rc; + struct timeval tv_start, tv_end, diff; + + gettimeofday(&tv_start, NULL); + val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req, + want_longpoll, false, NULL); + gettimeofday(&tv_end, NULL); + + if (have_stratum) { + if (val) + json_decref(val); + return true; + } + + if (!val) + return false; + + rc = work_decode(json_object_get(val, "result"), work); + + if (opt_debug && rc) { + timeval_subtract(&diff, &tv_end, &tv_start); + applog(LOG_DEBUG, "DEBUG: got new work in %d ms", + diff.tv_sec * 1000 + diff.tv_usec / 1000); + } + + json_decref(val); + + return rc; +} + +static void workio_cmd_free(struct workio_cmd *wc) +{ + if (!wc) + return; + + switch (wc->cmd) { + case WC_SUBMIT_WORK: + free(wc->u.work); + break; + default: /* do nothing */ + break; + } + + memset(wc, 0, sizeof(*wc)); /* poison */ + free(wc); +} + +static bool workio_get_work(struct workio_cmd *wc, CURL *curl) +{ + struct work *ret_work; + int failures = 0; + + ret_work = (struct work*)calloc(1, sizeof(*ret_work)); + if (!ret_work) + return false; + + /* obtain new work from bitcoin via JSON-RPC */ + while (!get_upstream_work(curl, ret_work)) { + if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); + free(ret_work); + return false; + } + + /* pause, then restart work-request loop */ + applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds", + opt_fail_pause); + sleep(opt_fail_pause); + } + + /* send work to requesting thread */ + if (!tq_push(wc->thr->q, ret_work)) + free(ret_work); + + return true; +} + +static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) +{ + int failures = 0; + + /* submit solution to bitcoin via JSON-RPC */ + while (!submit_upstream_work(curl, wc->u.work)) { + if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + applog(LOG_ERR, "...terminating workio thread"); + return false; + } + + /* pause, then restart work-request loop */ + applog(LOG_ERR, "...retry after %d seconds", + opt_fail_pause); + sleep(opt_fail_pause); + } + + return true; +} + +static void *workio_thread(void *userdata) +{ + struct thr_info *mythr = (struct thr_info*)userdata; + CURL *curl; + bool ok = true; + + curl = curl_easy_init(); + if (unlikely(!curl)) { + applog(LOG_ERR, "CURL initialization failed"); + return NULL; + } + + while (ok) { + struct workio_cmd *wc; + + /* wait for workio_cmd sent to us, on our queue */ + wc = (struct workio_cmd *)tq_pop(mythr->q, NULL); + if (!wc) { + ok = false; + break; + } + + /* process workio_cmd */ + switch (wc->cmd) { + case WC_GET_WORK: + ok = workio_get_work(wc, curl); + break; + case WC_SUBMIT_WORK: + ok = workio_submit_work(wc, curl); + break; + + default: /* should never happen */ + ok = false; + break; + } + + workio_cmd_free(wc); + } + + tq_freeze(mythr->q); + curl_easy_cleanup(curl); + + return NULL; +} + +static bool get_work(struct thr_info *thr, struct work *work) +{ + struct workio_cmd *wc; + struct work *work_heap; + + if (opt_benchmark) { + memset(work->data, 0x55, 76); + work->data[17] = swab32((uint32_t)time(NULL)); + memset(work->data + 19, 0x00, 52); + work->data[20] = 0x80000000; + work->data[31] = 0x00000280; + memset(work->target, 0x00, sizeof(work->target)); + return true; + } + + /* fill out work request message */ + wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); + if (!wc) + return false; + + wc->cmd = WC_GET_WORK; + wc->thr = thr; + + /* send work request to workio thread */ + if (!tq_push(thr_info[work_thr_id].q, wc)) { + workio_cmd_free(wc); + return false; + } + + /* wait for response, a unit of work */ + work_heap = (struct work *)tq_pop(thr->q, NULL); + if (!work_heap) + return false; + + /* copy returned work into storage provided by caller */ + memcpy(work, work_heap, sizeof(*work)); + free(work_heap); + + return true; +} + +static bool submit_work(struct thr_info *thr, const struct work *work_in) +{ + struct workio_cmd *wc; + /* fill out work request message */ + wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); + if (!wc) + return false; + + wc->u.work = (struct work *)malloc(sizeof(*work_in)); + if (!wc->u.work) + goto err_out; + + wc->cmd = WC_SUBMIT_WORK; + wc->thr = thr; + memcpy(wc->u.work, work_in, sizeof(*work_in)); + + /* send solution to workio thread */ + if (!tq_push(thr_info[work_thr_id].q, wc)) + goto err_out; + + return true; + +err_out: + workio_cmd_free(wc); + return false; +} + +static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) +{ + unsigned char merkle_root[64]; + int i; + + pthread_mutex_lock(&sctx->work_lock); + + strcpy(work->job_id, sctx->job.job_id); + work->xnonce2_len = sctx->xnonce2_size; + memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); + + /* Generate merkle root */ + if (opt_algo == ALGO_HEAVY) + heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + else + if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL) + SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root); + else + sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + + for (i = 0; i < sctx->job.merkle_count; i++) { + memcpy(merkle_root + 32, sctx->job.merkle[i], 32); + if (opt_algo == ALGO_HEAVY) + heavycoin_hash(merkle_root, merkle_root, 64); + else + sha256d(merkle_root, merkle_root, 64); + } + + /* Increment extranonce2 */ + for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); + + /* Assemble block header */ + memset(work->data, 0, 128); + work->data[0] = le32dec(sctx->job.version); + for (i = 0; i < 8; i++) + work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i); + for (i = 0; i < 8; i++) + work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); + work->data[17] = le32dec(sctx->job.ntime); + work->data[18] = le32dec(sctx->job.nbits); + work->data[20] = 0x80000000; + work->data[31] = 0x00000280; + + // HeavyCoin + if (opt_algo == ALGO_HEAVY) { + uint16_t *ext; + work->maxvote = 1024; + ext = (uint16_t*)(&work->data[20]); + ext[0] = opt_vote; + ext[1] = be16dec(sctx->job.nreward); + + for (i = 0; i < 20; i++) + work->data[i] = be32dec((uint32_t *)&work->data[i]); + } + // + + pthread_mutex_unlock(&sctx->work_lock); + + if (opt_debug) { + char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size); + applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x", + work->job_id, xnonce2str, swab32(work->data[17])); + free(xnonce2str); + } + + if (opt_algo == ALGO_JACKPOT) + diff_to_target(work->target, sctx->job.diff / 65536.0); + else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL) + diff_to_target(work->target, sctx->job.diff / 256.0); + else + diff_to_target(work->target, sctx->job.diff); +} + +static void *miner_thread(void *userdata) +{ + struct thr_info *mythr = (struct thr_info *)userdata; + int thr_id = mythr->id; + struct work work; + uint32_t max_nonce; + uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; + unsigned char *scratchbuf = NULL; + char s[16]; + int i; + + memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized + + /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE + * and if that fails, then SCHED_BATCH. No need for this to be an + * error if it fails */ + if (!opt_benchmark) { + setpriority(PRIO_PROCESS, 0, 19); + drop_policy(); + } + + /* Cpu affinity only makes sense if the number of threads is a multiple + * of the number of CPUs */ + if (num_processors > 1 && opt_n_threads % num_processors == 0) { + if (!opt_quiet) + applog(LOG_INFO, "Binding thread %d to cpu %d", + thr_id, thr_id % num_processors); + affine_to_cpu(thr_id, thr_id % num_processors); + } + + while (1) { + unsigned long hashes_done; + struct timeval tv_start, tv_end, diff; + int64_t max64; + int rc; + + if (have_stratum) { + while (time(NULL) >= g_work_time + 120) + sleep(1); + pthread_mutex_lock(&g_work_lock); + if (work.data[19] >= end_nonce) + stratum_gen_work(&stratum, &g_work); + } else { + /* obtain new work from internal workio thread */ + pthread_mutex_lock(&g_work_lock); + if (!have_stratum && (!have_longpoll || + time(NULL) >= g_work_time + LP_SCANTIME*3/4 || + work.data[19] >= end_nonce)) { + if (unlikely(!get_work(mythr, &g_work))) { + applog(LOG_ERR, "work retrieval failed, exiting " + "mining thread %d", mythr->id); + pthread_mutex_unlock(&g_work_lock); + goto out; + } + g_work_time = have_stratum ? 0 : time(NULL); + } + if (have_stratum) { + pthread_mutex_unlock(&g_work_lock); + continue; + } + } + if (memcmp(work.data, g_work.data, 76)) { + memcpy(&work, &g_work, sizeof(struct work)); + work.data[19] = 0xffffffffU / opt_n_threads * thr_id; + } else + work.data[19]++; + pthread_mutex_unlock(&g_work_lock); + work_restart[thr_id].restart = 0; + + /* adjust max_nonce to meet target scan time */ + if (have_stratum) + max64 = LP_SCANTIME; + else + max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) + - time(NULL); + max64 *= (int64_t)thr_hashrates[thr_id]; + if (max64 <= 0) + max64 = (opt_algo == ALGO_JACKPOT) ? 0x1fffLL : 0xfffffLL; + if ((int64_t)work.data[19] + max64 > end_nonce) + max_nonce = end_nonce; + else + max_nonce = (uint32_t)(work.data[19] + max64); + + hashes_done = 0; + gettimeofday(&tv_start, NULL); + + /* scan nonces for a proof-of-work hash */ + switch (opt_algo) { + + case ALGO_HEAVY: + rc = scanhash_heavy(thr_id, work.data, work.target, + max_nonce, &hashes_done, work.maxvote); + break; + + case ALGO_FUGUE256: + rc = scanhash_fugue256(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_GROESTL: + rc = scanhash_groestlcoin(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_MYR_GR: + rc = scanhash_myriad(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_JACKPOT: + rc = scanhash_jackpot(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + default: + /* should never happen */ + goto out; + } + + /* record scanhash elapsed time */ + gettimeofday(&tv_end, NULL); + timeval_subtract(&diff, &tv_end, &tv_start); + if (diff.tv_usec || diff.tv_sec) { + pthread_mutex_lock(&stats_lock); + thr_hashrates[thr_id] = + hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); + pthread_mutex_unlock(&stats_lock); + } + if (!opt_quiet) { + sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", + 1e-3 * thr_hashrates[thr_id]); + applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s", + thr_id, hashes_done, s); + } + if (opt_benchmark && thr_id == opt_n_threads - 1) { + double hashrate = 0.; + for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++) + hashrate += thr_hashrates[i]; + if (i == opt_n_threads) { + sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); + applog(LOG_INFO, "Total: %s khash/s", s); + } + } + + /* if nonce found, submit work */ + if (rc && !opt_benchmark && !submit_work(mythr, &work)) + break; + } + +out: + tq_freeze(mythr->q); + + return NULL; +} + +static void restart_threads(void) +{ + int i; + + for (i = 0; i < opt_n_threads; i++) + work_restart[i].restart = 1; +} + +static void *longpoll_thread(void *userdata) +{ + struct thr_info *mythr = (struct thr_info *)userdata; + CURL *curl = NULL; + char *copy_start, *hdr_path = NULL, *lp_url = NULL; + bool need_slash = false; + + curl = curl_easy_init(); + if (unlikely(!curl)) { + applog(LOG_ERR, "CURL initialization failed"); + goto out; + } + +start: + hdr_path = (char*)tq_pop(mythr->q, NULL); + if (!hdr_path) + goto out; + + /* full URL */ + if (strstr(hdr_path, "://")) { + lp_url = hdr_path; + hdr_path = NULL; + } + + /* absolute path, on current server */ + else { + copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path; + if (rpc_url[strlen(rpc_url) - 1] != '/') + need_slash = true; + + lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2); + if (!lp_url) + goto out; + + sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start); + } + + applog(LOG_INFO, "Long-polling activated for %s", lp_url); + + while (1) { + json_t *val, *soval; + int err; + + val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req, + false, true, &err); + if (have_stratum) { + if (val) + json_decref(val); + goto out; + } + if (likely(val)) { + if (!opt_quiet) applog(LOG_INFO, "LONGPOLL detected new block"); + soval = json_object_get(json_object_get(val, "result"), "submitold"); + submit_old = soval ? json_is_true(soval) : false; + pthread_mutex_lock(&g_work_lock); + if (work_decode(json_object_get(val, "result"), &g_work)) { + if (opt_debug) + applog(LOG_DEBUG, "DEBUG: got new work"); + time(&g_work_time); + restart_threads(); + } + pthread_mutex_unlock(&g_work_lock); + json_decref(val); + } else { + pthread_mutex_lock(&g_work_lock); + g_work_time -= LP_SCANTIME; + pthread_mutex_unlock(&g_work_lock); + if (err == CURLE_OPERATION_TIMEDOUT) { + restart_threads(); + } else { + have_longpoll = false; + restart_threads(); + free(hdr_path); + free(lp_url); + lp_url = NULL; + sleep(opt_fail_pause); + goto start; + } + } + } + +out: + free(hdr_path); + free(lp_url); + tq_freeze(mythr->q); + if (curl) + curl_easy_cleanup(curl); + + return NULL; +} + +static bool stratum_handle_response(char *buf) +{ + json_t *val, *err_val, *res_val, *id_val; + json_error_t err; + bool ret = false; + + val = JSON_LOADS(buf, &err); + if (!val) { + applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + id_val = json_object_get(val, "id"); + + if (!id_val || json_is_null(id_val) || !res_val) + goto out; + + share_result(json_is_true(res_val), + err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); + + ret = true; +out: + if (val) + json_decref(val); + + return ret; +} + +static void *stratum_thread(void *userdata) +{ + struct thr_info *mythr = (struct thr_info *)userdata; + char *s; + + stratum.url = (char*)tq_pop(mythr->q, NULL); + if (!stratum.url) + goto out; + applog(LOG_INFO, "Starting Stratum on %s", stratum.url); + + while (1) { + int failures = 0; + + while (!stratum.curl) { + pthread_mutex_lock(&g_work_lock); + g_work_time = 0; + pthread_mutex_unlock(&g_work_lock); + restart_threads(); + + if (!stratum_connect(&stratum, stratum.url) || + !stratum_subscribe(&stratum) || + !stratum_authorize(&stratum, rpc_user, rpc_pass)) { + stratum_disconnect(&stratum); + if (opt_retries >= 0 && ++failures > opt_retries) { + applog(LOG_ERR, "...terminating workio thread"); + tq_push(thr_info[work_thr_id].q, NULL); + goto out; + } + applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); + sleep(opt_fail_pause); + } + } + + if (stratum.job.job_id && + (strcmp(stratum.job.job_id, g_work.job_id) || !g_work_time)) { + pthread_mutex_lock(&g_work_lock); + stratum_gen_work(&stratum, &g_work); + time(&g_work_time); + pthread_mutex_unlock(&g_work_lock); + if (stratum.job.clean) { + if (!opt_quiet) applog(LOG_INFO, "Stratum detected new block"); + restart_threads(); + } + } + + if (!stratum_socket_full(&stratum, 120)) { + applog(LOG_ERR, "Stratum connection timed out"); + s = NULL; + } else + s = stratum_recv_line(&stratum); + if (!s) { + stratum_disconnect(&stratum); + applog(LOG_ERR, "Stratum connection interrupted"); + continue; + } + if (!stratum_handle_method(&stratum, s)) + stratum_handle_response(s); + free(s); + } + +out: + return NULL; +} + +static void show_version_and_exit(void) +{ + printf("%s\n%s\n", PACKAGE_STRING, curl_version()); + exit(0); +} + +static void show_usage_and_exit(int status) +{ + if (status) + fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n"); + else + printf(usage); + exit(status); +} + +static void parse_arg (int key, char *arg) +{ + char *p; + int v, i; + + switch(key) { + case 'a': + for (i = 0; i < ARRAY_SIZE(algo_names); i++) { + if (algo_names[i] && + !strcmp(arg, algo_names[i])) { + opt_algo = (sha256_algos)i; + break; + } + } + if (i == ARRAY_SIZE(algo_names)) + show_usage_and_exit(1); + break; + case 'B': + opt_background = true; + break; + case 'c': { + json_error_t err; + if (opt_config) + json_decref(opt_config); +#if JANSSON_VERSION_HEX >= 0x020000 + opt_config = json_load_file(arg, 0, &err); +#else + opt_config = json_load_file(arg, &err); +#endif + if (!json_is_object(opt_config)) { + applog(LOG_ERR, "JSON decode of %s failed", arg); + exit(1); + } + break; + } + case 'q': + opt_quiet = true; + break; + case 'D': + opt_debug = true; + break; + case 'p': + free(rpc_pass); + rpc_pass = strdup(arg); + break; + case 'P': + opt_protocol = true; + break; + case 'r': + v = atoi(arg); + if (v < -1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_retries = v; + break; + case 'R': + v = atoi(arg); + if (v < 1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_fail_pause = v; + break; + case 's': + v = atoi(arg); + if (v < 1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_scantime = v; + break; + case 'T': + v = atoi(arg); + if (v < 1 || v > 99999) /* sanity check */ + show_usage_and_exit(1); + opt_timeout = v; + break; + case 't': + v = atoi(arg); + if (v < 1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_n_threads = v; + break; + case 'v': + v = atoi(arg); + if (v < 0 || v > 1024) /* sanity check */ + show_usage_and_exit(1); + opt_vote = (uint16_t)v; + break; + case 'm': + opt_trust_pool = true; + break; + case 'u': + free(rpc_user); + rpc_user = strdup(arg); + break; + case 'o': /* --url */ + p = strstr(arg, "://"); + if (p) { + if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) && + strncasecmp(arg, "stratum+tcp://", 14)) + show_usage_and_exit(1); + free(rpc_url); + rpc_url = strdup(arg); + } else { + if (!strlen(arg) || *arg == '/') + show_usage_and_exit(1); + free(rpc_url); + rpc_url = (char*)malloc(strlen(arg) + 8); + sprintf(rpc_url, "http://%s", arg); + } + p = strrchr(rpc_url, '@'); + if (p) { + char *sp, *ap; + *p = '\0'; + ap = strstr(rpc_url, "://") + 3; + sp = strchr(ap, ':'); + if (sp) { + free(rpc_userpass); + rpc_userpass = strdup(ap); + free(rpc_user); + rpc_user = (char*)calloc(sp - ap + 1, 1); + strncpy(rpc_user, ap, sp - ap); + free(rpc_pass); + rpc_pass = strdup(sp + 1); + } else { + free(rpc_user); + rpc_user = strdup(ap); + } + memmove(ap, p + 1, strlen(p + 1) + 1); + } + have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7); + break; + case 'O': /* --userpass */ + p = strchr(arg, ':'); + if (!p) + show_usage_and_exit(1); + free(rpc_userpass); + rpc_userpass = strdup(arg); + free(rpc_user); + rpc_user = (char*)calloc(p - arg + 1, 1); + strncpy(rpc_user, arg, p - arg); + free(rpc_pass); + rpc_pass = strdup(p + 1); + break; + case 'x': /* --proxy */ + if (!strncasecmp(arg, "socks4://", 9)) + opt_proxy_type = CURLPROXY_SOCKS4; + else if (!strncasecmp(arg, "socks5://", 9)) + opt_proxy_type = CURLPROXY_SOCKS5; +#if LIBCURL_VERSION_NUM >= 0x071200 + else if (!strncasecmp(arg, "socks4a://", 10)) + opt_proxy_type = CURLPROXY_SOCKS4A; + else if (!strncasecmp(arg, "socks5h://", 10)) + opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME; +#endif + else + opt_proxy_type = CURLPROXY_HTTP; + free(opt_proxy); + opt_proxy = strdup(arg); + break; + case 1001: + free(opt_cert); + opt_cert = strdup(arg); + break; + case 1005: + opt_benchmark = true; + want_longpoll = false; + want_stratum = false; + have_stratum = false; + break; + case 1003: + want_longpoll = false; + break; + case 1007: + want_stratum = false; + break; + case 'S': + use_syslog = true; + break; + case 'd': // CB + { + char * pch = strtok (arg,","); + opt_n_threads = 0; + while (pch != NULL) { + if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') + { + if (atoi(pch) < num_processors) + device_map[opt_n_threads++] = atoi(pch); + else { + applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch)); + exit(1); + } + } else { + int device = cuda_finddevice(pch); + if (device >= 0 && device < num_processors) + device_map[opt_n_threads++] = device; + else { + applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch); + exit(1); + } + } + pch = strtok (NULL, ","); + } + } + break; + case 'V': + show_version_and_exit(); + case 'h': + show_usage_and_exit(0); + default: + show_usage_and_exit(1); + } +} + +static void parse_config(void) +{ + int i; + json_t *val; + + if (!json_is_object(opt_config)) + return; + + for (i = 0; i < ARRAY_SIZE(options); i++) { + if (!options[i].name) + break; + if (!strcmp(options[i].name, "config")) + continue; + + val = json_object_get(opt_config, options[i].name); + if (!val) + continue; + + if (options[i].has_arg && json_is_string(val)) { + char *s = strdup(json_string_value(val)); + if (!s) + break; + parse_arg(options[i].val, s); + free(s); + } else if (!options[i].has_arg && json_is_true(val)) + parse_arg(options[i].val, ""); + else + applog(LOG_ERR, "JSON option %s invalid", + options[i].name); + } + + if (opt_algo == ALGO_HEAVY && opt_vote == 9999) { + fprintf(stderr, "Heavycoin hash requires block reward vote parameter (see --vote)\n"); + show_usage_and_exit(1); + } +} + +static void parse_cmdline(int argc, char *argv[]) +{ + int key; + + while (1) { +#if HAVE_GETOPT_LONG + key = getopt_long(argc, argv, short_options, options, NULL); +#else + key = getopt(argc, argv, short_options); +#endif + if (key < 0) + break; + + parse_arg(key, optarg); + } + if (optind < argc) { + fprintf(stderr, "%s: unsupported non-option argument '%s'\n", + argv[0], argv[optind]); + show_usage_and_exit(1); + } + + if (opt_algo == ALGO_HEAVY && opt_vote == 9999) { + fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n", + argv[0]); + show_usage_and_exit(1); + } + + parse_config(); +} + +#ifndef WIN32 +static void signal_handler(int sig) +{ + switch (sig) { + case SIGHUP: + applog(LOG_INFO, "SIGHUP received"); + break; + case SIGINT: + applog(LOG_INFO, "SIGINT received, exiting"); + exit(0); + break; + case SIGTERM: + applog(LOG_INFO, "SIGTERM received, exiting"); + exit(0); + break; + } +} +#endif + +#define PROGRAM_VERSION "0.6" +int main(int argc, char *argv[]) +{ + struct thr_info *thr; + long flags; + int i; + +#ifdef WIN32 + SYSTEM_INFO sysinfo; +#endif + + printf(" *** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n"); + printf("\t This is version "PROGRAM_VERSION" (beta)\n"); + printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n"); + printf("\t based on pooler-cpuminer extension for HVC from\n\t https://github.com/heavycoin/cpuminer-heavycoin\n"); + printf("\t\t\tand\n\t http://hvc.1gh.com/\n"); + printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n"); + printf("\t LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm\n"); + printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n"); + printf("\t YAC donation address: Y87sptDEcpLkLeAuex6qZioDbvy1qXZEj4\n"); + + rpc_user = strdup(""); + rpc_pass = strdup(""); + + pthread_mutex_init(&applog_lock, NULL); + num_processors = cuda_num_devices(); + + /* parse command line */ + parse_cmdline(argc, argv); + + if (!opt_benchmark && !rpc_url) { + fprintf(stderr, "%s: no URL supplied\n", argv[0]); + show_usage_and_exit(1); + } + + if (!rpc_userpass) { + rpc_userpass = (char*)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2); + if (!rpc_userpass) + return 1; + sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); + } + + pthread_mutex_init(&stats_lock, NULL); + pthread_mutex_init(&g_work_lock, NULL); + pthread_mutex_init(&stratum.sock_lock, NULL); + pthread_mutex_init(&stratum.work_lock, NULL); + + flags = !opt_benchmark && strncmp(rpc_url, "https:", 6) + ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) + : CURL_GLOBAL_ALL; + if (curl_global_init(flags)) { + applog(LOG_ERR, "CURL initialization failed"); + return 1; + } + +#ifndef WIN32 + if (opt_background) { + i = fork(); + if (i < 0) exit(1); + if (i > 0) exit(0); + i = setsid(); + if (i < 0) + applog(LOG_ERR, "setsid() failed (errno = %d)", errno); + i = chdir("/"); + if (i < 0) + applog(LOG_ERR, "chdir() failed (errno = %d)", errno); + signal(SIGHUP, signal_handler); + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + } +#endif + + if (num_processors == 0) + { + applog(LOG_ERR, "No CUDA devices found! terminating."); + exit(1); + } + if (!opt_n_threads) + opt_n_threads = num_processors; + +#ifdef HAVE_SYSLOG_H + if (use_syslog) + openlog("cpuminer", LOG_PID, LOG_USER); +#endif + + work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart)); + if (!work_restart) + return 1; + + thr_info = (struct thr_info *)calloc(opt_n_threads + 3, sizeof(*thr)); + if (!thr_info) + return 1; + + thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double)); + if (!thr_hashrates) + return 1; + + /* init workio thread info */ + work_thr_id = opt_n_threads; + thr = &thr_info[work_thr_id]; + thr->id = work_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + + /* start work I/O thread */ + if (pthread_create(&thr->pth, NULL, workio_thread, thr)) { + applog(LOG_ERR, "workio thread create failed"); + return 1; + } + + if (want_longpoll && !have_stratum) { + /* init longpoll thread info */ + longpoll_thr_id = opt_n_threads + 1; + thr = &thr_info[longpoll_thr_id]; + thr->id = longpoll_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + + /* start longpoll thread */ + if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) { + applog(LOG_ERR, "longpoll thread create failed"); + return 1; + } + } + if (want_stratum) { + /* init stratum thread info */ + stratum_thr_id = opt_n_threads + 2; + thr = &thr_info[stratum_thr_id]; + thr->id = stratum_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + + /* start stratum thread */ + if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) { + applog(LOG_ERR, "stratum thread create failed"); + return 1; + } + + if (have_stratum) + tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); + } + + /* start mining threads */ + for (i = 0; i < opt_n_threads; i++) { + thr = &thr_info[i]; + + thr->id = i; + thr->q = tq_new(); + if (!thr->q) + return 1; + + if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) { + applog(LOG_ERR, "thread %d create failed", i); + return 1; + } + } + + applog(LOG_INFO, "%d miner threads started, " + "using '%s' algorithm.", + opt_n_threads, + algo_names[opt_algo]); + +#ifdef WIN32 + timeBeginPeriod(1); // enable high timer precision (similar to Google Chrome Trick) +#endif + + /* main loop - simply wait for workio thread to exit */ + pthread_join(thr_info[work_thr_id].pth, NULL); + +#ifdef WIN32 + timeEndPeriod(1); // be nice and forego high timer precision +#endif + + applog(LOG_INFO, "workio thread dead, exiting."); + + return 0; +} diff --git a/cpuminer-config.h b/cpuminer-config.h index 8b63e2e..0057c14 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -1,190 +1,190 @@ -/* cpuminer-config.h.in. Generated from configure.ac by autoheader. */ - -/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP - systems. This function is required for `alloca.c' support on those systems. - */ -#undef CRAY_STACKSEG_END - -/* Define to 1 if using `alloca.c'. */ -#undef C_ALLOCA - -/* Define to 1 if you have `alloca', as a function or macro. */ -#undef HAVE_ALLOCA - -/* Define to 1 if you have and it should be used (not on Ultrix). - */ -#undef HAVE_ALLOCA_H - -/* Define to 1 if you have the declaration of `be32dec', and to 0 if you - don't. */ -#undef HAVE_DECL_BE32DEC - -/* Define to 1 if you have the declaration of `be32enc', and to 0 if you - don't. */ -#undef HAVE_DECL_BE32ENC - -/* Define to 1 if you have the declaration of `le32dec', and to 0 if you - don't. */ -#undef HAVE_DECL_LE32DEC - -/* Define to 1 if you have the declaration of `le32enc', and to 0 if you - don't. */ -#undef HAVE_DECL_LE32ENC - -/* Define to 1 if you have the `getopt_long' function. */ -#define HAVE_GETOPT_LONG 1 - -/* Define to 1 if you have the header file. */ -#undef HAVE_INTTYPES_H - -/* Define to 1 if you have a functional curl library. */ -#undef HAVE_LIBCURL - -/* Define to 1 if you have the header file. */ -#undef HAVE_MEMORY_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STDINT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STDLIB_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STRINGS_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STRING_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYSLOG_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_ENDIAN_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_PARAM_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_STAT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_SYSCTL_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_TYPES_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_UNISTD_H - -/* Defined if libcurl supports AsynchDNS */ -#undef LIBCURL_FEATURE_ASYNCHDNS - -/* Defined if libcurl supports IDN */ -#undef LIBCURL_FEATURE_IDN - -/* Defined if libcurl supports IPv6 */ -#undef LIBCURL_FEATURE_IPV6 - -/* Defined if libcurl supports KRB4 */ -#undef LIBCURL_FEATURE_KRB4 - -/* Defined if libcurl supports libz */ -#undef LIBCURL_FEATURE_LIBZ - -/* Defined if libcurl supports NTLM */ -#undef LIBCURL_FEATURE_NTLM - -/* Defined if libcurl supports SSL */ -#undef LIBCURL_FEATURE_SSL - -/* Defined if libcurl supports SSPI */ -#undef LIBCURL_FEATURE_SSPI - -/* Defined if libcurl supports DICT */ -#undef LIBCURL_PROTOCOL_DICT - -/* Defined if libcurl supports FILE */ -#undef LIBCURL_PROTOCOL_FILE - -/* Defined if libcurl supports FTP */ -#undef LIBCURL_PROTOCOL_FTP - -/* Defined if libcurl supports FTPS */ -#undef LIBCURL_PROTOCOL_FTPS - -/* Defined if libcurl supports HTTP */ -#undef LIBCURL_PROTOCOL_HTTP - -/* Defined if libcurl supports HTTPS */ -#undef LIBCURL_PROTOCOL_HTTPS - -/* Defined if libcurl supports IMAP */ -#undef LIBCURL_PROTOCOL_IMAP - -/* Defined if libcurl supports LDAP */ -#undef LIBCURL_PROTOCOL_LDAP - -/* Defined if libcurl supports POP3 */ -#undef LIBCURL_PROTOCOL_POP3 - -/* Defined if libcurl supports RTSP */ -#undef LIBCURL_PROTOCOL_RTSP - -/* Defined if libcurl supports SMTP */ -#undef LIBCURL_PROTOCOL_SMTP - -/* Defined if libcurl supports TELNET */ -#undef LIBCURL_PROTOCOL_TELNET - -/* Defined if libcurl supports TFTP */ -#undef LIBCURL_PROTOCOL_TFTP - -/* Define to 1 if your C compiler doesn't accept -c and -o together. */ -#undef NO_MINUS_C_MINUS_O - -/* Name of package */ -#undef PACKAGE - -/* Define to the address where bug reports for this package should be sent. */ -#undef PACKAGE_BUGREPORT - -/* Define to the full name of this package. */ -#define PACKAGE_NAME "ccminer" - -/* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 2014.03.27" - -/* Define to the one symbol short name of this package. */ -#undef PACKAGE_TARNAME - -/* Define to the home page for this package. */ -#undef PACKAGE_URL - -/* Define to the version of this package. */ -#define PACKAGE_VERSION "2014.03.27" - -/* If using the C implementation of alloca, define if you know the - direction of stack growth for your system; otherwise it will be - automatically deduced at runtime. - STACK_DIRECTION > 0 => grows toward higher addresses - STACK_DIRECTION < 0 => grows toward lower addresses - STACK_DIRECTION = 0 => direction of growth unknown */ -#undef STACK_DIRECTION - -/* Define to 1 if you have the ANSI C header files. */ -#undef STDC_HEADERS - -/* Define to 1 if AVX assembly is available. */ -#undef USE_AVX - -/* Define to 1 if XOP assembly is available. */ -#undef USE_XOP - -/* Version number of package */ -#undef VERSION - -/* Define curl_free() as free() if our version of curl lacks curl_free. */ -#undef curl_free - -/* Define to `unsigned int' if does not define. */ -#undef size_t +/* cpuminer-config.h.in. Generated from configure.ac by autoheader. */ + +/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP + systems. This function is required for `alloca.c' support on those systems. + */ +#undef CRAY_STACKSEG_END + +/* Define to 1 if using `alloca.c'. */ +#undef C_ALLOCA + +/* Define to 1 if you have `alloca', as a function or macro. */ +#undef HAVE_ALLOCA + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#undef HAVE_ALLOCA_H + +/* Define to 1 if you have the declaration of `be32dec', and to 0 if you + don't. */ +#undef HAVE_DECL_BE32DEC + +/* Define to 1 if you have the declaration of `be32enc', and to 0 if you + don't. */ +#undef HAVE_DECL_BE32ENC + +/* Define to 1 if you have the declaration of `le32dec', and to 0 if you + don't. */ +#undef HAVE_DECL_LE32DEC + +/* Define to 1 if you have the declaration of `le32enc', and to 0 if you + don't. */ +#undef HAVE_DECL_LE32ENC + +/* Define to 1 if you have the `getopt_long' function. */ +#define HAVE_GETOPT_LONG 1 + +/* Define to 1 if you have the header file. */ +#undef HAVE_INTTYPES_H + +/* Define to 1 if you have a functional curl library. */ +#undef HAVE_LIBCURL + +/* Define to 1 if you have the header file. */ +#undef HAVE_MEMORY_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_STDINT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_STDLIB_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_STRING_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYSLOG_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_ENDIAN_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_PARAM_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_STAT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_SYSCTL_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_TYPES_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_UNISTD_H + +/* Defined if libcurl supports AsynchDNS */ +#undef LIBCURL_FEATURE_ASYNCHDNS + +/* Defined if libcurl supports IDN */ +#undef LIBCURL_FEATURE_IDN + +/* Defined if libcurl supports IPv6 */ +#undef LIBCURL_FEATURE_IPV6 + +/* Defined if libcurl supports KRB4 */ +#undef LIBCURL_FEATURE_KRB4 + +/* Defined if libcurl supports libz */ +#undef LIBCURL_FEATURE_LIBZ + +/* Defined if libcurl supports NTLM */ +#undef LIBCURL_FEATURE_NTLM + +/* Defined if libcurl supports SSL */ +#undef LIBCURL_FEATURE_SSL + +/* Defined if libcurl supports SSPI */ +#undef LIBCURL_FEATURE_SSPI + +/* Defined if libcurl supports DICT */ +#undef LIBCURL_PROTOCOL_DICT + +/* Defined if libcurl supports FILE */ +#undef LIBCURL_PROTOCOL_FILE + +/* Defined if libcurl supports FTP */ +#undef LIBCURL_PROTOCOL_FTP + +/* Defined if libcurl supports FTPS */ +#undef LIBCURL_PROTOCOL_FTPS + +/* Defined if libcurl supports HTTP */ +#undef LIBCURL_PROTOCOL_HTTP + +/* Defined if libcurl supports HTTPS */ +#undef LIBCURL_PROTOCOL_HTTPS + +/* Defined if libcurl supports IMAP */ +#undef LIBCURL_PROTOCOL_IMAP + +/* Defined if libcurl supports LDAP */ +#undef LIBCURL_PROTOCOL_LDAP + +/* Defined if libcurl supports POP3 */ +#undef LIBCURL_PROTOCOL_POP3 + +/* Defined if libcurl supports RTSP */ +#undef LIBCURL_PROTOCOL_RTSP + +/* Defined if libcurl supports SMTP */ +#undef LIBCURL_PROTOCOL_SMTP + +/* Defined if libcurl supports TELNET */ +#undef LIBCURL_PROTOCOL_TELNET + +/* Defined if libcurl supports TFTP */ +#undef LIBCURL_PROTOCOL_TFTP + +/* Define to 1 if your C compiler doesn't accept -c and -o together. */ +#undef NO_MINUS_C_MINUS_O + +/* Name of package */ +#undef PACKAGE + +/* Define to the address where bug reports for this package should be sent. */ +#undef PACKAGE_BUGREPORT + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "ccminer" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "ccminer 2014.04.27" + +/* Define to the one symbol short name of this package. */ +#undef PACKAGE_TARNAME + +/* Define to the home page for this package. */ +#undef PACKAGE_URL + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "2014.04.27" + +/* If using the C implementation of alloca, define if you know the + direction of stack growth for your system; otherwise it will be + automatically deduced at runtime. + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown */ +#undef STACK_DIRECTION + +/* Define to 1 if you have the ANSI C header files. */ +#undef STDC_HEADERS + +/* Define to 1 if AVX assembly is available. */ +#undef USE_AVX + +/* Define to 1 if XOP assembly is available. */ +#undef USE_XOP + +/* Version number of package */ +#undef VERSION + +/* Define curl_free() as free() if our version of curl lacks curl_free. */ +#undef curl_free + +/* Define to `unsigned int' if does not define. */ +#undef size_t diff --git a/cuda_blake512.cu b/cuda_blake512.cu index 3602b23..013b7e1 100644 --- a/cuda_blake512.cu +++ b/cuda_blake512.cu @@ -1,307 +1,307 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *d_heftyHashes[8]; -extern uint32_t *d_nonceVector[8]; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hash5output[8]; - -// die Message (116 Bytes) mit Padding zur Berechnung auf der GPU -__constant__ uint64_t c_PaddedMessage[16]; // padded message (84+32 bytes + padding) - -// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------ - -__constant__ uint8_t c_sigma[16][16]; - -const uint8_t host_sigma[16][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } -}; - -#define SWAP32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) - -#define SWAP64(x) \ - ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ - (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ - (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ - (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ - (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ - (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ - (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ - (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) - -__constant__ uint64_t c_SecondRound[16]; - -const uint64_t host_SecondRound[16] = -{ - 0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0,SWAP64(0x3A0) -}; - -__constant__ uint64_t c_u512[16]; - -const uint64_t host_u512[16] = -{ - 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, - 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, - 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, - 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, - 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, - 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, - 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, - 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL -}; - - -#define ROTR(x,n) (((x)<<(64-n))|( (x)>>(n))) - -#define G(a,b,c,d,e) \ - v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ - v[d] = ROTR( v[d] ^ v[a],32); \ - v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c],25); \ - v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ - v[d] = ROTR( v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c],11); - -__device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 ) -{ - uint64_t v[16], m[16], i; - -#pragma unroll 16 - for( i = 0; i < 16; ++i ) m[i] = SWAP64(block[i]); - -#pragma unroll 8 - for( i = 0; i < 8; ++i ) v[i] = h[i]; - - v[ 8] = u512[0]; - v[ 9] = u512[1]; - v[10] = u512[2]; - v[11] = u512[3]; - v[12] = u512[4]; - v[13] = u512[5]; - v[14] = u512[6]; - v[15] = u512[7]; - - /* don't xor t when the block is only padding */ - if ( !nullt ) { - v[12] ^= 928; - v[13] ^= 928; - } - -#pragma unroll 16 - for( i = 0; i < 16; ++i ) - { - /* column step */ - G( 0, 4, 8, 12, 0 ); - G( 1, 5, 9, 13, 2 ); - G( 2, 6, 10, 14, 4 ); - G( 3, 7, 11, 15, 6 ); - /* diagonal step */ - G( 0, 5, 10, 15, 8 ); - G( 1, 6, 11, 12, 10 ); - G( 2, 7, 8, 13, 12 ); - G( 3, 4, 9, 14, 14 ); - } - -#pragma unroll 16 - for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; -} - -// Endian Drehung für 32 Bit Typen -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) - | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); -} - -// Endian Drehung für 64 Bit Typen -static __device__ uint64_t cuda_swab64(uint64_t x) { - uint32_t h = (x >> 32); - uint32_t l = (x & 0xFFFFFFFFULL); - return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h)); -} - -// das Hi Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t HIWORD(const uint64_t &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2hiint(__longlong_as_double(x)); -#else - return (uint32_t)(x >> 32); -#endif -} - -// das Hi Word in einem 64 Bit Typen ersetzen -static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { - return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); -} - -// das Lo Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t LOWORD(const uint64_t &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2loint(__longlong_as_double(x)); -#else - return (uint32_t)(x & 0xFFFFFFFFULL); -#endif -} - -// das Lo Word in einem 64 Bit Typen ersetzen -static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { - return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); -} - -__global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - //uint32_t nounce = startNounce + thread; - uint32_t nounce = nonceVector[thread]; - - // Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash) - uint32_t hashPosition = nounce - startNounce; - - // State vorbereiten - uint64_t h[8]; - h[0] = 0x6a09e667f3bcc908ULL; - h[1] = 0xbb67ae8584caa73bULL; - h[2] = 0x3c6ef372fe94f82bULL; - h[3] = 0xa54ff53a5f1d36f1ULL; - h[4] = 0x510e527fade682d1ULL; - h[5] = 0x9b05688c2b3e6c1fULL; - h[6] = 0x1f83d9abfb41bd6bULL; - h[7] = 0x5be0cd19137e2179ULL; - - // 128 Byte für die Message - uint64_t buf[16]; - - // Message für die erste Runde in Register holen -#pragma unroll 16 - for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i]; - - // die Nounce durch die thread-spezifische ersetzen - buf[9] = REPLACE_HIWORD(buf[9], nounce); - - // den thread-spezifischen Hefty1 hash einsetzen - uint32_t *hefty = heftyHashes + 8 * hashPosition; - buf[10] = REPLACE_HIWORD(buf[10], hefty[0]); - buf[11] = REPLACE_LOWORD(buf[11], hefty[1]); - buf[11] = REPLACE_HIWORD(buf[11], hefty[2]); - buf[12] = REPLACE_LOWORD(buf[12], hefty[3]); - buf[12] = REPLACE_HIWORD(buf[12], hefty[4]); - buf[13] = REPLACE_LOWORD(buf[13], hefty[5]); - buf[13] = REPLACE_HIWORD(buf[13], hefty[6]); - buf[14] = REPLACE_LOWORD(buf[14], hefty[7]); - - // erste Runde - blake512_compress( h, buf, 0, c_sigma, c_u512 ); - - // zweite Runde -#pragma unroll 16 - for (int i=0; i < 16; ++i) buf[i] = c_SecondRound[i]; - blake512_compress( h, buf, 1, c_sigma, c_u512 ); - - // Hash rauslassen -#if 0 - // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind - uint32_t *outHash = (uint32_t *)outputHash + 16 * hashPosition; -#pragma unroll 8 - for (int i=0; i < 8; ++i) { - outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); - outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); - } -#else - // in dieser Version passieren auch ein paar 64 Bit Shifts - uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition; -#pragma unroll 8 - for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] ); -#endif - } -} - - -// ---------------------------- END CUDA blake512 functions ------------------------------------ - -// Setup-Funktionen -__host__ void blake512_cpu_init(int thr_id, int threads) -{ - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( c_sigma, - host_sigma, - sizeof(host_sigma), - 0, cudaMemcpyHostToDevice); - - cudaMemcpyToSymbol( c_u512, - host_u512, - sizeof(host_u512), - 0, cudaMemcpyHostToDevice); - - cudaMemcpyToSymbol( c_SecondRound, - host_SecondRound, - sizeof(host_SecondRound), - 0, cudaMemcpyHostToDevice); - - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads); -} - -__host__ void blake512_cpu_setBlock(void *pdata) - // data muss 84-Byte haben! - // heftyHash hat 32-Byte -{ - // Message mit Padding für erste Runde bereitstellen - unsigned char PaddedMessage[128]; - memcpy(PaddedMessage, pdata, 84); - memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen - memset(PaddedMessage+116, 0, 12); - PaddedMessage[116] = 0x80; - - // die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU - cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); -} - - -__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) -{ - const int threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - blake512_gpu_hash<<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// globaler Speicher für alle HeftyHashes aller Threads +extern uint32_t *d_heftyHashes[8]; +extern uint32_t *d_nonceVector[8]; + +// globaler Speicher für unsere Ergebnisse +uint32_t *d_hash5output[8]; + +// die Message (116 Bytes) mit Padding zur Berechnung auf der GPU +__constant__ uint64_t c_PaddedMessage[16]; // padded message (84+32 bytes + padding) + +// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------ + +__constant__ uint8_t c_sigma[16][16]; + +const uint8_t host_sigma[16][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +#define SWAP32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +#define SWAP64(x) \ + ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ + (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) + +__constant__ uint64_t c_SecondRound[16]; + +const uint64_t host_SecondRound[16] = +{ + 0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0,SWAP64(0x3A0) +}; + +__constant__ uint64_t c_u512[16]; + +const uint64_t host_u512[16] = +{ + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, + 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, + 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, + 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, + 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL +}; + + +#define ROTR(x,n) (((x)<<(64-n))|( (x)>>(n))) + +#define G(a,b,c,d,e) \ + v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ + v[d] = ROTR( v[d] ^ v[a],32); \ + v[c] += v[d]; \ + v[b] = ROTR( v[b] ^ v[c],25); \ + v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ + v[d] = ROTR( v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR( v[b] ^ v[c],11); + +__device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 ) +{ + uint64_t v[16], m[16], i; + +#pragma unroll 16 + for( i = 0; i < 16; ++i ) m[i] = SWAP64(block[i]); + +#pragma unroll 8 + for( i = 0; i < 8; ++i ) v[i] = h[i]; + + v[ 8] = u512[0]; + v[ 9] = u512[1]; + v[10] = u512[2]; + v[11] = u512[3]; + v[12] = u512[4]; + v[13] = u512[5]; + v[14] = u512[6]; + v[15] = u512[7]; + + /* don't xor t when the block is only padding */ + if ( !nullt ) { + v[12] ^= 928; + v[13] ^= 928; + } + +#pragma unroll 16 + for( i = 0; i < 16; ++i ) + { + /* column step */ + G( 0, 4, 8, 12, 0 ); + G( 1, 5, 9, 13, 2 ); + G( 2, 6, 10, 14, 4 ); + G( 3, 7, 11, 15, 6 ); + /* diagonal step */ + G( 0, 5, 10, 15, 8 ); + G( 1, 6, 11, 12, 10 ); + G( 2, 7, 8, 13, 12 ); + G( 3, 4, 9, 14, 14 ); + } + +#pragma unroll 16 + for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; +} + +// Endian Drehung für 32 Bit Typen +static __device__ uint32_t cuda_swab32(uint32_t x) +{ + return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) + | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); +} + +// Endian Drehung für 64 Bit Typen +static __device__ uint64_t cuda_swab64(uint64_t x) { + uint32_t h = (x >> 32); + uint32_t l = (x & 0xFFFFFFFFULL); + return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h)); +} + +// das Hi Word aus einem 64 Bit Typen extrahieren +static __device__ uint32_t HIWORD(const uint64_t &x) { +#if __CUDA_ARCH__ >= 130 + return (uint32_t)__double2hiint(__longlong_as_double(x)); +#else + return (uint32_t)(x >> 32); +#endif +} + +// das Hi Word in einem 64 Bit Typen ersetzen +static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { + return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); +} + +// das Lo Word aus einem 64 Bit Typen extrahieren +static __device__ uint32_t LOWORD(const uint64_t &x) { +#if __CUDA_ARCH__ >= 130 + return (uint32_t)__double2loint(__longlong_as_double(x)); +#else + return (uint32_t)(x & 0xFFFFFFFFULL); +#endif +} + +// das Lo Word in einem 64 Bit Typen ersetzen +static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { + return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); +} + +__global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // bestimme den aktuellen Zähler + //uint32_t nounce = startNounce + thread; + uint32_t nounce = nonceVector[thread]; + + // Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash) + uint32_t hashPosition = nounce - startNounce; + + // State vorbereiten + uint64_t h[8]; + h[0] = 0x6a09e667f3bcc908ULL; + h[1] = 0xbb67ae8584caa73bULL; + h[2] = 0x3c6ef372fe94f82bULL; + h[3] = 0xa54ff53a5f1d36f1ULL; + h[4] = 0x510e527fade682d1ULL; + h[5] = 0x9b05688c2b3e6c1fULL; + h[6] = 0x1f83d9abfb41bd6bULL; + h[7] = 0x5be0cd19137e2179ULL; + + // 128 Byte für die Message + uint64_t buf[16]; + + // Message für die erste Runde in Register holen +#pragma unroll 16 + for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i]; + + // die Nounce durch die thread-spezifische ersetzen + buf[9] = REPLACE_HIWORD(buf[9], nounce); + + // den thread-spezifischen Hefty1 hash einsetzen + uint32_t *hefty = heftyHashes + 8 * hashPosition; + buf[10] = REPLACE_HIWORD(buf[10], hefty[0]); + buf[11] = REPLACE_LOWORD(buf[11], hefty[1]); + buf[11] = REPLACE_HIWORD(buf[11], hefty[2]); + buf[12] = REPLACE_LOWORD(buf[12], hefty[3]); + buf[12] = REPLACE_HIWORD(buf[12], hefty[4]); + buf[13] = REPLACE_LOWORD(buf[13], hefty[5]); + buf[13] = REPLACE_HIWORD(buf[13], hefty[6]); + buf[14] = REPLACE_LOWORD(buf[14], hefty[7]); + + // erste Runde + blake512_compress( h, buf, 0, c_sigma, c_u512 ); + + // zweite Runde +#pragma unroll 16 + for (int i=0; i < 16; ++i) buf[i] = c_SecondRound[i]; + blake512_compress( h, buf, 1, c_sigma, c_u512 ); + + // Hash rauslassen +#if 0 + // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind + uint32_t *outHash = (uint32_t *)outputHash + 16 * hashPosition; +#pragma unroll 8 + for (int i=0; i < 8; ++i) { + outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); + outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); + } +#else + // in dieser Version passieren auch ein paar 64 Bit Shifts + uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition; +#pragma unroll 8 + for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] ); +#endif + } +} + + +// ---------------------------- END CUDA blake512 functions ------------------------------------ + +// Setup-Funktionen +__host__ void blake512_cpu_init(int thr_id, int threads) +{ + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( c_sigma, + host_sigma, + sizeof(host_sigma), + 0, cudaMemcpyHostToDevice); + + cudaMemcpyToSymbol( c_u512, + host_u512, + sizeof(host_u512), + 0, cudaMemcpyHostToDevice); + + cudaMemcpyToSymbol( c_SecondRound, + host_SecondRound, + sizeof(host_SecondRound), + 0, cudaMemcpyHostToDevice); + + // Speicher für alle Ergebnisse belegen + cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads); +} + +__host__ void blake512_cpu_setBlock(void *pdata) + // data muss 84-Byte haben! + // heftyHash hat 32-Byte +{ + // Message mit Padding für erste Runde bereitstellen + unsigned char PaddedMessage[128]; + memcpy(PaddedMessage, pdata, 84); + memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen + memset(PaddedMessage+116, 0, 12); + PaddedMessage[116] = 0x80; + + // die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU + cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); +} + + +__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) +{ + const int threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + blake512_gpu_hash<<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); +} diff --git a/cuda_blake512.h b/cuda_blake512.h index b0cf201..48bd3ff 100644 --- a/cuda_blake512.h +++ b/cuda_blake512.h @@ -1,8 +1,8 @@ -#ifndef _CUDA_BLAKE512_H -#define _CUDA_BLAKE512_H - -void blake512_cpu_init(int thr_id, int threads); -void blake512_cpu_setBlock(void *pdata); -void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce); - -#endif +#ifndef _CUDA_BLAKE512_H +#define _CUDA_BLAKE512_H + +void blake512_cpu_init(int thr_id, int threads); +void blake512_cpu_setBlock(void *pdata); +void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce); + +#endif diff --git a/cuda_combine.cu b/cuda_combine.cu index fb1033c..c9036f3 100644 --- a/cuda_combine.cu +++ b/cuda_combine.cu @@ -1,150 +1,150 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -// Folgende Definitionen später durch header ersetzen -typedef unsigned int uint32_t; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hashoutput[8]; - -extern uint32_t *d_hash2output[8]; -extern uint32_t *d_hash3output[8]; -extern uint32_t *d_hash4output[8]; -extern uint32_t *d_hash5output[8]; -extern uint32_t *d_nonceVector[8]; - -/* Combines top 64-bits from each hash into a single hash */ -static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) -{ - uint32_t lout[8]; // Combining in Registern machen - -#pragma unroll 8 - for (int i=0; i < 8; ++i) - lout[i] = 0; - - // das Makro setzt jeweils 4 Bits aus vier verschiedenen Hashes zu einem Nibble zusammen -#define MIX(bits, mask, i) \ - lout[(255 - (bits+3))/32] <<= 4; \ - if ((hash1[i] & mask) != 0) lout[(255 - (bits+0))/32] |= 8; \ - if ((hash2[i] & mask) != 0) lout[(255 - (bits+1))/32] |= 4; \ - if ((hash3[i] & mask) != 0) lout[(255 - (bits+2))/32] |= 2; \ - if ((hash4[i] & mask) != 0) lout[(255 - (bits+3))/32] |= 1; \ - - /* Transpose first 64 bits of each hash into out */ - MIX( 0, 0x80000000, 7); - MIX( 4, 0x40000000, 7); - MIX( 8, 0x20000000, 7); - MIX( 12, 0x10000000, 7); - MIX( 16, 0x08000000, 7); - MIX( 20, 0x04000000, 7); - MIX( 24, 0x02000000, 7); - MIX( 28, 0x01000000, 7); - MIX( 32, 0x00800000, 7); - MIX( 36, 0x00400000, 7); - MIX( 40, 0x00200000, 7); - MIX( 44, 0x00100000, 7); - MIX( 48, 0x00080000, 7); - MIX( 52, 0x00040000, 7); - MIX( 56, 0x00020000, 7); - MIX( 60, 0x00010000, 7); - MIX( 64, 0x00008000, 7); - MIX( 68, 0x00004000, 7); - MIX( 72, 0x00002000, 7); - MIX( 76, 0x00001000, 7); - MIX( 80, 0x00000800, 7); - MIX( 84, 0x00000400, 7); - MIX( 88, 0x00000200, 7); - MIX( 92, 0x00000100, 7); - MIX( 96, 0x00000080, 7); - MIX(100, 0x00000040, 7); - MIX(104, 0x00000020, 7); - MIX(108, 0x00000010, 7); - MIX(112, 0x00000008, 7); - MIX(116, 0x00000004, 7); - MIX(120, 0x00000002, 7); - MIX(124, 0x00000001, 7); - - MIX(128, 0x80000000, 6); - MIX(132, 0x40000000, 6); - MIX(136, 0x20000000, 6); - MIX(140, 0x10000000, 6); - MIX(144, 0x08000000, 6); - MIX(148, 0x04000000, 6); - MIX(152, 0x02000000, 6); - MIX(156, 0x01000000, 6); - MIX(160, 0x00800000, 6); - MIX(164, 0x00400000, 6); - MIX(168, 0x00200000, 6); - MIX(172, 0x00100000, 6); - MIX(176, 0x00080000, 6); - MIX(180, 0x00040000, 6); - MIX(184, 0x00020000, 6); - MIX(188, 0x00010000, 6); - MIX(192, 0x00008000, 6); - MIX(196, 0x00004000, 6); - MIX(200, 0x00002000, 6); - MIX(204, 0x00001000, 6); - MIX(208, 0x00000800, 6); - MIX(212, 0x00000400, 6); - MIX(216, 0x00000200, 6); - MIX(220, 0x00000100, 6); - MIX(224, 0x00000080, 6); - MIX(228, 0x00000040, 6); - MIX(232, 0x00000020, 6); - MIX(236, 0x00000010, 6); - MIX(240, 0x00000008, 6); - MIX(244, 0x00000004, 6); - MIX(248, 0x00000002, 6); - MIX(252, 0x00000001, 6); - -#pragma unroll 8 - for (int i=0; i < 8; ++i) - out[i] = lout[i]; -} - -__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = nonceVector[thread]; - uint32_t hashPosition = nounce - startNounce; - // Die Aufgabe der combine-funktion besteht aus zwei Teilen. - // 1) Komprimiere die hashes zu einem kleinen Array - // 2) Errechne dort den combines-value - - // Die Kompression wird dadurch verwirklicht, dass im out-array weiterhin mit "thread" indiziert - // wird. Die anderen Werte werden mit der nonce indiziert - - combine_hashes(&out[8 * thread], &hash2[8 * hashPosition], &hash3[16 * hashPosition], &hash4[16 * hashPosition], &hash5[16 * hashPosition]); - } -} - -// Setup-Funktionen -__host__ void combine_cpu_init(int thr_id, int threads) -{ - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); -} - -void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash) -{ - // diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten - - const int threadsperblock = 128; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - combine_gpu_hash<<>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]); - - // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden - cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost); -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +// Folgende Definitionen später durch header ersetzen +typedef unsigned int uint32_t; + +// globaler Speicher für unsere Ergebnisse +uint32_t *d_hashoutput[8]; + +extern uint32_t *d_hash2output[8]; +extern uint32_t *d_hash3output[8]; +extern uint32_t *d_hash4output[8]; +extern uint32_t *d_hash5output[8]; +extern uint32_t *d_nonceVector[8]; + +/* Combines top 64-bits from each hash into a single hash */ +static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) +{ + uint32_t lout[8]; // Combining in Registern machen + +#pragma unroll 8 + for (int i=0; i < 8; ++i) + lout[i] = 0; + + // das Makro setzt jeweils 4 Bits aus vier verschiedenen Hashes zu einem Nibble zusammen +#define MIX(bits, mask, i) \ + lout[(255 - (bits+3))/32] <<= 4; \ + if ((hash1[i] & mask) != 0) lout[(255 - (bits+0))/32] |= 8; \ + if ((hash2[i] & mask) != 0) lout[(255 - (bits+1))/32] |= 4; \ + if ((hash3[i] & mask) != 0) lout[(255 - (bits+2))/32] |= 2; \ + if ((hash4[i] & mask) != 0) lout[(255 - (bits+3))/32] |= 1; \ + + /* Transpose first 64 bits of each hash into out */ + MIX( 0, 0x80000000, 7); + MIX( 4, 0x40000000, 7); + MIX( 8, 0x20000000, 7); + MIX( 12, 0x10000000, 7); + MIX( 16, 0x08000000, 7); + MIX( 20, 0x04000000, 7); + MIX( 24, 0x02000000, 7); + MIX( 28, 0x01000000, 7); + MIX( 32, 0x00800000, 7); + MIX( 36, 0x00400000, 7); + MIX( 40, 0x00200000, 7); + MIX( 44, 0x00100000, 7); + MIX( 48, 0x00080000, 7); + MIX( 52, 0x00040000, 7); + MIX( 56, 0x00020000, 7); + MIX( 60, 0x00010000, 7); + MIX( 64, 0x00008000, 7); + MIX( 68, 0x00004000, 7); + MIX( 72, 0x00002000, 7); + MIX( 76, 0x00001000, 7); + MIX( 80, 0x00000800, 7); + MIX( 84, 0x00000400, 7); + MIX( 88, 0x00000200, 7); + MIX( 92, 0x00000100, 7); + MIX( 96, 0x00000080, 7); + MIX(100, 0x00000040, 7); + MIX(104, 0x00000020, 7); + MIX(108, 0x00000010, 7); + MIX(112, 0x00000008, 7); + MIX(116, 0x00000004, 7); + MIX(120, 0x00000002, 7); + MIX(124, 0x00000001, 7); + + MIX(128, 0x80000000, 6); + MIX(132, 0x40000000, 6); + MIX(136, 0x20000000, 6); + MIX(140, 0x10000000, 6); + MIX(144, 0x08000000, 6); + MIX(148, 0x04000000, 6); + MIX(152, 0x02000000, 6); + MIX(156, 0x01000000, 6); + MIX(160, 0x00800000, 6); + MIX(164, 0x00400000, 6); + MIX(168, 0x00200000, 6); + MIX(172, 0x00100000, 6); + MIX(176, 0x00080000, 6); + MIX(180, 0x00040000, 6); + MIX(184, 0x00020000, 6); + MIX(188, 0x00010000, 6); + MIX(192, 0x00008000, 6); + MIX(196, 0x00004000, 6); + MIX(200, 0x00002000, 6); + MIX(204, 0x00001000, 6); + MIX(208, 0x00000800, 6); + MIX(212, 0x00000400, 6); + MIX(216, 0x00000200, 6); + MIX(220, 0x00000100, 6); + MIX(224, 0x00000080, 6); + MIX(228, 0x00000040, 6); + MIX(232, 0x00000020, 6); + MIX(236, 0x00000010, 6); + MIX(240, 0x00000008, 6); + MIX(244, 0x00000004, 6); + MIX(248, 0x00000002, 6); + MIX(252, 0x00000001, 6); + +#pragma unroll 8 + for (int i=0; i < 8; ++i) + out[i] = lout[i]; +} + +__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = nonceVector[thread]; + uint32_t hashPosition = nounce - startNounce; + // Die Aufgabe der combine-funktion besteht aus zwei Teilen. + // 1) Komprimiere die hashes zu einem kleinen Array + // 2) Errechne dort den combines-value + + // Die Kompression wird dadurch verwirklicht, dass im out-array weiterhin mit "thread" indiziert + // wird. Die anderen Werte werden mit der nonce indiziert + + combine_hashes(&out[8 * thread], &hash2[8 * hashPosition], &hash3[16 * hashPosition], &hash4[16 * hashPosition], &hash5[16 * hashPosition]); + } +} + +// Setup-Funktionen +__host__ void combine_cpu_init(int thr_id, int threads) +{ + // Speicher für alle Ergebnisse belegen + cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); +} + +void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash) +{ + // diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten + + const int threadsperblock = 128; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + combine_gpu_hash<<>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]); + + // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden + cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost); +} diff --git a/cuda_combine.h b/cuda_combine.h index ada3a21..5bb5832 100644 --- a/cuda_combine.h +++ b/cuda_combine.h @@ -1,7 +1,7 @@ -#ifndef _CUDA_COMBINE_H -#define _CUDA_COMBINE_H - -void combine_cpu_init(int thr_id, int threads); -void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash); - -#endif +#ifndef _CUDA_COMBINE_H +#define _CUDA_COMBINE_H + +void combine_cpu_init(int thr_id, int threads); +void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash); + +#endif diff --git a/cuda_fugue256.cu b/cuda_fugue256.cu index 7f09099..f5ddd4f 100644 --- a/cuda_fugue256.cu +++ b/cuda_fugue256.cu @@ -1,793 +1,793 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -#include "sph_fugue.h" - -#define USE_SHARED 1 - -// aus cpu-miner.c -extern int device_map[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -// schon in sph_fugue.h definiert -//#define SPH_C32(x) ((uint32_t)(x ## U)) - -uint32_t *d_fugue256_hashoutput[8]; -uint32_t *d_resultNonce[8]; - -__constant__ uint32_t GPUstate[30]; // Single GPU -__constant__ uint32_t pTarget[8]; // Single GPU - -texture mixTab0Tex; -texture mixTab1Tex; -texture mixTab2Tex; -texture mixTab3Tex; - -#if USE_SHARED -#define mixtab0(x) (*((uint32_t*)mixtabs + ( (x)))) -#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x)))) -#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x)))) -#define mixtab3(x) (*((uint32_t*)mixtabs + (768+(x)))) -#else -#define mixtab0(x) tex1Dfetch(mixTab0Tex, x) -#define mixtab1(x) tex1Dfetch(mixTab1Tex, x) -#define mixtab2(x) tex1Dfetch(mixTab2Tex, x) -#define mixtab3(x) tex1Dfetch(mixTab3Tex, x) -#endif - -/* TABELLEN */ -static const uint32_t mixtab0_cpu[] = { - SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7), - SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7), - SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0), - SPH_C32(0x01010704), SPH_C32(0x67672e87), SPH_C32(0x2b2bd1ac), - SPH_C32(0xfefeccd5), SPH_C32(0xd7d71371), SPH_C32(0xabab7c9a), - SPH_C32(0x767659c3), SPH_C32(0xcaca4005), SPH_C32(0x8282a33e), - SPH_C32(0xc9c94909), SPH_C32(0x7d7d68ef), SPH_C32(0xfafad0c5), - SPH_C32(0x5959947f), SPH_C32(0x4747ce07), SPH_C32(0xf0f0e6ed), - SPH_C32(0xadad6e82), SPH_C32(0xd4d41a7d), SPH_C32(0xa2a243be), - SPH_C32(0xafaf608a), SPH_C32(0x9c9cf946), SPH_C32(0xa4a451a6), - SPH_C32(0x727245d3), SPH_C32(0xc0c0762d), SPH_C32(0xb7b728ea), - SPH_C32(0xfdfdc5d9), SPH_C32(0x9393d47a), SPH_C32(0x2626f298), - SPH_C32(0x363682d8), SPH_C32(0x3f3fbdfc), SPH_C32(0xf7f7f3f1), - SPH_C32(0xcccc521d), SPH_C32(0x34348cd0), SPH_C32(0xa5a556a2), - SPH_C32(0xe5e58db9), SPH_C32(0xf1f1e1e9), SPH_C32(0x71714cdf), - SPH_C32(0xd8d83e4d), SPH_C32(0x313197c4), SPH_C32(0x15156b54), - SPH_C32(0x04041c10), SPH_C32(0xc7c76331), SPH_C32(0x2323e98c), - SPH_C32(0xc3c37f21), SPH_C32(0x18184860), SPH_C32(0x9696cf6e), - SPH_C32(0x05051b14), SPH_C32(0x9a9aeb5e), SPH_C32(0x0707151c), - SPH_C32(0x12127e48), SPH_C32(0x8080ad36), SPH_C32(0xe2e298a5), - SPH_C32(0xebeba781), SPH_C32(0x2727f59c), SPH_C32(0xb2b233fe), - SPH_C32(0x757550cf), SPH_C32(0x09093f24), SPH_C32(0x8383a43a), - SPH_C32(0x2c2cc4b0), SPH_C32(0x1a1a4668), SPH_C32(0x1b1b416c), - SPH_C32(0x6e6e11a3), SPH_C32(0x5a5a9d73), SPH_C32(0xa0a04db6), - SPH_C32(0x5252a553), SPH_C32(0x3b3ba1ec), SPH_C32(0xd6d61475), - SPH_C32(0xb3b334fa), SPH_C32(0x2929dfa4), SPH_C32(0xe3e39fa1), - SPH_C32(0x2f2fcdbc), SPH_C32(0x8484b126), SPH_C32(0x5353a257), - SPH_C32(0xd1d10169), SPH_C32(0x00000000), SPH_C32(0xededb599), - SPH_C32(0x2020e080), SPH_C32(0xfcfcc2dd), SPH_C32(0xb1b13af2), - SPH_C32(0x5b5b9a77), SPH_C32(0x6a6a0db3), SPH_C32(0xcbcb4701), - SPH_C32(0xbebe17ce), SPH_C32(0x3939afe4), SPH_C32(0x4a4aed33), - SPH_C32(0x4c4cff2b), SPH_C32(0x5858937b), SPH_C32(0xcfcf5b11), - SPH_C32(0xd0d0066d), SPH_C32(0xefefbb91), SPH_C32(0xaaaa7b9e), - SPH_C32(0xfbfbd7c1), SPH_C32(0x4343d217), SPH_C32(0x4d4df82f), - SPH_C32(0x333399cc), SPH_C32(0x8585b622), SPH_C32(0x4545c00f), - SPH_C32(0xf9f9d9c9), SPH_C32(0x02020e08), SPH_C32(0x7f7f66e7), - SPH_C32(0x5050ab5b), SPH_C32(0x3c3cb4f0), SPH_C32(0x9f9ff04a), - SPH_C32(0xa8a87596), SPH_C32(0x5151ac5f), SPH_C32(0xa3a344ba), - SPH_C32(0x4040db1b), SPH_C32(0x8f8f800a), SPH_C32(0x9292d37e), - SPH_C32(0x9d9dfe42), SPH_C32(0x3838a8e0), SPH_C32(0xf5f5fdf9), - SPH_C32(0xbcbc19c6), SPH_C32(0xb6b62fee), SPH_C32(0xdada3045), - SPH_C32(0x2121e784), SPH_C32(0x10107040), SPH_C32(0xffffcbd1), - SPH_C32(0xf3f3efe1), SPH_C32(0xd2d20865), SPH_C32(0xcdcd5519), - SPH_C32(0x0c0c2430), SPH_C32(0x1313794c), SPH_C32(0xececb29d), - SPH_C32(0x5f5f8667), SPH_C32(0x9797c86a), SPH_C32(0x4444c70b), - SPH_C32(0x1717655c), SPH_C32(0xc4c46a3d), SPH_C32(0xa7a758aa), - SPH_C32(0x7e7e61e3), SPH_C32(0x3d3db3f4), SPH_C32(0x6464278b), - SPH_C32(0x5d5d886f), SPH_C32(0x19194f64), SPH_C32(0x737342d7), - SPH_C32(0x60603b9b), SPH_C32(0x8181aa32), SPH_C32(0x4f4ff627), - SPH_C32(0xdcdc225d), SPH_C32(0x2222ee88), SPH_C32(0x2a2ad6a8), - SPH_C32(0x9090dd76), SPH_C32(0x88889516), SPH_C32(0x4646c903), - SPH_C32(0xeeeebc95), SPH_C32(0xb8b805d6), SPH_C32(0x14146c50), - SPH_C32(0xdede2c55), SPH_C32(0x5e5e8163), SPH_C32(0x0b0b312c), - SPH_C32(0xdbdb3741), SPH_C32(0xe0e096ad), SPH_C32(0x32329ec8), - SPH_C32(0x3a3aa6e8), SPH_C32(0x0a0a3628), SPH_C32(0x4949e43f), - SPH_C32(0x06061218), SPH_C32(0x2424fc90), SPH_C32(0x5c5c8f6b), - SPH_C32(0xc2c27825), SPH_C32(0xd3d30f61), SPH_C32(0xacac6986), - SPH_C32(0x62623593), SPH_C32(0x9191da72), SPH_C32(0x9595c662), - SPH_C32(0xe4e48abd), SPH_C32(0x797974ff), SPH_C32(0xe7e783b1), - SPH_C32(0xc8c84e0d), SPH_C32(0x373785dc), SPH_C32(0x6d6d18af), - SPH_C32(0x8d8d8e02), SPH_C32(0xd5d51d79), SPH_C32(0x4e4ef123), - SPH_C32(0xa9a97292), SPH_C32(0x6c6c1fab), SPH_C32(0x5656b943), - SPH_C32(0xf4f4fafd), SPH_C32(0xeaeaa085), SPH_C32(0x6565208f), - SPH_C32(0x7a7a7df3), SPH_C32(0xaeae678e), SPH_C32(0x08083820), - SPH_C32(0xbaba0bde), SPH_C32(0x787873fb), SPH_C32(0x2525fb94), - SPH_C32(0x2e2ecab8), SPH_C32(0x1c1c5470), SPH_C32(0xa6a65fae), - SPH_C32(0xb4b421e6), SPH_C32(0xc6c66435), SPH_C32(0xe8e8ae8d), - SPH_C32(0xdddd2559), SPH_C32(0x747457cb), SPH_C32(0x1f1f5d7c), - SPH_C32(0x4b4bea37), SPH_C32(0xbdbd1ec2), SPH_C32(0x8b8b9c1a), - SPH_C32(0x8a8a9b1e), SPH_C32(0x70704bdb), SPH_C32(0x3e3ebaf8), - SPH_C32(0xb5b526e2), SPH_C32(0x66662983), SPH_C32(0x4848e33b), - SPH_C32(0x0303090c), SPH_C32(0xf6f6f4f5), SPH_C32(0x0e0e2a38), - SPH_C32(0x61613c9f), SPH_C32(0x35358bd4), SPH_C32(0x5757be47), - SPH_C32(0xb9b902d2), SPH_C32(0x8686bf2e), SPH_C32(0xc1c17129), - SPH_C32(0x1d1d5374), SPH_C32(0x9e9ef74e), SPH_C32(0xe1e191a9), - SPH_C32(0xf8f8decd), SPH_C32(0x9898e556), SPH_C32(0x11117744), - SPH_C32(0x696904bf), SPH_C32(0xd9d93949), SPH_C32(0x8e8e870e), - SPH_C32(0x9494c166), SPH_C32(0x9b9bec5a), SPH_C32(0x1e1e5a78), - SPH_C32(0x8787b82a), SPH_C32(0xe9e9a989), SPH_C32(0xcece5c15), - SPH_C32(0x5555b04f), SPH_C32(0x2828d8a0), SPH_C32(0xdfdf2b51), - SPH_C32(0x8c8c8906), SPH_C32(0xa1a14ab2), SPH_C32(0x89899212), - SPH_C32(0x0d0d2334), SPH_C32(0xbfbf10ca), SPH_C32(0xe6e684b5), - SPH_C32(0x4242d513), SPH_C32(0x686803bb), SPH_C32(0x4141dc1f), - SPH_C32(0x9999e252), SPH_C32(0x2d2dc3b4), SPH_C32(0x0f0f2d3c), - SPH_C32(0xb0b03df6), SPH_C32(0x5454b74b), SPH_C32(0xbbbb0cda), - SPH_C32(0x16166258) -}; - -static const uint32_t mixtab1_cpu[] = { - SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e), - SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a), - SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090), - SPH_C32(0x04010107), SPH_C32(0x8767672e), SPH_C32(0xac2b2bd1), - SPH_C32(0xd5fefecc), SPH_C32(0x71d7d713), SPH_C32(0x9aabab7c), - SPH_C32(0xc3767659), SPH_C32(0x05caca40), SPH_C32(0x3e8282a3), - SPH_C32(0x09c9c949), SPH_C32(0xef7d7d68), SPH_C32(0xc5fafad0), - SPH_C32(0x7f595994), SPH_C32(0x074747ce), SPH_C32(0xedf0f0e6), - SPH_C32(0x82adad6e), SPH_C32(0x7dd4d41a), SPH_C32(0xbea2a243), - SPH_C32(0x8aafaf60), SPH_C32(0x469c9cf9), SPH_C32(0xa6a4a451), - SPH_C32(0xd3727245), SPH_C32(0x2dc0c076), SPH_C32(0xeab7b728), - SPH_C32(0xd9fdfdc5), SPH_C32(0x7a9393d4), SPH_C32(0x982626f2), - SPH_C32(0xd8363682), SPH_C32(0xfc3f3fbd), SPH_C32(0xf1f7f7f3), - SPH_C32(0x1dcccc52), SPH_C32(0xd034348c), SPH_C32(0xa2a5a556), - SPH_C32(0xb9e5e58d), SPH_C32(0xe9f1f1e1), SPH_C32(0xdf71714c), - SPH_C32(0x4dd8d83e), SPH_C32(0xc4313197), SPH_C32(0x5415156b), - SPH_C32(0x1004041c), SPH_C32(0x31c7c763), SPH_C32(0x8c2323e9), - SPH_C32(0x21c3c37f), SPH_C32(0x60181848), SPH_C32(0x6e9696cf), - SPH_C32(0x1405051b), SPH_C32(0x5e9a9aeb), SPH_C32(0x1c070715), - SPH_C32(0x4812127e), SPH_C32(0x368080ad), SPH_C32(0xa5e2e298), - SPH_C32(0x81ebeba7), SPH_C32(0x9c2727f5), SPH_C32(0xfeb2b233), - SPH_C32(0xcf757550), SPH_C32(0x2409093f), SPH_C32(0x3a8383a4), - SPH_C32(0xb02c2cc4), SPH_C32(0x681a1a46), SPH_C32(0x6c1b1b41), - SPH_C32(0xa36e6e11), SPH_C32(0x735a5a9d), SPH_C32(0xb6a0a04d), - SPH_C32(0x535252a5), SPH_C32(0xec3b3ba1), SPH_C32(0x75d6d614), - SPH_C32(0xfab3b334), SPH_C32(0xa42929df), SPH_C32(0xa1e3e39f), - SPH_C32(0xbc2f2fcd), SPH_C32(0x268484b1), SPH_C32(0x575353a2), - SPH_C32(0x69d1d101), SPH_C32(0x00000000), SPH_C32(0x99ededb5), - SPH_C32(0x802020e0), SPH_C32(0xddfcfcc2), SPH_C32(0xf2b1b13a), - SPH_C32(0x775b5b9a), SPH_C32(0xb36a6a0d), SPH_C32(0x01cbcb47), - SPH_C32(0xcebebe17), SPH_C32(0xe43939af), SPH_C32(0x334a4aed), - SPH_C32(0x2b4c4cff), SPH_C32(0x7b585893), SPH_C32(0x11cfcf5b), - SPH_C32(0x6dd0d006), SPH_C32(0x91efefbb), SPH_C32(0x9eaaaa7b), - SPH_C32(0xc1fbfbd7), SPH_C32(0x174343d2), SPH_C32(0x2f4d4df8), - SPH_C32(0xcc333399), SPH_C32(0x228585b6), SPH_C32(0x0f4545c0), - SPH_C32(0xc9f9f9d9), SPH_C32(0x0802020e), SPH_C32(0xe77f7f66), - SPH_C32(0x5b5050ab), SPH_C32(0xf03c3cb4), SPH_C32(0x4a9f9ff0), - SPH_C32(0x96a8a875), SPH_C32(0x5f5151ac), SPH_C32(0xbaa3a344), - SPH_C32(0x1b4040db), SPH_C32(0x0a8f8f80), SPH_C32(0x7e9292d3), - SPH_C32(0x429d9dfe), SPH_C32(0xe03838a8), SPH_C32(0xf9f5f5fd), - SPH_C32(0xc6bcbc19), SPH_C32(0xeeb6b62f), SPH_C32(0x45dada30), - SPH_C32(0x842121e7), SPH_C32(0x40101070), SPH_C32(0xd1ffffcb), - SPH_C32(0xe1f3f3ef), SPH_C32(0x65d2d208), SPH_C32(0x19cdcd55), - SPH_C32(0x300c0c24), SPH_C32(0x4c131379), SPH_C32(0x9dececb2), - SPH_C32(0x675f5f86), SPH_C32(0x6a9797c8), SPH_C32(0x0b4444c7), - SPH_C32(0x5c171765), SPH_C32(0x3dc4c46a), SPH_C32(0xaaa7a758), - SPH_C32(0xe37e7e61), SPH_C32(0xf43d3db3), SPH_C32(0x8b646427), - SPH_C32(0x6f5d5d88), SPH_C32(0x6419194f), SPH_C32(0xd7737342), - SPH_C32(0x9b60603b), SPH_C32(0x328181aa), SPH_C32(0x274f4ff6), - SPH_C32(0x5ddcdc22), SPH_C32(0x882222ee), SPH_C32(0xa82a2ad6), - SPH_C32(0x769090dd), SPH_C32(0x16888895), SPH_C32(0x034646c9), - SPH_C32(0x95eeeebc), SPH_C32(0xd6b8b805), SPH_C32(0x5014146c), - SPH_C32(0x55dede2c), SPH_C32(0x635e5e81), SPH_C32(0x2c0b0b31), - SPH_C32(0x41dbdb37), SPH_C32(0xade0e096), SPH_C32(0xc832329e), - SPH_C32(0xe83a3aa6), SPH_C32(0x280a0a36), SPH_C32(0x3f4949e4), - SPH_C32(0x18060612), SPH_C32(0x902424fc), SPH_C32(0x6b5c5c8f), - SPH_C32(0x25c2c278), SPH_C32(0x61d3d30f), SPH_C32(0x86acac69), - SPH_C32(0x93626235), SPH_C32(0x729191da), SPH_C32(0x629595c6), - SPH_C32(0xbde4e48a), SPH_C32(0xff797974), SPH_C32(0xb1e7e783), - SPH_C32(0x0dc8c84e), SPH_C32(0xdc373785), SPH_C32(0xaf6d6d18), - SPH_C32(0x028d8d8e), SPH_C32(0x79d5d51d), SPH_C32(0x234e4ef1), - SPH_C32(0x92a9a972), SPH_C32(0xab6c6c1f), SPH_C32(0x435656b9), - SPH_C32(0xfdf4f4fa), SPH_C32(0x85eaeaa0), SPH_C32(0x8f656520), - SPH_C32(0xf37a7a7d), SPH_C32(0x8eaeae67), SPH_C32(0x20080838), - SPH_C32(0xdebaba0b), SPH_C32(0xfb787873), SPH_C32(0x942525fb), - SPH_C32(0xb82e2eca), SPH_C32(0x701c1c54), SPH_C32(0xaea6a65f), - SPH_C32(0xe6b4b421), SPH_C32(0x35c6c664), SPH_C32(0x8de8e8ae), - SPH_C32(0x59dddd25), SPH_C32(0xcb747457), SPH_C32(0x7c1f1f5d), - SPH_C32(0x374b4bea), SPH_C32(0xc2bdbd1e), SPH_C32(0x1a8b8b9c), - SPH_C32(0x1e8a8a9b), SPH_C32(0xdb70704b), SPH_C32(0xf83e3eba), - SPH_C32(0xe2b5b526), SPH_C32(0x83666629), SPH_C32(0x3b4848e3), - SPH_C32(0x0c030309), SPH_C32(0xf5f6f6f4), SPH_C32(0x380e0e2a), - SPH_C32(0x9f61613c), SPH_C32(0xd435358b), SPH_C32(0x475757be), - SPH_C32(0xd2b9b902), SPH_C32(0x2e8686bf), SPH_C32(0x29c1c171), - SPH_C32(0x741d1d53), SPH_C32(0x4e9e9ef7), SPH_C32(0xa9e1e191), - SPH_C32(0xcdf8f8de), SPH_C32(0x569898e5), SPH_C32(0x44111177), - SPH_C32(0xbf696904), SPH_C32(0x49d9d939), SPH_C32(0x0e8e8e87), - SPH_C32(0x669494c1), SPH_C32(0x5a9b9bec), SPH_C32(0x781e1e5a), - SPH_C32(0x2a8787b8), SPH_C32(0x89e9e9a9), SPH_C32(0x15cece5c), - SPH_C32(0x4f5555b0), SPH_C32(0xa02828d8), SPH_C32(0x51dfdf2b), - SPH_C32(0x068c8c89), SPH_C32(0xb2a1a14a), SPH_C32(0x12898992), - SPH_C32(0x340d0d23), SPH_C32(0xcabfbf10), SPH_C32(0xb5e6e684), - SPH_C32(0x134242d5), SPH_C32(0xbb686803), SPH_C32(0x1f4141dc), - SPH_C32(0x529999e2), SPH_C32(0xb42d2dc3), SPH_C32(0x3c0f0f2d), - SPH_C32(0xf6b0b03d), SPH_C32(0x4b5454b7), SPH_C32(0xdabbbb0c), - SPH_C32(0x58161662) -}; - -static const uint32_t mixtab2_cpu[] = { - SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777), - SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b), - SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030), - SPH_C32(0x07040101), SPH_C32(0x2e876767), SPH_C32(0xd1ac2b2b), - SPH_C32(0xccd5fefe), SPH_C32(0x1371d7d7), SPH_C32(0x7c9aabab), - SPH_C32(0x59c37676), SPH_C32(0x4005caca), SPH_C32(0xa33e8282), - SPH_C32(0x4909c9c9), SPH_C32(0x68ef7d7d), SPH_C32(0xd0c5fafa), - SPH_C32(0x947f5959), SPH_C32(0xce074747), SPH_C32(0xe6edf0f0), - SPH_C32(0x6e82adad), SPH_C32(0x1a7dd4d4), SPH_C32(0x43bea2a2), - SPH_C32(0x608aafaf), SPH_C32(0xf9469c9c), SPH_C32(0x51a6a4a4), - SPH_C32(0x45d37272), SPH_C32(0x762dc0c0), SPH_C32(0x28eab7b7), - SPH_C32(0xc5d9fdfd), SPH_C32(0xd47a9393), SPH_C32(0xf2982626), - SPH_C32(0x82d83636), SPH_C32(0xbdfc3f3f), SPH_C32(0xf3f1f7f7), - SPH_C32(0x521dcccc), SPH_C32(0x8cd03434), SPH_C32(0x56a2a5a5), - SPH_C32(0x8db9e5e5), SPH_C32(0xe1e9f1f1), SPH_C32(0x4cdf7171), - SPH_C32(0x3e4dd8d8), SPH_C32(0x97c43131), SPH_C32(0x6b541515), - SPH_C32(0x1c100404), SPH_C32(0x6331c7c7), SPH_C32(0xe98c2323), - SPH_C32(0x7f21c3c3), SPH_C32(0x48601818), SPH_C32(0xcf6e9696), - SPH_C32(0x1b140505), SPH_C32(0xeb5e9a9a), SPH_C32(0x151c0707), - SPH_C32(0x7e481212), SPH_C32(0xad368080), SPH_C32(0x98a5e2e2), - SPH_C32(0xa781ebeb), SPH_C32(0xf59c2727), SPH_C32(0x33feb2b2), - SPH_C32(0x50cf7575), SPH_C32(0x3f240909), SPH_C32(0xa43a8383), - SPH_C32(0xc4b02c2c), SPH_C32(0x46681a1a), SPH_C32(0x416c1b1b), - SPH_C32(0x11a36e6e), SPH_C32(0x9d735a5a), SPH_C32(0x4db6a0a0), - SPH_C32(0xa5535252), SPH_C32(0xa1ec3b3b), SPH_C32(0x1475d6d6), - SPH_C32(0x34fab3b3), SPH_C32(0xdfa42929), SPH_C32(0x9fa1e3e3), - SPH_C32(0xcdbc2f2f), SPH_C32(0xb1268484), SPH_C32(0xa2575353), - SPH_C32(0x0169d1d1), SPH_C32(0x00000000), SPH_C32(0xb599eded), - SPH_C32(0xe0802020), SPH_C32(0xc2ddfcfc), SPH_C32(0x3af2b1b1), - SPH_C32(0x9a775b5b), SPH_C32(0x0db36a6a), SPH_C32(0x4701cbcb), - SPH_C32(0x17cebebe), SPH_C32(0xafe43939), SPH_C32(0xed334a4a), - SPH_C32(0xff2b4c4c), SPH_C32(0x937b5858), SPH_C32(0x5b11cfcf), - SPH_C32(0x066dd0d0), SPH_C32(0xbb91efef), SPH_C32(0x7b9eaaaa), - SPH_C32(0xd7c1fbfb), SPH_C32(0xd2174343), SPH_C32(0xf82f4d4d), - SPH_C32(0x99cc3333), SPH_C32(0xb6228585), SPH_C32(0xc00f4545), - SPH_C32(0xd9c9f9f9), SPH_C32(0x0e080202), SPH_C32(0x66e77f7f), - SPH_C32(0xab5b5050), SPH_C32(0xb4f03c3c), SPH_C32(0xf04a9f9f), - SPH_C32(0x7596a8a8), SPH_C32(0xac5f5151), SPH_C32(0x44baa3a3), - SPH_C32(0xdb1b4040), SPH_C32(0x800a8f8f), SPH_C32(0xd37e9292), - SPH_C32(0xfe429d9d), SPH_C32(0xa8e03838), SPH_C32(0xfdf9f5f5), - SPH_C32(0x19c6bcbc), SPH_C32(0x2feeb6b6), SPH_C32(0x3045dada), - SPH_C32(0xe7842121), SPH_C32(0x70401010), SPH_C32(0xcbd1ffff), - SPH_C32(0xefe1f3f3), SPH_C32(0x0865d2d2), SPH_C32(0x5519cdcd), - SPH_C32(0x24300c0c), SPH_C32(0x794c1313), SPH_C32(0xb29decec), - SPH_C32(0x86675f5f), SPH_C32(0xc86a9797), SPH_C32(0xc70b4444), - SPH_C32(0x655c1717), SPH_C32(0x6a3dc4c4), SPH_C32(0x58aaa7a7), - SPH_C32(0x61e37e7e), SPH_C32(0xb3f43d3d), SPH_C32(0x278b6464), - SPH_C32(0x886f5d5d), SPH_C32(0x4f641919), SPH_C32(0x42d77373), - SPH_C32(0x3b9b6060), SPH_C32(0xaa328181), SPH_C32(0xf6274f4f), - SPH_C32(0x225ddcdc), SPH_C32(0xee882222), SPH_C32(0xd6a82a2a), - SPH_C32(0xdd769090), SPH_C32(0x95168888), SPH_C32(0xc9034646), - SPH_C32(0xbc95eeee), SPH_C32(0x05d6b8b8), SPH_C32(0x6c501414), - SPH_C32(0x2c55dede), SPH_C32(0x81635e5e), SPH_C32(0x312c0b0b), - SPH_C32(0x3741dbdb), SPH_C32(0x96ade0e0), SPH_C32(0x9ec83232), - SPH_C32(0xa6e83a3a), SPH_C32(0x36280a0a), SPH_C32(0xe43f4949), - SPH_C32(0x12180606), SPH_C32(0xfc902424), SPH_C32(0x8f6b5c5c), - SPH_C32(0x7825c2c2), SPH_C32(0x0f61d3d3), SPH_C32(0x6986acac), - SPH_C32(0x35936262), SPH_C32(0xda729191), SPH_C32(0xc6629595), - SPH_C32(0x8abde4e4), SPH_C32(0x74ff7979), SPH_C32(0x83b1e7e7), - SPH_C32(0x4e0dc8c8), SPH_C32(0x85dc3737), SPH_C32(0x18af6d6d), - SPH_C32(0x8e028d8d), SPH_C32(0x1d79d5d5), SPH_C32(0xf1234e4e), - SPH_C32(0x7292a9a9), SPH_C32(0x1fab6c6c), SPH_C32(0xb9435656), - SPH_C32(0xfafdf4f4), SPH_C32(0xa085eaea), SPH_C32(0x208f6565), - SPH_C32(0x7df37a7a), SPH_C32(0x678eaeae), SPH_C32(0x38200808), - SPH_C32(0x0bdebaba), SPH_C32(0x73fb7878), SPH_C32(0xfb942525), - SPH_C32(0xcab82e2e), SPH_C32(0x54701c1c), SPH_C32(0x5faea6a6), - SPH_C32(0x21e6b4b4), SPH_C32(0x6435c6c6), SPH_C32(0xae8de8e8), - SPH_C32(0x2559dddd), SPH_C32(0x57cb7474), SPH_C32(0x5d7c1f1f), - SPH_C32(0xea374b4b), SPH_C32(0x1ec2bdbd), SPH_C32(0x9c1a8b8b), - SPH_C32(0x9b1e8a8a), SPH_C32(0x4bdb7070), SPH_C32(0xbaf83e3e), - SPH_C32(0x26e2b5b5), SPH_C32(0x29836666), SPH_C32(0xe33b4848), - SPH_C32(0x090c0303), SPH_C32(0xf4f5f6f6), SPH_C32(0x2a380e0e), - SPH_C32(0x3c9f6161), SPH_C32(0x8bd43535), SPH_C32(0xbe475757), - SPH_C32(0x02d2b9b9), SPH_C32(0xbf2e8686), SPH_C32(0x7129c1c1), - SPH_C32(0x53741d1d), SPH_C32(0xf74e9e9e), SPH_C32(0x91a9e1e1), - SPH_C32(0xdecdf8f8), SPH_C32(0xe5569898), SPH_C32(0x77441111), - SPH_C32(0x04bf6969), SPH_C32(0x3949d9d9), SPH_C32(0x870e8e8e), - SPH_C32(0xc1669494), SPH_C32(0xec5a9b9b), SPH_C32(0x5a781e1e), - SPH_C32(0xb82a8787), SPH_C32(0xa989e9e9), SPH_C32(0x5c15cece), - SPH_C32(0xb04f5555), SPH_C32(0xd8a02828), SPH_C32(0x2b51dfdf), - SPH_C32(0x89068c8c), SPH_C32(0x4ab2a1a1), SPH_C32(0x92128989), - SPH_C32(0x23340d0d), SPH_C32(0x10cabfbf), SPH_C32(0x84b5e6e6), - SPH_C32(0xd5134242), SPH_C32(0x03bb6868), SPH_C32(0xdc1f4141), - SPH_C32(0xe2529999), SPH_C32(0xc3b42d2d), SPH_C32(0x2d3c0f0f), - SPH_C32(0x3df6b0b0), SPH_C32(0xb74b5454), SPH_C32(0x0cdabbbb), - SPH_C32(0x62581616) -}; - -static const uint32_t mixtab3_cpu[] = { - SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777), - SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b), - SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030), - SPH_C32(0x01070401), SPH_C32(0x672e8767), SPH_C32(0x2bd1ac2b), - SPH_C32(0xfeccd5fe), SPH_C32(0xd71371d7), SPH_C32(0xab7c9aab), - SPH_C32(0x7659c376), SPH_C32(0xca4005ca), SPH_C32(0x82a33e82), - SPH_C32(0xc94909c9), SPH_C32(0x7d68ef7d), SPH_C32(0xfad0c5fa), - SPH_C32(0x59947f59), SPH_C32(0x47ce0747), SPH_C32(0xf0e6edf0), - SPH_C32(0xad6e82ad), SPH_C32(0xd41a7dd4), SPH_C32(0xa243bea2), - SPH_C32(0xaf608aaf), SPH_C32(0x9cf9469c), SPH_C32(0xa451a6a4), - SPH_C32(0x7245d372), SPH_C32(0xc0762dc0), SPH_C32(0xb728eab7), - SPH_C32(0xfdc5d9fd), SPH_C32(0x93d47a93), SPH_C32(0x26f29826), - SPH_C32(0x3682d836), SPH_C32(0x3fbdfc3f), SPH_C32(0xf7f3f1f7), - SPH_C32(0xcc521dcc), SPH_C32(0x348cd034), SPH_C32(0xa556a2a5), - SPH_C32(0xe58db9e5), SPH_C32(0xf1e1e9f1), SPH_C32(0x714cdf71), - SPH_C32(0xd83e4dd8), SPH_C32(0x3197c431), SPH_C32(0x156b5415), - SPH_C32(0x041c1004), SPH_C32(0xc76331c7), SPH_C32(0x23e98c23), - SPH_C32(0xc37f21c3), SPH_C32(0x18486018), SPH_C32(0x96cf6e96), - SPH_C32(0x051b1405), SPH_C32(0x9aeb5e9a), SPH_C32(0x07151c07), - SPH_C32(0x127e4812), SPH_C32(0x80ad3680), SPH_C32(0xe298a5e2), - SPH_C32(0xeba781eb), SPH_C32(0x27f59c27), SPH_C32(0xb233feb2), - SPH_C32(0x7550cf75), SPH_C32(0x093f2409), SPH_C32(0x83a43a83), - SPH_C32(0x2cc4b02c), SPH_C32(0x1a46681a), SPH_C32(0x1b416c1b), - SPH_C32(0x6e11a36e), SPH_C32(0x5a9d735a), SPH_C32(0xa04db6a0), - SPH_C32(0x52a55352), SPH_C32(0x3ba1ec3b), SPH_C32(0xd61475d6), - SPH_C32(0xb334fab3), SPH_C32(0x29dfa429), SPH_C32(0xe39fa1e3), - SPH_C32(0x2fcdbc2f), SPH_C32(0x84b12684), SPH_C32(0x53a25753), - SPH_C32(0xd10169d1), SPH_C32(0x00000000), SPH_C32(0xedb599ed), - SPH_C32(0x20e08020), SPH_C32(0xfcc2ddfc), SPH_C32(0xb13af2b1), - SPH_C32(0x5b9a775b), SPH_C32(0x6a0db36a), SPH_C32(0xcb4701cb), - SPH_C32(0xbe17cebe), SPH_C32(0x39afe439), SPH_C32(0x4aed334a), - SPH_C32(0x4cff2b4c), SPH_C32(0x58937b58), SPH_C32(0xcf5b11cf), - SPH_C32(0xd0066dd0), SPH_C32(0xefbb91ef), SPH_C32(0xaa7b9eaa), - SPH_C32(0xfbd7c1fb), SPH_C32(0x43d21743), SPH_C32(0x4df82f4d), - SPH_C32(0x3399cc33), SPH_C32(0x85b62285), SPH_C32(0x45c00f45), - SPH_C32(0xf9d9c9f9), SPH_C32(0x020e0802), SPH_C32(0x7f66e77f), - SPH_C32(0x50ab5b50), SPH_C32(0x3cb4f03c), SPH_C32(0x9ff04a9f), - SPH_C32(0xa87596a8), SPH_C32(0x51ac5f51), SPH_C32(0xa344baa3), - SPH_C32(0x40db1b40), SPH_C32(0x8f800a8f), SPH_C32(0x92d37e92), - SPH_C32(0x9dfe429d), SPH_C32(0x38a8e038), SPH_C32(0xf5fdf9f5), - SPH_C32(0xbc19c6bc), SPH_C32(0xb62feeb6), SPH_C32(0xda3045da), - SPH_C32(0x21e78421), SPH_C32(0x10704010), SPH_C32(0xffcbd1ff), - SPH_C32(0xf3efe1f3), SPH_C32(0xd20865d2), SPH_C32(0xcd5519cd), - SPH_C32(0x0c24300c), SPH_C32(0x13794c13), SPH_C32(0xecb29dec), - SPH_C32(0x5f86675f), SPH_C32(0x97c86a97), SPH_C32(0x44c70b44), - SPH_C32(0x17655c17), SPH_C32(0xc46a3dc4), SPH_C32(0xa758aaa7), - SPH_C32(0x7e61e37e), SPH_C32(0x3db3f43d), SPH_C32(0x64278b64), - SPH_C32(0x5d886f5d), SPH_C32(0x194f6419), SPH_C32(0x7342d773), - SPH_C32(0x603b9b60), SPH_C32(0x81aa3281), SPH_C32(0x4ff6274f), - SPH_C32(0xdc225ddc), SPH_C32(0x22ee8822), SPH_C32(0x2ad6a82a), - SPH_C32(0x90dd7690), SPH_C32(0x88951688), SPH_C32(0x46c90346), - SPH_C32(0xeebc95ee), SPH_C32(0xb805d6b8), SPH_C32(0x146c5014), - SPH_C32(0xde2c55de), SPH_C32(0x5e81635e), SPH_C32(0x0b312c0b), - SPH_C32(0xdb3741db), SPH_C32(0xe096ade0), SPH_C32(0x329ec832), - SPH_C32(0x3aa6e83a), SPH_C32(0x0a36280a), SPH_C32(0x49e43f49), - SPH_C32(0x06121806), SPH_C32(0x24fc9024), SPH_C32(0x5c8f6b5c), - SPH_C32(0xc27825c2), SPH_C32(0xd30f61d3), SPH_C32(0xac6986ac), - SPH_C32(0x62359362), SPH_C32(0x91da7291), SPH_C32(0x95c66295), - SPH_C32(0xe48abde4), SPH_C32(0x7974ff79), SPH_C32(0xe783b1e7), - SPH_C32(0xc84e0dc8), SPH_C32(0x3785dc37), SPH_C32(0x6d18af6d), - SPH_C32(0x8d8e028d), SPH_C32(0xd51d79d5), SPH_C32(0x4ef1234e), - SPH_C32(0xa97292a9), SPH_C32(0x6c1fab6c), SPH_C32(0x56b94356), - SPH_C32(0xf4fafdf4), SPH_C32(0xeaa085ea), SPH_C32(0x65208f65), - SPH_C32(0x7a7df37a), SPH_C32(0xae678eae), SPH_C32(0x08382008), - SPH_C32(0xba0bdeba), SPH_C32(0x7873fb78), SPH_C32(0x25fb9425), - SPH_C32(0x2ecab82e), SPH_C32(0x1c54701c), SPH_C32(0xa65faea6), - SPH_C32(0xb421e6b4), SPH_C32(0xc66435c6), SPH_C32(0xe8ae8de8), - SPH_C32(0xdd2559dd), SPH_C32(0x7457cb74), SPH_C32(0x1f5d7c1f), - SPH_C32(0x4bea374b), SPH_C32(0xbd1ec2bd), SPH_C32(0x8b9c1a8b), - SPH_C32(0x8a9b1e8a), SPH_C32(0x704bdb70), SPH_C32(0x3ebaf83e), - SPH_C32(0xb526e2b5), SPH_C32(0x66298366), SPH_C32(0x48e33b48), - SPH_C32(0x03090c03), SPH_C32(0xf6f4f5f6), SPH_C32(0x0e2a380e), - SPH_C32(0x613c9f61), SPH_C32(0x358bd435), SPH_C32(0x57be4757), - SPH_C32(0xb902d2b9), SPH_C32(0x86bf2e86), SPH_C32(0xc17129c1), - SPH_C32(0x1d53741d), SPH_C32(0x9ef74e9e), SPH_C32(0xe191a9e1), - SPH_C32(0xf8decdf8), SPH_C32(0x98e55698), SPH_C32(0x11774411), - SPH_C32(0x6904bf69), SPH_C32(0xd93949d9), SPH_C32(0x8e870e8e), - SPH_C32(0x94c16694), SPH_C32(0x9bec5a9b), SPH_C32(0x1e5a781e), - SPH_C32(0x87b82a87), SPH_C32(0xe9a989e9), SPH_C32(0xce5c15ce), - SPH_C32(0x55b04f55), SPH_C32(0x28d8a028), SPH_C32(0xdf2b51df), - SPH_C32(0x8c89068c), SPH_C32(0xa14ab2a1), SPH_C32(0x89921289), - SPH_C32(0x0d23340d), SPH_C32(0xbf10cabf), SPH_C32(0xe684b5e6), - SPH_C32(0x42d51342), SPH_C32(0x6803bb68), SPH_C32(0x41dc1f41), - SPH_C32(0x99e25299), SPH_C32(0x2dc3b42d), SPH_C32(0x0f2d3c0f), - SPH_C32(0xb03df6b0), SPH_C32(0x54b74b54), SPH_C32(0xbb0cdabb), - SPH_C32(0x16625816) -}; - -#define TIX2(q, x00, x01, x08, x10, x24) { \ - x10 ^= x00; \ - x00 = (q); \ - x08 ^= x00; \ - x01 ^= x24; \ - } - -#define TIX3(q, x00, x01, x04, x08, x16, x27, x30) { \ - x16 ^= x00; \ - x00 = (q); \ - x08 ^= x00; \ - x01 ^= x27; \ - x04 ^= x30; \ - } - -#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \ - x22 ^= x00; \ - x00 = (q); \ - x08 ^= x00; \ - x01 ^= x24; \ - x04 ^= x27; \ - x07 ^= x30; \ - } - -#define CMIX30(x00, x01, x02, x04, x05, x06, x15, x16, x17) { \ - x00 ^= x04; \ - x01 ^= x05; \ - x02 ^= x06; \ - x15 ^= x04; \ - x16 ^= x05; \ - x17 ^= x06; \ - } - -#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \ - x00 ^= x04; \ - x01 ^= x05; \ - x02 ^= x06; \ - x18 ^= x04; \ - x19 ^= x05; \ - x20 ^= x06; \ - } - -#define SMIX(x0, x1, x2, x3) { \ - uint32_t c0 = 0; \ - uint32_t c1 = 0; \ - uint32_t c2 = 0; \ - uint32_t c3 = 0; \ - uint32_t r0 = 0; \ - uint32_t r1 = 0; \ - uint32_t r2 = 0; \ - uint32_t r3 = 0; \ - uint32_t tmp; \ - tmp = mixtab0(x0 >> 24); \ - c0 ^= tmp; \ - tmp = mixtab1((x0 >> 16) & 0xFF); \ - c0 ^= tmp; \ - r1 ^= tmp; \ - tmp = mixtab2((x0 >> 8) & 0xFF); \ - c0 ^= tmp; \ - r2 ^= tmp; \ - tmp = mixtab3(x0 & 0xFF); \ - c0 ^= tmp; \ - r3 ^= tmp; \ - tmp = mixtab0(x1 >> 24); \ - c1 ^= tmp; \ - r0 ^= tmp; \ - tmp = mixtab1((x1 >> 16) & 0xFF); \ - c1 ^= tmp; \ - tmp = mixtab2((x1 >> 8) & 0xFF); \ - c1 ^= tmp; \ - r2 ^= tmp; \ - tmp = mixtab3(x1 & 0xFF); \ - c1 ^= tmp; \ - r3 ^= tmp; \ - tmp = mixtab0(x2 >> 24); \ - c2 ^= tmp; \ - r0 ^= tmp; \ - tmp = mixtab1((x2 >> 16) & 0xFF); \ - c2 ^= tmp; \ - r1 ^= tmp; \ - tmp = mixtab2((x2 >> 8) & 0xFF); \ - c2 ^= tmp; \ - tmp = mixtab3(x2 & 0xFF); \ - c2 ^= tmp; \ - r3 ^= tmp; \ - tmp = mixtab0(x3 >> 24); \ - c3 ^= tmp; \ - r0 ^= tmp; \ - tmp = mixtab1((x3 >> 16) & 0xFF); \ - c3 ^= tmp; \ - r1 ^= tmp; \ - tmp = mixtab2((x3 >> 8) & 0xFF); \ - c3 ^= tmp; \ - r2 ^= tmp; \ - tmp = mixtab3(x3 & 0xFF); \ - c3 ^= tmp; \ - x0 = ((c0 ^ r0) & SPH_C32(0xFF000000)) \ - | ((c1 ^ r1) & SPH_C32(0x00FF0000)) \ - | ((c2 ^ r2) & SPH_C32(0x0000FF00)) \ - | ((c3 ^ r3) & SPH_C32(0x000000FF)); \ - x1 = ((c1 ^ (r0 << 8)) & SPH_C32(0xFF000000)) \ - | ((c2 ^ (r1 << 8)) & SPH_C32(0x00FF0000)) \ - | ((c3 ^ (r2 << 8)) & SPH_C32(0x0000FF00)) \ - | ((c0 ^ (r3 >> 24)) & SPH_C32(0x000000FF)); \ - x2 = ((c2 ^ (r0 << 16)) & SPH_C32(0xFF000000)) \ - | ((c3 ^ (r1 << 16)) & SPH_C32(0x00FF0000)) \ - | ((c0 ^ (r2 >> 16)) & SPH_C32(0x0000FF00)) \ - | ((c1 ^ (r3 >> 16)) & SPH_C32(0x000000FF)); \ - x3 = ((c3 ^ (r0 << 24)) & SPH_C32(0xFF000000)) \ - | ((c0 ^ (r1 >> 8)) & SPH_C32(0x00FF0000)) \ - | ((c1 ^ (r2 >> 8)) & SPH_C32(0x0000FF00)) \ - | ((c2 ^ (r3 >> 8)) & SPH_C32(0x000000FF)); \ - /* */ \ - } - -#define S00 (sc[ 0]) -#define S01 (sc[ 1]) -#define S02 (sc[ 2]) -#define S03 (sc[ 3]) -#define S04 (sc[ 4]) -#define S05 (sc[ 5]) -#define S06 (sc[ 6]) -#define S07 (sc[ 7]) -#define S08 (sc[ 8]) -#define S09 (sc[ 9]) -#define S10 (sc[10]) -#define S11 (sc[11]) -#define S12 (sc[12]) -#define S13 (sc[13]) -#define S14 (sc[14]) -#define S15 (sc[15]) -#define S16 (sc[16]) -#define S17 (sc[17]) -#define S18 (sc[18]) -#define S19 (sc[19]) -#define S20 (sc[20]) -#define S21 (sc[21]) -#define S22 (sc[22]) -#define S23 (sc[23]) -#define S24 (sc[24]) -#define S25 (sc[25]) -#define S26 (sc[26]) -#define S27 (sc[27]) -#define S28 (sc[28]) -#define S29 (sc[29]) -#define S30 (sc[30]) -#define S31 (sc[31]) -#define S32 (sc[32]) -#define S33 (sc[33]) -#define S34 (sc[34]) -#define S35 (sc[35]) - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) -/* GPU - FUNKTIONEN */ - -#if USE_SHARED -__global__ void __launch_bounds__(256) -#else -__global__ void -#endif -fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) -{ -#if USE_SHARED - extern __shared__ char mixtabs[]; - - *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x); - *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x); - *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x); - *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x); - - __syncthreads(); -#endif - - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - /* Nimm den State und verarbeite das letztenByte (die Nounce) */ - uint32_t sc[30]; - - #pragma unroll 30 - for(int i=0;i<30;i++) - sc[i] = GPUstate[i]; - - uint32_t nounce = startNounce + thread; // muss noch ermittelt werden - uint32_t q; - - - // Bei Byte 80 laufen die Teilrunden: 4-0-1 (hier fest) - - // Teilrunde 4 - q = nounce; - TIX2(q, S06, S07, S14, S16, S00); - CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20); - SMIX(S03, S04, S05, S06); - CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17); - SMIX(S00, S01, S02, S03); - - // Teilrunde 0 - q = 0; - TIX2(q, S00, S01, S08, S10, S24); - CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14); - SMIX(S27, S28, S29, S00); - CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11); - SMIX(S24, S25, S26, S27); - - // Teilrunde 1 - q = 0x280; // hoffentlich richtig rum... - TIX2(q, S24, S25, S02, S04, S18); - CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08); - SMIX(S21, S22, S23, S24); - CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05); - SMIX(S18, S19, S20, S21); - - // Rundenende - // rms = 12, d.h. 30 - 12 = 18 - - #pragma unroll 10 - for(int i=0;i<10;i++) - { - //ROR(3, 30); - uint32_t tmp[3]; - #pragma unroll 3 - for(int k=0;k<3;k++) - tmp[k] = sc[27+k]; - #pragma unroll 27 - for(int k=26;k>=0;k--) - sc[k+3] = sc[k]; - #pragma unroll 3 - for(int k=0;k<3;k++) - sc[k] = tmp[k]; - - - CMIX30(sc[18], sc[19], sc[20], sc[22], sc[23], sc[24], sc[3], sc[4], sc[5]); - SMIX(sc[18], sc[19], sc[20], sc[21]); - } - - #pragma unroll 13 - for(int i=0;i<13;i++) - { - sc[22] ^= sc[18]; - sc[3] ^= sc[18]; - - // ROR(15, 30); BEGIN - uint32_t tmp1[15]; - #pragma unroll 15 - for(int k=0;k<15;k++) - tmp1[k] = sc[15+k]; - #pragma unroll 15 - for(int k=14;k>=0;k--) - sc[k+15] = sc[k]; - #pragma unroll 15 - for(int k=0;k<15;k++) - sc[k] = tmp1[k]; - // ROR(15, 30); END - - SMIX(sc[18], sc[19], sc[20], sc[21]); - sc[22] ^= sc[18]; - sc[4] ^= sc[18]; - - // ROR(14, 30); BEGIN - uint32_t tmp2[14]; - #pragma unroll 14 - for(int k=0;k<14;k++) - tmp2[k] = sc[16+k]; - #pragma unroll 16 - for(int k=15;k>=0;k--) - sc[k+14] = sc[k]; - #pragma unroll 14 - for(int k=0;k<14;k++) - sc[k] = tmp2[k]; - // ROR(14, 30); END - - SMIX(sc[18], sc[19], sc[20], sc[21]); - } - - sc[22] ^= sc[18]; - sc[3] ^= sc[18]; - - /* - // SWAP32 und Daten ausgeben - #pragma unroll 4 - for(int i=0;i<4;i++) - ((uint32_t*)outputHash)[8*thread+i] = SWAB32(sc[19+i]); - - #pragma unroll 4 - for(int i=0;i<4;i++) - ((uint32_t*)outputHash)[8*thread+i+4] = SWAB32(sc[3+i]); - */ - uint32_t hash[8]; - #pragma unroll 4 - for(int i=0;i<4;i++) - ((uint32_t*)hash)[i] = SWAB32(sc[19+i]); - - #pragma unroll 4 - for(int i=0;i<4;i++) - ((uint32_t*)hash)[i+4] = SWAB32(sc[3+i]); - - int i; - bool rc = true; - - for (i = 7; i >= 0; i--) { - if (hash[i] > pTarget[i]) { - rc = false; - break; - } - if (hash[i] < pTarget[i]) { - rc = true; - break; - } - } - - if(rc == true) - { - if(resNounce[0] > nounce) - resNounce[0] = nounce; - } - } -} - -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } - - -void fugue256_cpu_init(int thr_id, int threads) -{ - cudaSetDevice(device_map[thr_id]); - - // Kopiere die Hash-Tabellen in den GPU-Speicher - texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); - texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); - texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); - texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); - - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); - cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); -} - -__host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) -{ - // CPU-Vorbereitungen treffen - sph_fugue256_context ctx_fugue_const; - sph_fugue256_init(&ctx_fugue_const); - sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern - - cudaMemcpyToSymbol( GPUstate, - ctx_fugue_const.S, - sizeof(uint32_t) * 30 ); - - cudaMemcpyToSymbol( pTarget, - pTargetIn, - sizeof(uint32_t) * 8 ); - - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); -} - -__host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce) -{ -#if USE_SHARED - const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN -#else - const int threadsperblock = 512; // so einstellen wie gewünscht ;-) -#endif - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs -#if USE_SHARED - size_t shared_size = 4 * 256 * sizeof(uint32_t); -#else - size_t shared_size = 0; -#endif - fugue256_gpu_hash<<>>(thr_id, threads, startNounce, d_fugue256_hashoutput[thr_id], d_resultNonce[thr_id]); - - // Strategisches Sleep Kommando zur Senkung der CPU Last - MyStreamSynchronize(NULL, 0, thr_id); - - //cudaMemcpy(outputHashes, d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost); - cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +#include "sph/sph_fugue.h" + +#define USE_SHARED 1 + +// aus cpu-miner.c +extern int device_map[8]; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// schon in sph_fugue.h definiert +//#define SPH_C32(x) ((uint32_t)(x ## U)) + +uint32_t *d_fugue256_hashoutput[8]; +uint32_t *d_resultNonce[8]; + +__constant__ uint32_t GPUstate[30]; // Single GPU +__constant__ uint32_t pTarget[8]; // Single GPU + +texture mixTab0Tex; +texture mixTab1Tex; +texture mixTab2Tex; +texture mixTab3Tex; + +#if USE_SHARED +#define mixtab0(x) (*((uint32_t*)mixtabs + ( (x)))) +#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x)))) +#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x)))) +#define mixtab3(x) (*((uint32_t*)mixtabs + (768+(x)))) +#else +#define mixtab0(x) tex1Dfetch(mixTab0Tex, x) +#define mixtab1(x) tex1Dfetch(mixTab1Tex, x) +#define mixtab2(x) tex1Dfetch(mixTab2Tex, x) +#define mixtab3(x) tex1Dfetch(mixTab3Tex, x) +#endif + +/* TABELLEN */ +static const uint32_t mixtab0_cpu[] = { + SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7), + SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7), + SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0), + SPH_C32(0x01010704), SPH_C32(0x67672e87), SPH_C32(0x2b2bd1ac), + SPH_C32(0xfefeccd5), SPH_C32(0xd7d71371), SPH_C32(0xabab7c9a), + SPH_C32(0x767659c3), SPH_C32(0xcaca4005), SPH_C32(0x8282a33e), + SPH_C32(0xc9c94909), SPH_C32(0x7d7d68ef), SPH_C32(0xfafad0c5), + SPH_C32(0x5959947f), SPH_C32(0x4747ce07), SPH_C32(0xf0f0e6ed), + SPH_C32(0xadad6e82), SPH_C32(0xd4d41a7d), SPH_C32(0xa2a243be), + SPH_C32(0xafaf608a), SPH_C32(0x9c9cf946), SPH_C32(0xa4a451a6), + SPH_C32(0x727245d3), SPH_C32(0xc0c0762d), SPH_C32(0xb7b728ea), + SPH_C32(0xfdfdc5d9), SPH_C32(0x9393d47a), SPH_C32(0x2626f298), + SPH_C32(0x363682d8), SPH_C32(0x3f3fbdfc), SPH_C32(0xf7f7f3f1), + SPH_C32(0xcccc521d), SPH_C32(0x34348cd0), SPH_C32(0xa5a556a2), + SPH_C32(0xe5e58db9), SPH_C32(0xf1f1e1e9), SPH_C32(0x71714cdf), + SPH_C32(0xd8d83e4d), SPH_C32(0x313197c4), SPH_C32(0x15156b54), + SPH_C32(0x04041c10), SPH_C32(0xc7c76331), SPH_C32(0x2323e98c), + SPH_C32(0xc3c37f21), SPH_C32(0x18184860), SPH_C32(0x9696cf6e), + SPH_C32(0x05051b14), SPH_C32(0x9a9aeb5e), SPH_C32(0x0707151c), + SPH_C32(0x12127e48), SPH_C32(0x8080ad36), SPH_C32(0xe2e298a5), + SPH_C32(0xebeba781), SPH_C32(0x2727f59c), SPH_C32(0xb2b233fe), + SPH_C32(0x757550cf), SPH_C32(0x09093f24), SPH_C32(0x8383a43a), + SPH_C32(0x2c2cc4b0), SPH_C32(0x1a1a4668), SPH_C32(0x1b1b416c), + SPH_C32(0x6e6e11a3), SPH_C32(0x5a5a9d73), SPH_C32(0xa0a04db6), + SPH_C32(0x5252a553), SPH_C32(0x3b3ba1ec), SPH_C32(0xd6d61475), + SPH_C32(0xb3b334fa), SPH_C32(0x2929dfa4), SPH_C32(0xe3e39fa1), + SPH_C32(0x2f2fcdbc), SPH_C32(0x8484b126), SPH_C32(0x5353a257), + SPH_C32(0xd1d10169), SPH_C32(0x00000000), SPH_C32(0xededb599), + SPH_C32(0x2020e080), SPH_C32(0xfcfcc2dd), SPH_C32(0xb1b13af2), + SPH_C32(0x5b5b9a77), SPH_C32(0x6a6a0db3), SPH_C32(0xcbcb4701), + SPH_C32(0xbebe17ce), SPH_C32(0x3939afe4), SPH_C32(0x4a4aed33), + SPH_C32(0x4c4cff2b), SPH_C32(0x5858937b), SPH_C32(0xcfcf5b11), + SPH_C32(0xd0d0066d), SPH_C32(0xefefbb91), SPH_C32(0xaaaa7b9e), + SPH_C32(0xfbfbd7c1), SPH_C32(0x4343d217), SPH_C32(0x4d4df82f), + SPH_C32(0x333399cc), SPH_C32(0x8585b622), SPH_C32(0x4545c00f), + SPH_C32(0xf9f9d9c9), SPH_C32(0x02020e08), SPH_C32(0x7f7f66e7), + SPH_C32(0x5050ab5b), SPH_C32(0x3c3cb4f0), SPH_C32(0x9f9ff04a), + SPH_C32(0xa8a87596), SPH_C32(0x5151ac5f), SPH_C32(0xa3a344ba), + SPH_C32(0x4040db1b), SPH_C32(0x8f8f800a), SPH_C32(0x9292d37e), + SPH_C32(0x9d9dfe42), SPH_C32(0x3838a8e0), SPH_C32(0xf5f5fdf9), + SPH_C32(0xbcbc19c6), SPH_C32(0xb6b62fee), SPH_C32(0xdada3045), + SPH_C32(0x2121e784), SPH_C32(0x10107040), SPH_C32(0xffffcbd1), + SPH_C32(0xf3f3efe1), SPH_C32(0xd2d20865), SPH_C32(0xcdcd5519), + SPH_C32(0x0c0c2430), SPH_C32(0x1313794c), SPH_C32(0xececb29d), + SPH_C32(0x5f5f8667), SPH_C32(0x9797c86a), SPH_C32(0x4444c70b), + SPH_C32(0x1717655c), SPH_C32(0xc4c46a3d), SPH_C32(0xa7a758aa), + SPH_C32(0x7e7e61e3), SPH_C32(0x3d3db3f4), SPH_C32(0x6464278b), + SPH_C32(0x5d5d886f), SPH_C32(0x19194f64), SPH_C32(0x737342d7), + SPH_C32(0x60603b9b), SPH_C32(0x8181aa32), SPH_C32(0x4f4ff627), + SPH_C32(0xdcdc225d), SPH_C32(0x2222ee88), SPH_C32(0x2a2ad6a8), + SPH_C32(0x9090dd76), SPH_C32(0x88889516), SPH_C32(0x4646c903), + SPH_C32(0xeeeebc95), SPH_C32(0xb8b805d6), SPH_C32(0x14146c50), + SPH_C32(0xdede2c55), SPH_C32(0x5e5e8163), SPH_C32(0x0b0b312c), + SPH_C32(0xdbdb3741), SPH_C32(0xe0e096ad), SPH_C32(0x32329ec8), + SPH_C32(0x3a3aa6e8), SPH_C32(0x0a0a3628), SPH_C32(0x4949e43f), + SPH_C32(0x06061218), SPH_C32(0x2424fc90), SPH_C32(0x5c5c8f6b), + SPH_C32(0xc2c27825), SPH_C32(0xd3d30f61), SPH_C32(0xacac6986), + SPH_C32(0x62623593), SPH_C32(0x9191da72), SPH_C32(0x9595c662), + SPH_C32(0xe4e48abd), SPH_C32(0x797974ff), SPH_C32(0xe7e783b1), + SPH_C32(0xc8c84e0d), SPH_C32(0x373785dc), SPH_C32(0x6d6d18af), + SPH_C32(0x8d8d8e02), SPH_C32(0xd5d51d79), SPH_C32(0x4e4ef123), + SPH_C32(0xa9a97292), SPH_C32(0x6c6c1fab), SPH_C32(0x5656b943), + SPH_C32(0xf4f4fafd), SPH_C32(0xeaeaa085), SPH_C32(0x6565208f), + SPH_C32(0x7a7a7df3), SPH_C32(0xaeae678e), SPH_C32(0x08083820), + SPH_C32(0xbaba0bde), SPH_C32(0x787873fb), SPH_C32(0x2525fb94), + SPH_C32(0x2e2ecab8), SPH_C32(0x1c1c5470), SPH_C32(0xa6a65fae), + SPH_C32(0xb4b421e6), SPH_C32(0xc6c66435), SPH_C32(0xe8e8ae8d), + SPH_C32(0xdddd2559), SPH_C32(0x747457cb), SPH_C32(0x1f1f5d7c), + SPH_C32(0x4b4bea37), SPH_C32(0xbdbd1ec2), SPH_C32(0x8b8b9c1a), + SPH_C32(0x8a8a9b1e), SPH_C32(0x70704bdb), SPH_C32(0x3e3ebaf8), + SPH_C32(0xb5b526e2), SPH_C32(0x66662983), SPH_C32(0x4848e33b), + SPH_C32(0x0303090c), SPH_C32(0xf6f6f4f5), SPH_C32(0x0e0e2a38), + SPH_C32(0x61613c9f), SPH_C32(0x35358bd4), SPH_C32(0x5757be47), + SPH_C32(0xb9b902d2), SPH_C32(0x8686bf2e), SPH_C32(0xc1c17129), + SPH_C32(0x1d1d5374), SPH_C32(0x9e9ef74e), SPH_C32(0xe1e191a9), + SPH_C32(0xf8f8decd), SPH_C32(0x9898e556), SPH_C32(0x11117744), + SPH_C32(0x696904bf), SPH_C32(0xd9d93949), SPH_C32(0x8e8e870e), + SPH_C32(0x9494c166), SPH_C32(0x9b9bec5a), SPH_C32(0x1e1e5a78), + SPH_C32(0x8787b82a), SPH_C32(0xe9e9a989), SPH_C32(0xcece5c15), + SPH_C32(0x5555b04f), SPH_C32(0x2828d8a0), SPH_C32(0xdfdf2b51), + SPH_C32(0x8c8c8906), SPH_C32(0xa1a14ab2), SPH_C32(0x89899212), + SPH_C32(0x0d0d2334), SPH_C32(0xbfbf10ca), SPH_C32(0xe6e684b5), + SPH_C32(0x4242d513), SPH_C32(0x686803bb), SPH_C32(0x4141dc1f), + SPH_C32(0x9999e252), SPH_C32(0x2d2dc3b4), SPH_C32(0x0f0f2d3c), + SPH_C32(0xb0b03df6), SPH_C32(0x5454b74b), SPH_C32(0xbbbb0cda), + SPH_C32(0x16166258) +}; + +static const uint32_t mixtab1_cpu[] = { + SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e), + SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a), + SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090), + SPH_C32(0x04010107), SPH_C32(0x8767672e), SPH_C32(0xac2b2bd1), + SPH_C32(0xd5fefecc), SPH_C32(0x71d7d713), SPH_C32(0x9aabab7c), + SPH_C32(0xc3767659), SPH_C32(0x05caca40), SPH_C32(0x3e8282a3), + SPH_C32(0x09c9c949), SPH_C32(0xef7d7d68), SPH_C32(0xc5fafad0), + SPH_C32(0x7f595994), SPH_C32(0x074747ce), SPH_C32(0xedf0f0e6), + SPH_C32(0x82adad6e), SPH_C32(0x7dd4d41a), SPH_C32(0xbea2a243), + SPH_C32(0x8aafaf60), SPH_C32(0x469c9cf9), SPH_C32(0xa6a4a451), + SPH_C32(0xd3727245), SPH_C32(0x2dc0c076), SPH_C32(0xeab7b728), + SPH_C32(0xd9fdfdc5), SPH_C32(0x7a9393d4), SPH_C32(0x982626f2), + SPH_C32(0xd8363682), SPH_C32(0xfc3f3fbd), SPH_C32(0xf1f7f7f3), + SPH_C32(0x1dcccc52), SPH_C32(0xd034348c), SPH_C32(0xa2a5a556), + SPH_C32(0xb9e5e58d), SPH_C32(0xe9f1f1e1), SPH_C32(0xdf71714c), + SPH_C32(0x4dd8d83e), SPH_C32(0xc4313197), SPH_C32(0x5415156b), + SPH_C32(0x1004041c), SPH_C32(0x31c7c763), SPH_C32(0x8c2323e9), + SPH_C32(0x21c3c37f), SPH_C32(0x60181848), SPH_C32(0x6e9696cf), + SPH_C32(0x1405051b), SPH_C32(0x5e9a9aeb), SPH_C32(0x1c070715), + SPH_C32(0x4812127e), SPH_C32(0x368080ad), SPH_C32(0xa5e2e298), + SPH_C32(0x81ebeba7), SPH_C32(0x9c2727f5), SPH_C32(0xfeb2b233), + SPH_C32(0xcf757550), SPH_C32(0x2409093f), SPH_C32(0x3a8383a4), + SPH_C32(0xb02c2cc4), SPH_C32(0x681a1a46), SPH_C32(0x6c1b1b41), + SPH_C32(0xa36e6e11), SPH_C32(0x735a5a9d), SPH_C32(0xb6a0a04d), + SPH_C32(0x535252a5), SPH_C32(0xec3b3ba1), SPH_C32(0x75d6d614), + SPH_C32(0xfab3b334), SPH_C32(0xa42929df), SPH_C32(0xa1e3e39f), + SPH_C32(0xbc2f2fcd), SPH_C32(0x268484b1), SPH_C32(0x575353a2), + SPH_C32(0x69d1d101), SPH_C32(0x00000000), SPH_C32(0x99ededb5), + SPH_C32(0x802020e0), SPH_C32(0xddfcfcc2), SPH_C32(0xf2b1b13a), + SPH_C32(0x775b5b9a), SPH_C32(0xb36a6a0d), SPH_C32(0x01cbcb47), + SPH_C32(0xcebebe17), SPH_C32(0xe43939af), SPH_C32(0x334a4aed), + SPH_C32(0x2b4c4cff), SPH_C32(0x7b585893), SPH_C32(0x11cfcf5b), + SPH_C32(0x6dd0d006), SPH_C32(0x91efefbb), SPH_C32(0x9eaaaa7b), + SPH_C32(0xc1fbfbd7), SPH_C32(0x174343d2), SPH_C32(0x2f4d4df8), + SPH_C32(0xcc333399), SPH_C32(0x228585b6), SPH_C32(0x0f4545c0), + SPH_C32(0xc9f9f9d9), SPH_C32(0x0802020e), SPH_C32(0xe77f7f66), + SPH_C32(0x5b5050ab), SPH_C32(0xf03c3cb4), SPH_C32(0x4a9f9ff0), + SPH_C32(0x96a8a875), SPH_C32(0x5f5151ac), SPH_C32(0xbaa3a344), + SPH_C32(0x1b4040db), SPH_C32(0x0a8f8f80), SPH_C32(0x7e9292d3), + SPH_C32(0x429d9dfe), SPH_C32(0xe03838a8), SPH_C32(0xf9f5f5fd), + SPH_C32(0xc6bcbc19), SPH_C32(0xeeb6b62f), SPH_C32(0x45dada30), + SPH_C32(0x842121e7), SPH_C32(0x40101070), SPH_C32(0xd1ffffcb), + SPH_C32(0xe1f3f3ef), SPH_C32(0x65d2d208), SPH_C32(0x19cdcd55), + SPH_C32(0x300c0c24), SPH_C32(0x4c131379), SPH_C32(0x9dececb2), + SPH_C32(0x675f5f86), SPH_C32(0x6a9797c8), SPH_C32(0x0b4444c7), + SPH_C32(0x5c171765), SPH_C32(0x3dc4c46a), SPH_C32(0xaaa7a758), + SPH_C32(0xe37e7e61), SPH_C32(0xf43d3db3), SPH_C32(0x8b646427), + SPH_C32(0x6f5d5d88), SPH_C32(0x6419194f), SPH_C32(0xd7737342), + SPH_C32(0x9b60603b), SPH_C32(0x328181aa), SPH_C32(0x274f4ff6), + SPH_C32(0x5ddcdc22), SPH_C32(0x882222ee), SPH_C32(0xa82a2ad6), + SPH_C32(0x769090dd), SPH_C32(0x16888895), SPH_C32(0x034646c9), + SPH_C32(0x95eeeebc), SPH_C32(0xd6b8b805), SPH_C32(0x5014146c), + SPH_C32(0x55dede2c), SPH_C32(0x635e5e81), SPH_C32(0x2c0b0b31), + SPH_C32(0x41dbdb37), SPH_C32(0xade0e096), SPH_C32(0xc832329e), + SPH_C32(0xe83a3aa6), SPH_C32(0x280a0a36), SPH_C32(0x3f4949e4), + SPH_C32(0x18060612), SPH_C32(0x902424fc), SPH_C32(0x6b5c5c8f), + SPH_C32(0x25c2c278), SPH_C32(0x61d3d30f), SPH_C32(0x86acac69), + SPH_C32(0x93626235), SPH_C32(0x729191da), SPH_C32(0x629595c6), + SPH_C32(0xbde4e48a), SPH_C32(0xff797974), SPH_C32(0xb1e7e783), + SPH_C32(0x0dc8c84e), SPH_C32(0xdc373785), SPH_C32(0xaf6d6d18), + SPH_C32(0x028d8d8e), SPH_C32(0x79d5d51d), SPH_C32(0x234e4ef1), + SPH_C32(0x92a9a972), SPH_C32(0xab6c6c1f), SPH_C32(0x435656b9), + SPH_C32(0xfdf4f4fa), SPH_C32(0x85eaeaa0), SPH_C32(0x8f656520), + SPH_C32(0xf37a7a7d), SPH_C32(0x8eaeae67), SPH_C32(0x20080838), + SPH_C32(0xdebaba0b), SPH_C32(0xfb787873), SPH_C32(0x942525fb), + SPH_C32(0xb82e2eca), SPH_C32(0x701c1c54), SPH_C32(0xaea6a65f), + SPH_C32(0xe6b4b421), SPH_C32(0x35c6c664), SPH_C32(0x8de8e8ae), + SPH_C32(0x59dddd25), SPH_C32(0xcb747457), SPH_C32(0x7c1f1f5d), + SPH_C32(0x374b4bea), SPH_C32(0xc2bdbd1e), SPH_C32(0x1a8b8b9c), + SPH_C32(0x1e8a8a9b), SPH_C32(0xdb70704b), SPH_C32(0xf83e3eba), + SPH_C32(0xe2b5b526), SPH_C32(0x83666629), SPH_C32(0x3b4848e3), + SPH_C32(0x0c030309), SPH_C32(0xf5f6f6f4), SPH_C32(0x380e0e2a), + SPH_C32(0x9f61613c), SPH_C32(0xd435358b), SPH_C32(0x475757be), + SPH_C32(0xd2b9b902), SPH_C32(0x2e8686bf), SPH_C32(0x29c1c171), + SPH_C32(0x741d1d53), SPH_C32(0x4e9e9ef7), SPH_C32(0xa9e1e191), + SPH_C32(0xcdf8f8de), SPH_C32(0x569898e5), SPH_C32(0x44111177), + SPH_C32(0xbf696904), SPH_C32(0x49d9d939), SPH_C32(0x0e8e8e87), + SPH_C32(0x669494c1), SPH_C32(0x5a9b9bec), SPH_C32(0x781e1e5a), + SPH_C32(0x2a8787b8), SPH_C32(0x89e9e9a9), SPH_C32(0x15cece5c), + SPH_C32(0x4f5555b0), SPH_C32(0xa02828d8), SPH_C32(0x51dfdf2b), + SPH_C32(0x068c8c89), SPH_C32(0xb2a1a14a), SPH_C32(0x12898992), + SPH_C32(0x340d0d23), SPH_C32(0xcabfbf10), SPH_C32(0xb5e6e684), + SPH_C32(0x134242d5), SPH_C32(0xbb686803), SPH_C32(0x1f4141dc), + SPH_C32(0x529999e2), SPH_C32(0xb42d2dc3), SPH_C32(0x3c0f0f2d), + SPH_C32(0xf6b0b03d), SPH_C32(0x4b5454b7), SPH_C32(0xdabbbb0c), + SPH_C32(0x58161662) +}; + +static const uint32_t mixtab2_cpu[] = { + SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777), + SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b), + SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030), + SPH_C32(0x07040101), SPH_C32(0x2e876767), SPH_C32(0xd1ac2b2b), + SPH_C32(0xccd5fefe), SPH_C32(0x1371d7d7), SPH_C32(0x7c9aabab), + SPH_C32(0x59c37676), SPH_C32(0x4005caca), SPH_C32(0xa33e8282), + SPH_C32(0x4909c9c9), SPH_C32(0x68ef7d7d), SPH_C32(0xd0c5fafa), + SPH_C32(0x947f5959), SPH_C32(0xce074747), SPH_C32(0xe6edf0f0), + SPH_C32(0x6e82adad), SPH_C32(0x1a7dd4d4), SPH_C32(0x43bea2a2), + SPH_C32(0x608aafaf), SPH_C32(0xf9469c9c), SPH_C32(0x51a6a4a4), + SPH_C32(0x45d37272), SPH_C32(0x762dc0c0), SPH_C32(0x28eab7b7), + SPH_C32(0xc5d9fdfd), SPH_C32(0xd47a9393), SPH_C32(0xf2982626), + SPH_C32(0x82d83636), SPH_C32(0xbdfc3f3f), SPH_C32(0xf3f1f7f7), + SPH_C32(0x521dcccc), SPH_C32(0x8cd03434), SPH_C32(0x56a2a5a5), + SPH_C32(0x8db9e5e5), SPH_C32(0xe1e9f1f1), SPH_C32(0x4cdf7171), + SPH_C32(0x3e4dd8d8), SPH_C32(0x97c43131), SPH_C32(0x6b541515), + SPH_C32(0x1c100404), SPH_C32(0x6331c7c7), SPH_C32(0xe98c2323), + SPH_C32(0x7f21c3c3), SPH_C32(0x48601818), SPH_C32(0xcf6e9696), + SPH_C32(0x1b140505), SPH_C32(0xeb5e9a9a), SPH_C32(0x151c0707), + SPH_C32(0x7e481212), SPH_C32(0xad368080), SPH_C32(0x98a5e2e2), + SPH_C32(0xa781ebeb), SPH_C32(0xf59c2727), SPH_C32(0x33feb2b2), + SPH_C32(0x50cf7575), SPH_C32(0x3f240909), SPH_C32(0xa43a8383), + SPH_C32(0xc4b02c2c), SPH_C32(0x46681a1a), SPH_C32(0x416c1b1b), + SPH_C32(0x11a36e6e), SPH_C32(0x9d735a5a), SPH_C32(0x4db6a0a0), + SPH_C32(0xa5535252), SPH_C32(0xa1ec3b3b), SPH_C32(0x1475d6d6), + SPH_C32(0x34fab3b3), SPH_C32(0xdfa42929), SPH_C32(0x9fa1e3e3), + SPH_C32(0xcdbc2f2f), SPH_C32(0xb1268484), SPH_C32(0xa2575353), + SPH_C32(0x0169d1d1), SPH_C32(0x00000000), SPH_C32(0xb599eded), + SPH_C32(0xe0802020), SPH_C32(0xc2ddfcfc), SPH_C32(0x3af2b1b1), + SPH_C32(0x9a775b5b), SPH_C32(0x0db36a6a), SPH_C32(0x4701cbcb), + SPH_C32(0x17cebebe), SPH_C32(0xafe43939), SPH_C32(0xed334a4a), + SPH_C32(0xff2b4c4c), SPH_C32(0x937b5858), SPH_C32(0x5b11cfcf), + SPH_C32(0x066dd0d0), SPH_C32(0xbb91efef), SPH_C32(0x7b9eaaaa), + SPH_C32(0xd7c1fbfb), SPH_C32(0xd2174343), SPH_C32(0xf82f4d4d), + SPH_C32(0x99cc3333), SPH_C32(0xb6228585), SPH_C32(0xc00f4545), + SPH_C32(0xd9c9f9f9), SPH_C32(0x0e080202), SPH_C32(0x66e77f7f), + SPH_C32(0xab5b5050), SPH_C32(0xb4f03c3c), SPH_C32(0xf04a9f9f), + SPH_C32(0x7596a8a8), SPH_C32(0xac5f5151), SPH_C32(0x44baa3a3), + SPH_C32(0xdb1b4040), SPH_C32(0x800a8f8f), SPH_C32(0xd37e9292), + SPH_C32(0xfe429d9d), SPH_C32(0xa8e03838), SPH_C32(0xfdf9f5f5), + SPH_C32(0x19c6bcbc), SPH_C32(0x2feeb6b6), SPH_C32(0x3045dada), + SPH_C32(0xe7842121), SPH_C32(0x70401010), SPH_C32(0xcbd1ffff), + SPH_C32(0xefe1f3f3), SPH_C32(0x0865d2d2), SPH_C32(0x5519cdcd), + SPH_C32(0x24300c0c), SPH_C32(0x794c1313), SPH_C32(0xb29decec), + SPH_C32(0x86675f5f), SPH_C32(0xc86a9797), SPH_C32(0xc70b4444), + SPH_C32(0x655c1717), SPH_C32(0x6a3dc4c4), SPH_C32(0x58aaa7a7), + SPH_C32(0x61e37e7e), SPH_C32(0xb3f43d3d), SPH_C32(0x278b6464), + SPH_C32(0x886f5d5d), SPH_C32(0x4f641919), SPH_C32(0x42d77373), + SPH_C32(0x3b9b6060), SPH_C32(0xaa328181), SPH_C32(0xf6274f4f), + SPH_C32(0x225ddcdc), SPH_C32(0xee882222), SPH_C32(0xd6a82a2a), + SPH_C32(0xdd769090), SPH_C32(0x95168888), SPH_C32(0xc9034646), + SPH_C32(0xbc95eeee), SPH_C32(0x05d6b8b8), SPH_C32(0x6c501414), + SPH_C32(0x2c55dede), SPH_C32(0x81635e5e), SPH_C32(0x312c0b0b), + SPH_C32(0x3741dbdb), SPH_C32(0x96ade0e0), SPH_C32(0x9ec83232), + SPH_C32(0xa6e83a3a), SPH_C32(0x36280a0a), SPH_C32(0xe43f4949), + SPH_C32(0x12180606), SPH_C32(0xfc902424), SPH_C32(0x8f6b5c5c), + SPH_C32(0x7825c2c2), SPH_C32(0x0f61d3d3), SPH_C32(0x6986acac), + SPH_C32(0x35936262), SPH_C32(0xda729191), SPH_C32(0xc6629595), + SPH_C32(0x8abde4e4), SPH_C32(0x74ff7979), SPH_C32(0x83b1e7e7), + SPH_C32(0x4e0dc8c8), SPH_C32(0x85dc3737), SPH_C32(0x18af6d6d), + SPH_C32(0x8e028d8d), SPH_C32(0x1d79d5d5), SPH_C32(0xf1234e4e), + SPH_C32(0x7292a9a9), SPH_C32(0x1fab6c6c), SPH_C32(0xb9435656), + SPH_C32(0xfafdf4f4), SPH_C32(0xa085eaea), SPH_C32(0x208f6565), + SPH_C32(0x7df37a7a), SPH_C32(0x678eaeae), SPH_C32(0x38200808), + SPH_C32(0x0bdebaba), SPH_C32(0x73fb7878), SPH_C32(0xfb942525), + SPH_C32(0xcab82e2e), SPH_C32(0x54701c1c), SPH_C32(0x5faea6a6), + SPH_C32(0x21e6b4b4), SPH_C32(0x6435c6c6), SPH_C32(0xae8de8e8), + SPH_C32(0x2559dddd), SPH_C32(0x57cb7474), SPH_C32(0x5d7c1f1f), + SPH_C32(0xea374b4b), SPH_C32(0x1ec2bdbd), SPH_C32(0x9c1a8b8b), + SPH_C32(0x9b1e8a8a), SPH_C32(0x4bdb7070), SPH_C32(0xbaf83e3e), + SPH_C32(0x26e2b5b5), SPH_C32(0x29836666), SPH_C32(0xe33b4848), + SPH_C32(0x090c0303), SPH_C32(0xf4f5f6f6), SPH_C32(0x2a380e0e), + SPH_C32(0x3c9f6161), SPH_C32(0x8bd43535), SPH_C32(0xbe475757), + SPH_C32(0x02d2b9b9), SPH_C32(0xbf2e8686), SPH_C32(0x7129c1c1), + SPH_C32(0x53741d1d), SPH_C32(0xf74e9e9e), SPH_C32(0x91a9e1e1), + SPH_C32(0xdecdf8f8), SPH_C32(0xe5569898), SPH_C32(0x77441111), + SPH_C32(0x04bf6969), SPH_C32(0x3949d9d9), SPH_C32(0x870e8e8e), + SPH_C32(0xc1669494), SPH_C32(0xec5a9b9b), SPH_C32(0x5a781e1e), + SPH_C32(0xb82a8787), SPH_C32(0xa989e9e9), SPH_C32(0x5c15cece), + SPH_C32(0xb04f5555), SPH_C32(0xd8a02828), SPH_C32(0x2b51dfdf), + SPH_C32(0x89068c8c), SPH_C32(0x4ab2a1a1), SPH_C32(0x92128989), + SPH_C32(0x23340d0d), SPH_C32(0x10cabfbf), SPH_C32(0x84b5e6e6), + SPH_C32(0xd5134242), SPH_C32(0x03bb6868), SPH_C32(0xdc1f4141), + SPH_C32(0xe2529999), SPH_C32(0xc3b42d2d), SPH_C32(0x2d3c0f0f), + SPH_C32(0x3df6b0b0), SPH_C32(0xb74b5454), SPH_C32(0x0cdabbbb), + SPH_C32(0x62581616) +}; + +static const uint32_t mixtab3_cpu[] = { + SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777), + SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b), + SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030), + SPH_C32(0x01070401), SPH_C32(0x672e8767), SPH_C32(0x2bd1ac2b), + SPH_C32(0xfeccd5fe), SPH_C32(0xd71371d7), SPH_C32(0xab7c9aab), + SPH_C32(0x7659c376), SPH_C32(0xca4005ca), SPH_C32(0x82a33e82), + SPH_C32(0xc94909c9), SPH_C32(0x7d68ef7d), SPH_C32(0xfad0c5fa), + SPH_C32(0x59947f59), SPH_C32(0x47ce0747), SPH_C32(0xf0e6edf0), + SPH_C32(0xad6e82ad), SPH_C32(0xd41a7dd4), SPH_C32(0xa243bea2), + SPH_C32(0xaf608aaf), SPH_C32(0x9cf9469c), SPH_C32(0xa451a6a4), + SPH_C32(0x7245d372), SPH_C32(0xc0762dc0), SPH_C32(0xb728eab7), + SPH_C32(0xfdc5d9fd), SPH_C32(0x93d47a93), SPH_C32(0x26f29826), + SPH_C32(0x3682d836), SPH_C32(0x3fbdfc3f), SPH_C32(0xf7f3f1f7), + SPH_C32(0xcc521dcc), SPH_C32(0x348cd034), SPH_C32(0xa556a2a5), + SPH_C32(0xe58db9e5), SPH_C32(0xf1e1e9f1), SPH_C32(0x714cdf71), + SPH_C32(0xd83e4dd8), SPH_C32(0x3197c431), SPH_C32(0x156b5415), + SPH_C32(0x041c1004), SPH_C32(0xc76331c7), SPH_C32(0x23e98c23), + SPH_C32(0xc37f21c3), SPH_C32(0x18486018), SPH_C32(0x96cf6e96), + SPH_C32(0x051b1405), SPH_C32(0x9aeb5e9a), SPH_C32(0x07151c07), + SPH_C32(0x127e4812), SPH_C32(0x80ad3680), SPH_C32(0xe298a5e2), + SPH_C32(0xeba781eb), SPH_C32(0x27f59c27), SPH_C32(0xb233feb2), + SPH_C32(0x7550cf75), SPH_C32(0x093f2409), SPH_C32(0x83a43a83), + SPH_C32(0x2cc4b02c), SPH_C32(0x1a46681a), SPH_C32(0x1b416c1b), + SPH_C32(0x6e11a36e), SPH_C32(0x5a9d735a), SPH_C32(0xa04db6a0), + SPH_C32(0x52a55352), SPH_C32(0x3ba1ec3b), SPH_C32(0xd61475d6), + SPH_C32(0xb334fab3), SPH_C32(0x29dfa429), SPH_C32(0xe39fa1e3), + SPH_C32(0x2fcdbc2f), SPH_C32(0x84b12684), SPH_C32(0x53a25753), + SPH_C32(0xd10169d1), SPH_C32(0x00000000), SPH_C32(0xedb599ed), + SPH_C32(0x20e08020), SPH_C32(0xfcc2ddfc), SPH_C32(0xb13af2b1), + SPH_C32(0x5b9a775b), SPH_C32(0x6a0db36a), SPH_C32(0xcb4701cb), + SPH_C32(0xbe17cebe), SPH_C32(0x39afe439), SPH_C32(0x4aed334a), + SPH_C32(0x4cff2b4c), SPH_C32(0x58937b58), SPH_C32(0xcf5b11cf), + SPH_C32(0xd0066dd0), SPH_C32(0xefbb91ef), SPH_C32(0xaa7b9eaa), + SPH_C32(0xfbd7c1fb), SPH_C32(0x43d21743), SPH_C32(0x4df82f4d), + SPH_C32(0x3399cc33), SPH_C32(0x85b62285), SPH_C32(0x45c00f45), + SPH_C32(0xf9d9c9f9), SPH_C32(0x020e0802), SPH_C32(0x7f66e77f), + SPH_C32(0x50ab5b50), SPH_C32(0x3cb4f03c), SPH_C32(0x9ff04a9f), + SPH_C32(0xa87596a8), SPH_C32(0x51ac5f51), SPH_C32(0xa344baa3), + SPH_C32(0x40db1b40), SPH_C32(0x8f800a8f), SPH_C32(0x92d37e92), + SPH_C32(0x9dfe429d), SPH_C32(0x38a8e038), SPH_C32(0xf5fdf9f5), + SPH_C32(0xbc19c6bc), SPH_C32(0xb62feeb6), SPH_C32(0xda3045da), + SPH_C32(0x21e78421), SPH_C32(0x10704010), SPH_C32(0xffcbd1ff), + SPH_C32(0xf3efe1f3), SPH_C32(0xd20865d2), SPH_C32(0xcd5519cd), + SPH_C32(0x0c24300c), SPH_C32(0x13794c13), SPH_C32(0xecb29dec), + SPH_C32(0x5f86675f), SPH_C32(0x97c86a97), SPH_C32(0x44c70b44), + SPH_C32(0x17655c17), SPH_C32(0xc46a3dc4), SPH_C32(0xa758aaa7), + SPH_C32(0x7e61e37e), SPH_C32(0x3db3f43d), SPH_C32(0x64278b64), + SPH_C32(0x5d886f5d), SPH_C32(0x194f6419), SPH_C32(0x7342d773), + SPH_C32(0x603b9b60), SPH_C32(0x81aa3281), SPH_C32(0x4ff6274f), + SPH_C32(0xdc225ddc), SPH_C32(0x22ee8822), SPH_C32(0x2ad6a82a), + SPH_C32(0x90dd7690), SPH_C32(0x88951688), SPH_C32(0x46c90346), + SPH_C32(0xeebc95ee), SPH_C32(0xb805d6b8), SPH_C32(0x146c5014), + SPH_C32(0xde2c55de), SPH_C32(0x5e81635e), SPH_C32(0x0b312c0b), + SPH_C32(0xdb3741db), SPH_C32(0xe096ade0), SPH_C32(0x329ec832), + SPH_C32(0x3aa6e83a), SPH_C32(0x0a36280a), SPH_C32(0x49e43f49), + SPH_C32(0x06121806), SPH_C32(0x24fc9024), SPH_C32(0x5c8f6b5c), + SPH_C32(0xc27825c2), SPH_C32(0xd30f61d3), SPH_C32(0xac6986ac), + SPH_C32(0x62359362), SPH_C32(0x91da7291), SPH_C32(0x95c66295), + SPH_C32(0xe48abde4), SPH_C32(0x7974ff79), SPH_C32(0xe783b1e7), + SPH_C32(0xc84e0dc8), SPH_C32(0x3785dc37), SPH_C32(0x6d18af6d), + SPH_C32(0x8d8e028d), SPH_C32(0xd51d79d5), SPH_C32(0x4ef1234e), + SPH_C32(0xa97292a9), SPH_C32(0x6c1fab6c), SPH_C32(0x56b94356), + SPH_C32(0xf4fafdf4), SPH_C32(0xeaa085ea), SPH_C32(0x65208f65), + SPH_C32(0x7a7df37a), SPH_C32(0xae678eae), SPH_C32(0x08382008), + SPH_C32(0xba0bdeba), SPH_C32(0x7873fb78), SPH_C32(0x25fb9425), + SPH_C32(0x2ecab82e), SPH_C32(0x1c54701c), SPH_C32(0xa65faea6), + SPH_C32(0xb421e6b4), SPH_C32(0xc66435c6), SPH_C32(0xe8ae8de8), + SPH_C32(0xdd2559dd), SPH_C32(0x7457cb74), SPH_C32(0x1f5d7c1f), + SPH_C32(0x4bea374b), SPH_C32(0xbd1ec2bd), SPH_C32(0x8b9c1a8b), + SPH_C32(0x8a9b1e8a), SPH_C32(0x704bdb70), SPH_C32(0x3ebaf83e), + SPH_C32(0xb526e2b5), SPH_C32(0x66298366), SPH_C32(0x48e33b48), + SPH_C32(0x03090c03), SPH_C32(0xf6f4f5f6), SPH_C32(0x0e2a380e), + SPH_C32(0x613c9f61), SPH_C32(0x358bd435), SPH_C32(0x57be4757), + SPH_C32(0xb902d2b9), SPH_C32(0x86bf2e86), SPH_C32(0xc17129c1), + SPH_C32(0x1d53741d), SPH_C32(0x9ef74e9e), SPH_C32(0xe191a9e1), + SPH_C32(0xf8decdf8), SPH_C32(0x98e55698), SPH_C32(0x11774411), + SPH_C32(0x6904bf69), SPH_C32(0xd93949d9), SPH_C32(0x8e870e8e), + SPH_C32(0x94c16694), SPH_C32(0x9bec5a9b), SPH_C32(0x1e5a781e), + SPH_C32(0x87b82a87), SPH_C32(0xe9a989e9), SPH_C32(0xce5c15ce), + SPH_C32(0x55b04f55), SPH_C32(0x28d8a028), SPH_C32(0xdf2b51df), + SPH_C32(0x8c89068c), SPH_C32(0xa14ab2a1), SPH_C32(0x89921289), + SPH_C32(0x0d23340d), SPH_C32(0xbf10cabf), SPH_C32(0xe684b5e6), + SPH_C32(0x42d51342), SPH_C32(0x6803bb68), SPH_C32(0x41dc1f41), + SPH_C32(0x99e25299), SPH_C32(0x2dc3b42d), SPH_C32(0x0f2d3c0f), + SPH_C32(0xb03df6b0), SPH_C32(0x54b74b54), SPH_C32(0xbb0cdabb), + SPH_C32(0x16625816) +}; + +#define TIX2(q, x00, x01, x08, x10, x24) { \ + x10 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x24; \ + } + +#define TIX3(q, x00, x01, x04, x08, x16, x27, x30) { \ + x16 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x27; \ + x04 ^= x30; \ + } + +#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \ + x22 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x24; \ + x04 ^= x27; \ + x07 ^= x30; \ + } + +#define CMIX30(x00, x01, x02, x04, x05, x06, x15, x16, x17) { \ + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x15 ^= x04; \ + x16 ^= x05; \ + x17 ^= x06; \ + } + +#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \ + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x18 ^= x04; \ + x19 ^= x05; \ + x20 ^= x06; \ + } + +#define SMIX(x0, x1, x2, x3) { \ + uint32_t c0 = 0; \ + uint32_t c1 = 0; \ + uint32_t c2 = 0; \ + uint32_t c3 = 0; \ + uint32_t r0 = 0; \ + uint32_t r1 = 0; \ + uint32_t r2 = 0; \ + uint32_t r3 = 0; \ + uint32_t tmp; \ + tmp = mixtab0(x0 >> 24); \ + c0 ^= tmp; \ + tmp = mixtab1((x0 >> 16) & 0xFF); \ + c0 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x0 >> 8) & 0xFF); \ + c0 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x0 & 0xFF); \ + c0 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x1 >> 24); \ + c1 ^= tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x1 >> 16) & 0xFF); \ + c1 ^= tmp; \ + tmp = mixtab2((x1 >> 8) & 0xFF); \ + c1 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x1 & 0xFF); \ + c1 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x2 >> 24); \ + c2 ^= tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x2 >> 16) & 0xFF); \ + c2 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x2 >> 8) & 0xFF); \ + c2 ^= tmp; \ + tmp = mixtab3(x2 & 0xFF); \ + c2 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x3 >> 24); \ + c3 ^= tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x3 >> 16) & 0xFF); \ + c3 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x3 >> 8) & 0xFF); \ + c3 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x3 & 0xFF); \ + c3 ^= tmp; \ + x0 = ((c0 ^ r0) & SPH_C32(0xFF000000)) \ + | ((c1 ^ r1) & SPH_C32(0x00FF0000)) \ + | ((c2 ^ r2) & SPH_C32(0x0000FF00)) \ + | ((c3 ^ r3) & SPH_C32(0x000000FF)); \ + x1 = ((c1 ^ (r0 << 8)) & SPH_C32(0xFF000000)) \ + | ((c2 ^ (r1 << 8)) & SPH_C32(0x00FF0000)) \ + | ((c3 ^ (r2 << 8)) & SPH_C32(0x0000FF00)) \ + | ((c0 ^ (r3 >> 24)) & SPH_C32(0x000000FF)); \ + x2 = ((c2 ^ (r0 << 16)) & SPH_C32(0xFF000000)) \ + | ((c3 ^ (r1 << 16)) & SPH_C32(0x00FF0000)) \ + | ((c0 ^ (r2 >> 16)) & SPH_C32(0x0000FF00)) \ + | ((c1 ^ (r3 >> 16)) & SPH_C32(0x000000FF)); \ + x3 = ((c3 ^ (r0 << 24)) & SPH_C32(0xFF000000)) \ + | ((c0 ^ (r1 >> 8)) & SPH_C32(0x00FF0000)) \ + | ((c1 ^ (r2 >> 8)) & SPH_C32(0x0000FF00)) \ + | ((c2 ^ (r3 >> 8)) & SPH_C32(0x000000FF)); \ + /* */ \ + } + +#define S00 (sc[ 0]) +#define S01 (sc[ 1]) +#define S02 (sc[ 2]) +#define S03 (sc[ 3]) +#define S04 (sc[ 4]) +#define S05 (sc[ 5]) +#define S06 (sc[ 6]) +#define S07 (sc[ 7]) +#define S08 (sc[ 8]) +#define S09 (sc[ 9]) +#define S10 (sc[10]) +#define S11 (sc[11]) +#define S12 (sc[12]) +#define S13 (sc[13]) +#define S14 (sc[14]) +#define S15 (sc[15]) +#define S16 (sc[16]) +#define S17 (sc[17]) +#define S18 (sc[18]) +#define S19 (sc[19]) +#define S20 (sc[20]) +#define S21 (sc[21]) +#define S22 (sc[22]) +#define S23 (sc[23]) +#define S24 (sc[24]) +#define S25 (sc[25]) +#define S26 (sc[26]) +#define S27 (sc[27]) +#define S28 (sc[28]) +#define S29 (sc[29]) +#define S30 (sc[30]) +#define S31 (sc[31]) +#define S32 (sc[32]) +#define S33 (sc[33]) +#define S34 (sc[34]) +#define S35 (sc[35]) + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) +/* GPU - FUNKTIONEN */ + +#if USE_SHARED +__global__ void __launch_bounds__(256) +#else +__global__ void +#endif +fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) +{ +#if USE_SHARED + extern __shared__ char mixtabs[]; + + *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x); + *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x); + *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x); + *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x); + + __syncthreads(); +#endif + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + /* Nimm den State und verarbeite das letztenByte (die Nounce) */ + uint32_t sc[30]; + + #pragma unroll 30 + for(int i=0;i<30;i++) + sc[i] = GPUstate[i]; + + uint32_t nounce = startNounce + thread; // muss noch ermittelt werden + uint32_t q; + + + // Bei Byte 80 laufen die Teilrunden: 4-0-1 (hier fest) + + // Teilrunde 4 + q = nounce; + TIX2(q, S06, S07, S14, S16, S00); + CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20); + SMIX(S03, S04, S05, S06); + CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17); + SMIX(S00, S01, S02, S03); + + // Teilrunde 0 + q = 0; + TIX2(q, S00, S01, S08, S10, S24); + CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14); + SMIX(S27, S28, S29, S00); + CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11); + SMIX(S24, S25, S26, S27); + + // Teilrunde 1 + q = 0x280; // hoffentlich richtig rum... + TIX2(q, S24, S25, S02, S04, S18); + CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08); + SMIX(S21, S22, S23, S24); + CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05); + SMIX(S18, S19, S20, S21); + + // Rundenende + // rms = 12, d.h. 30 - 12 = 18 + + #pragma unroll 10 + for(int i=0;i<10;i++) + { + //ROR(3, 30); + uint32_t tmp[3]; + #pragma unroll 3 + for(int k=0;k<3;k++) + tmp[k] = sc[27+k]; + #pragma unroll 27 + for(int k=26;k>=0;k--) + sc[k+3] = sc[k]; + #pragma unroll 3 + for(int k=0;k<3;k++) + sc[k] = tmp[k]; + + + CMIX30(sc[18], sc[19], sc[20], sc[22], sc[23], sc[24], sc[3], sc[4], sc[5]); + SMIX(sc[18], sc[19], sc[20], sc[21]); + } + + #pragma unroll 13 + for(int i=0;i<13;i++) + { + sc[22] ^= sc[18]; + sc[3] ^= sc[18]; + + // ROR(15, 30); BEGIN + uint32_t tmp1[15]; + #pragma unroll 15 + for(int k=0;k<15;k++) + tmp1[k] = sc[15+k]; + #pragma unroll 15 + for(int k=14;k>=0;k--) + sc[k+15] = sc[k]; + #pragma unroll 15 + for(int k=0;k<15;k++) + sc[k] = tmp1[k]; + // ROR(15, 30); END + + SMIX(sc[18], sc[19], sc[20], sc[21]); + sc[22] ^= sc[18]; + sc[4] ^= sc[18]; + + // ROR(14, 30); BEGIN + uint32_t tmp2[14]; + #pragma unroll 14 + for(int k=0;k<14;k++) + tmp2[k] = sc[16+k]; + #pragma unroll 16 + for(int k=15;k>=0;k--) + sc[k+14] = sc[k]; + #pragma unroll 14 + for(int k=0;k<14;k++) + sc[k] = tmp2[k]; + // ROR(14, 30); END + + SMIX(sc[18], sc[19], sc[20], sc[21]); + } + + sc[22] ^= sc[18]; + sc[3] ^= sc[18]; + + /* + // SWAP32 und Daten ausgeben + #pragma unroll 4 + for(int i=0;i<4;i++) + ((uint32_t*)outputHash)[8*thread+i] = SWAB32(sc[19+i]); + + #pragma unroll 4 + for(int i=0;i<4;i++) + ((uint32_t*)outputHash)[8*thread+i+4] = SWAB32(sc[3+i]); + */ + uint32_t hash[8]; + #pragma unroll 4 + for(int i=0;i<4;i++) + ((uint32_t*)hash)[i] = SWAB32(sc[19+i]); + + #pragma unroll 4 + for(int i=0;i<4;i++) + ((uint32_t*)hash)[i+4] = SWAB32(sc[3+i]); + + int i; + bool rc = true; + + for (i = 7; i >= 0; i--) { + if (hash[i] > pTarget[i]) { + rc = false; + break; + } + if (hash[i] < pTarget[i]) { + rc = true; + break; + } + } + + if(rc == true) + { + if(resNounce[0] > nounce) + resNounce[0] = nounce; + } + } +} + +#define texDef(texname, texmem, texsource, texsize) \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } + + +void fugue256_cpu_init(int thr_id, int threads) +{ + cudaSetDevice(device_map[thr_id]); + + // Kopiere die Hash-Tabellen in den GPU-Speicher + texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); + texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); + texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); + texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); + + // Speicher für alle Ergebnisse belegen + cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); + cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); +} + +__host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) +{ + // CPU-Vorbereitungen treffen + sph_fugue256_context ctx_fugue_const; + sph_fugue256_init(&ctx_fugue_const); + sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern + + cudaMemcpyToSymbol( GPUstate, + ctx_fugue_const.S, + sizeof(uint32_t) * 30 ); + + cudaMemcpyToSymbol( pTarget, + pTargetIn, + sizeof(uint32_t) * 8 ); + + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); +} + +__host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce) +{ +#if USE_SHARED + const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN +#else + const int threadsperblock = 512; // so einstellen wie gewünscht ;-) +#endif + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs +#if USE_SHARED + size_t shared_size = 4 * 256 * sizeof(uint32_t); +#else + size_t shared_size = 0; +#endif + fugue256_gpu_hash<<>>(thr_id, threads, startNounce, d_fugue256_hashoutput[thr_id], d_resultNonce[thr_id]); + + // Strategisches Sleep Kommando zur Senkung der CPU Last + MyStreamSynchronize(NULL, 0, thr_id); + + //cudaMemcpy(outputHashes, d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); +} diff --git a/cuda_fugue256.h b/cuda_fugue256.h index bb864fb..c9fac91 100644 --- a/cuda_fugue256.h +++ b/cuda_fugue256.h @@ -1,8 +1,8 @@ -#ifndef _CUDA_FUGUE512_H -#define _CUDA_FUGUE512_H - -void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce); -void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn); -void fugue256_cpu_init(int thr_id, int threads); - -#endif +#ifndef _CUDA_FUGUE512_H +#define _CUDA_FUGUE512_H + +void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce); +void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn); +void fugue256_cpu_init(int thr_id, int threads); + +#endif diff --git a/cuda_groestl512.cu b/cuda_groestl512.cu index 1aebcf3..6875404 100644 --- a/cuda_groestl512.cu +++ b/cuda_groestl512.cu @@ -1,822 +1,822 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *d_heftyHashes[8]; -extern uint32_t *d_nonceVector[8]; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hash4output[8]; - -__constant__ uint32_t groestl_gpu_state[32]; -__constant__ uint32_t groestl_gpu_msg[32]; - -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#define PC32up(j, r) ((uint32_t)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) 0xFFFFFFFF -#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) - -#define B32_0(x) ((x) & 0xFF) -#define B32_1(x) (((x) >> 8) & 0xFF) -#define B32_2(x) (((x) >> 16) & 0xFF) -#define B32_3(x) ((x) >> 24) - -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define C32e(x) ((SPH_C32(x) >> 24) \ - | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ - | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ - | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) - -#define T0up(x) tex1Dfetch(t0up, x) -#define T0dn(x) tex1Dfetch(t0dn, x) -#define T1up(x) tex1Dfetch(t1up, x) -#define T1dn(x) tex1Dfetch(t1dn, x) -#define T2up(x) tex1Dfetch(t2up, x) -#define T2dn(x) tex1Dfetch(t2dn, x) -#define T3up(x) tex1Dfetch(t3up, x) -#define T3dn(x) tex1Dfetch(t3dn, x) - -texture t0up; -texture t0dn; -texture t1up; -texture t1dn; -texture t2up; -texture t2dn; -texture t3up; -texture t3dn; - -uint32_t T0up_cpu[] = { - C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), - C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), - C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), - C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), - C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), - C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), - C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), - C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), - C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), - C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), - C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), - C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), - C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), - C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), - C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), - C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), - C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), - C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), - C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), - C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), - C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), - C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), - C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), - C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), - C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), - C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), - C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), - C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), - C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), - C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), - C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), - C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), - C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), - C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), - C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), - C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), - C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), - C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), - C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), - C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), - C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), - C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), - C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), - C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), - C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), - C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), - C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), - C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), - C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), - C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), - C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), - C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), - C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), - C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), - C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), - C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), - C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), - C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), - C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), - C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), - C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), - C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), - C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), - C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) -}; - -uint32_t T0dn_cpu[] = { - C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), - C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), - C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), - C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), - C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), - C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), - C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), - C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), - C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), - C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), - C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), - C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), - C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), - C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), - C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), - C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), - C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), - C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), - C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), - C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), - C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), - C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), - C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), - C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), - C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), - C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), - C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), - C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), - C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), - C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), - C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), - C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), - C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), - C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), - C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), - C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), - C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), - C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), - C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), - C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), - C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), - C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), - C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), - C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), - C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), - C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), - C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), - C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), - C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), - C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), - C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), - C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), - C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), - C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), - C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), - C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), - C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), - C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), - C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), - C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), - C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), - C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), - C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), - C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) -}; - -uint32_t T1up_cpu[] = { - C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), - C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), - C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), - C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), - C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), - C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), - C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), - C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), - C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), - C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), - C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), - C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), - C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), - C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), - C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), - C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), - C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), - C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), - C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), - C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), - C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), - C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), - C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), - C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), - C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), - C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), - C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), - C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), - C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), - C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), - C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), - C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), - C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), - C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), - C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), - C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), - C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), - C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), - C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), - C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), - C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), - C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), - C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), - C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), - C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), - C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), - C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), - C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), - C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), - C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), - C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), - C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), - C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), - C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), - C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), - C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), - C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), - C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), - C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), - C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), - C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), - C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), - C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), - C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) -}; - -uint32_t T1dn_cpu[] = { - C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), - C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), - C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), - C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), - C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), - C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), - C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), - C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), - C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), - C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), - C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), - C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), - C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), - C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), - C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), - C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), - C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), - C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), - C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), - C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), - C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), - C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), - C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), - C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), - C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), - C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), - C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), - C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), - C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), - C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), - C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), - C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), - C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), - C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), - C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), - C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), - C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), - C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), - C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), - C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), - C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), - C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), - C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), - C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), - C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), - C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), - C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), - C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), - C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), - C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), - C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), - C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), - C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), - C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), - C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), - C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), - C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), - C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), - C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), - C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), - C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), - C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), - C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), - C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) -}; - -uint32_t T2up_cpu[] = { - C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), - C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), - C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), - C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), - C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), - C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), - C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), - C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), - C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), - C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), - C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), - C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), - C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), - C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), - C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), - C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), - C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), - C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), - C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), - C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), - C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), - C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), - C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), - C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), - C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), - C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), - C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), - C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), - C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), - C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), - C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), - C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), - C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), - C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), - C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), - C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), - C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), - C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), - C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), - C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), - C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), - C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), - C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), - C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), - C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), - C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), - C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), - C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), - C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), - C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), - C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), - C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), - C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), - C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), - C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), - C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), - C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), - C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), - C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), - C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), - C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), - C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), - C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), - C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) -}; - -uint32_t T2dn_cpu[] = { - C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), - C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), - C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), - C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), - C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), - C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), - C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), - C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), - C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), - C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), - C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), - C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), - C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), - C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), - C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), - C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), - C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), - C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), - C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), - C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), - C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), - C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), - C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), - C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), - C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), - C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), - C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), - C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), - C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), - C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), - C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), - C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), - C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), - C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), - C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), - C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), - C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), - C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), - C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), - C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), - C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), - C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), - C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), - C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), - C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), - C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), - C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), - C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), - C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), - C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), - C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), - C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), - C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), - C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), - C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), - C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), - C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), - C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), - C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), - C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), - C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), - C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), - C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), - C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) -}; - -uint32_t T3up_cpu[] = { - C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), - C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), - C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), - C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), - C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), - C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), - C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), - C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), - C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), - C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), - C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), - C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), - C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), - C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), - C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), - C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), - C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), - C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), - C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), - C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), - C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), - C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), - C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), - C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), - C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), - C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), - C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), - C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), - C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), - C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), - C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), - C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), - C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), - C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), - C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), - C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), - C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), - C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), - C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), - C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), - C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), - C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), - C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), - C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), - C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), - C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), - C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), - C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), - C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), - C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), - C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), - C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), - C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), - C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), - C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), - C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), - C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), - C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), - C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), - C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), - C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), - C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), - C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), - C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) -}; - -uint32_t T3dn_cpu[] = { - C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), - C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), - C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), - C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), - C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), - C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), - C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), - C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), - C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), - C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), - C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), - C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), - C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), - C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), - C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), - C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), - C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), - C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), - C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), - C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), - C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), - C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), - C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), - C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), - C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), - C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), - C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), - C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), - C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), - C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), - C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), - C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), - C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), - C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), - C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), - C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), - C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), - C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), - C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), - C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), - C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), - C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), - C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), - C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), - C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), - C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), - C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), - C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), - C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), - C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), - C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), - C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), - C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), - C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), - C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), - C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), - C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), - C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), - C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), - C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), - C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), - C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), - C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), - C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) -}; - -__device__ void groestl512_perm_P(uint32_t *a) -{ - uint32_t t[32]; - -//#pragma unroll 14 - for(int r=0;r<14;r++) - { -#pragma unroll 16 - for(int k=0;k<16;k++) - { - a[(k*2)+0] ^= PC32up(k * 0x10, r); - //a[(k<<1)+1] ^= PC32dn(k * 0x10, r); - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^ - T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ - T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 23) & 0x1f]) ); - - t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ - T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 23) & 0x1f]) ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} - -__device__ void groestl512_perm_Q(uint32_t *a) -{ -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - uint32_t t[32]; - -#pragma unroll 16 - for(int k=0;k<16;k++) - { - a[(k*2)+0] ^= QC32up(k * 0x10, r); - a[(k*2)+1] ^= QC32dn(k * 0x10, r); - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ - T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 13) & 0x1f]) ); - - t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ - T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 13) & 0x1f]) ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} - -__global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t message[32]; - uint32_t state[32]; - - // lese message ein & verknüpfe diese mit dem hash1 von hefty1 - // lese den state ein - -#pragma unroll 32 - for(int k=0;k<32;k++) - { - state[k] = groestl_gpu_state[k]; - message[k] = groestl_gpu_msg[k]; - } - - uint32_t nounce = nonceVector[thread]; - // nounce setzen - //message[19] = startNounce + thread; - message[19] = nounce; - - uint32_t hashPosition = nounce - startNounce; - - // den richtigen Hefty1 Hash holen -// memcpy(&message[21], &heftyHashes[8 * hashPosition], sizeof(uint32_t) * 8); - uint32_t *heftyHash = &heftyHashes[8 * hashPosition]; -#pragma unroll 8 - for (int k=0; k<8; ++k) - message[21+k] = heftyHash[k]; - - uint32_t g[32]; -#pragma unroll 32 - for(int u=0;u<32;u++) - g[u] = message[u] ^ state[u]; - - // Perm - groestl512_perm_P(g); - groestl512_perm_Q(message); - -#pragma unroll 32 - for(int u=0;u<32;u++) - { - state[u] ^= g[u] ^ message[u]; - g[u] = state[u]; - } - - groestl512_perm_P(g); - -#pragma unroll 32 - for(int u=0;u<32;u++) - state[u] ^= g[u]; - - // kopiere Ergebnis -#pragma unroll 16 - for(int k=0;k<16;k++) - ((uint32_t*)outputHash)[16*hashPosition+k] = state[k + 16]; - } -} - -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ - -// Setup-Funktionen -__host__ void groestl512_cpu_init(int thr_id, int threads) -{ - // Texturen mit obigem Makro initialisieren - texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); - - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads); -} - -__host__ void groestl512_cpu_setBlock(void *data) - // data muss 84-Byte haben! - // heftyHash hat 32-Byte -{ - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 84); - - // Erweitere die Nachricht auf den Nachrichtenblock (padding) - // Unsere Nachricht hat 116 Byte - msgBlock[29] = 0x80; - msgBlock[31] = 0x01000000; - - // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird - // auf der GPU ausgeführt) - - // setze register - uint32_t groestl_state_init[32]; - memset(groestl_state_init, 0, sizeof(uint32_t) * 32); - groestl_state_init[31] = 0x20000; - - // state speichern - cudaMemcpyToSymbol( groestl_gpu_state, - groestl_state_init, - 128); - - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( groestl_gpu_msg, - msgBlock, - 128); -} - -__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) -{ - // Hefty1 Hashes kopieren (eigentlich nur zum debuggen) - if (copy) - cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); -} - -__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) -{ - const int threadsperblock = 128; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - groestl512_gpu_hash<<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// globaler Speicher für alle HeftyHashes aller Threads +extern uint32_t *d_heftyHashes[8]; +extern uint32_t *d_nonceVector[8]; + +// globaler Speicher für unsere Ergebnisse +uint32_t *d_hash4output[8]; + +__constant__ uint32_t groestl_gpu_state[32]; +__constant__ uint32_t groestl_gpu_msg[32]; + +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define PC32up(j, r) ((uint32_t)((j) + (r))) +#define PC32dn(j, r) 0 +#define QC32up(j, r) 0xFFFFFFFF +#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) + +#define B32_0(x) ((x) & 0xFF) +#define B32_1(x) (((x) >> 8) & 0xFF) +#define B32_2(x) (((x) >> 16) & 0xFF) +#define B32_3(x) ((x) >> 24) + +#define SPH_C32(x) ((uint32_t)(x ## U)) +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) + +#define T0up(x) tex1Dfetch(t0up, x) +#define T0dn(x) tex1Dfetch(t0dn, x) +#define T1up(x) tex1Dfetch(t1up, x) +#define T1dn(x) tex1Dfetch(t1dn, x) +#define T2up(x) tex1Dfetch(t2up, x) +#define T2dn(x) tex1Dfetch(t2dn, x) +#define T3up(x) tex1Dfetch(t3up, x) +#define T3dn(x) tex1Dfetch(t3dn, x) + +texture t0up; +texture t0dn; +texture t1up; +texture t1dn; +texture t2up; +texture t2dn; +texture t3up; +texture t3dn; + +uint32_t T0up_cpu[] = { + C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), + C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), + C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), + C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), + C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), + C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), + C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), + C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), + C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), + C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), + C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), + C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), + C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), + C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), + C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), + C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), + C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), + C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), + C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), + C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), + C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), + C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), + C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), + C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), + C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), + C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), + C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), + C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), + C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), + C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), + C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), + C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), + C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), + C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), + C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), + C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), + C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), + C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), + C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), + C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), + C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), + C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), + C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), + C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), + C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), + C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), + C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), + C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), + C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), + C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), + C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), + C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), + C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), + C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), + C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), + C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), + C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), + C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), + C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), + C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), + C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), + C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), + C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), + C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) +}; + +uint32_t T0dn_cpu[] = { + C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), + C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), + C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), + C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), + C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), + C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), + C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), + C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), + C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), + C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), + C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), + C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), + C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), + C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), + C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), + C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), + C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), + C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), + C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), + C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), + C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), + C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), + C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), + C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), + C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), + C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), + C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), + C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), + C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), + C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), + C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), + C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), + C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), + C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), + C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), + C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), + C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), + C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), + C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), + C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), + C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), + C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), + C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), + C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), + C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), + C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), + C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), + C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), + C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), + C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), + C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), + C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), + C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), + C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), + C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), + C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), + C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), + C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), + C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), + C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), + C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), + C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), + C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), + C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) +}; + +uint32_t T1up_cpu[] = { + C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), + C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), + C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), + C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), + C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), + C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), + C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), + C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), + C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), + C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), + C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), + C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), + C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), + C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), + C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), + C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), + C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), + C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), + C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), + C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), + C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), + C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), + C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), + C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), + C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), + C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), + C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), + C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), + C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), + C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), + C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), + C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), + C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), + C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), + C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), + C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), + C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), + C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), + C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), + C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), + C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), + C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), + C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), + C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), + C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), + C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), + C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), + C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), + C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), + C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), + C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), + C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), + C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), + C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), + C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), + C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), + C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), + C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), + C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), + C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), + C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), + C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), + C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), + C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) +}; + +uint32_t T1dn_cpu[] = { + C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), + C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), + C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), + C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), + C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), + C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), + C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), + C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), + C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), + C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), + C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), + C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), + C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), + C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), + C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), + C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), + C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), + C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), + C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), + C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), + C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), + C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), + C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), + C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), + C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), + C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), + C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), + C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), + C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), + C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), + C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), + C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), + C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), + C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), + C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), + C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), + C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), + C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), + C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), + C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), + C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), + C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), + C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), + C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), + C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), + C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), + C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), + C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), + C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), + C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), + C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), + C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), + C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), + C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), + C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), + C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), + C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), + C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), + C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), + C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), + C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), + C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), + C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), + C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) +}; + +uint32_t T2up_cpu[] = { + C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), + C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), + C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), + C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), + C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), + C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), + C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), + C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), + C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), + C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), + C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), + C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), + C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), + C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), + C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), + C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), + C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), + C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), + C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), + C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), + C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), + C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), + C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), + C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), + C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), + C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), + C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), + C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), + C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), + C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), + C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), + C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), + C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), + C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), + C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), + C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), + C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), + C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), + C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), + C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), + C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), + C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), + C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), + C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), + C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), + C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), + C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), + C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), + C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), + C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), + C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), + C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), + C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), + C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), + C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), + C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), + C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), + C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), + C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), + C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), + C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), + C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), + C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), + C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) +}; + +uint32_t T2dn_cpu[] = { + C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), + C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), + C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), + C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), + C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), + C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), + C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), + C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), + C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), + C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), + C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), + C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), + C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), + C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), + C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), + C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), + C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), + C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), + C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), + C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), + C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), + C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), + C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), + C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), + C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), + C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), + C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), + C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), + C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), + C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), + C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), + C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), + C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), + C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), + C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), + C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), + C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), + C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), + C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), + C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), + C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), + C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), + C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), + C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), + C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), + C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), + C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), + C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), + C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), + C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), + C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), + C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), + C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), + C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), + C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), + C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), + C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), + C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), + C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), + C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), + C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), + C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), + C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), + C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) +}; + +uint32_t T3up_cpu[] = { + C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), + C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), + C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), + C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), + C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), + C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), + C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), + C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), + C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), + C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), + C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), + C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), + C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), + C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), + C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), + C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), + C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), + C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), + C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), + C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), + C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), + C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), + C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), + C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), + C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), + C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), + C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), + C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), + C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), + C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), + C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), + C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), + C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), + C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), + C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), + C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), + C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), + C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), + C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), + C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), + C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), + C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), + C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), + C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), + C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), + C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), + C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), + C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), + C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), + C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), + C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), + C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), + C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), + C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), + C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), + C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), + C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), + C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), + C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), + C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), + C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), + C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), + C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), + C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) +}; + +uint32_t T3dn_cpu[] = { + C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), + C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), + C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), + C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), + C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), + C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), + C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), + C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), + C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), + C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), + C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), + C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), + C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), + C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), + C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), + C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), + C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), + C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), + C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), + C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), + C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), + C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), + C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), + C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), + C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), + C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), + C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), + C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), + C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), + C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), + C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), + C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), + C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), + C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), + C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), + C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), + C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), + C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), + C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), + C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), + C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), + C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), + C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), + C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), + C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), + C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), + C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), + C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), + C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), + C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), + C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), + C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), + C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), + C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), + C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), + C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), + C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), + C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), + C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), + C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), + C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), + C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), + C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), + C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) +}; + +__device__ void groestl512_perm_P(uint32_t *a) +{ + uint32_t t[32]; + +//#pragma unroll 14 + for(int r=0;r<14;r++) + { +#pragma unroll 16 + for(int k=0;k<16;k++) + { + a[(k*2)+0] ^= PC32up(k * 0x10, r); + //a[(k<<1)+1] ^= PC32dn(k * 0x10, r); + } + + // RBTT +#pragma unroll 16 + for(int k=0;k<32;k+=2) + { + t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^ + T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ + T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ + T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ + T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ + T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ + T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ + T3dn( B32_3(a[(k + 23) & 0x1f]) ); + + t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^ + T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ + T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ + T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ + T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ + T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ + T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ + T3up( B32_3(a[(k + 23) & 0x1f]) ); + } +#pragma unroll 32 + for(int k=0;k<32;k++) + a[k] = t[k]; + } +} + +__device__ void groestl512_perm_Q(uint32_t *a) +{ +//#pragma unroll 14 + for(int r=0;r<14;r++) + { + uint32_t t[32]; + +#pragma unroll 16 + for(int k=0;k<16;k++) + { + a[(k*2)+0] ^= QC32up(k * 0x10, r); + a[(k*2)+1] ^= QC32dn(k * 0x10, r); + } + + // RBTT +#pragma unroll 16 + for(int k=0;k<32;k+=2) + { + t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ + T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ + T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ + T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ + T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ + T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ + T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ + T3dn( B32_3(a[(k + 13) & 0x1f]) ); + + t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ + T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ + T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ + T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ + T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ + T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ + T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ + T3up( B32_3(a[(k + 13) & 0x1f]) ); + } +#pragma unroll 32 + for(int k=0;k<32;k++) + a[k] = t[k]; + } +} + +__global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t message[32]; + uint32_t state[32]; + + // lese message ein & verknüpfe diese mit dem hash1 von hefty1 + // lese den state ein + +#pragma unroll 32 + for(int k=0;k<32;k++) + { + state[k] = groestl_gpu_state[k]; + message[k] = groestl_gpu_msg[k]; + } + + uint32_t nounce = nonceVector[thread]; + // nounce setzen + //message[19] = startNounce + thread; + message[19] = nounce; + + uint32_t hashPosition = nounce - startNounce; + + // den richtigen Hefty1 Hash holen +// memcpy(&message[21], &heftyHashes[8 * hashPosition], sizeof(uint32_t) * 8); + uint32_t *heftyHash = &heftyHashes[8 * hashPosition]; +#pragma unroll 8 + for (int k=0; k<8; ++k) + message[21+k] = heftyHash[k]; + + uint32_t g[32]; +#pragma unroll 32 + for(int u=0;u<32;u++) + g[u] = message[u] ^ state[u]; + + // Perm + groestl512_perm_P(g); + groestl512_perm_Q(message); + +#pragma unroll 32 + for(int u=0;u<32;u++) + { + state[u] ^= g[u] ^ message[u]; + g[u] = state[u]; + } + + groestl512_perm_P(g); + +#pragma unroll 32 + for(int u=0;u<32;u++) + state[u] ^= g[u]; + + // kopiere Ergebnis +#pragma unroll 16 + for(int k=0;k<16;k++) + ((uint32_t*)outputHash)[16*hashPosition+k] = state[k + 16]; + } +} + +#define texDef(texname, texmem, texsource, texsize) \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + +// Setup-Funktionen +__host__ void groestl512_cpu_init(int thr_id, int threads) +{ + // Texturen mit obigem Makro initialisieren + texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256); + texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); + texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256); + texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); + texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256); + texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); + texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256); + texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); + + // Speicher für alle Ergebnisse belegen + cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads); +} + +__host__ void groestl512_cpu_setBlock(void *data) + // data muss 84-Byte haben! + // heftyHash hat 32-Byte +{ + // Nachricht expandieren und setzen + uint32_t msgBlock[32]; + + memset(msgBlock, 0, sizeof(uint32_t) * 32); + memcpy(&msgBlock[0], data, 84); + + // Erweitere die Nachricht auf den Nachrichtenblock (padding) + // Unsere Nachricht hat 116 Byte + msgBlock[29] = 0x80; + msgBlock[31] = 0x01000000; + + // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird + // auf der GPU ausgeführt) + + // setze register + uint32_t groestl_state_init[32]; + memset(groestl_state_init, 0, sizeof(uint32_t) * 32); + groestl_state_init[31] = 0x20000; + + // state speichern + cudaMemcpyToSymbol( groestl_gpu_state, + groestl_state_init, + 128); + + // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) + cudaMemcpyToSymbol( groestl_gpu_msg, + msgBlock, + 128); +} + +__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) +{ + // Hefty1 Hashes kopieren (eigentlich nur zum debuggen) + if (copy) + cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); +} + +__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) +{ + const int threadsperblock = 128; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + groestl512_gpu_hash<<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); +} diff --git a/cuda_groestl512.h b/cuda_groestl512.h index bbeee40..0e77f2f 100644 --- a/cuda_groestl512.h +++ b/cuda_groestl512.h @@ -1,9 +1,9 @@ -#ifndef _CUDA_GROESTL512_H -#define _CUDA_GROESTL512_H - -void groestl512_cpu_init(int thr_id, int threads); -void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -void groestl512_cpu_setBlock(void *data); -void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce); - +#ifndef _CUDA_GROESTL512_H +#define _CUDA_GROESTL512_H + +void groestl512_cpu_init(int thr_id, int threads); +void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); +void groestl512_cpu_setBlock(void *data); +void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce); + #endif \ No newline at end of file diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index 2fcdbe1..12de56d 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -1,475 +1,477 @@ -// Auf Groestlcoin spezialisierte Version von Groestl - -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -// IMPORTANT: leave this enabled! -#define USE_SHARED 1 - -// aus cpu-miner.c -extern int device_map[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -// aus driver.c -extern "C" void set_device(int device); - -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props; - -// globaler Speicher für alle HeftyHashes aller Threads -__constant__ uint32_t pTarget[8]; // Single GPU -extern uint32_t *d_resultNonce[8]; - -__constant__ uint32_t groestlcoin_gpu_msg[32]; - -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#define PC32up(j, r) ((uint32_t)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) 0xFFFFFFFF -#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) - -#define B32_0(x) __byte_perm(x, 0, 0x4440) -//((x) & 0xFF) -#define B32_1(x) __byte_perm(x, 0, 0x4441) -//(((x) >> 8) & 0xFF) -#define B32_2(x) __byte_perm(x, 0, 0x4442) -//(((x) >> 16) & 0xFF) -#define B32_3(x) __byte_perm(x, 0, 0x4443) -//((x) >> 24) - -#if 0 -#if USE_SHARED -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) -#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) -#else -#define T0up(x) tex1Dfetch(t0up1, x) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) tex1Dfetch(t1dn1, x) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) tex1Dfetch(t2dn1, x) -#define T3up(x) tex1Dfetch(t3up1, x) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif -#endif - -// a healthy mix between shared and textured access provides the highest speed! -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) tex1Dfetch(t3dn1, x) - -texture t0up1; -texture t0dn1; -texture t1up1; -texture t1dn1; -texture t2up1; -texture t2dn1; -texture t3up1; -texture t3dn1; - -extern uint32_t T0up_cpu[]; -extern uint32_t T0dn_cpu[]; -extern uint32_t T1up_cpu[]; -extern uint32_t T1dn_cpu[]; -extern uint32_t T2up_cpu[]; -extern uint32_t T2dn_cpu[]; -extern uint32_t T3up_cpu[]; -extern uint32_t T3dn_cpu[]; - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - - -__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) -{ - uint32_t t[32]; - -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - switch(r) - { - case 0: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; - case 1: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break; - case 2: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break; - case 3: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break; - case 4: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break; - case 5: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break; - case 6: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break; - case 7: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break; - case 8: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break; - case 9: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break; - case 10: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break; - case 11: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break; - case 12: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break; - case 13: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break; - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); - uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); - uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); - uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); - - t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ - T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); - - t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ - T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} - -__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) -{ -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - uint32_t t[32]; - - switch(r) - { - case 0: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; - case 1: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; - case 2: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; - case 3: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break; - case 4: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break; - case 5: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break; - case 6: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break; - case 7: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break; - case 8: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break; - case 9: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break; - case 10: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break; - case 11: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break; - case 12: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break; - case 13: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break; - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); - uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); - uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); - uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); - - t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ - T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); - - t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ - T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} -#if USE_SHARED -__global__ void /* __launch_bounds__(256) */ -#else -__global__ void -#endif - - groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) -{ -#if USE_SHARED - extern __shared__ char mixtabs[]; - - if (threadIdx.x < 256) - { - *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); - *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x); - *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x); - *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x); - *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x); - *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x); - } - - __syncthreads(); -#endif - - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // GROESTL - uint32_t message[32]; - uint32_t state[32]; - -#pragma unroll 32 - for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k]; - - uint32_t nounce = startNounce + thread; - message[19] = SWAB32(nounce); - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] = message[u]; - state[31] ^= 0x20000; - - // Perm -#if USE_SHARED - groestlcoin_perm_P(state, mixtabs); - state[31] ^= 0x20000; - groestlcoin_perm_Q(message, mixtabs); -#else - groestlcoin_perm_P(state, NULL); - state[31] ^= 0x20000; - groestlcoin_perm_Q(message, NULL); -#endif -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - -#pragma unroll 32 - for(int u=0;u<32;u++) message[u] = state[u]; - -#if USE_SHARED - groestlcoin_perm_P(message, mixtabs); -#else - groestlcoin_perm_P(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - - //// - //// 2. Runde groestl - //// -#pragma unroll 16 - for(int k=0;k<16;k++) message[k] = state[k + 16]; -#pragma unroll 14 - for(int k=1;k<15;k++) - message[k+16] = 0; - - message[16] = 0x80; - message[31] = 0x01000000; - -#pragma unroll 32 - for(int u=0;u<32;u++) - state[u] = message[u]; - state[31] ^= 0x20000; - - // Perm -#if USE_SHARED - groestlcoin_perm_P(state, mixtabs); - state[31] ^= 0x20000; - groestlcoin_perm_Q(message, mixtabs); -#else - groestlcoin_perm_P(state, NULL); - state[31] ^= 0x20000; - groestlcoin_perm_Q(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - -#pragma unroll 32 - for(int u=0;u<32;u++) message[u] = state[u]; - -#if USE_SHARED - groestlcoin_perm_P(message, mixtabs); -#else - groestlcoin_perm_P(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - - // kopiere Ergebnis - int i, position = -1; - bool rc = true; - -#pragma unroll 8 - for (i = 7; i >= 0; i--) { - if (state[i+16] > pTarget[i]) { - if(position < i) { - position = i; - rc = false; - } - } - if (state[i+16] < pTarget[i]) { - if(position < i) { - position = i; - rc = true; - } - } - } - - if(rc == true) - if(resNounce[0] > nounce) - resNounce[0] = nounce; - } -} - -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ - -// Setup-Funktionen -__host__ void groestlcoin_cpu_init(int thr_id, int threads) -{ - cudaSetDevice(device_map[thr_id]); - - cudaGetDeviceProperties(&props, device_map[thr_id]); - - cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 ); -// Texturen mit obigem Makro initialisieren - texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); - - // Speicher für Gewinner-Nonce belegen - cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); -} - -__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) -{ - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 80); - - // Erweitere die Nachricht auf den Nachrichtenblock (padding) - // Unsere Nachricht hat 80 Byte - msgBlock[20] = 0x80; - msgBlock[31] = 0x01000000; - - // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird - // auf der GPU ausgeführt) - - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( groestlcoin_gpu_msg, - msgBlock, - 128); - - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - cudaMemcpyToSymbol( pTarget, - pTargetIn, - sizeof(uint32_t) * 8 ); -} - -__host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) -{ - // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - int threadsperblock = (props.major >= 3) ? 768 : 512; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs -#if USE_SHARED - size_t shared_size = 8 * 256 * sizeof(uint32_t); -#else - size_t shared_size = 0; -#endif - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - //fprintf(stderr, "ThrID: %d\n", thr_id); - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - groestlcoin_gpu_hash<<>>(threads, startNounce, d_resultNonce[thr_id]); - - // Strategisches Sleep Kommando zur Senkung der CPU Last - MyStreamSynchronize(NULL, 0, thr_id); - - cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); -} +// Auf Groestlcoin spezialisierte Version von Groestl + +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// it's unfortunate that this is a compile time constant. +#define MAXWELL_OR_FERMI 0 + +// aus cpu-miner.c +extern int device_map[8]; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +// aus driver.c +extern "C" void set_device(int device); + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// diese Struktur wird in der Init Funktion angefordert +static cudaDeviceProp props; + +// globaler Speicher für alle HeftyHashes aller Threads +__constant__ uint32_t pTarget[8]; // Single GPU +extern uint32_t *d_resultNonce[8]; + +__constant__ uint32_t groestlcoin_gpu_msg[32]; + +#define SPH_C32(x) ((uint32_t)(x ## U)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define PC32up(j, r) ((uint32_t)((j) + (r))) +#define PC32dn(j, r) 0 +#define QC32up(j, r) 0xFFFFFFFF +#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) + +#define B32_0(x) __byte_perm(x, 0, 0x4440) +//((x) & 0xFF) +#define B32_1(x) __byte_perm(x, 0, 0x4441) +//(((x) >> 8) & 0xFF) +#define B32_2(x) __byte_perm(x, 0, 0x4442) +//(((x) >> 16) & 0xFF) +#define B32_3(x) __byte_perm(x, 0, 0x4443) +//((x) >> 24) + +#if MAXWELL_OR_FEMRI +#define USE_SHARED 1 +// Maxwell and Fermi cards get the best speed with SHARED access it seems. +#if USE_SHARED +#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) +#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) +#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) +#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) +#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) +#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) +#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) +#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) +#else +#define T0up(x) tex1Dfetch(t0up1, x) +#define T0dn(x) tex1Dfetch(t0dn1, x) +#define T1up(x) tex1Dfetch(t1up1, x) +#define T1dn(x) tex1Dfetch(t1dn1, x) +#define T2up(x) tex1Dfetch(t2up1, x) +#define T2dn(x) tex1Dfetch(t2dn1, x) +#define T3up(x) tex1Dfetch(t3up1, x) +#define T3dn(x) tex1Dfetch(t3dn1, x) +#endif +#else +#define USE_SHARED 1 +// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! +#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) +#define T0dn(x) tex1Dfetch(t0dn1, x) +#define T1up(x) tex1Dfetch(t1up1, x) +#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) +#define T2up(x) tex1Dfetch(t2up1, x) +#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) +#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) +#define T3dn(x) tex1Dfetch(t3dn1, x) +#endif + +texture t0up1; +texture t0dn1; +texture t1up1; +texture t1dn1; +texture t2up1; +texture t2dn1; +texture t3up1; +texture t3dn1; + +extern uint32_t T0up_cpu[]; +extern uint32_t T0dn_cpu[]; +extern uint32_t T1up_cpu[]; +extern uint32_t T1dn_cpu[]; +extern uint32_t T2up_cpu[]; +extern uint32_t T2dn_cpu[]; +extern uint32_t T3up_cpu[]; +extern uint32_t T3dn_cpu[]; + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + + +__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) +{ + uint32_t t[32]; + +//#pragma unroll 14 + for(int r=0;r<14;r++) + { + switch(r) + { + case 0: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; + case 1: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break; + case 2: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break; + case 3: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break; + case 4: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break; + case 5: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break; + case 6: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break; + case 7: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break; + case 8: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break; + case 9: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break; + case 10: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break; + case 11: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break; + case 12: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break; + case 13: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break; + } + + // RBTT +#pragma unroll 16 + for(int k=0;k<32;k+=2) + { + uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); + uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); + uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); + uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); + + t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ + T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); + + t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ + T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); + } +#pragma unroll 32 + for(int k=0;k<32;k++) + a[k] = t[k]; + } +} + +__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) +{ +//#pragma unroll 14 + for(int r=0;r<14;r++) + { + uint32_t t[32]; + + switch(r) + { + case 0: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; + case 1: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; + case 2: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; + case 3: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break; + case 4: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break; + case 5: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break; + case 6: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break; + case 7: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break; + case 8: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break; + case 9: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break; + case 10: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break; + case 11: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break; + case 12: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break; + case 13: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break; + } + + // RBTT +#pragma unroll 16 + for(int k=0;k<32;k+=2) + { + uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); + uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); + uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); + uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); + + t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ + T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); + + t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ + T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); + } +#pragma unroll 32 + for(int k=0;k<32;k++) + a[k] = t[k]; + } +} +#if USE_SHARED +__global__ void /* __launch_bounds__(256) */ +#else +__global__ void +#endif + + groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) +{ +#if USE_SHARED + extern __shared__ char mixtabs[]; + + if (threadIdx.x < 256) + { + *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); + *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x); + *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x); + *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x); + *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x); + *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x); + *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x); + *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x); + } + + __syncthreads(); +#endif + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // GROESTL + uint32_t message[32]; + uint32_t state[32]; + +#pragma unroll 32 + for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k]; + + uint32_t nounce = startNounce + thread; + message[19] = SWAB32(nounce); + +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] = message[u]; + state[31] ^= 0x20000; + + // Perm +#if USE_SHARED + groestlcoin_perm_P(state, mixtabs); + state[31] ^= 0x20000; + groestlcoin_perm_Q(message, mixtabs); +#else + groestlcoin_perm_P(state, NULL); + state[31] ^= 0x20000; + groestlcoin_perm_Q(message, NULL); +#endif +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] ^= message[u]; + +#pragma unroll 32 + for(int u=0;u<32;u++) message[u] = state[u]; + +#if USE_SHARED + groestlcoin_perm_P(message, mixtabs); +#else + groestlcoin_perm_P(message, NULL); +#endif + +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] ^= message[u]; + + //// + //// 2. Runde groestl + //// +#pragma unroll 16 + for(int k=0;k<16;k++) message[k] = state[k + 16]; +#pragma unroll 14 + for(int k=1;k<15;k++) + message[k+16] = 0; + + message[16] = 0x80; + message[31] = 0x01000000; + +#pragma unroll 32 + for(int u=0;u<32;u++) + state[u] = message[u]; + state[31] ^= 0x20000; + + // Perm +#if USE_SHARED + groestlcoin_perm_P(state, mixtabs); + state[31] ^= 0x20000; + groestlcoin_perm_Q(message, mixtabs); +#else + groestlcoin_perm_P(state, NULL); + state[31] ^= 0x20000; + groestlcoin_perm_Q(message, NULL); +#endif + +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] ^= message[u]; + +#pragma unroll 32 + for(int u=0;u<32;u++) message[u] = state[u]; + +#if USE_SHARED + groestlcoin_perm_P(message, mixtabs); +#else + groestlcoin_perm_P(message, NULL); +#endif + +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] ^= message[u]; + + // kopiere Ergebnis + int i, position = -1; + bool rc = true; + +#pragma unroll 8 + for (i = 7; i >= 0; i--) { + if (state[i+16] > pTarget[i]) { + if(position < i) { + position = i; + rc = false; + } + } + if (state[i+16] < pTarget[i]) { + if(position < i) { + position = i; + rc = true; + } + } + } + + if(rc == true) + if(resNounce[0] > nounce) + resNounce[0] = nounce; + } +} + +#define texDef(texname, texmem, texsource, texsize) \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + +// Setup-Funktionen +__host__ void groestlcoin_cpu_init(int thr_id, int threads) +{ + cudaSetDevice(device_map[thr_id]); + + cudaGetDeviceProperties(&props, device_map[thr_id]); + + // Texturen mit obigem Makro initialisieren + texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); + texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); + texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); + texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); + texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); + texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); + texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); + texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); + + // Speicher für Gewinner-Nonce belegen + cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); +} + +__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) +{ + // Nachricht expandieren und setzen + uint32_t msgBlock[32]; + + memset(msgBlock, 0, sizeof(uint32_t) * 32); + memcpy(&msgBlock[0], data, 80); + + // Erweitere die Nachricht auf den Nachrichtenblock (padding) + // Unsere Nachricht hat 80 Byte + msgBlock[20] = 0x80; + msgBlock[31] = 0x01000000; + + // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird + // auf der GPU ausgeführt) + + // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) + cudaMemcpyToSymbol( groestlcoin_gpu_msg, + msgBlock, + 128); + + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + cudaMemcpyToSymbol( pTarget, + pTargetIn, + sizeof(uint32_t) * 8 ); +} + +__host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) +{ + // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, + // alle anderen mit 512 Threads. + int threadsperblock = (props.major >= 3) ? 768 : 512; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs +#if USE_SHARED + size_t shared_size = 8 * 256 * sizeof(uint32_t); +#else + size_t shared_size = 0; +#endif + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + //fprintf(stderr, "ThrID: %d\n", thr_id); + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + groestlcoin_gpu_hash<<>>(threads, startNounce, d_resultNonce[thr_id]); + + // Strategisches Sleep Kommando zur Senkung der CPU Last + MyStreamSynchronize(NULL, 0, thr_id); + + cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); +} diff --git a/cuda_groestlcoin.h b/cuda_groestlcoin.h index 97c2e7f..8ad7dab 100644 --- a/cuda_groestlcoin.h +++ b/cuda_groestlcoin.h @@ -1,8 +1,8 @@ -#ifndef _CUDA_GROESTLCOIN_H -#define _CUDA_GROESTLCOIN_H - -void groestlcoin_cpu_init(int thr_id, int threads); -void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn); -void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); - +#ifndef _CUDA_GROESTLCOIN_H +#define _CUDA_GROESTLCOIN_H + +void groestlcoin_cpu_init(int thr_id, int threads); +void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn); +void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); + #endif \ No newline at end of file diff --git a/cuda_hefty1.cu b/cuda_hefty1.cu index 2f72c0f..239752f 100644 --- a/cuda_hefty1.cu +++ b/cuda_hefty1.cu @@ -1,414 +1,414 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -// aus cpu-miner.c -extern int device_map[8]; - -#include -#include - -#define USE_SHARED 1 - -// Folgende Definitionen später durch header ersetzen -typedef unsigned int uint32_t; -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props; - -// globaler Speicher für alle HeftyHashes aller Threads -uint32_t *d_heftyHashes[8]; - -/* Hash-Tabellen */ -__constant__ uint32_t hefty_gpu_constantTable[64]; -#if USE_SHARED -#define heftyLookUp(x) (*((uint32_t*)heftytab + (x))) -#else -#define heftyLookUp(x) hefty_gpu_constantTable[x] -#endif - -// muss expandiert werden -__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message -__constant__ uint32_t hefty_gpu_register[8]; -__constant__ uint32_t hefty_gpu_sponge[4]; - -uint32_t hefty_cpu_hashTable[] = { - 0x6a09e667UL, - 0xbb67ae85UL, - 0x3c6ef372UL, - 0xa54ff53aUL, - 0x510e527fUL, - 0x9b05688cUL, - 0x1f83d9abUL, - 0x5be0cd19UL }; - -uint32_t hefty_cpu_constantTable[] = { - 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, - 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, - 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, - 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, - 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, - 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, - 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, - 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, - 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, - 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, - 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, - 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, - 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, - 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, - 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, - 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL -}; - -//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -static __host__ __device__ uint32_t S(uint32_t x, int n) -{ - return (((x) >> (n)) | ((x) << (32 - (n)))); -} -#define R(x, n) ((x) >> (n)) -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) -#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) -#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) -#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - -// uint8_t -#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) ) -__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x) -{ - uint16_t w = (x >> 16) ^ (x & 0xffff); - uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) ); - return 24 - (((n >> 2) ^ (n & 0x03)) << 3); -} -// 4 auf einmal -#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F ) -#define getByte(x,y) ( ((x) >> (y)) & 0xFF ) - -__host__ __forceinline__ __device__ void Mangle(uint32_t *inp) -{ - uint32_t r = smoosh4Quad(inp[0]); - uint32_t inp0org; - uint32_t tmp0Mask, tmp1Mask; - uint32_t in1, in2, isAddition; - uint32_t tmp; - uint8_t b; - - inp[1] = inp[1] ^ S(inp[0], getByte(r, 24)); - - r += 0x01010101; - tmp = smoosh2(inp[1]); - b = getByte(r,tmp); - inp0org = S(inp[0], b); - tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 - tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 - - in1 = (inp[2] & ~inp0org) | - (tmp1Mask & ~inp[2] & inp0org) | - (~tmp0Mask & ~inp[2] & inp0org); - in2 = inp[2] += ~inp0org; - isAddition = ~tmp0Mask & tmp1Mask; - inp[2] = isAddition ? in2 : in1; - - r += 0x01010101; - tmp = smoosh2(inp[1] ^ inp[2]); - b = getByte(r,tmp); - inp0org = S(inp[0], b); - tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 - tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 - - in1 = (inp[3] & ~inp0org) | - (tmp1Mask & ~inp[3] & inp0org) | - (~tmp0Mask & ~inp[3] & inp0org); - in2 = inp[3] += ~inp0org; - isAddition = ~tmp0Mask & tmp1Mask; - inp[3] = isAddition ? in2 : in1; - - inp[0] ^= (inp[1] ^ inp[2]) + inp[3]; -} - -__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x) -{ - inp[0] ^= x; - Mangle(inp); -} - -__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp) -{ - uint32_t y = inp[0]; - Mangle(inp); - return y; -} - -__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x) -{ - uint32_t r = Squeeze(sponge); - uint32_t t = ((r >> 8) & 0x1F); - uint32_t y = 1 << t; - - uint32_t a = (((r>>1) & 0x01) << t) & y; - uint32_t b = ((r & 0x01) << t) & y; - uint32_t c = x & y; - - uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c); - return retVal; -} - -__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) -{ - uint32_t tmpBr; - - uint32_t brG = Br(sponge, regs[6]); - uint32_t brF = Br(sponge, regs[5]); - uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; - uint32_t brE = Br(sponge, regs[4]); - uint32_t tmp2 = tmp1 + S1(brE); - uint32_t brC = Br(sponge, regs[2]); - uint32_t brB = Br(sponge, regs[1]); - uint32_t brA = Br(sponge, regs[0]); - uint32_t tmp3 = Maj(brA, brB, brC); - tmpBr = Br(sponge, regs[0]); - uint32_t tmp4 = tmp3 + S0(tmpBr); - tmpBr = Br(sponge, tmp2); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = tmp2 + tmp4; - regs[4] += tmpBr; -} - -__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) -{ - uint32_t tmpBr; - - uint32_t brG = Br(sponge, regs[6]); - uint32_t brF = Br(sponge, regs[5]); - uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; - uint32_t brE = Br(sponge, regs[4]); - uint32_t tmp2 = tmp1 + S1(brE); - uint32_t brC = Br(sponge, regs[2]); - uint32_t brB = Br(sponge, regs[1]); - uint32_t brA = Br(sponge, regs[0]); - uint32_t tmp3 = Maj(brA, brB, brC); - tmpBr = Br(sponge, regs[0]); - uint32_t tmp4 = tmp3 + S0(tmpBr); - tmpBr = Br(sponge, tmp2); - - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = tmp2 + tmp4; - regs[4] += tmpBr; -} - -// Die Hash-Funktion -__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash) -{ - #if USE_SHARED - extern __shared__ char heftytab[]; - if(threadIdx.x < 64) - { - *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x]; - } - - __syncthreads(); -#endif - - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - uint32_t nounce = startNounce + thread; - - // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory - // reduktion von 256 byte auf 128 byte - uint32_t W1[16]; - uint32_t W2[16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - uint32_t sponge[4]; - -#pragma unroll 4 - for(int k=0; k < 4; k++) - sponge[k] = hefty_gpu_sponge[k]; - - // pre -#pragma unroll 8 - for (int k=0; k < 8; k++) - { - regs[k] = hefty_gpu_register[k]; - hash[k] = regs[k]; - } - - //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding -#pragma unroll 16 - for(int k=0;k<16;k++) - W1[k] = hefty_gpu_blockHeader[k]; - W1[3] = SWAB32(nounce); - - // 2. Runde -#pragma unroll 16 - for(int j=0;j<16;j++) - Absorb(sponge, W1[j] ^ heftyLookUp(j)); - -// Progress W1 (Bytes 0...63) -#pragma unroll 16 - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] ^ regs[7]); - hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge); - } - -// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ... - -#pragma unroll 3 - for(int k=0;k<3;k++) - { - #pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - #pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - - #pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - #pragma unroll 16 - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] + regs[7]); - hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge); - } - #pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } - -#pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; - -#pragma unroll 8 - for(int k=0;k<8;k++) - ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); - } -} - -// Setup-Funktionen -__host__ void hefty_cpu_init(int thr_id, int threads) -{ - cudaSetDevice(device_map[thr_id]); - - cudaGetDeviceProperties(&props, device_map[thr_id]); - - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( hefty_gpu_constantTable, - hefty_cpu_constantTable, - sizeof(uint32_t) * 64 ); - - // Speicher für alle Hefty1 hashes belegen - cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); -} - -__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data) - // data muss 84-Byte haben! -{ - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 84); - msgBlock[21] |= 0x80; - msgBlock[31] = 672; // bitlen - - for(int i=0;i<31;i++) // Byteorder drehen - msgBlock[i] = SWAB32(msgBlock[i]); - - // die erste Runde wird auf der CPU durchgeführt, da diese für - // alle Threads gleich ist. Der Hash wird dann an die Threads - // übergeben - - // Erstelle expandierten Block W - uint32_t W[64]; - memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); - for(int j=16;j<64;j++) - W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - uint32_t sponge[4]; - - // pre - memset(sponge, 0, sizeof(uint32_t) * 4); - for (int k=0; k < 8; k++) - { - regs[k] = hefty_cpu_hashTable[k]; - hash[k] = regs[k]; - } - - // 1. Runde - for(int j=0;j<16;j++) - Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]); - - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] ^ regs[7]); - hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); - } - - for(int j=16;j<64;j++) - { - Absorb(sponge, regs[3] + regs[7]); - hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); - } - - for(int k=0;k<8;k++) - hash[k] += regs[k]; - - // sponge speichern - - cudaMemcpyToSymbol( hefty_gpu_sponge, - sponge, - sizeof(uint32_t) * 4 ); - // hash speichern - cudaMemcpyToSymbol( hefty_gpu_register, - hash, - sizeof(uint32_t) * 8 ); - - // Blockheader setzen (korrekte Nonce fehlt da drin noch) - cudaMemcpyToSymbol( hefty_gpu_blockHeader, - &msgBlock[16], - 64); -} - -__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) -{ - // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - int threadsperblock = (props.major >= 3) ? 768 : 512; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - #if USE_SHARED - size_t shared_size = 8 * 64 * sizeof(uint32_t); -#else - size_t shared_size = 0; -#endif - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - hefty_gpu_hash<<>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +// aus cpu-miner.c +extern int device_map[8]; + +#include +#include + +#define USE_SHARED 1 + +// Folgende Definitionen später durch header ersetzen +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; + +// diese Struktur wird in der Init Funktion angefordert +static cudaDeviceProp props; + +// globaler Speicher für alle HeftyHashes aller Threads +uint32_t *d_heftyHashes[8]; + +/* Hash-Tabellen */ +__constant__ uint32_t hefty_gpu_constantTable[64]; +#if USE_SHARED +#define heftyLookUp(x) (*((uint32_t*)heftytab + (x))) +#else +#define heftyLookUp(x) hefty_gpu_constantTable[x] +#endif + +// muss expandiert werden +__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message +__constant__ uint32_t hefty_gpu_register[8]; +__constant__ uint32_t hefty_gpu_sponge[4]; + +uint32_t hefty_cpu_hashTable[] = { + 0x6a09e667UL, + 0xbb67ae85UL, + 0x3c6ef372UL, + 0xa54ff53aUL, + 0x510e527fUL, + 0x9b05688cUL, + 0x1f83d9abUL, + 0x5be0cd19UL }; + +uint32_t hefty_cpu_constantTable[] = { + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, + 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, + 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, + 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, + 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, + 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, + 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, + 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, + 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, + 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, + 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, + 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, + 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, + 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL +}; + +//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +static __host__ __device__ uint32_t S(uint32_t x, int n) +{ + return (((x) >> (n)) | ((x) << (32 - (n)))); +} +#define R(x, n) ((x) >> (n)) +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) +#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) +#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) +#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + +// uint8_t +#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) ) +__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x) +{ + uint16_t w = (x >> 16) ^ (x & 0xffff); + uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) ); + return 24 - (((n >> 2) ^ (n & 0x03)) << 3); +} +// 4 auf einmal +#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F ) +#define getByte(x,y) ( ((x) >> (y)) & 0xFF ) + +__host__ __forceinline__ __device__ void Mangle(uint32_t *inp) +{ + uint32_t r = smoosh4Quad(inp[0]); + uint32_t inp0org; + uint32_t tmp0Mask, tmp1Mask; + uint32_t in1, in2, isAddition; + uint32_t tmp; + uint8_t b; + + inp[1] = inp[1] ^ S(inp[0], getByte(r, 24)); + + r += 0x01010101; + tmp = smoosh2(inp[1]); + b = getByte(r,tmp); + inp0org = S(inp[0], b); + tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 + tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 + + in1 = (inp[2] & ~inp0org) | + (tmp1Mask & ~inp[2] & inp0org) | + (~tmp0Mask & ~inp[2] & inp0org); + in2 = inp[2] += ~inp0org; + isAddition = ~tmp0Mask & tmp1Mask; + inp[2] = isAddition ? in2 : in1; + + r += 0x01010101; + tmp = smoosh2(inp[1] ^ inp[2]); + b = getByte(r,tmp); + inp0org = S(inp[0], b); + tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 + tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 + + in1 = (inp[3] & ~inp0org) | + (tmp1Mask & ~inp[3] & inp0org) | + (~tmp0Mask & ~inp[3] & inp0org); + in2 = inp[3] += ~inp0org; + isAddition = ~tmp0Mask & tmp1Mask; + inp[3] = isAddition ? in2 : in1; + + inp[0] ^= (inp[1] ^ inp[2]) + inp[3]; +} + +__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x) +{ + inp[0] ^= x; + Mangle(inp); +} + +__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp) +{ + uint32_t y = inp[0]; + Mangle(inp); + return y; +} + +__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x) +{ + uint32_t r = Squeeze(sponge); + uint32_t t = ((r >> 8) & 0x1F); + uint32_t y = 1 << t; + + uint32_t a = (((r>>1) & 0x01) << t) & y; + uint32_t b = ((r & 0x01) << t) & y; + uint32_t c = x & y; + + uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c); + return retVal; +} + +__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) +{ + uint32_t tmpBr; + + uint32_t brG = Br(sponge, regs[6]); + uint32_t brF = Br(sponge, regs[5]); + uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; + uint32_t brE = Br(sponge, regs[4]); + uint32_t tmp2 = tmp1 + S1(brE); + uint32_t brC = Br(sponge, regs[2]); + uint32_t brB = Br(sponge, regs[1]); + uint32_t brA = Br(sponge, regs[0]); + uint32_t tmp3 = Maj(brA, brB, brC); + tmpBr = Br(sponge, regs[0]); + uint32_t tmp4 = tmp3 + S0(tmpBr); + tmpBr = Br(sponge, tmp2); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = tmp2 + tmp4; + regs[4] += tmpBr; +} + +__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) +{ + uint32_t tmpBr; + + uint32_t brG = Br(sponge, regs[6]); + uint32_t brF = Br(sponge, regs[5]); + uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; + uint32_t brE = Br(sponge, regs[4]); + uint32_t tmp2 = tmp1 + S1(brE); + uint32_t brC = Br(sponge, regs[2]); + uint32_t brB = Br(sponge, regs[1]); + uint32_t brA = Br(sponge, regs[0]); + uint32_t tmp3 = Maj(brA, brB, brC); + tmpBr = Br(sponge, regs[0]); + uint32_t tmp4 = tmp3 + S0(tmpBr); + tmpBr = Br(sponge, tmp2); + + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = tmp2 + tmp4; + regs[4] += tmpBr; +} + +// Die Hash-Funktion +__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash) +{ + #if USE_SHARED + extern __shared__ char heftytab[]; + if(threadIdx.x < 64) + { + *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x]; + } + + __syncthreads(); +#endif + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // bestimme den aktuellen Zähler + uint32_t nounce = startNounce + thread; + + // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory + // reduktion von 256 byte auf 128 byte + uint32_t W1[16]; + uint32_t W2[16]; + + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; + uint32_t sponge[4]; + +#pragma unroll 4 + for(int k=0; k < 4; k++) + sponge[k] = hefty_gpu_sponge[k]; + + // pre +#pragma unroll 8 + for (int k=0; k < 8; k++) + { + regs[k] = hefty_gpu_register[k]; + hash[k] = regs[k]; + } + + //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding +#pragma unroll 16 + for(int k=0;k<16;k++) + W1[k] = hefty_gpu_blockHeader[k]; + W1[3] = SWAB32(nounce); + + // 2. Runde +#pragma unroll 16 + for(int j=0;j<16;j++) + Absorb(sponge, W1[j] ^ heftyLookUp(j)); + +// Progress W1 (Bytes 0...63) +#pragma unroll 16 + for(int j=0;j<16;j++) + { + Absorb(sponge, regs[3] ^ regs[7]); + hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge); + } + +// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ... + +#pragma unroll 3 + for(int k=0;k<3;k++) + { + #pragma unroll 2 + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + #pragma unroll 5 + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + + #pragma unroll 8 + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + #pragma unroll 16 + for(int j=0;j<16;j++) + { + Absorb(sponge, regs[3] + regs[7]); + hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge); + } + #pragma unroll 16 + for(int j=0;j<16;j++) + W1[j] = W2[j]; + } + +#pragma unroll 8 + for(int k=0;k<8;k++) + hash[k] += regs[k]; + +#pragma unroll 8 + for(int k=0;k<8;k++) + ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); + } +} + +// Setup-Funktionen +__host__ void hefty_cpu_init(int thr_id, int threads) +{ + cudaSetDevice(device_map[thr_id]); + + cudaGetDeviceProperties(&props, device_map[thr_id]); + + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( hefty_gpu_constantTable, + hefty_cpu_constantTable, + sizeof(uint32_t) * 64 ); + + // Speicher für alle Hefty1 hashes belegen + cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); +} + +__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data) + // data muss 84-Byte haben! +{ + // Nachricht expandieren und setzen + uint32_t msgBlock[32]; + + memset(msgBlock, 0, sizeof(uint32_t) * 32); + memcpy(&msgBlock[0], data, 84); + msgBlock[21] |= 0x80; + msgBlock[31] = 672; // bitlen + + for(int i=0;i<31;i++) // Byteorder drehen + msgBlock[i] = SWAB32(msgBlock[i]); + + // die erste Runde wird auf der CPU durchgeführt, da diese für + // alle Threads gleich ist. Der Hash wird dann an die Threads + // übergeben + + // Erstelle expandierten Block W + uint32_t W[64]; + memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); + for(int j=16;j<64;j++) + W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; + + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; + uint32_t sponge[4]; + + // pre + memset(sponge, 0, sizeof(uint32_t) * 4); + for (int k=0; k < 8; k++) + { + regs[k] = hefty_cpu_hashTable[k]; + hash[k] = regs[k]; + } + + // 1. Runde + for(int j=0;j<16;j++) + Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]); + + for(int j=0;j<16;j++) + { + Absorb(sponge, regs[3] ^ regs[7]); + hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); + } + + for(int j=16;j<64;j++) + { + Absorb(sponge, regs[3] + regs[7]); + hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); + } + + for(int k=0;k<8;k++) + hash[k] += regs[k]; + + // sponge speichern + + cudaMemcpyToSymbol( hefty_gpu_sponge, + sponge, + sizeof(uint32_t) * 4 ); + // hash speichern + cudaMemcpyToSymbol( hefty_gpu_register, + hash, + sizeof(uint32_t) * 8 ); + + // Blockheader setzen (korrekte Nonce fehlt da drin noch) + cudaMemcpyToSymbol( hefty_gpu_blockHeader, + &msgBlock[16], + 64); +} + +__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) +{ + // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, + // alle anderen mit 512 Threads. + int threadsperblock = (props.major >= 3) ? 768 : 512; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + #if USE_SHARED + size_t shared_size = 8 * 64 * sizeof(uint32_t); +#else + size_t shared_size = 0; +#endif + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + hefty_gpu_hash<<>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); +} diff --git a/cuda_hefty1.h b/cuda_hefty1.h index 08b1844..9e72d3d 100644 --- a/cuda_hefty1.h +++ b/cuda_hefty1.h @@ -1,8 +1,8 @@ -#ifndef _CUDA_HEFTY1_H -#define _CUDA_HEFTY1_H - -void hefty_cpu_hash(int thr_id, int threads, int startNounce); -void hefty_cpu_setBlock(int thr_id, int threads, void *data); -void hefty_cpu_init(int thr_id, int threads); - +#ifndef _CUDA_HEFTY1_H +#define _CUDA_HEFTY1_H + +void hefty_cpu_hash(int thr_id, int threads, int startNounce); +void hefty_cpu_setBlock(int thr_id, int threads, void *data); +void hefty_cpu_init(int thr_id, int threads); + #endif \ No newline at end of file diff --git a/cuda_keccak512.cu b/cuda_keccak512.cu index c9b0a6c..13e5255 100644 --- a/cuda_keccak512.cu +++ b/cuda_keccak512.cu @@ -1,273 +1,273 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *d_heftyHashes[8]; -extern uint32_t *d_nonceVector[8]; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hash3output[8]; - -// der Keccak512 State nach der ersten Runde (72 Bytes) -__constant__ uint64_t c_State[25]; - -// die Message (72 Bytes) für die zweite Runde auf der GPU -__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding - -// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------ - -#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) - -#define U32TO64_LE(p) \ - (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) - -#define U64TO32_LE(p, v) \ - *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); - -static __device__ void mycpy72(uint32_t *d, const uint32_t *s) { -#pragma unroll 18 - for (int k=0; k < 18; ++k) d[k] = s[k]; -} - -static __device__ void mycpy32(uint32_t *d, const uint32_t *s) { -#pragma unroll 8 - for (int k=0; k < 8; ++k) d[k] = s[k]; -} - -typedef struct keccak_hash_state_t { - uint64_t state[25]; // 25*2 - uint32_t buffer[72/4]; // 72 -} keccak_hash_state; - -__device__ void statecopy(uint64_t *d, uint64_t *s) -{ -#pragma unroll 25 - for (int i=0; i < 25; ++i) - d[i] = s[i]; -} - - -static const uint64_t host_keccak_round_constants[24] = { - 0x0000000000000001ull, 0x0000000000008082ull, - 0x800000000000808aull, 0x8000000080008000ull, - 0x000000000000808bull, 0x0000000080000001ull, - 0x8000000080008081ull, 0x8000000000008009ull, - 0x000000000000008aull, 0x0000000000000088ull, - 0x0000000080008009ull, 0x000000008000000aull, - 0x000000008000808bull, 0x800000000000008bull, - 0x8000000000008089ull, 0x8000000000008003ull, - 0x8000000000008002ull, 0x8000000000000080ull, - 0x000000000000800aull, 0x800000008000000aull, - 0x8000000080008081ull, 0x8000000000008080ull, - 0x0000000080000001ull, 0x8000000080008008ull -}; - -__constant__ uint64_t c_keccak_round_constants[24]; - -__host__ __device__ void -keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { - size_t i; - uint64_t t[5], u[5], v, w; - - /* absorb input */ -#pragma unroll 9 - for (i = 0; i < 72 / 8; i++, in += 2) - s[i] ^= U32TO64_LE(in); - - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } -} - -// Die Hash-Funktion -__global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - //uint32_t nounce = startNounce + thread; - uint32_t nounce = nonceVector[thread]; - - // Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash) - uint32_t hashPosition = nounce - startNounce; - - // erstmal den State der ersten Runde holen - uint64_t keccak_gpu_state[25]; -#pragma unroll 25 - for (int i=0; i < 25; ++i) - keccak_gpu_state[i] = c_State[i]; - - // Message2 in den Puffer holen - uint32_t msgBlock[18]; - mycpy72(msgBlock, c_PaddedMessage2); - - // die individuelle Nonce einsetzen - msgBlock[1] = nounce; - - // den individuellen Hefty1 Hash einsetzen - mycpy32(&msgBlock[3], &heftyHashes[8 * hashPosition]); - - // den Block einmal gut durchschütteln - keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants); - - // das Hash erzeugen - uint32_t hash[16]; - -#pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { - U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); - } - - - // und ins Global Memory rausschreiben -#pragma unroll 16 - for(int k=0;k<16;k++) - ((uint32_t*)outputHash)[16*hashPosition+k] = hash[k]; - } -} - -// ---------------------------- END CUDA keccak512 functions ------------------------------------ - -// Setup-Funktionen -__host__ void keccak512_cpu_init(int thr_id, int threads) -{ - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( c_keccak_round_constants, - host_keccak_round_constants, - sizeof(host_keccak_round_constants), - 0, cudaMemcpyHostToDevice); - - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads); -} - -// ----------------BEGIN keccak512 CPU version from scrypt-jane code -------------------- - -#define SCRYPT_HASH_DIGEST_SIZE 64 -#define SCRYPT_KECCAK_F 1600 -#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */ -#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */ -#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) /* 72 */ - -// --------------- END keccak512 CPU version from scrypt-jane code -------------------- - -__host__ void keccak512_cpu_setBlock(void *data) - // data muss 84-Byte haben! - // heftyHash hat 32-Byte -{ - // CH - // state init - uint64_t keccak_cpu_state[25]; - memset(keccak_cpu_state, 0, 200); - - // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke - // zu jeweils - uint32_t msgBlock[18]; - memset(msgBlock, 0, 18 * sizeof(uint32_t)); - - // kopiere die Daten rein (aber nur alles nach Bit 72) - memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12); - - // Nachricht abschließen - msgBlock[11] = 0x01; - msgBlock[17] = 0x80000000; - - // erste Runde - keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants); - - // Message 2 ins Constant Memory kopieren (die variable Nonce und - // der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden) - cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice ); - - // state kopieren - cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); -} - -__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) -{ - // Hefty1 Hashes kopieren - if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); - //else cudaThreadSynchronize(); -} - -__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) -{ - const int threadsperblock = 128; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - keccak512_gpu_hash<<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// globaler Speicher für alle HeftyHashes aller Threads +extern uint32_t *d_heftyHashes[8]; +extern uint32_t *d_nonceVector[8]; + +// globaler Speicher für unsere Ergebnisse +uint32_t *d_hash3output[8]; + +// der Keccak512 State nach der ersten Runde (72 Bytes) +__constant__ uint64_t c_State[25]; + +// die Message (72 Bytes) für die zweite Runde auf der GPU +__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding + +// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------ + +#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) + +#define U32TO64_LE(p) \ + (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) + +#define U64TO32_LE(p, v) \ + *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); + +static __device__ void mycpy72(uint32_t *d, const uint32_t *s) { +#pragma unroll 18 + for (int k=0; k < 18; ++k) d[k] = s[k]; +} + +static __device__ void mycpy32(uint32_t *d, const uint32_t *s) { +#pragma unroll 8 + for (int k=0; k < 8; ++k) d[k] = s[k]; +} + +typedef struct keccak_hash_state_t { + uint64_t state[25]; // 25*2 + uint32_t buffer[72/4]; // 72 +} keccak_hash_state; + +__device__ void statecopy(uint64_t *d, uint64_t *s) +{ +#pragma unroll 25 + for (int i=0; i < 25; ++i) + d[i] = s[i]; +} + + +static const uint64_t host_keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +__constant__ uint64_t c_keccak_round_constants[24]; + +__host__ __device__ void +keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { + size_t i; + uint64_t t[5], u[5], v, w; + + /* absorb input */ +#pragma unroll 9 + for (i = 0; i < 72 / 8; i++, in += 2) + s[i] ^= U32TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +// Die Hash-Funktion +__global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // bestimme den aktuellen Zähler + //uint32_t nounce = startNounce + thread; + uint32_t nounce = nonceVector[thread]; + + // Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash) + uint32_t hashPosition = nounce - startNounce; + + // erstmal den State der ersten Runde holen + uint64_t keccak_gpu_state[25]; +#pragma unroll 25 + for (int i=0; i < 25; ++i) + keccak_gpu_state[i] = c_State[i]; + + // Message2 in den Puffer holen + uint32_t msgBlock[18]; + mycpy72(msgBlock, c_PaddedMessage2); + + // die individuelle Nonce einsetzen + msgBlock[1] = nounce; + + // den individuellen Hefty1 Hash einsetzen + mycpy32(&msgBlock[3], &heftyHashes[8 * hashPosition]); + + // den Block einmal gut durchschütteln + keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants); + + // das Hash erzeugen + uint32_t hash[16]; + +#pragma unroll 8 + for (size_t i = 0; i < 64; i += 8) { + U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); + } + + + // und ins Global Memory rausschreiben +#pragma unroll 16 + for(int k=0;k<16;k++) + ((uint32_t*)outputHash)[16*hashPosition+k] = hash[k]; + } +} + +// ---------------------------- END CUDA keccak512 functions ------------------------------------ + +// Setup-Funktionen +__host__ void keccak512_cpu_init(int thr_id, int threads) +{ + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( c_keccak_round_constants, + host_keccak_round_constants, + sizeof(host_keccak_round_constants), + 0, cudaMemcpyHostToDevice); + + // Speicher für alle Ergebnisse belegen + cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads); +} + +// ----------------BEGIN keccak512 CPU version from scrypt-jane code -------------------- + +#define SCRYPT_HASH_DIGEST_SIZE 64 +#define SCRYPT_KECCAK_F 1600 +#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */ +#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */ +#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) /* 72 */ + +// --------------- END keccak512 CPU version from scrypt-jane code -------------------- + +__host__ void keccak512_cpu_setBlock(void *data) + // data muss 84-Byte haben! + // heftyHash hat 32-Byte +{ + // CH + // state init + uint64_t keccak_cpu_state[25]; + memset(keccak_cpu_state, 0, 200); + + // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke + // zu jeweils + uint32_t msgBlock[18]; + memset(msgBlock, 0, 18 * sizeof(uint32_t)); + + // kopiere die Daten rein (aber nur alles nach Bit 72) + memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12); + + // Nachricht abschließen + msgBlock[11] = 0x01; + msgBlock[17] = 0x80000000; + + // erste Runde + keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants); + + // Message 2 ins Constant Memory kopieren (die variable Nonce und + // der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden) + cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice ); + + // state kopieren + cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); +} + +__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) +{ + // Hefty1 Hashes kopieren + if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); + //else cudaThreadSynchronize(); +} + +__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) +{ + const int threadsperblock = 128; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + keccak512_gpu_hash<<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); +} diff --git a/cuda_keccak512.h b/cuda_keccak512.h index abd4741..003f40f 100644 --- a/cuda_keccak512.h +++ b/cuda_keccak512.h @@ -1,9 +1,9 @@ -#ifndef _CUDA_KECCAK512_H -#define _CUDA_KECCAK512_H - -void keccak512_cpu_init(int thr_id, int threads); -void keccak512_cpu_setBlock(void *data); -void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce); - -#endif +#ifndef _CUDA_KECCAK512_H +#define _CUDA_KECCAK512_H + +void keccak512_cpu_init(int thr_id, int threads); +void keccak512_cpu_setBlock(void *data); +void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); +void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce); + +#endif diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu index fb85a24..8e45688 100644 --- a/cuda_myriadgroestl.cu +++ b/cuda_myriadgroestl.cu @@ -1,622 +1,622 @@ -// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice - -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -// it's unfortunate that this is a compile time constant. -#define MAXWELL_OR_FERMI 0 - -// aus cpu-miner.c -extern int device_map[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props; - -// globaler Speicher für alle HeftyHashes aller Threads -__constant__ uint32_t pTarget[8]; // Single GPU -extern uint32_t *d_resultNonce[8]; - -__constant__ uint32_t myriadgroestl_gpu_msg[32]; - -// muss expandiert werden -__constant__ uint32_t myr_sha256_gpu_constantTable[64]; -__constant__ uint32_t myr_sha256_gpu_hashTable[8]; - -uint32_t myr_sha256_cpu_hashTable[] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; -uint32_t myr_sha256_cpu_constantTable[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, -}; - -#if __CUDA_ARCH__ < 350 - // Kepler (Compute 3.0) - #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -#else - // Kepler (Compute 3.5) - #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) -#endif -#define R(x, n) ((x) >> (n)) -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) -#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) -#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3)) -#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10)) - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - -__device__ void myriadgroestl_gpu_sha256(uint32_t *message) -{ - uint32_t W1[16]; - uint32_t W2[16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - - // pre -#pragma unroll 8 - for (int k=0; k < 8; k++) - { - regs[k] = myr_sha256_gpu_hashTable[k]; - hash[k] = regs[k]; - } - -#pragma unroll 16 - for(int k=0;k<16;k++) - W1[k] = SWAB32(message[k]); - -// Progress W1 -#pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -// Progress W2...W3 -#pragma unroll 3 - for(int k=0;k<3;k++) - { -#pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; -#pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - -#pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - // Rundenfunktion -#pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -#pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } - -#pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; - - ///// - ///// Zweite Runde (wegen Msg-Padding) - ///// -#pragma unroll 8 - for(int k=0;k<8;k++) - regs[k] = hash[k]; - - W1[0] = SWAB32(0x80); -#pragma unroll 14 - for(int k=1;k<15;k++) - W1[k] = 0; - W1[15] = 512; - -// Progress W1 -#pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -// Progress W2...W3 -#pragma unroll 3 - for(int k=0;k<3;k++) - { -#pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; -#pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - -#pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - // Rundenfunktion -#pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -#pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } - -#pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; - - //// FERTIG - -#pragma unroll 8 - for(int k=0;k<8;k++) - message[k] = SWAB32(hash[k]); -} - -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#define PC32up(j, r) ((uint32_t)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) 0xFFFFFFFF -#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) - -#define B32_0(x) __byte_perm(x, 0, 0x4440) -//((x) & 0xFF) -#define B32_1(x) __byte_perm(x, 0, 0x4441) -//(((x) >> 8) & 0xFF) -#define B32_2(x) __byte_perm(x, 0, 0x4442) -//(((x) >> 16) & 0xFF) -#define B32_3(x) __byte_perm(x, 0, 0x4443) -//((x) >> 24) - -#if MAXWELL_OR_FEMRI -#define USE_SHARED 1 -// Maxwell and Fermi cards get the best speed with SHARED access it seems. -#if USE_SHARED -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) -#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) -#else -#define T0up(x) tex1Dfetch(t0up1, x) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) tex1Dfetch(t1dn1, x) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) tex1Dfetch(t2dn1, x) -#define T3up(x) tex1Dfetch(t3up1, x) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif -#else -#define USE_SHARED 1 -// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! -#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) -#define T0dn(x) tex1Dfetch(t0dn1, x) -#define T1up(x) tex1Dfetch(t1up1, x) -#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) -#define T2up(x) tex1Dfetch(t2up1, x) -#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) -#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) -#define T3dn(x) tex1Dfetch(t3dn1, x) -#endif - -texture t0up1; -texture t0dn1; -texture t1up1; -texture t1dn1; -texture t2up1; -texture t2dn1; -texture t3up1; -texture t3dn1; - -extern uint32_t T0up_cpu[]; -extern uint32_t T0dn_cpu[]; -extern uint32_t T1up_cpu[]; -extern uint32_t T1dn_cpu[]; -extern uint32_t T2up_cpu[]; -extern uint32_t T2dn_cpu[]; -extern uint32_t T3up_cpu[]; -extern uint32_t T3dn_cpu[]; - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - - -__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs) -{ - uint32_t t[32]; - -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - switch(r) - { - case 0: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; - case 1: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break; - case 2: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break; - case 3: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break; - case 4: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break; - case 5: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break; - case 6: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break; - case 7: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break; - case 8: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break; - case 9: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break; - case 10: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break; - case 11: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break; - case 12: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break; - case 13: -#pragma unroll 16 - for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break; - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); - uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); - uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); - uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); - - t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ - T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); - - t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ - T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} - -__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs) -{ -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - uint32_t t[32]; - - switch(r) - { - case 0: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; - case 1: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; - case 2: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; - case 3: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break; - case 4: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break; - case 5: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break; - case 6: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break; - case 7: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break; - case 8: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break; - case 9: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break; - case 10: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break; - case 11: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break; - case 12: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break; - case 13: - #pragma unroll 16 - for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break; - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); - uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); - uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); - uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); - - t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ - T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); - - t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ - T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} - -__global__ void -myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) -{ -#if USE_SHARED - extern __shared__ char mixtabs[]; - - if (threadIdx.x < 256) - { - *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); - *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x); - *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x); - *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x); - *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x); - *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x); - *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x); - } - - __syncthreads(); -#endif - - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // GROESTL - uint32_t message[32]; - uint32_t state[32]; - -#pragma unroll 32 - for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k]; - - uint32_t nounce = startNounce + thread; - message[19] = SWAB32(nounce); - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] = message[u]; - state[31] ^= 0x20000; - - // Perm -#if USE_SHARED - myriadgroestl_perm_P(state, mixtabs); - state[31] ^= 0x20000; - myriadgroestl_perm_Q(message, mixtabs); -#else - myriadgroestl_perm_P(state, NULL); - state[31] ^= 0x20000; - myriadgroestl_perm_Q(message, NULL); -#endif -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - -#pragma unroll 32 - for(int u=0;u<32;u++) message[u] = state[u]; - -#if USE_SHARED - myriadgroestl_perm_P(message, mixtabs); -#else - myriadgroestl_perm_P(message, NULL); -#endif - -#pragma unroll 32 - for(int u=0;u<32;u++) state[u] ^= message[u]; - - uint32_t out_state[16]; -#pragma unroll 16 - for(int u=0;u<16;u++) out_state[u] = state[u+16]; - myriadgroestl_gpu_sha256(out_state); - - int i, position = -1; - bool rc = true; - -#pragma unroll 8 - for (i = 7; i >= 0; i--) { - if (out_state[i] > pTarget[i]) { - if(position < i) { - position = i; - rc = false; - } - } - if (out_state[i] < pTarget[i]) { - if(position < i) { - position = i; - rc = true; - } - } - } - - if(rc == true) - if(resNounce[0] > nounce) - resNounce[0] = nounce; - } -} - -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ - -// Setup-Funktionen -__host__ void myriadgroestl_cpu_init(int thr_id, int threads) -{ - cudaSetDevice(device_map[thr_id]); - - cudaMemcpyToSymbol( myr_sha256_gpu_hashTable, - myr_sha256_cpu_hashTable, - sizeof(uint32_t) * 8 ); - - cudaMemcpyToSymbol( myr_sha256_gpu_constantTable, - myr_sha256_cpu_constantTable, - sizeof(uint32_t) * 64 ); - - cudaGetDeviceProperties(&props, device_map[thr_id]); - - // Texturen mit obigem Makro initialisieren - texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); - - // Speicher für Gewinner-Nonce belegen - cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); -} - -__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn) -{ - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 80); - - // Erweitere die Nachricht auf den Nachrichtenblock (padding) - // Unsere Nachricht hat 80 Byte - msgBlock[20] = 0x80; - msgBlock[31] = 0x01000000; - - // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird - // auf der GPU ausgeführt) - - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( myriadgroestl_gpu_msg, - msgBlock, - 128); - - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - cudaMemcpyToSymbol( pTarget, - pTargetIn, - sizeof(uint32_t) * 8 ); -} - -__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) -{ - // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - int threadsperblock = (props.major >= 3) ? 768 : 512; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs -#if USE_SHARED - size_t shared_size = 8 * 256 * sizeof(uint32_t); -#else - size_t shared_size = 0; -#endif - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - //fprintf(stderr, "ThrID: %d\n", thr_id); - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - myriadgroestl_gpu_hash<<>>(threads, startNounce, d_resultNonce[thr_id]); - - // Strategisches Sleep Kommando zur Senkung der CPU Last - MyStreamSynchronize(NULL, 0, thr_id); - - cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); -} +// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice + +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// it's unfortunate that this is a compile time constant. +#define MAXWELL_OR_FERMI 0 + +// aus cpu-miner.c +extern int device_map[8]; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +// diese Struktur wird in der Init Funktion angefordert +static cudaDeviceProp props; + +// globaler Speicher für alle HeftyHashes aller Threads +__constant__ uint32_t pTarget[8]; // Single GPU +extern uint32_t *d_resultNonce[8]; + +__constant__ uint32_t myriadgroestl_gpu_msg[32]; + +// muss expandiert werden +__constant__ uint32_t myr_sha256_gpu_constantTable[64]; +__constant__ uint32_t myr_sha256_gpu_hashTable[8]; + +uint32_t myr_sha256_cpu_hashTable[] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; +uint32_t myr_sha256_cpu_constantTable[] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else + // Kepler (Compute 3.5) + #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) +#endif +#define R(x, n) ((x) >> (n)) +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3)) +#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10)) + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + +__device__ void myriadgroestl_gpu_sha256(uint32_t *message) +{ + uint32_t W1[16]; + uint32_t W2[16]; + + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; + + // pre +#pragma unroll 8 + for (int k=0; k < 8; k++) + { + regs[k] = myr_sha256_gpu_hashTable[k]; + hash[k] = regs[k]; + } + +#pragma unroll 16 + for(int k=0;k<16;k++) + W1[k] = SWAB32(message[k]); + +// Progress W1 +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +// Progress W2...W3 +#pragma unroll 3 + for(int k=0;k<3;k++) + { +#pragma unroll 2 + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; +#pragma unroll 5 + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + +#pragma unroll 8 + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Rundenfunktion +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +#pragma unroll 16 + for(int j=0;j<16;j++) + W1[j] = W2[j]; + } + +#pragma unroll 8 + for(int k=0;k<8;k++) + hash[k] += regs[k]; + + ///// + ///// Zweite Runde (wegen Msg-Padding) + ///// +#pragma unroll 8 + for(int k=0;k<8;k++) + regs[k] = hash[k]; + + W1[0] = SWAB32(0x80); +#pragma unroll 14 + for(int k=1;k<15;k++) + W1[k] = 0; + W1[15] = 512; + +// Progress W1 +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +// Progress W2...W3 +#pragma unroll 3 + for(int k=0;k<3;k++) + { +#pragma unroll 2 + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; +#pragma unroll 5 + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + +#pragma unroll 8 + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Rundenfunktion +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +#pragma unroll 16 + for(int j=0;j<16;j++) + W1[j] = W2[j]; + } + +#pragma unroll 8 + for(int k=0;k<8;k++) + hash[k] += regs[k]; + + //// FERTIG + +#pragma unroll 8 + for(int k=0;k<8;k++) + message[k] = SWAB32(hash[k]); +} + +#define SPH_C32(x) ((uint32_t)(x ## U)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define PC32up(j, r) ((uint32_t)((j) + (r))) +#define PC32dn(j, r) 0 +#define QC32up(j, r) 0xFFFFFFFF +#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) + +#define B32_0(x) __byte_perm(x, 0, 0x4440) +//((x) & 0xFF) +#define B32_1(x) __byte_perm(x, 0, 0x4441) +//(((x) >> 8) & 0xFF) +#define B32_2(x) __byte_perm(x, 0, 0x4442) +//(((x) >> 16) & 0xFF) +#define B32_3(x) __byte_perm(x, 0, 0x4443) +//((x) >> 24) + +#if MAXWELL_OR_FEMRI +#define USE_SHARED 1 +// Maxwell and Fermi cards get the best speed with SHARED access it seems. +#if USE_SHARED +#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) +#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) +#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) +#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) +#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) +#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) +#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) +#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) +#else +#define T0up(x) tex1Dfetch(t0up1, x) +#define T0dn(x) tex1Dfetch(t0dn1, x) +#define T1up(x) tex1Dfetch(t1up1, x) +#define T1dn(x) tex1Dfetch(t1dn1, x) +#define T2up(x) tex1Dfetch(t2up1, x) +#define T2dn(x) tex1Dfetch(t2dn1, x) +#define T3up(x) tex1Dfetch(t3up1, x) +#define T3dn(x) tex1Dfetch(t3dn1, x) +#endif +#else +#define USE_SHARED 1 +// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! +#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) +#define T0dn(x) tex1Dfetch(t0dn1, x) +#define T1up(x) tex1Dfetch(t1up1, x) +#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) +#define T2up(x) tex1Dfetch(t2up1, x) +#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) +#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) +#define T3dn(x) tex1Dfetch(t3dn1, x) +#endif + +texture t0up1; +texture t0dn1; +texture t1up1; +texture t1dn1; +texture t2up1; +texture t2dn1; +texture t3up1; +texture t3dn1; + +extern uint32_t T0up_cpu[]; +extern uint32_t T0dn_cpu[]; +extern uint32_t T1up_cpu[]; +extern uint32_t T1dn_cpu[]; +extern uint32_t T2up_cpu[]; +extern uint32_t T2dn_cpu[]; +extern uint32_t T3up_cpu[]; +extern uint32_t T3dn_cpu[]; + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + + +__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs) +{ + uint32_t t[32]; + +//#pragma unroll 14 + for(int r=0;r<14;r++) + { + switch(r) + { + case 0: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; + case 1: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break; + case 2: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break; + case 3: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break; + case 4: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break; + case 5: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break; + case 6: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break; + case 7: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break; + case 8: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break; + case 9: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break; + case 10: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break; + case 11: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break; + case 12: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break; + case 13: +#pragma unroll 16 + for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break; + } + + // RBTT +#pragma unroll 16 + for(int k=0;k<32;k+=2) + { + uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); + uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); + uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); + uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); + + t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ + T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); + + t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ + T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); + } +#pragma unroll 32 + for(int k=0;k<32;k++) + a[k] = t[k]; + } +} + +__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs) +{ +//#pragma unroll 14 + for(int r=0;r<14;r++) + { + uint32_t t[32]; + + switch(r) + { + case 0: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; + case 1: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; + case 2: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; + case 3: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break; + case 4: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break; + case 5: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break; + case 6: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break; + case 7: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break; + case 8: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break; + case 9: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break; + case 10: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break; + case 11: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break; + case 12: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break; + case 13: + #pragma unroll 16 + for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break; + } + + // RBTT +#pragma unroll 16 + for(int k=0;k<32;k+=2) + { + uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); + uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); + uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); + uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); + + t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ + T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); + + t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ + T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); + } +#pragma unroll 32 + for(int k=0;k<32;k++) + a[k] = t[k]; + } +} + +__global__ void +myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) +{ +#if USE_SHARED + extern __shared__ char mixtabs[]; + + if (threadIdx.x < 256) + { + *((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); + *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x); + *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x); + *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x); + *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x); + *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x); + *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x); + *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x); + } + + __syncthreads(); +#endif + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // GROESTL + uint32_t message[32]; + uint32_t state[32]; + +#pragma unroll 32 + for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k]; + + uint32_t nounce = startNounce + thread; + message[19] = SWAB32(nounce); + +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] = message[u]; + state[31] ^= 0x20000; + + // Perm +#if USE_SHARED + myriadgroestl_perm_P(state, mixtabs); + state[31] ^= 0x20000; + myriadgroestl_perm_Q(message, mixtabs); +#else + myriadgroestl_perm_P(state, NULL); + state[31] ^= 0x20000; + myriadgroestl_perm_Q(message, NULL); +#endif +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] ^= message[u]; + +#pragma unroll 32 + for(int u=0;u<32;u++) message[u] = state[u]; + +#if USE_SHARED + myriadgroestl_perm_P(message, mixtabs); +#else + myriadgroestl_perm_P(message, NULL); +#endif + +#pragma unroll 32 + for(int u=0;u<32;u++) state[u] ^= message[u]; + + uint32_t out_state[16]; +#pragma unroll 16 + for(int u=0;u<16;u++) out_state[u] = state[u+16]; + myriadgroestl_gpu_sha256(out_state); + + int i, position = -1; + bool rc = true; + +#pragma unroll 8 + for (i = 7; i >= 0; i--) { + if (out_state[i] > pTarget[i]) { + if(position < i) { + position = i; + rc = false; + } + } + if (out_state[i] < pTarget[i]) { + if(position < i) { + position = i; + rc = true; + } + } + } + + if(rc == true) + if(resNounce[0] > nounce) + resNounce[0] = nounce; + } +} + +#define texDef(texname, texmem, texsource, texsize) \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + +// Setup-Funktionen +__host__ void myriadgroestl_cpu_init(int thr_id, int threads) +{ + cudaSetDevice(device_map[thr_id]); + + cudaMemcpyToSymbol( myr_sha256_gpu_hashTable, + myr_sha256_cpu_hashTable, + sizeof(uint32_t) * 8 ); + + cudaMemcpyToSymbol( myr_sha256_gpu_constantTable, + myr_sha256_cpu_constantTable, + sizeof(uint32_t) * 64 ); + + cudaGetDeviceProperties(&props, device_map[thr_id]); + + // Texturen mit obigem Makro initialisieren + texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); + texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); + texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); + texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); + texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); + texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); + texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); + texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); + + // Speicher für Gewinner-Nonce belegen + cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); +} + +__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn) +{ + // Nachricht expandieren und setzen + uint32_t msgBlock[32]; + + memset(msgBlock, 0, sizeof(uint32_t) * 32); + memcpy(&msgBlock[0], data, 80); + + // Erweitere die Nachricht auf den Nachrichtenblock (padding) + // Unsere Nachricht hat 80 Byte + msgBlock[20] = 0x80; + msgBlock[31] = 0x01000000; + + // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird + // auf der GPU ausgeführt) + + // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) + cudaMemcpyToSymbol( myriadgroestl_gpu_msg, + msgBlock, + 128); + + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + cudaMemcpyToSymbol( pTarget, + pTargetIn, + sizeof(uint32_t) * 8 ); +} + +__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) +{ + // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, + // alle anderen mit 512 Threads. + int threadsperblock = (props.major >= 3) ? 768 : 512; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs +#if USE_SHARED + size_t shared_size = 8 * 256 * sizeof(uint32_t); +#else + size_t shared_size = 0; +#endif + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + //fprintf(stderr, "ThrID: %d\n", thr_id); + cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + myriadgroestl_gpu_hash<<>>(threads, startNounce, d_resultNonce[thr_id]); + + // Strategisches Sleep Kommando zur Senkung der CPU Last + MyStreamSynchronize(NULL, 0, thr_id); + + cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); +} diff --git a/cuda_sha256.cu b/cuda_sha256.cu index 050f4b4..b26021d 100644 --- a/cuda_sha256.cu +++ b/cuda_sha256.cu @@ -1,268 +1,268 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -// Folgende Definitionen später durch header ersetzen -typedef unsigned int uint32_t; - -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *d_heftyHashes[8]; -extern uint32_t *d_nonceVector[8]; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hash2output[8]; - - -/* Hash-Tabellen */ -__constant__ uint32_t sha256_gpu_constantTable[64]; - -// muss expandiert werden -__constant__ uint32_t sha256_gpu_blockHeader[16]; // 2x512 Bit Message -__constant__ uint32_t sha256_gpu_register[8]; - -uint32_t sha256_cpu_hashTable[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; -uint32_t sha256_cpu_constantTable[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, -}; - -#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -#define R(x, n) ((x) >> (n)) -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) -#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) -#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) -#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - -// Die Hash-Funktion -__global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - uint32_t nounce = startNounce + thread; - nonceVector[thread] = nounce; - - // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory - uint32_t W1[16]; - uint32_t W2[16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - - // pre -#pragma unroll 8 - for (int k=0; k < 8; k++) - { - regs[k] = sha256_gpu_register[k]; - hash[k] = regs[k]; - } - - // 2. Runde - //memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke - //memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen -#pragma unroll 16 - for(int k=0;k<16;k++) - W1[k] = sha256_gpu_blockHeader[k]; - - uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x); -#pragma unroll 8 - for(int k=0;k<8;k++) - W1[5+k] = heftyHashes[offset + k]; - - -#pragma unroll 8 - for (int i=5; i <5+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;) - W1[3] = SWAB32(nounce); - -// Progress W1 -#pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -// Progress W2...W3 -#pragma unroll 3 - for(int k=0;k<3;k++) - { - #pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - #pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - - #pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - // Rundenfunktion - #pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - - #pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } - -/* - for(int j=16;j<64;j++) - W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; - -#pragma unroll 64 - for(int j=0;j<64;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } -*/ -#pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; - -#pragma unroll 8 - for(int k=0;k<8;k++) - ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); - } -} - -// Setup-Funktionen -__host__ void sha256_cpu_init(int thr_id, int threads) -{ - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( sha256_gpu_constantTable, - sha256_cpu_constantTable, - sizeof(uint32_t) * 64 ); - - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads); -} - -__host__ void sha256_cpu_setBlock(void *data) - // data muss 84-Byte haben! - // heftyHash hat 32-Byte -{ - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 84); - memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen - msgBlock[29] |= 0x80; - msgBlock[31] = 928; // bitlen - - for(int i=0;i<31;i++) // Byteorder drehen - msgBlock[i] = SWAB32(msgBlock[i]); - - // die erste Runde wird auf der CPU durchgeführt, da diese für - // alle Threads gleich ist. Der Hash wird dann an die Threads - // übergeben - uint32_t W[64]; - - // Erstelle expandierten Block W - memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); - for(int j=16;j<64;j++) - W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - - // pre - for (int k=0; k < 8; k++) - { - regs[k] = sha256_cpu_hashTable[k]; - hash[k] = regs[k]; - } - - // 1. Runde - for(int j=0;j<64;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - //#pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - // sollte mal noch durch memmov ersetzt werden! -// memcpy(®s[1], ®s[0], sizeof(uint32_t) * 7); - regs[0] = T1 + T2; - regs[4] += T1; - } - - for(int k=0;k<8;k++) - hash[k] += regs[k]; - - // hash speichern - cudaMemcpyToSymbol( sha256_gpu_register, - hash, - sizeof(uint32_t) * 8 ); - - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( sha256_gpu_blockHeader, - &msgBlock[16], - 64); -} - -__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) -{ - // Hefty1 Hashes kopieren - if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); - //else cudaThreadSynchronize(); -} - -__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) -{ - const int threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - sha256_gpu_hash<<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// Folgende Definitionen später durch header ersetzen +typedef unsigned int uint32_t; + +// globaler Speicher für alle HeftyHashes aller Threads +extern uint32_t *d_heftyHashes[8]; +extern uint32_t *d_nonceVector[8]; + +// globaler Speicher für unsere Ergebnisse +uint32_t *d_hash2output[8]; + + +/* Hash-Tabellen */ +__constant__ uint32_t sha256_gpu_constantTable[64]; + +// muss expandiert werden +__constant__ uint32_t sha256_gpu_blockHeader[16]; // 2x512 Bit Message +__constant__ uint32_t sha256_gpu_register[8]; + +uint32_t sha256_cpu_hashTable[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; +uint32_t sha256_cpu_constantTable[] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#define R(x, n) ((x) >> (n)) +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) +#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) +#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) +#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) + +#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) + +// Die Hash-Funktion +__global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // bestimme den aktuellen Zähler + uint32_t nounce = startNounce + thread; + nonceVector[thread] = nounce; + + // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory + uint32_t W1[16]; + uint32_t W2[16]; + + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; + + // pre +#pragma unroll 8 + for (int k=0; k < 8; k++) + { + regs[k] = sha256_gpu_register[k]; + hash[k] = regs[k]; + } + + // 2. Runde + //memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke + //memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen +#pragma unroll 16 + for(int k=0;k<16;k++) + W1[k] = sha256_gpu_blockHeader[k]; + + uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x); +#pragma unroll 8 + for(int k=0;k<8;k++) + W1[5+k] = heftyHashes[offset + k]; + + +#pragma unroll 8 + for (int i=5; i <5+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;) + W1[3] = SWAB32(nounce); + +// Progress W1 +#pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = T1 + T2; + regs[4] += T1; + } + +// Progress W2...W3 +#pragma unroll 3 + for(int k=0;k<3;k++) + { + #pragma unroll 2 + for(int j=0;j<2;j++) + W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + #pragma unroll 5 + for(int j=2;j<7;j++) + W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; + + #pragma unroll 8 + for(int j=7;j<15;j++) + W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Rundenfunktion + #pragma unroll 16 + for(int j=0;j<16;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; + regs[0] = T1 + T2; + regs[4] += T1; + } + + #pragma unroll 16 + for(int j=0;j<16;j++) + W1[j] = W2[j]; + } + +/* + for(int j=16;j<64;j++) + W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; + +#pragma unroll 64 + for(int j=0;j<64;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + #pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + regs[0] = T1 + T2; + regs[4] += T1; + } +*/ +#pragma unroll 8 + for(int k=0;k<8;k++) + hash[k] += regs[k]; + +#pragma unroll 8 + for(int k=0;k<8;k++) + ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); + } +} + +// Setup-Funktionen +__host__ void sha256_cpu_init(int thr_id, int threads) +{ + // Kopiere die Hash-Tabellen in den GPU-Speicher + cudaMemcpyToSymbol( sha256_gpu_constantTable, + sha256_cpu_constantTable, + sizeof(uint32_t) * 64 ); + + // Speicher für alle Ergebnisse belegen + cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads); +} + +__host__ void sha256_cpu_setBlock(void *data) + // data muss 84-Byte haben! + // heftyHash hat 32-Byte +{ + // Nachricht expandieren und setzen + uint32_t msgBlock[32]; + + memset(msgBlock, 0, sizeof(uint32_t) * 32); + memcpy(&msgBlock[0], data, 84); + memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen + msgBlock[29] |= 0x80; + msgBlock[31] = 928; // bitlen + + for(int i=0;i<31;i++) // Byteorder drehen + msgBlock[i] = SWAB32(msgBlock[i]); + + // die erste Runde wird auf der CPU durchgeführt, da diese für + // alle Threads gleich ist. Der Hash wird dann an die Threads + // übergeben + uint32_t W[64]; + + // Erstelle expandierten Block W + memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); + for(int j=16;j<64;j++) + W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; + + // Initialisiere die register a bis h mit der Hash-Tabelle + uint32_t regs[8]; + uint32_t hash[8]; + + // pre + for (int k=0; k < 8; k++) + { + regs[k] = sha256_cpu_hashTable[k]; + hash[k] = regs[k]; + } + + // 1. Runde + for(int j=0;j<64;j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + //#pragma unroll 7 + for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; + // sollte mal noch durch memmov ersetzt werden! +// memcpy(®s[1], ®s[0], sizeof(uint32_t) * 7); + regs[0] = T1 + T2; + regs[4] += T1; + } + + for(int k=0;k<8;k++) + hash[k] += regs[k]; + + // hash speichern + cudaMemcpyToSymbol( sha256_gpu_register, + hash, + sizeof(uint32_t) * 8 ); + + // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) + cudaMemcpyToSymbol( sha256_gpu_blockHeader, + &msgBlock[16], + 64); +} + +__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) +{ + // Hefty1 Hashes kopieren + if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); + //else cudaThreadSynchronize(); +} + +__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) +{ + const int threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + sha256_gpu_hash<<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); +} diff --git a/cuda_sha256.h b/cuda_sha256.h index ff03bf5..9efd170 100644 --- a/cuda_sha256.h +++ b/cuda_sha256.h @@ -1,8 +1,8 @@ -#ifndef _CUDA_SHA256_H -#define _CUDA_SHA256_H - -void sha256_cpu_init(int thr_id, int threads); -void sha256_cpu_setBlock(void *data); -void sha256_cpu_hash(int thr_id, int threads, int startNounce); -void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -#endif +#ifndef _CUDA_SHA256_H +#define _CUDA_SHA256_H + +void sha256_cpu_init(int thr_id, int threads); +void sha256_cpu_setBlock(void *data); +void sha256_cpu_hash(int thr_id, int threads, int startNounce); +void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); +#endif diff --git a/elist.h b/elist.h index 431472f..5dcdda5 100644 --- a/elist.h +++ b/elist.h @@ -1,251 +1,251 @@ -#ifndef _LINUX_LIST_H -#define _LINUX_LIST_H - -/* - * Simple doubly linked list implementation. - * - * Some of the internal functions ("__xxx") are useful when - * manipulating whole lists rather than single entries, as - * sometimes we already know the next/prev entries and we can - * generate better code by using them directly rather than - * using the generic single-entry routines. - */ - -struct list_head { - struct list_head *next, *prev; -}; - -#define LIST_HEAD_INIT(name) { &(name), &(name) } - -#define LIST_HEAD(name) \ - struct list_head name = LIST_HEAD_INIT(name) - -#define INIT_LIST_HEAD(ptr) do { \ - (ptr)->next = (ptr); (ptr)->prev = (ptr); \ -} while (0) - -/* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static __inline void __list_add(struct list_head *lnew, - struct list_head *prev, - struct list_head *next) -{ - next->prev = lnew; - lnew->next = next; - lnew->prev = prev; - prev->next = lnew; -} - -/** - * list_add - add a new entry - * @new: new entry to be added - * @head: list head to add it after - * - * Insert a new entry after the specified head. - * This is good for implementing stacks. - */ -static __inline void list_add(struct list_head *lnew, struct list_head *head) -{ - __list_add(lnew, head, head->next); -} - -/** - * list_add_tail - add a new entry - * @new: new entry to be added - * @head: list head to add it before - * - * Insert a new entry before the specified head. - * This is useful for implementing queues. - */ -static __inline void list_add_tail(struct list_head *lnew, struct list_head *head) -{ - __list_add(lnew, head->prev, head); -} - -/* - * Delete a list entry by making the prev/next entries - * point to each other. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static __inline void __list_del(struct list_head *prev, struct list_head *next) -{ - next->prev = prev; - prev->next = next; -} - -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is in an undefined state. - */ -static __inline void list_del(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); - entry->next = (struct list_head *) 0; - entry->prev = (struct list_head *) 0; -} - -/** - * list_del_init - deletes entry from list and reinitialize it. - * @entry: the element to delete from the list. - */ -static __inline void list_del_init(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); - INIT_LIST_HEAD(entry); -} - -/** - * list_move - delete from one list and add as another's head - * @list: the entry to move - * @head: the head that will precede our entry - */ -static __inline void list_move(struct list_head *list, struct list_head *head) -{ - __list_del(list->prev, list->next); - list_add(list, head); -} - -/** - * list_move_tail - delete from one list and add as another's tail - * @list: the entry to move - * @head: the head that will follow our entry - */ -static __inline void list_move_tail(struct list_head *list, - struct list_head *head) -{ - __list_del(list->prev, list->next); - list_add_tail(list, head); -} - -/** - * list_empty - tests whether a list is empty - * @head: the list to test. - */ -static __inline int list_empty(struct list_head *head) -{ - return head->next == head; -} - -static __inline void __list_splice(struct list_head *list, - struct list_head *head) -{ - struct list_head *first = list->next; - struct list_head *last = list->prev; - struct list_head *at = head->next; - - first->prev = head; - head->next = first; - - last->next = at; - at->prev = last; -} - -/** - * list_splice - join two lists - * @list: the new list to add. - * @head: the place to add it in the first list. - */ -static __inline void list_splice(struct list_head *list, struct list_head *head) -{ - if (!list_empty(list)) - __list_splice(list, head); -} - -/** - * list_splice_init - join two lists and reinitialise the emptied list. - * @list: the new list to add. - * @head: the place to add it in the first list. - * - * The list at @list is reinitialised - */ -static __inline void list_splice_init(struct list_head *list, - struct list_head *head) -{ - if (!list_empty(list)) { - __list_splice(list, head); - INIT_LIST_HEAD(list); - } -} - -/** - * list_entry - get the struct for this entry - * @ptr: the &struct list_head pointer. - * @type: the type of the struct this is embedded in. - * @member: the name of the list_struct within the struct. - */ -#define list_entry(ptr, type, member) \ - ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) - -/** - * list_for_each - iterate over a list - * @pos: the &struct list_head to use as a loop counter. - * @head: the head for your list. - */ -#define list_for_each(pos, head) \ - for (pos = (head)->next; pos != (head); \ - pos = pos->next) -/** - * list_for_each_prev - iterate over a list backwards - * @pos: the &struct list_head to use as a loop counter. - * @head: the head for your list. - */ -#define list_for_each_prev(pos, head) \ - for (pos = (head)->prev; pos != (head); \ - pos = pos->prev) - -/** - * list_for_each_safe - iterate over a list safe against removal of list entry - * @pos: the &struct list_head to use as a loop counter. - * @n: another &struct list_head to use as temporary storage - * @head: the head for your list. - */ -#define list_for_each_safe(pos, n, head) \ - for (pos = (head)->next, n = pos->next; pos != (head); \ - pos = n, n = pos->next) - -/** - * list_for_each_entry - iterate over list of given type - * @pos: the type * to use as a loop counter. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry(pos, head, member, tpos) \ - for (pos = list_entry((head)->next, tpos, member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, tpos, member)) - -/** - * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry - * @pos: the type * to use as a loop counter. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry_safe(pos, n, head, member, tpos, tn) \ - for (pos = list_entry((head)->next, tpos, member), \ - n = list_entry(pos->member.next, tpos, member); \ - &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, tn, member)) - -/** - * list_for_each_entry_continue - iterate over list of given type - * continuing after existing point - * @pos: the type * to use as a loop counter. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry_continue(pos, head, member, tpos) \ - for (pos = list_entry(pos->member.next, tpos, member), \ - prefetch(pos->member.next); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, tpos, member), \ - prefetch(pos->member.next)) - -#endif +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static __inline void __list_add(struct list_head *lnew, + struct list_head *prev, + struct list_head *next) +{ + next->prev = lnew; + lnew->next = next; + lnew->prev = prev; + prev->next = lnew; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static __inline void list_add(struct list_head *lnew, struct list_head *head) +{ + __list_add(lnew, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static __inline void list_add_tail(struct list_head *lnew, struct list_head *head) +{ + __list_add(lnew, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static __inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static __inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = (struct list_head *) 0; + entry->prev = (struct list_head *) 0; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static __inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static __inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static __inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static __inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static __inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static __inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static __inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); \ + pos = pos->next) +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); \ + pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member, tpos) \ + for (pos = list_entry((head)->next, tpos, member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, tpos, member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member, tpos, tn) \ + for (pos = list_entry((head)->next, tpos, member), \ + n = list_entry(pos->member.next, tpos, member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, tn, member)) + +/** + * list_for_each_entry_continue - iterate over list of given type + * continuing after existing point + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_continue(pos, head, member, tpos) \ + for (pos = list_entry(pos->member.next, tpos, member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, tpos, member), \ + prefetch(pos->member.next)) + +#endif diff --git a/files.txt b/files.txt index 2b7db9d..5391bba 100644 --- a/files.txt +++ b/files.txt @@ -1,30 +1,30 @@ -blake512.cu -blake.c -combine.cu -compat.h -cpu-miner.c -cpuminer-config.h -cuda_blake512.h -cuda_combine.h -cuda_groestl512.h -cuda_hefty1.h -cuda_keccak512.h -cuda_sha256.h -elist.h -groestl512.cu -groestl.c -heavy.c -hefty1.c -hefty1.cu -hefty1.h -keccak512.cu -keccak.c -miner.h -scrypt.c -sha256.cu -sha2.c -sph_blake.h -sph_groestl.h -sph_keccak.h -sph_types.h -util.c +blake512.cu +blake.c +combine.cu +compat.h +cpu-miner.c +cpuminer-config.h +cuda_blake512.h +cuda_combine.h +cuda_groestl512.h +cuda_hefty1.h +cuda_keccak512.h +cuda_sha256.h +elist.h +groestl512.cu +groestl.c +heavy.c +hefty1.c +hefty1.cu +hefty1.h +keccak512.cu +keccak.c +miner.h +scrypt.c +sha256.cu +sha2.c +sph_blake.h +sph_groestl.h +sph_keccak.h +sph_types.h +util.c diff --git a/fuguecoin.cpp b/fuguecoin.cpp index 2430c86..9d58a55 100644 --- a/fuguecoin.cpp +++ b/fuguecoin.cpp @@ -1,86 +1,86 @@ -#include "uint256.h" -#include "sph_fugue.h" - -#include "cpuminer-config.h" -#include "miner.h" - -#include -#include -#include - -extern "C" void my_fugue256_init(void *cc); -extern "C" void my_fugue256(void *cc, const void *data, size_t len); -extern "C" void my_fugue256_close(void *cc, void *dst); -extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst); - -// vorbereitete Kontexte nach den ersten 80 Bytes -sph_fugue256_context ctx_fugue_const[8]; - -#define SWAP32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) - -extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t start_nonce = pdata[19]++; - const uint32_t Htarg = ptarget[7]; - const uint32_t throughPut = 4096 * 128; - - // init - static bool init[8] = { false, false, false, false, false, false, false, false }; - if(!init[thr_id]) - { - fugue256_cpu_init(thr_id, throughPut); - init[thr_id] = true; - } - - // Endian Drehung ist notwendig - uint32_t endiandata[20]; - for (int kk=0; kk < 20; kk++) - be32enc(&endiandata[kk], pdata[kk]); - - // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) - fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget); - - do { - // GPU - uint32_t foundNounce = 0xFFFFFFFF; - fugue256_cpu_hash(thr_id, throughPut, pdata[19], NULL, &foundNounce); - - if(foundNounce < 0xffffffff) - { - uint32_t hash[8]; - endiandata[19] = SWAP32(foundNounce); - sph_fugue256_context ctx_fugue; - sph_fugue256_init(&ctx_fugue); - sph_fugue256 (&ctx_fugue, endiandata, 80); - sph_fugue256_close(&ctx_fugue, &hash); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) - { - pdata[19] = foundNounce; - *hashes_done = foundNounce - start_nonce; - return 1; - } else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce); - } - } - - if (pdata[19] + throughPut < pdata[19]) - pdata[19] = max_nonce; - else pdata[19] += throughPut; - - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = pdata[19] - start_nonce; - return 0; -} - -void fugue256_hash(unsigned char* output, const unsigned char* input, int len) -{ - sph_fugue256_context ctx; - sph_fugue256_init(&ctx); - sph_fugue256(&ctx, input, len); - sph_fugue256_close(&ctx, (void *)output); -} +#include "uint256.h" +#include "sph/sph_fugue.h" + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include +#include + +extern "C" void my_fugue256_init(void *cc); +extern "C" void my_fugue256(void *cc, const void *data, size_t len); +extern "C" void my_fugue256_close(void *cc, void *dst); +extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst); + +// vorbereitete Kontexte nach den ersten 80 Bytes +sph_fugue256_context ctx_fugue_const[8]; + +#define SWAP32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t start_nonce = pdata[19]++; + const uint32_t Htarg = ptarget[7]; + const uint32_t throughPut = 4096 * 128; + + // init + static bool init[8] = { false, false, false, false, false, false, false, false }; + if(!init[thr_id]) + { + fugue256_cpu_init(thr_id, throughPut); + init[thr_id] = true; + } + + // Endian Drehung ist notwendig + uint32_t endiandata[20]; + for (int kk=0; kk < 20; kk++) + be32enc(&endiandata[kk], pdata[kk]); + + // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) + fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget); + + do { + // GPU + uint32_t foundNounce = 0xFFFFFFFF; + fugue256_cpu_hash(thr_id, throughPut, pdata[19], NULL, &foundNounce); + + if(foundNounce < 0xffffffff) + { + uint32_t hash[8]; + endiandata[19] = SWAP32(foundNounce); + sph_fugue256_context ctx_fugue; + sph_fugue256_init(&ctx_fugue); + sph_fugue256 (&ctx_fugue, endiandata, 80); + sph_fugue256_close(&ctx_fugue, &hash); + + if (hash[7] <= Htarg && fulltest(hash, ptarget)) + { + pdata[19] = foundNounce; + *hashes_done = foundNounce - start_nonce; + return 1; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce); + } + } + + if (pdata[19] + throughPut < pdata[19]) + pdata[19] = max_nonce; + else pdata[19] += throughPut; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - start_nonce; + return 0; +} + +void fugue256_hash(unsigned char* output, const unsigned char* input, int len) +{ + sph_fugue256_context ctx; + sph_fugue256_init(&ctx); + sph_fugue256(&ctx, input, len); + sph_fugue256_close(&ctx, (void *)output); +} diff --git a/groestlcoin.cpp b/groestlcoin.cpp index b117ee5..c8b7850 100644 --- a/groestlcoin.cpp +++ b/groestlcoin.cpp @@ -1,177 +1,177 @@ -#include "uint256.h" -#include "sph_groestl.h" - -#include "cpuminer-config.h" -#include "miner.h" - -#include -#include -#include "cuda_groestlcoin.h" -#include - -#define SWAP32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) - -void sha256func(unsigned char *hash, const unsigned char *data, int len) -{ - uint32_t S[16], T[16]; - int i, r; - - sha256_init(S); - for (r = len; r > -9; r -= 64) { - if (r < 64) - memset(T, 0, 64); - memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); - if (r >= 0 && r < 64) - ((unsigned char *)T)[r] = 0x80; - for (i = 0; i < 16; i++) - T[i] = be32dec(T + i); - if (r < 56) - T[15] = 8 * len; - sha256_transform(S, T, 0); - } - /* - memcpy(S + 8, sha256d_hash1 + 8, 32); - sha256_init(T); - sha256_transform(T, S, 0); - */ - for (i = 0; i < 8; i++) - be32enc((uint32_t *)hash + i, T[i]); -} - -static void groestlhash(void *state, const void *input) -{ - // Tryout GPU-groestl - - sph_groestl512_context ctx_groestl[2]; - static unsigned char pblank[1]; - int ii; - uint32_t mask = 8; - uint32_t zero = 0; - - - //these uint512 in the c++ source of the client are backed by an array of uint32 - uint32_t hashA[16], hashB[16]; - - - sph_groestl512_init(&ctx_groestl[0]); - sph_groestl512 (&ctx_groestl[0], input, 80); //6 - sph_groestl512_close(&ctx_groestl[0], hashA); //7 - - sph_groestl512_init(&ctx_groestl[1]); - sph_groestl512 (&ctx_groestl[1], hashA, 64); //6 - sph_groestl512_close(&ctx_groestl[1], hashB); //7 - - memcpy(state, hashB, 32); -} - - - -extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t start_nonce = pdata[19]++; - const uint32_t Htarg = ptarget[7]; - const uint32_t throughPut = 4096 * 128; - //const uint32_t throughPut = 1; - int i; - uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); - - // init - static bool init[8] = { false, false, false, false, false, false, false, false }; - if(!init[thr_id]) - { - groestlcoin_cpu_init(thr_id, throughPut); - init[thr_id] = true; - } - - // Endian Drehung ist notwendig - //char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"}; - //pdata = (uint32_t*)testdata; - uint32_t endiandata[32]; - for (int kk=0; kk < 32; kk++) - be32enc(&endiandata[kk], pdata[kk]); - - // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) - groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget); - - do { - // GPU - uint32_t foundNounce = 0xFFFFFFFF; - - groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce); - - /* - { - for(i=0;i +#include +#include "cuda_groestlcoin.h" +#include + +#define SWAP32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +void sha256func(unsigned char *hash, const unsigned char *data, int len) +{ + uint32_t S[16], T[16]; + int i, r; + + sha256_init(S); + for (r = len; r > -9; r -= 64) { + if (r < 64) + memset(T, 0, 64); + memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); + if (r >= 0 && r < 64) + ((unsigned char *)T)[r] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + if (r < 56) + T[15] = 8 * len; + sha256_transform(S, T, 0); + } + /* + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(T); + sha256_transform(T, S, 0); + */ + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, T[i]); +} + +static void groestlhash(void *state, const void *input) +{ + // Tryout GPU-groestl + + sph_groestl512_context ctx_groestl[2]; + static unsigned char pblank[1]; + int ii; + uint32_t mask = 8; + uint32_t zero = 0; + + + //these uint512 in the c++ source of the client are backed by an array of uint32 + uint32_t hashA[16], hashB[16]; + + + sph_groestl512_init(&ctx_groestl[0]); + sph_groestl512 (&ctx_groestl[0], input, 80); //6 + sph_groestl512_close(&ctx_groestl[0], hashA); //7 + + sph_groestl512_init(&ctx_groestl[1]); + sph_groestl512 (&ctx_groestl[1], hashA, 64); //6 + sph_groestl512_close(&ctx_groestl[1], hashB); //7 + + memcpy(state, hashB, 32); +} + + + +extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t start_nonce = pdata[19]++; + const uint32_t Htarg = ptarget[7]; + const uint32_t throughPut = 4096 * 128; + //const uint32_t throughPut = 1; + int i; + uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); + + // init + static bool init[8] = { false, false, false, false, false, false, false, false }; + if(!init[thr_id]) + { + groestlcoin_cpu_init(thr_id, throughPut); + init[thr_id] = true; + } + + // Endian Drehung ist notwendig + //char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"}; + //pdata = (uint32_t*)testdata; + uint32_t endiandata[32]; + for (int kk=0; kk < 32; kk++) + be32enc(&endiandata[kk], pdata[kk]); + + // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) + groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget); + + do { + // GPU + uint32_t foundNounce = 0xFFFFFFFF; + + groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce); + + /* + { + for(i=0;i -#include -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#include - -#ifndef _WIN32 -#include -#endif - -// include thrust -#include -#include -#include -#include - -#include "miner.h" - -#include "hefty1.h" -#include "sph_keccak.h" -#include "sph_blake.h" -#include "sph_groestl.h" - -#include "cuda_hefty1.h" -#include "cuda_sha256.h" -#include "cuda_keccak512.h" -#include "cuda_groestl512.h" -#include "cuda_blake512.h" -#include "cuda_combine.h" - -extern uint32_t *d_hash2output[8]; -extern uint32_t *d_hash3output[8]; -extern uint32_t *d_hash4output[8]; -extern uint32_t *d_hash5output[8]; - -#define HEAVYCOIN_BLKHDR_SZ 84 - -// nonce-array für die threads -uint32_t *d_nonceVector[8]; - -/* Combines top 64-bits from each hash into a single hash */ -static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4) -{ - const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 }; - int bits; - unsigned int i; - uint32_t mask; - unsigned int k; - - /* Transpose first 64 bits of each hash into out */ - memset(out, 0, 32); - bits = 0; - for (i = 7; i >= 6; i--) { - for (mask = 0x80000000; mask; mask >>= 1) { - for (k = 0; k < 4; k++) { - out[(255 - bits)/32] <<= 1; - if ((hash[k][i] & mask) != 0) - out[(255 - bits)/32] |= 1; - bits++; - } - } - } -} - -#ifdef _MSC_VER -#include -static uint32_t __inline bitsset( uint32_t x ) -{ - DWORD r = 0; - _BitScanReverse(&r, x); - return r; -} -#else -static uint32_t bitsset( uint32_t x ) -{ - return 31-__builtin_clz(x); -} -#endif - -// Finde das high bit in einem Multiword-Integer. -static int findhighbit(const uint32_t *ptarget, int words) -{ - int i; - int highbit = 0; - for (i=words-1; i >= 0; --i) - { - if (ptarget[i] != 0) { - highbit = i*32 + bitsset(ptarget[i])+1; - break; - } - } - return highbit; -} - -// Generiere ein Multiword-Integer das die Zahl -// (2 << highbit) - 1 repräsentiert. -static void genmask(uint32_t *ptarget, int words, int highbit) -{ - int i; - for (i=words-1; i >= 0; --i) - { - if ((i+1)*32 <= highbit) - ptarget[i] = 0xffffffff; - else if (i*32 > highbit) - ptarget[i] = 0x00000000; - else - ptarget[i] = (1 << (highbit-i*32)) - 1; - } -} - -struct check_nonce_for_remove -{ - check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) : - m_target(target), - m_hashes(hashes), - m_hashlen(hashlen), - m_startNonce(startNonce) { } - - __device__ - bool operator()(const uint32_t x) - { - // Position im Hash Buffer - uint32_t hashIndex = x - m_startNonce; - // Wert des Hashes (als uint64_t) auslesen. - // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits) - uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6])); - // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. - return (hashValue & m_target) != hashValue; - } - - uint64_t m_target; - uint32_t *m_hashes; - uint32_t m_hashlen; - uint32_t m_startNonce; -}; - -// Zahl der CUDA Devices im System bestimmen -extern "C" int cuda_num_devices() -{ - int version; - cudaError_t err = cudaDriverGetVersion(&version); - if (err != cudaSuccess) - { - applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?"); - exit(1); - } - - int maj = version / 1000, min = version % 100; // same as in deviceQuery sample - if (maj < 5 || (maj == 5 && min < 5)) - { - applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5); - exit(1); - } - - int GPU_N; - err = cudaGetDeviceCount(&GPU_N); - if (err != cudaSuccess) - { - applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?"); - exit(1); - } - return GPU_N; -} - -static bool substringsearch(const char *haystack, const char *needle, int &match) -{ - int hlen = strlen(haystack); - int nlen = strlen(needle); - for (int i=0; i < hlen; ++i) - { - if (haystack[i] == ' ') continue; - int j=0, x = 0; - while(j < nlen) - { - if (haystack[i+x] == ' ') {++x; continue;} - if (needle[j] == ' ') {++j; continue;} - if (needle[j] == '#') return ++match == needle[j+1]-'0'; - if (tolower(haystack[i+x]) != tolower(needle[j])) break; - ++j; ++x; - } - if (j == nlen) return true; - } - return false; -} - -// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1) -extern "C" int cuda_finddevice(char *name) -{ - int num = cuda_num_devices(); - int match = 0; - for (int i=0; i < num; ++i) - { - cudaDeviceProp props; - if (cudaGetDeviceProperties(&props, i) == cudaSuccess) - if (substringsearch(props.name, name, match)) return i; - } - return -1; -} - -// Zeitsynchronisations-Routine von cudaminer mit CPU sleep -typedef struct { double value[8]; } tsumarray; -cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id) -{ - cudaError_t result = cudaSuccess; - if (situation >= 0) - { - static std::map tsum; - - double a = 0.95, b = 0.05; - if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence - - double tsync = 0.0; - double tsleep = 0.95 * tsum[situation].value[thr_id]; - if (cudaStreamQuery(stream) == cudaErrorNotReady) - { - usleep((useconds_t)(1e6*tsleep)); - struct timeval tv_start, tv_end; - gettimeofday(&tv_start, NULL); - result = cudaStreamSynchronize(stream); - gettimeofday(&tv_end, NULL); - tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec); - } - if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync); - } - else - result = cudaStreamSynchronize(stream); - return result; -} - -int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote); - -extern "C" -int scanhash_heavy(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote) -{ - return scanhash_heavy_cpp(thr_id, pdata, - ptarget, max_nonce, hashes_done, maxvote); -} - -int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote) -{ - // CUDA will process thousands of threads. - const int throughput = 4096 * 128; - - int rc = 0; - uint32_t *hash = NULL; - cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)); - uint32_t *cpu_nonceVector = NULL; - cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)); - - int nrmCalls[6]; - memset(nrmCalls, 0, sizeof(int) * 6); - - uint32_t start_nonce = pdata[19]; - uint16_t *ext = (uint16_t *)&pdata[20]; - - // für jeden Hash ein individuelles Target erstellen basierend - // auf dem höchsten Bit, das in ptarget gesetzt ist. - int highbit = findhighbit(ptarget, 8); - uint32_t target2[2], target3[2], target4[2], target5[2]; - genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256 - genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512 - genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512 - genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512 - - static bool init[8] = {0,0,0,0,0,0,0,0}; - if (!init[thr_id]) - { - hefty_cpu_init(thr_id, throughput); - sha256_cpu_init(thr_id, throughput); - keccak512_cpu_init(thr_id, throughput); - groestl512_cpu_init(thr_id, throughput); - blake512_cpu_init(thr_id, throughput); - combine_cpu_init(thr_id, throughput); - init[thr_id] = true; - cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput); - } - - - if (opt_vote > maxvote) { - printf("Warning: Your block reward vote (%hu) exceeds " - "the maxvote reported by the pool (%hu).\n", - opt_vote, maxvote); - } - - if (opt_trust_pool && opt_vote > maxvote) { - printf("Warning: Capping block reward vote to maxvote reported by pool.\n"); - ext[0] = maxvote; - } - else - ext[0] = opt_vote; - - // Setze die Blockdaten - hefty_cpu_setBlock(thr_id, throughput, pdata); - sha256_cpu_setBlock(pdata); - keccak512_cpu_setBlock(pdata); - groestl512_cpu_setBlock(pdata); - blake512_cpu_setBlock(pdata); - - do { - int i; - - ////// Compaction init - thrust::device_ptr devNoncePtr(d_nonceVector[thr_id]); - thrust::device_ptr devNoncePtrEnd((d_nonceVector[thr_id]) + throughput); - uint32_t actualNumberOfValuesInNonceVectorGPU = throughput; - - hefty_cpu_hash(thr_id, throughput, pdata[19]); - //cudaThreadSynchronize(); - sha256_cpu_hash(thr_id, throughput, pdata[19]); - //cudaThreadSynchronize(); - - // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier. - MyStreamSynchronize(NULL, 0, thr_id); - - ////// Compaction - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target2), d_hash2output[thr_id], 8, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - //cudaThreadSynchronize(); - - ////// Compaction - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target3), d_hash3output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - //cudaThreadSynchronize(); - - ////// Compaction - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target5), d_hash5output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - //cudaThreadSynchronize(); - - ////// Compaction - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target4), d_hash4output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - // combine - combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash); - - // Ergebnisse kopieren - if(actualNumberOfValuesInNonceVectorGPU > 0) - { - cudaMemcpy(cpu_nonceVector, d_nonceVector[thr_id], sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU, cudaMemcpyDeviceToHost); - - for (i=0; i +#include +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include + +#ifndef _WIN32 +#include +#endif + +// include thrust +#include +#include +#include +#include + +#include "miner.h" + +#include "hefty1.h" +#include "sph/sph_keccak.h" +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" + +#include "cuda_hefty1.h" +#include "cuda_sha256.h" +#include "cuda_keccak512.h" +#include "cuda_groestl512.h" +#include "cuda_blake512.h" +#include "cuda_combine.h" + +extern uint32_t *d_hash2output[8]; +extern uint32_t *d_hash3output[8]; +extern uint32_t *d_hash4output[8]; +extern uint32_t *d_hash5output[8]; + +#define HEAVYCOIN_BLKHDR_SZ 84 + +// nonce-array für die threads +uint32_t *d_nonceVector[8]; + +/* Combines top 64-bits from each hash into a single hash */ +static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4) +{ + const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 }; + int bits; + unsigned int i; + uint32_t mask; + unsigned int k; + + /* Transpose first 64 bits of each hash into out */ + memset(out, 0, 32); + bits = 0; + for (i = 7; i >= 6; i--) { + for (mask = 0x80000000; mask; mask >>= 1) { + for (k = 0; k < 4; k++) { + out[(255 - bits)/32] <<= 1; + if ((hash[k][i] & mask) != 0) + out[(255 - bits)/32] |= 1; + bits++; + } + } + } +} + +#ifdef _MSC_VER +#include +static uint32_t __inline bitsset( uint32_t x ) +{ + DWORD r = 0; + _BitScanReverse(&r, x); + return r; +} +#else +static uint32_t bitsset( uint32_t x ) +{ + return 31-__builtin_clz(x); +} +#endif + +// Finde das high bit in einem Multiword-Integer. +static int findhighbit(const uint32_t *ptarget, int words) +{ + int i; + int highbit = 0; + for (i=words-1; i >= 0; --i) + { + if (ptarget[i] != 0) { + highbit = i*32 + bitsset(ptarget[i])+1; + break; + } + } + return highbit; +} + +// Generiere ein Multiword-Integer das die Zahl +// (2 << highbit) - 1 repräsentiert. +static void genmask(uint32_t *ptarget, int words, int highbit) +{ + int i; + for (i=words-1; i >= 0; --i) + { + if ((i+1)*32 <= highbit) + ptarget[i] = 0xffffffff; + else if (i*32 > highbit) + ptarget[i] = 0x00000000; + else + ptarget[i] = (1 << (highbit-i*32)) - 1; + } +} + +struct check_nonce_for_remove +{ + check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) : + m_target(target), + m_hashes(hashes), + m_hashlen(hashlen), + m_startNonce(startNonce) { } + + __device__ + bool operator()(const uint32_t x) + { + // Position im Hash Buffer + uint32_t hashIndex = x - m_startNonce; + // Wert des Hashes (als uint64_t) auslesen. + // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits) + uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6])); + // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. + return (hashValue & m_target) != hashValue; + } + + uint64_t m_target; + uint32_t *m_hashes; + uint32_t m_hashlen; + uint32_t m_startNonce; +}; + +// Zahl der CUDA Devices im System bestimmen +extern "C" int cuda_num_devices() +{ + int version; + cudaError_t err = cudaDriverGetVersion(&version); + if (err != cudaSuccess) + { + applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?"); + exit(1); + } + + int maj = version / 1000, min = version % 100; // same as in deviceQuery sample + if (maj < 5 || (maj == 5 && min < 5)) + { + applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5); + exit(1); + } + + int GPU_N; + err = cudaGetDeviceCount(&GPU_N); + if (err != cudaSuccess) + { + applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?"); + exit(1); + } + return GPU_N; +} + +static bool substringsearch(const char *haystack, const char *needle, int &match) +{ + int hlen = strlen(haystack); + int nlen = strlen(needle); + for (int i=0; i < hlen; ++i) + { + if (haystack[i] == ' ') continue; + int j=0, x = 0; + while(j < nlen) + { + if (haystack[i+x] == ' ') {++x; continue;} + if (needle[j] == ' ') {++j; continue;} + if (needle[j] == '#') return ++match == needle[j+1]-'0'; + if (tolower(haystack[i+x]) != tolower(needle[j])) break; + ++j; ++x; + } + if (j == nlen) return true; + } + return false; +} + +// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1) +extern "C" int cuda_finddevice(char *name) +{ + int num = cuda_num_devices(); + int match = 0; + for (int i=0; i < num; ++i) + { + cudaDeviceProp props; + if (cudaGetDeviceProperties(&props, i) == cudaSuccess) + if (substringsearch(props.name, name, match)) return i; + } + return -1; +} + +// Zeitsynchronisations-Routine von cudaminer mit CPU sleep +typedef struct { double value[8]; } tsumarray; +cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id) +{ + cudaError_t result = cudaSuccess; + if (situation >= 0) + { + static std::map tsum; + + double a = 0.95, b = 0.05; + if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence + + double tsync = 0.0; + double tsleep = 0.95 * tsum[situation].value[thr_id]; + if (cudaStreamQuery(stream) == cudaErrorNotReady) + { + usleep((useconds_t)(1e6*tsleep)); + struct timeval tv_start, tv_end; + gettimeofday(&tv_start, NULL); + result = cudaStreamSynchronize(stream); + gettimeofday(&tv_end, NULL); + tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec); + } + if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync); + } + else + result = cudaStreamSynchronize(stream); + return result; +} + +int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done, uint32_t maxvote); + +extern "C" +int scanhash_heavy(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done, uint32_t maxvote) +{ + return scanhash_heavy_cpp(thr_id, pdata, + ptarget, max_nonce, hashes_done, maxvote); +} + +int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done, uint32_t maxvote) +{ + // CUDA will process thousands of threads. + const int throughput = 4096 * 128; + + int rc = 0; + uint32_t *hash = NULL; + cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)); + uint32_t *cpu_nonceVector = NULL; + cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)); + + int nrmCalls[6]; + memset(nrmCalls, 0, sizeof(int) * 6); + + uint32_t start_nonce = pdata[19]; + uint16_t *ext = (uint16_t *)&pdata[20]; + + // für jeden Hash ein individuelles Target erstellen basierend + // auf dem höchsten Bit, das in ptarget gesetzt ist. + int highbit = findhighbit(ptarget, 8); + uint32_t target2[2], target3[2], target4[2], target5[2]; + genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256 + genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512 + genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512 + genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512 + + static bool init[8] = {0,0,0,0,0,0,0,0}; + if (!init[thr_id]) + { + hefty_cpu_init(thr_id, throughput); + sha256_cpu_init(thr_id, throughput); + keccak512_cpu_init(thr_id, throughput); + groestl512_cpu_init(thr_id, throughput); + blake512_cpu_init(thr_id, throughput); + combine_cpu_init(thr_id, throughput); + init[thr_id] = true; + cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput); + } + + + if (opt_vote > maxvote) { + printf("Warning: Your block reward vote (%hu) exceeds " + "the maxvote reported by the pool (%hu).\n", + opt_vote, maxvote); + } + + if (opt_trust_pool && opt_vote > maxvote) { + printf("Warning: Capping block reward vote to maxvote reported by pool.\n"); + ext[0] = maxvote; + } + else + ext[0] = opt_vote; + + // Setze die Blockdaten + hefty_cpu_setBlock(thr_id, throughput, pdata); + sha256_cpu_setBlock(pdata); + keccak512_cpu_setBlock(pdata); + groestl512_cpu_setBlock(pdata); + blake512_cpu_setBlock(pdata); + + do { + int i; + + ////// Compaction init + thrust::device_ptr devNoncePtr(d_nonceVector[thr_id]); + thrust::device_ptr devNoncePtrEnd((d_nonceVector[thr_id]) + throughput); + uint32_t actualNumberOfValuesInNonceVectorGPU = throughput; + + hefty_cpu_hash(thr_id, throughput, pdata[19]); + //cudaThreadSynchronize(); + sha256_cpu_hash(thr_id, throughput, pdata[19]); + //cudaThreadSynchronize(); + + // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier. + MyStreamSynchronize(NULL, 0, thr_id); + + ////// Compaction + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target2), d_hash2output[thr_id], 8, pdata[19])); + actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); + if(actualNumberOfValuesInNonceVectorGPU == 0) + goto emptyNonceVector; + + keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); + //cudaThreadSynchronize(); + + ////// Compaction + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target3), d_hash3output[thr_id], 16, pdata[19])); + actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); + if(actualNumberOfValuesInNonceVectorGPU == 0) + goto emptyNonceVector; + + blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); + //cudaThreadSynchronize(); + + ////// Compaction + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target5), d_hash5output[thr_id], 16, pdata[19])); + actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); + if(actualNumberOfValuesInNonceVectorGPU == 0) + goto emptyNonceVector; + + groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); + //cudaThreadSynchronize(); + + ////// Compaction + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target4), d_hash4output[thr_id], 16, pdata[19])); + actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); + if(actualNumberOfValuesInNonceVectorGPU == 0) + goto emptyNonceVector; + + // combine + combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash); + + // Ergebnisse kopieren + if(actualNumberOfValuesInNonceVectorGPU > 0) + { + cudaMemcpy(cpu_nonceVector, d_nonceVector[thr_id], sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU, cudaMemcpyDeviceToHost); + + for (i=0; i - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the FreeBSD Project. - */ - -#include -#include - -#include "hefty1.h" - -#ifdef WIN32 -#define inline __inline -#endif - -#define Min(A, B) (A <= B ? A : B) - -#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \ - { \ - /* To thwart parallelism, Br modifies itself each time it's \ - * called. This also means that calling it in different \ - * orders yeilds different results. In C the order of \ - * evaluation of function arguments and + operands are \ - * unspecified (and depends on the compiler), so we must make \ - * the order of Br calls explicit. \ - */ \ - uint32_t brG = Br(ctx, G); \ - uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \ - uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \ - uint32_t brC = Br(ctx, C); \ - uint32_t brB = Br(ctx, B); \ - uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \ - uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \ - H = G; \ - G = F; \ - F = E; \ - E = D + Br(ctx, tmp2); \ - D = C; \ - C = B; \ - B = A; \ - A = tmp2 + tmp4; \ - } \ - -/* Nothing up my sleeve constants */ -const static uint32_t K[64] = { - 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, - 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, - 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, - 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, - 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, - 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, - 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, - 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, - 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, - 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, - 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, - 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, - 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, - 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, - 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, - 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL -}; - -/* Initial hash values */ -const static uint32_t H[HEFTY1_STATE_WORDS] = { - 0x6a09e667UL, - 0xbb67ae85UL, - 0x3c6ef372UL, - 0xa54ff53aUL, - 0x510e527fUL, - 0x9b05688cUL, - 0x1f83d9abUL, - 0x5be0cd19UL -}; - -static inline uint32_t Rr(uint32_t X, uint8_t n) -{ - return (X >> n) | (X << (32 - n)); -} - -static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G) -{ - return (E & F) ^ (~E & G); -} - -static inline uint32_t Sigma1(uint32_t E) -{ - return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25); -} - -static inline uint32_t sigma1(uint32_t X) -{ - return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10); -} - -static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C) -{ - return (A & B) ^ (A & C) ^ (B & C); -} - -static inline uint32_t Sigma0(uint32_t A) -{ - return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22); -} - -static inline uint32_t sigma0(uint32_t X) -{ - return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3); -} - -static inline uint32_t Reverse32(uint32_t n) -{ - #if BYTE_ORDER == LITTLE_ENDIAN - return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24; - #else - return n; - #endif -} - -static inline uint64_t Reverse64(uint64_t n) -{ - #if BYTE_ORDER == LITTLE_ENDIAN - uint32_t a = n >> 32; - uint32_t b = (n << 32) >> 32; - - return (uint64_t)Reverse32(b) << 32 | Reverse32(a); - #else - return n; - #endif -} - -/* Smoosh byte into nibble */ -static inline uint8_t Smoosh4(uint8_t X) -{ - return (X >> 4) ^ (X & 0xf); -} - -/* Smoosh 32-bit word into 2-bits */ -static inline uint8_t Smoosh2(uint32_t X) -{ - uint16_t w = (X >> 16) ^ (X & 0xffff); - uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff)); - return (n >> 2) ^ (n & 0x3); -} -#include -static void Mangle(uint32_t *S) -{ - uint8_t r0 = Smoosh4(S[0] >> 24); - uint8_t r1 = Smoosh4(S[0] >> 16); - uint8_t r2 = Smoosh4(S[0] >> 8); - uint8_t r3 = Smoosh4(S[0] & 0xff); - - /* Diffuse */ - S[1] ^= Rr(S[0], r0); - switch (Smoosh2(S[1])) { - case 0: S[2] ^= Rr(S[0], 1 + r0); break; - case 1: S[2] += Rr(~S[0], 1 + r1); break; - case 2: S[2] &= Rr(~S[0], 1 + r2); break; - case 3: S[2] ^= Rr(S[0], 1 + r3); break; - } - switch (Smoosh2(S[1] ^ S[2])) { - case 0: S[3] ^= Rr(S[0], 2 + r0); break; - case 1: S[3] += Rr(~S[0], 2 + r1); break; - case 2: S[3] &= Rr(~S[0], 2 + r2); break; - case 3: S[3] ^= Rr(S[0], 2 + r3); break; - } - - /* Compress */ - S[0] ^= (S[1] ^ S[2]) + S[3]; -} - -static void Absorb(uint32_t *S, uint32_t X) -{ - uint32_t *R = S; - R[0] ^= X; - Mangle(S); -} - -static uint32_t Squeeze(uint32_t *S) -{ - uint32_t Y = S[0]; - Mangle(S); - return Y; -} - -/* Branch, compress and serialize function */ -static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X) -{ - uint32_t R = Squeeze(ctx->sponge); - - uint8_t r0 = R >> 8; - uint8_t r1 = R & 0xff; - - uint32_t Y = 1 << (r0 % 32); - - switch (r1 % 4) - { - case 0: - /* Do nothing */ - break; - case 1: - return X & ~Y; - case 2: - return X | Y; - case 3: - return X ^ Y; - } - - return X; -} - -static void HashBlock(HEFTY1_CTX *ctx) -{ - uint32_t A, B, C, D, E, F, G, H; - uint32_t W[HEFTY1_BLOCK_BYTES]; - int t; - - assert(ctx); - - A = ctx->h[0]; - B = ctx->h[1]; - C = ctx->h[2]; - D = ctx->h[3]; - E = ctx->h[4]; - F = ctx->h[5]; - G = ctx->h[6]; - H = ctx->h[7]; - - t = 0; - for (; t < 16; t++) { - W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */ - Absorb(ctx->sponge, W[t] ^ K[t]); - } - - for (t = 0; t < 16; t++) { - Absorb(ctx->sponge, D ^ H); - RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); - } - for (t = 16; t < 64; t++) { - Absorb(ctx->sponge, H + D); - W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16]; - RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); - } - - ctx->h[0] += A; - ctx->h[1] += B; - ctx->h[2] += C; - ctx->h[3] += D; - ctx->h[4] += E; - ctx->h[5] += F; - ctx->h[6] += G; - ctx->h[7] += H; - - A = 0; - B = 0; - C = 0; - D = 0; - E = 0; - F = 0; - G = 0; - H = 0; - - memset(W, 0, sizeof(W)); -} - -/* Public interface */ - -void HEFTY1_Init(HEFTY1_CTX *ctx) -{ - assert(ctx); - - memcpy(ctx->h, H, sizeof(ctx->h)); - memset(ctx->block, 0, sizeof(ctx->block)); - ctx->written = 0; - memset(ctx->sponge, 0, sizeof(ctx->sponge)); -} - -void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len) -{ - uint64_t read; - assert(ctx); - - read = 0; - while (len) { - uint64_t end = ctx->written % HEFTY1_BLOCK_BYTES; - uint64_t count = Min(len, HEFTY1_BLOCK_BYTES - end); - memcpy(&ctx->block[end], &((unsigned char *)buf)[read], (size_t)count); - len -= (size_t)count; - read += count; - ctx->written += count; - if (!(ctx->written % HEFTY1_BLOCK_BYTES)) - HashBlock(ctx); - } -} - -void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx) -{ - uint64_t used; - uint64_t *len; - int i; - assert(digest); - assert(ctx); - - /* Pad message (FIPS 180 Section 5.1.1) */ - used = ctx->written % HEFTY1_BLOCK_BYTES; - ctx->block[used++] = 0x80; /* Append 1 to end of message */ - if (used > HEFTY1_BLOCK_BYTES - 8) { - /* We have already written into the last 64bits, so - * we must continue into the next block. */ - memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - (size_t)used); - HashBlock(ctx); - used = 0; /* Create a new block (below) */ - } - - /* All remaining bits to zero */ - memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - (size_t)used); - - /* The last 64bits encode the length (in network byte order) */ - len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8]; - *len = Reverse64(ctx->written*8); - - HashBlock(ctx); - - /* Convert back to network byte order */ - i = 0; - for (; i < HEFTY1_STATE_WORDS; i++) - ctx->h[i] = Reverse32(ctx->h[i]); - - memcpy(digest, ctx->h, sizeof(ctx->h)); - memset(ctx, 0, sizeof(HEFTY1_CTX)); -} - -unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest) -{ - HEFTY1_CTX ctx; - static unsigned char m[HEFTY1_DIGEST_BYTES]; - - if (!digest) - digest = m; - - HEFTY1_Init(&ctx); - HEFTY1_Update(&ctx, buf, len); - HEFTY1_Final(digest, &ctx); - - return digest; -} +/* + * HEFTY1 CPU-only cryptographic hash function + * + * Copyright (c) 2014, dbcc14 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the FreeBSD Project. + */ + +#include +#include + +#include "hefty1.h" + +#ifdef WIN32 +#define inline __inline +#endif + +#define Min(A, B) (A <= B ? A : B) + +#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \ + { \ + /* To thwart parallelism, Br modifies itself each time it's \ + * called. This also means that calling it in different \ + * orders yeilds different results. In C the order of \ + * evaluation of function arguments and + operands are \ + * unspecified (and depends on the compiler), so we must make \ + * the order of Br calls explicit. \ + */ \ + uint32_t brG = Br(ctx, G); \ + uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \ + uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \ + uint32_t brC = Br(ctx, C); \ + uint32_t brB = Br(ctx, B); \ + uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \ + uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \ + H = G; \ + G = F; \ + F = E; \ + E = D + Br(ctx, tmp2); \ + D = C; \ + C = B; \ + B = A; \ + A = tmp2 + tmp4; \ + } \ + +/* Nothing up my sleeve constants */ +const static uint32_t K[64] = { + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, + 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, + 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, + 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, + 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, + 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, + 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, + 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, + 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, + 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, + 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, + 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, + 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, + 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL +}; + +/* Initial hash values */ +const static uint32_t H[HEFTY1_STATE_WORDS] = { + 0x6a09e667UL, + 0xbb67ae85UL, + 0x3c6ef372UL, + 0xa54ff53aUL, + 0x510e527fUL, + 0x9b05688cUL, + 0x1f83d9abUL, + 0x5be0cd19UL +}; + +static inline uint32_t Rr(uint32_t X, uint8_t n) +{ + return (X >> n) | (X << (32 - n)); +} + +static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G) +{ + return (E & F) ^ (~E & G); +} + +static inline uint32_t Sigma1(uint32_t E) +{ + return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25); +} + +static inline uint32_t sigma1(uint32_t X) +{ + return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10); +} + +static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C) +{ + return (A & B) ^ (A & C) ^ (B & C); +} + +static inline uint32_t Sigma0(uint32_t A) +{ + return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22); +} + +static inline uint32_t sigma0(uint32_t X) +{ + return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3); +} + +static inline uint32_t Reverse32(uint32_t n) +{ + #if BYTE_ORDER == LITTLE_ENDIAN + return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24; + #else + return n; + #endif +} + +static inline uint64_t Reverse64(uint64_t n) +{ + #if BYTE_ORDER == LITTLE_ENDIAN + uint32_t a = n >> 32; + uint32_t b = (n << 32) >> 32; + + return (uint64_t)Reverse32(b) << 32 | Reverse32(a); + #else + return n; + #endif +} + +/* Smoosh byte into nibble */ +static inline uint8_t Smoosh4(uint8_t X) +{ + return (X >> 4) ^ (X & 0xf); +} + +/* Smoosh 32-bit word into 2-bits */ +static inline uint8_t Smoosh2(uint32_t X) +{ + uint16_t w = (X >> 16) ^ (X & 0xffff); + uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff)); + return (n >> 2) ^ (n & 0x3); +} +#include +static void Mangle(uint32_t *S) +{ + uint8_t r0 = Smoosh4(S[0] >> 24); + uint8_t r1 = Smoosh4(S[0] >> 16); + uint8_t r2 = Smoosh4(S[0] >> 8); + uint8_t r3 = Smoosh4(S[0] & 0xff); + + /* Diffuse */ + S[1] ^= Rr(S[0], r0); + switch (Smoosh2(S[1])) { + case 0: S[2] ^= Rr(S[0], 1 + r0); break; + case 1: S[2] += Rr(~S[0], 1 + r1); break; + case 2: S[2] &= Rr(~S[0], 1 + r2); break; + case 3: S[2] ^= Rr(S[0], 1 + r3); break; + } + switch (Smoosh2(S[1] ^ S[2])) { + case 0: S[3] ^= Rr(S[0], 2 + r0); break; + case 1: S[3] += Rr(~S[0], 2 + r1); break; + case 2: S[3] &= Rr(~S[0], 2 + r2); break; + case 3: S[3] ^= Rr(S[0], 2 + r3); break; + } + + /* Compress */ + S[0] ^= (S[1] ^ S[2]) + S[3]; +} + +static void Absorb(uint32_t *S, uint32_t X) +{ + uint32_t *R = S; + R[0] ^= X; + Mangle(S); +} + +static uint32_t Squeeze(uint32_t *S) +{ + uint32_t Y = S[0]; + Mangle(S); + return Y; +} + +/* Branch, compress and serialize function */ +static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X) +{ + uint32_t R = Squeeze(ctx->sponge); + + uint8_t r0 = R >> 8; + uint8_t r1 = R & 0xff; + + uint32_t Y = 1 << (r0 % 32); + + switch (r1 % 4) + { + case 0: + /* Do nothing */ + break; + case 1: + return X & ~Y; + case 2: + return X | Y; + case 3: + return X ^ Y; + } + + return X; +} + +static void HashBlock(HEFTY1_CTX *ctx) +{ + uint32_t A, B, C, D, E, F, G, H; + uint32_t W[HEFTY1_BLOCK_BYTES]; + int t; + + assert(ctx); + + A = ctx->h[0]; + B = ctx->h[1]; + C = ctx->h[2]; + D = ctx->h[3]; + E = ctx->h[4]; + F = ctx->h[5]; + G = ctx->h[6]; + H = ctx->h[7]; + + t = 0; + for (; t < 16; t++) { + W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */ + Absorb(ctx->sponge, W[t] ^ K[t]); + } + + for (t = 0; t < 16; t++) { + Absorb(ctx->sponge, D ^ H); + RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); + } + for (t = 16; t < 64; t++) { + Absorb(ctx->sponge, H + D); + W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16]; + RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); + } + + ctx->h[0] += A; + ctx->h[1] += B; + ctx->h[2] += C; + ctx->h[3] += D; + ctx->h[4] += E; + ctx->h[5] += F; + ctx->h[6] += G; + ctx->h[7] += H; + + A = 0; + B = 0; + C = 0; + D = 0; + E = 0; + F = 0; + G = 0; + H = 0; + + memset(W, 0, sizeof(W)); +} + +/* Public interface */ + +void HEFTY1_Init(HEFTY1_CTX *ctx) +{ + assert(ctx); + + memcpy(ctx->h, H, sizeof(ctx->h)); + memset(ctx->block, 0, sizeof(ctx->block)); + ctx->written = 0; + memset(ctx->sponge, 0, sizeof(ctx->sponge)); +} + +void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len) +{ + uint64_t read; + assert(ctx); + + read = 0; + while (len) { + uint64_t end = ctx->written % HEFTY1_BLOCK_BYTES; + uint64_t count = Min(len, HEFTY1_BLOCK_BYTES - end); + memcpy(&ctx->block[end], &((unsigned char *)buf)[read], (size_t)count); + len -= (size_t)count; + read += count; + ctx->written += count; + if (!(ctx->written % HEFTY1_BLOCK_BYTES)) + HashBlock(ctx); + } +} + +void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx) +{ + uint64_t used; + uint64_t *len; + int i; + assert(digest); + assert(ctx); + + /* Pad message (FIPS 180 Section 5.1.1) */ + used = ctx->written % HEFTY1_BLOCK_BYTES; + ctx->block[used++] = 0x80; /* Append 1 to end of message */ + if (used > HEFTY1_BLOCK_BYTES - 8) { + /* We have already written into the last 64bits, so + * we must continue into the next block. */ + memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - (size_t)used); + HashBlock(ctx); + used = 0; /* Create a new block (below) */ + } + + /* All remaining bits to zero */ + memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - (size_t)used); + + /* The last 64bits encode the length (in network byte order) */ + len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8]; + *len = Reverse64(ctx->written*8); + + HashBlock(ctx); + + /* Convert back to network byte order */ + i = 0; + for (; i < HEFTY1_STATE_WORDS; i++) + ctx->h[i] = Reverse32(ctx->h[i]); + + memcpy(digest, ctx->h, sizeof(ctx->h)); + memset(ctx, 0, sizeof(HEFTY1_CTX)); +} + +unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest) +{ + HEFTY1_CTX ctx; + static unsigned char m[HEFTY1_DIGEST_BYTES]; + + if (!digest) + digest = m; + + HEFTY1_Init(&ctx); + HEFTY1_Update(&ctx, buf, len); + HEFTY1_Final(digest, &ctx); + + return digest; +} diff --git a/hefty1.h b/hefty1.h index 29939e8..6610fbd 100644 --- a/hefty1.h +++ b/hefty1.h @@ -1,66 +1,66 @@ -/* - * HEFTY1 CPU-only cryptographic hash function - * - * Copyright (c) 2014, dbcc14 - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the FreeBSD Project. - */ - -#ifndef __HEFTY1_H__ -#define __HEFTY1_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef WIN32 -#include -#endif - -#include - -#define HEFTY1_DIGEST_BYTES 32 -#define HEFTY1_BLOCK_BYTES 64 -#define HEFTY1_STATE_WORDS 8 -#define HEFTY1_SPONGE_WORDS 4 - -typedef struct HEFTY1_CTX { - uint32_t h[HEFTY1_STATE_WORDS]; - uint8_t block[HEFTY1_BLOCK_BYTES]; - uint64_t written; - uint32_t sponge[HEFTY1_SPONGE_WORDS]; -} HEFTY1_CTX; - -void HEFTY1_Init(HEFTY1_CTX *cxt); -void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len); -void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt); -unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest); - -#ifdef __cplusplus -} -#endif - -#endif /* __HEFTY1_H__ */ +/* + * HEFTY1 CPU-only cryptographic hash function + * + * Copyright (c) 2014, dbcc14 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the FreeBSD Project. + */ + +#ifndef __HEFTY1_H__ +#define __HEFTY1_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef WIN32 +#include +#endif + +#include + +#define HEFTY1_DIGEST_BYTES 32 +#define HEFTY1_BLOCK_BYTES 64 +#define HEFTY1_STATE_WORDS 8 +#define HEFTY1_SPONGE_WORDS 4 + +typedef struct HEFTY1_CTX { + uint32_t h[HEFTY1_STATE_WORDS]; + uint8_t block[HEFTY1_BLOCK_BYTES]; + uint64_t written; + uint32_t sponge[HEFTY1_SPONGE_WORDS]; +} HEFTY1_CTX; + +void HEFTY1_Init(HEFTY1_CTX *cxt); +void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len); +void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt); +unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest); + +#ifdef __cplusplus +} +#endif + +#endif /* __HEFTY1_H__ */ diff --git a/miner.h b/miner.h index 6c20d80..ffea67c 100644 --- a/miner.h +++ b/miner.h @@ -1,321 +1,329 @@ -#ifndef __MINER_H__ -#define __MINER_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "cpuminer-config.h" - -#include -#include -#include -#include -#include -#include - -#ifdef WIN32 -#define snprintf(...) _snprintf(__VA_ARGS__) -#define strdup(x) _strdup(x) -#define strncasecmp(x,y,z) _strnicmp(x,y,z) -#define strcasecmp(x,y) _stricmp(x,y) -typedef int ssize_t; -#endif - -#ifdef STDC_HEADERS -# include -# include -#else -# ifdef HAVE_STDLIB_H -# include -# endif -#endif -#ifdef HAVE_ALLOCA_H -# include -#elif !defined alloca -# ifdef __GNUC__ -# define alloca __builtin_alloca -# elif defined _AIX -# define alloca __alloca -# elif defined _MSC_VER -# include -# define alloca _alloca -# elif !defined HAVE_ALLOCA -# ifdef __cplusplus -extern "C" -# endif -void *alloca (size_t); -# endif -#endif - -#ifdef HAVE_SYSLOG_H -#include -#else -enum { - LOG_ERR, - LOG_WARNING, - LOG_NOTICE, - LOG_INFO, - LOG_DEBUG, -}; -#endif - -#undef unlikely -#undef likely -#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) -#define unlikely(expr) (__builtin_expect(!!(expr), 0)) -#define likely(expr) (__builtin_expect(!!(expr), 1)) -#else -#define unlikely(expr) (expr) -#define likely(expr) (expr) -#endif - -#ifndef ARRAY_SIZE -#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) -#endif - -#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) -#define WANT_BUILTIN_BSWAP -#else -#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ - | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) -#endif - -static inline uint32_t swab32(uint32_t v) -{ -#ifdef WANT_BUILTIN_BSWAP - return __builtin_bswap32(v); -#else - return bswap_32(v); -#endif -} - -#ifdef HAVE_SYS_ENDIAN_H -#include -#endif - -#if !HAVE_DECL_BE32DEC -static inline uint32_t be32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + - ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); -} -#endif - -#if !HAVE_DECL_LE32DEC -static inline uint32_t le32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); -} -#endif - -#if !HAVE_DECL_BE32ENC -static inline void be32enc(void *pp, uint32_t x) -{ - uint8_t *p = (uint8_t *)pp; - p[3] = x & 0xff; - p[2] = (x >> 8) & 0xff; - p[1] = (x >> 16) & 0xff; - p[0] = (x >> 24) & 0xff; -} -#endif - -#if !HAVE_DECL_LE32ENC -static inline void le32enc(void *pp, uint32_t x) -{ - uint8_t *p = (uint8_t *)pp; - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; -} -#endif - -#if !HAVE_DECL_BE16DEC -static inline uint16_t be16dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - return ((uint16_t)(p[1]) + ((uint16_t)(p[0]) << 8)); -} -#endif - -#if !HAVE_DECL_BE16ENC -static inline void be16enc(void *pp, uint16_t x) -{ - uint8_t *p = (uint8_t *)pp; - p[1] = x & 0xff; - p[0] = (x >> 8) & 0xff; -} -#endif - -#if !HAVE_DECL_LE16DEC -static inline uint16_t le16dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - return ((uint16_t)(p[0]) + ((uint16_t)(p[1]) << 8)); -} -#endif - -#if !HAVE_DECL_LE16ENC -static inline void le16enc(void *pp, uint16_t x) -{ - uint8_t *p = (uint8_t *)pp; - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; -} -#endif - -#if JANSSON_MAJOR_VERSION >= 2 -#define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr)) -#else -#define JSON_LOADS(str, err_ptr) json_loads((str), (err_ptr)) -#endif - -#define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION - -void sha256_init(uint32_t *state); -void sha256_transform(uint32_t *state, const uint32_t *block, int swap); -void sha256d(unsigned char *hash, const unsigned char *data, int len); - -#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__) -#define HAVE_SHA256_4WAY 0 -int sha256_use_4way(); -void sha256_init_4way(uint32_t *state); -void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); -#endif - -#if defined(__x86_64__) && defined(USE_AVX2) -#define HAVE_SHA256_8WAY 0 -int sha256_use_8way(); -void sha256_init_8way(uint32_t *state); -void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); -#endif - -extern int scanhash_sha256d(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); - -extern unsigned char *scrypt_buffer_alloc(); - -extern int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done); - -extern int scanhash_heavy(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote); - -extern int scanhash_fugue256(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); - -extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); - -extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len); -extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); -extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len); - -struct thr_info { - int id; - pthread_t pth; - struct thread_q *q; -}; - -struct work_restart { - volatile unsigned long restart; - char padding[128 - sizeof(unsigned long)]; -}; - -extern bool opt_debug; -extern bool opt_protocol; -extern int opt_timeout; -extern bool want_longpoll; -extern bool have_longpoll; -extern bool want_stratum; -extern bool have_stratum; -extern char *opt_cert; -extern char *opt_proxy; -extern long opt_proxy_type; -extern bool use_syslog; -extern pthread_mutex_t applog_lock; -extern struct thr_info *thr_info; -extern int longpoll_thr_id; -extern int stratum_thr_id; -extern struct work_restart *work_restart; -extern bool opt_trust_pool; -extern uint16_t opt_vote; - -extern void applog(int prio, const char *fmt, ...); -extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, - const char *rpc_req, bool, bool, int *); -extern char *bin2hex(const unsigned char *p, size_t len); -extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); -extern int timeval_subtract(struct timeval *result, struct timeval *x, - struct timeval *y); -extern bool fulltest(const uint32_t *hash, const uint32_t *target); -extern void diff_to_target(uint32_t *target, double diff); - -struct stratum_job { - char *job_id; - unsigned char prevhash[32]; - size_t coinbase_size; - unsigned char *coinbase; - unsigned char *xnonce2; - int merkle_count; - unsigned char **merkle; - unsigned char version[4]; - unsigned char nbits[4]; - unsigned char ntime[4]; - bool clean; - unsigned char nreward[2]; - double diff; -}; - -struct stratum_ctx { - char *url; - - CURL *curl; - char *curl_url; - char curl_err_str[CURL_ERROR_SIZE]; - curl_socket_t sock; - size_t sockbuf_size; - char *sockbuf; - pthread_mutex_t sock_lock; - - double next_diff; - - char *session_id; - size_t xnonce1_size; - unsigned char *xnonce1; - size_t xnonce2_size; - struct stratum_job job; - pthread_mutex_t work_lock; -}; - -bool stratum_socket_full(struct stratum_ctx *sctx, int timeout); -bool stratum_send_line(struct stratum_ctx *sctx, char *s); -char *stratum_recv_line(struct stratum_ctx *sctx); -bool stratum_connect(struct stratum_ctx *sctx, const char *url); -void stratum_disconnect(struct stratum_ctx *sctx); -bool stratum_subscribe(struct stratum_ctx *sctx); -bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); -bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); - -struct thread_q; - -extern struct thread_q *tq_new(void); -extern void tq_free(struct thread_q *tq); -extern bool tq_push(struct thread_q *tq, void *data); -extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime); -extern void tq_freeze(struct thread_q *tq); -extern void tq_thaw(struct thread_q *tq); - -#ifdef __cplusplus -} -#endif - -#endif /* __MINER_H__ */ +#ifndef __MINER_H__ +#define __MINER_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "cpuminer-config.h" + +#include +#include +#include +#include +#include +#include + +#ifdef WIN32 +#define snprintf(...) _snprintf(__VA_ARGS__) +#define strdup(x) _strdup(x) +#define strncasecmp(x,y,z) _strnicmp(x,y,z) +#define strcasecmp(x,y) _stricmp(x,y) +typedef int ssize_t; +#endif + +#ifdef STDC_HEADERS +# include +# include +#else +# ifdef HAVE_STDLIB_H +# include +# endif +#endif +#ifdef HAVE_ALLOCA_H +# include +#elif !defined alloca +# ifdef __GNUC__ +# define alloca __builtin_alloca +# elif defined _AIX +# define alloca __alloca +# elif defined _MSC_VER +# include +# define alloca _alloca +# elif !defined HAVE_ALLOCA +# ifdef __cplusplus +extern "C" +# endif +void *alloca (size_t); +# endif +#endif + +#ifdef HAVE_SYSLOG_H +#include +#else +enum { + LOG_ERR, + LOG_WARNING, + LOG_NOTICE, + LOG_INFO, + LOG_DEBUG, +}; +#endif + +#undef unlikely +#undef likely +#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) +#define unlikely(expr) (__builtin_expect(!!(expr), 0)) +#define likely(expr) (__builtin_expect(!!(expr), 1)) +#else +#define unlikely(expr) (expr) +#define likely(expr) (expr) +#endif + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) +#define WANT_BUILTIN_BSWAP +#else +#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ + | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#endif + +static inline uint32_t swab32(uint32_t v) +{ +#ifdef WANT_BUILTIN_BSWAP + return __builtin_bswap32(v); +#else + return bswap_32(v); +#endif +} + +#ifdef HAVE_SYS_ENDIAN_H +#include +#endif + +#if !HAVE_DECL_BE32DEC +static inline uint32_t be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} +#endif + +#if !HAVE_DECL_LE32DEC +static inline uint32_t le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} +#endif + +#if !HAVE_DECL_BE32ENC +static inline void be32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} +#endif + +#if !HAVE_DECL_LE32ENC +static inline void le32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} +#endif + +#if !HAVE_DECL_BE16DEC +static inline uint16_t be16dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint16_t)(p[1]) + ((uint16_t)(p[0]) << 8)); +} +#endif + +#if !HAVE_DECL_BE16ENC +static inline void be16enc(void *pp, uint16_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[1] = x & 0xff; + p[0] = (x >> 8) & 0xff; +} +#endif + +#if !HAVE_DECL_LE16DEC +static inline uint16_t le16dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint16_t)(p[0]) + ((uint16_t)(p[1]) << 8)); +} +#endif + +#if !HAVE_DECL_LE16ENC +static inline void le16enc(void *pp, uint16_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; +} +#endif + +#if JANSSON_MAJOR_VERSION >= 2 +#define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr)) +#else +#define JSON_LOADS(str, err_ptr) json_loads((str), (err_ptr)) +#endif + +#define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION + +void sha256_init(uint32_t *state); +void sha256_transform(uint32_t *state, const uint32_t *block, int swap); +void sha256d(unsigned char *hash, const unsigned char *data, int len); + +#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__) +#define HAVE_SHA256_4WAY 0 +int sha256_use_4way(); +void sha256_init_4way(uint32_t *state); +void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); +#endif + +#if defined(__x86_64__) && defined(USE_AVX2) +#define HAVE_SHA256_8WAY 0 +int sha256_use_8way(); +void sha256_init_8way(uint32_t *state); +void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); +#endif + +extern int scanhash_sha256d(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + +extern unsigned char *scrypt_buffer_alloc(); + +extern int scanhash_scrypt(int thr_id, uint32_t *pdata, + unsigned char *scratchbuf, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done); + +extern int scanhash_heavy(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done, uint32_t maxvote); + +extern int scanhash_fugue256(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + +extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + +extern int scanhash_myriad(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + +extern int scanhash_jackpot(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + +extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len); +extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); +extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len); + +struct thr_info { + int id; + pthread_t pth; + struct thread_q *q; +}; + +struct work_restart { + volatile unsigned long restart; + char padding[128 - sizeof(unsigned long)]; +}; + +extern bool opt_debug; +extern bool opt_protocol; +extern int opt_timeout; +extern bool want_longpoll; +extern bool have_longpoll; +extern bool want_stratum; +extern bool have_stratum; +extern char *opt_cert; +extern char *opt_proxy; +extern long opt_proxy_type; +extern bool use_syslog; +extern pthread_mutex_t applog_lock; +extern struct thr_info *thr_info; +extern int longpoll_thr_id; +extern int stratum_thr_id; +extern struct work_restart *work_restart; +extern bool opt_trust_pool; +extern uint16_t opt_vote; + +extern void applog(int prio, const char *fmt, ...); +extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, + const char *rpc_req, bool, bool, int *); +extern char *bin2hex(const unsigned char *p, size_t len); +extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); +extern int timeval_subtract(struct timeval *result, struct timeval *x, + struct timeval *y); +extern bool fulltest(const uint32_t *hash, const uint32_t *target); +extern void diff_to_target(uint32_t *target, double diff); + +struct stratum_job { + char *job_id; + unsigned char prevhash[32]; + size_t coinbase_size; + unsigned char *coinbase; + unsigned char *xnonce2; + int merkle_count; + unsigned char **merkle; + unsigned char version[4]; + unsigned char nbits[4]; + unsigned char ntime[4]; + bool clean; + unsigned char nreward[2]; + double diff; +}; + +struct stratum_ctx { + char *url; + + CURL *curl; + char *curl_url; + char curl_err_str[CURL_ERROR_SIZE]; + curl_socket_t sock; + size_t sockbuf_size; + char *sockbuf; + pthread_mutex_t sock_lock; + + double next_diff; + + char *session_id; + size_t xnonce1_size; + unsigned char *xnonce1; + size_t xnonce2_size; + struct stratum_job job; + pthread_mutex_t work_lock; +}; + +bool stratum_socket_full(struct stratum_ctx *sctx, int timeout); +bool stratum_send_line(struct stratum_ctx *sctx, char *s); +char *stratum_recv_line(struct stratum_ctx *sctx); +bool stratum_connect(struct stratum_ctx *sctx, const char *url); +void stratum_disconnect(struct stratum_ctx *sctx); +bool stratum_subscribe(struct stratum_ctx *sctx); +bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); +bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); + +struct thread_q; + +extern struct thread_q *tq_new(void); +extern void tq_free(struct thread_q *tq); +extern bool tq_push(struct thread_q *tq, void *data); +extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime); +extern void tq_freeze(struct thread_q *tq); +extern void tq_thaw(struct thread_q *tq); + +#ifdef __cplusplus +} +#endif + +#endif /* __MINER_H__ */ diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp index 4b2a231..6ccecce 100644 --- a/myriadgroestl.cpp +++ b/myriadgroestl.cpp @@ -1,106 +1,106 @@ -#include "uint256.h" -#include "sph/sph_groestl.h" - -#include "cpuminer-config.h" -#include "miner.h" - -#include -#include -#include - -extern bool opt_benchmark; - -void myriadgroestl_cpu_init(int thr_id, int threads); -void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn); -void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); - -#define SWAP32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) - -static void myriadhash(void *state, const void *input) -{ - sph_groestl512_context ctx_groestl; - - uint32_t hashA[16], hashB[16]; - - sph_groestl512_init(&ctx_groestl); - sph_groestl512 (&ctx_groestl, input, 80); - sph_groestl512_close(&ctx_groestl, hashA); - - SHA256_CTX sha256; - SHA256_Init(&sha256); - SHA256_Update(&sha256,(unsigned char *)hashA, 64); - SHA256_Final((unsigned char *)hashB, &sha256); - memcpy(state, hashB, 32); -} - - - -extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t start_nonce = pdata[19]++; - const uint32_t throughPut = 128 * 1024; -// const uint32_t throughPut = 1; - uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); - - // TODO: entfernen für eine Release! Ist nur zum Testen! - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - const uint32_t Htarg = ptarget[7]; - - // init - static bool init[8] = { false, false, false, false, false, false, false, false }; - if(!init[thr_id]) - { -#if BIG_DEBUG -#else - myriadgroestl_cpu_init(thr_id, throughPut); -#endif - init[thr_id] = true; - } - - uint32_t endiandata[32]; - for (int kk=0; kk < 32; kk++) - be32enc(&endiandata[kk], pdata[kk]); - - // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) - myriadgroestl_cpu_setBlock(thr_id, endiandata, (void*)ptarget); - - do { - // GPU - uint32_t foundNounce = 0xFFFFFFFF; - - myriadgroestl_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce); - - if(foundNounce < 0xffffffff) - { - uint32_t tmpHash[8]; - endiandata[19] = SWAP32(foundNounce); - myriadhash(tmpHash, endiandata); - if (tmpHash[7] <= Htarg && - fulltest(tmpHash, ptarget)) { - pdata[19] = foundNounce; - *hashes_done = foundNounce - start_nonce; - free(outputHash); - return true; - } else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce); - } - - foundNounce = 0xffffffff; - } - - if (pdata[19] + throughPut < pdata[19]) - pdata[19] = max_nonce; - else pdata[19] += throughPut; - - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = pdata[19] - start_nonce; - free(outputHash); - return 0; -} - +#include "uint256.h" +#include "sph/sph_groestl.h" + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include +#include + +extern bool opt_benchmark; + +void myriadgroestl_cpu_init(int thr_id, int threads); +void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn); +void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); + +#define SWAP32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +static void myriadhash(void *state, const void *input) +{ + sph_groestl512_context ctx_groestl; + + uint32_t hashA[16], hashB[16]; + + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, input, 80); + sph_groestl512_close(&ctx_groestl, hashA); + + SHA256_CTX sha256; + SHA256_Init(&sha256); + SHA256_Update(&sha256,(unsigned char *)hashA, 64); + SHA256_Final((unsigned char *)hashB, &sha256); + memcpy(state, hashB, 32); +} + + + +extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t start_nonce = pdata[19]++; + const uint32_t throughPut = 128 * 1024; +// const uint32_t throughPut = 1; + uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); + + // TODO: entfernen für eine Release! Ist nur zum Testen! + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + const uint32_t Htarg = ptarget[7]; + + // init + static bool init[8] = { false, false, false, false, false, false, false, false }; + if(!init[thr_id]) + { +#if BIG_DEBUG +#else + myriadgroestl_cpu_init(thr_id, throughPut); +#endif + init[thr_id] = true; + } + + uint32_t endiandata[32]; + for (int kk=0; kk < 32; kk++) + be32enc(&endiandata[kk], pdata[kk]); + + // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) + myriadgroestl_cpu_setBlock(thr_id, endiandata, (void*)ptarget); + + do { + // GPU + uint32_t foundNounce = 0xFFFFFFFF; + + myriadgroestl_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce); + + if(foundNounce < 0xffffffff) + { + uint32_t tmpHash[8]; + endiandata[19] = SWAP32(foundNounce); + myriadhash(tmpHash, endiandata); + if (tmpHash[7] <= Htarg && + fulltest(tmpHash, ptarget)) { + pdata[19] = foundNounce; + *hashes_done = foundNounce - start_nonce; + free(outputHash); + return true; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce); + } + + foundNounce = 0xffffffff; + } + + if (pdata[19] + throughPut < pdata[19]) + pdata[19] = max_nonce; + else pdata[19] += throughPut; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - start_nonce; + free(outputHash); + return 0; +} + diff --git a/quark/cuda_quark_checkhash.cu b/quark/cuda_quark_checkhash.cu index b80da04..c4052f2 100644 --- a/quark/cuda_quark_checkhash.cu +++ b/quark/cuda_quark_checkhash.cu @@ -1,107 +1,107 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -#include -#include - -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -// das Hash Target gegen das wir testen sollen -__constant__ uint32_t pTarget[8]; - -uint32_t *d_resNounce[8]; -uint32_t *h_resNounce[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -__global__ void quark_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[16 * hashPosition]; - - uint32_t hash[8]; -#pragma unroll 8 - for (int i=0; i < 8; i++) - hash[i] = inpHash[i]; - - // kopiere Ergebnis - int i, position = -1; - bool rc = true; - -#pragma unroll 8 - for (i = 7; i >= 0; i--) { - if (hash[i] > pTarget[i]) { - if(position < i) { - position = i; - rc = false; - } - } - if (hash[i] < pTarget[i]) { - if(position < i) { - position = i; - rc = true; - } - } - } - - if(rc == true) - if(resNounce[0] > nounce) - resNounce[0] = nounce; - } -} - -// Setup-Funktionen -__host__ void quark_check_cpu_init(int thr_id, int threads) -{ - cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)); - cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)); -} - -// Target Difficulty setzen -__host__ void quark_check_cpu_setTarget(const void *ptarget) -{ - // die Message zur Berechnung auf der GPU - cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); -} - -__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) -{ - uint32_t result = 0xffffffff; - cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); - - const int threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - quark_check_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); - - // Strategisches Sleep Kommando zur Senkung der CPU Last - MyStreamSynchronize(NULL, order, thr_id); - - // Ergebnis zum Host kopieren (in page locked memory, damits schneller geht) - cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - - // cudaMemcpy() ist asynchron! - cudaThreadSynchronize(); - result = *h_resNounce[thr_id]; - - return result; -} +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include +#include + +// Folgende Definitionen später durch header ersetzen +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +// das Hash Target gegen das wir testen sollen +__constant__ uint32_t pTarget[8]; + +uint32_t *d_resNounce[8]; +uint32_t *h_resNounce[8]; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +__global__ void quark_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // bestimme den aktuellen Zähler + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = nounce - startNounce; + uint32_t *inpHash = &g_hash[16 * hashPosition]; + + uint32_t hash[8]; +#pragma unroll 8 + for (int i=0; i < 8; i++) + hash[i] = inpHash[i]; + + // kopiere Ergebnis + int i, position = -1; + bool rc = true; + +#pragma unroll 8 + for (i = 7; i >= 0; i--) { + if (hash[i] > pTarget[i]) { + if(position < i) { + position = i; + rc = false; + } + } + if (hash[i] < pTarget[i]) { + if(position < i) { + position = i; + rc = true; + } + } + } + + if(rc == true) + if(resNounce[0] > nounce) + resNounce[0] = nounce; + } +} + +// Setup-Funktionen +__host__ void quark_check_cpu_init(int thr_id, int threads) +{ + cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)); + cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)); +} + +// Target Difficulty setzen +__host__ void quark_check_cpu_setTarget(const void *ptarget) +{ + // die Message zur Berechnung auf der GPU + cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); +} + +__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) +{ + uint32_t result = 0xffffffff; + cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); + + const int threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; + +// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + + quark_check_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); + + // Strategisches Sleep Kommando zur Senkung der CPU Last + MyStreamSynchronize(NULL, order, thr_id); + + // Ergebnis zum Host kopieren (in page locked memory, damits schneller geht) + cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + // cudaMemcpy() ist asynchron! + cudaThreadSynchronize(); + result = *h_resNounce[thr_id]; + + return result; +} diff --git a/scrypt.c b/scrypt.c index 5efd0e2..2ff2902 100644 --- a/scrypt.c +++ b/scrypt.c @@ -1,756 +1,756 @@ -/* - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#include "cpuminer-config.h" -#include "miner.h" - -#include -#include -#include - -static const uint32_t keypad[12] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 -}; -static const uint32_t innerpad[11] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 -}; -static const uint32_t outerpad[8] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 -}; -static const uint32_t finalblk[16] = { - 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8]; - uint32_t pad[16]; - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 16, 16); - memcpy(pad + 4, keypad, 48); - sha256_transform(tstate, pad, 0); - memcpy(ihash, tstate, 32); - - sha256_init(ostate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform(ostate, pad, 0); - - sha256_init(tstate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 16; i++) - pad[i] = 0x36363636; - sha256_transform(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8], ostate2[8]; - uint32_t ibuf[16], obuf[16]; - int i, j; - - memcpy(istate, tstate, 32); - sha256_transform(istate, salt, 0); - - memcpy(ibuf, salt + 16, 16); - memcpy(ibuf + 5, innerpad, 44); - memcpy(obuf + 8, outerpad, 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 32); - ibuf[4] = i + 1; - sha256_transform(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 32); - sha256_transform(ostate2, obuf, 0); - for (j = 0; j < 8; j++) - output[8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, - const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[16]; - int i; - - sha256_transform(tstate, salt, 1); - sha256_transform(tstate, salt + 16, 1); - sha256_transform(tstate, finalblk, 0); - memcpy(buf, tstate, 32); - memcpy(buf + 8, outerpad, 32); - - sha256_transform(ostate, buf, 0); - for (i = 0; i < 8; i++) - output[i] = swab32(ostate[i]); -} - - -#if HAVE_SHA256_4WAY - -static const uint32_t keypad_4way[4 * 12] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000280, 0x00000280, 0x00000280, 0x00000280 -}; -static const uint32_t innerpad_4way[4 * 11] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 -}; -static const uint32_t outerpad_4way[4 * 8] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000300, 0x00000300, 0x00000300, 0x00000300 -}; -static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[4 * 8] __attribute__((aligned(16))); - uint32_t pad[4 * 16] __attribute__((aligned(16))); - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 4 * 16, 4 * 16); - memcpy(pad + 4 * 4, keypad_4way, 4 * 48); - sha256_transform_4way(tstate, pad, 0); - memcpy(ihash, tstate, 4 * 32); - - sha256_init_4way(ostate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 4 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_4way(ostate, pad, 0); - - sha256_init_4way(tstate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 4 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_4way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[4 * 8] __attribute__((aligned(16))); - uint32_t ostate2[4 * 8] __attribute__((aligned(16))); - uint32_t ibuf[4 * 16] __attribute__((aligned(16))); - uint32_t obuf[4 * 16] __attribute__((aligned(16))); - int i, j; - - memcpy(istate, tstate, 4 * 32); - sha256_transform_4way(istate, salt, 0); - - memcpy(ibuf, salt + 4 * 16, 4 * 16); - memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); - memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 4 * 32); - ibuf[4 * 4 + 0] = i + 1; - ibuf[4 * 4 + 1] = i + 1; - ibuf[4 * 4 + 2] = i + 1; - ibuf[4 * 4 + 3] = i + 1; - sha256_transform_4way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 4 * 32); - sha256_transform_4way(ostate2, obuf, 0); - for (j = 0; j < 4 * 8; j++) - output[4 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[4 * 16] __attribute__((aligned(16))); - int i; - - sha256_transform_4way(tstate, salt, 1); - sha256_transform_4way(tstate, salt + 4 * 16, 1); - sha256_transform_4way(tstate, finalblk_4way, 0); - memcpy(buf, tstate, 4 * 32); - memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - - sha256_transform_4way(ostate, buf, 0); - for (i = 0; i < 4 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_4WAY */ - - -#if HAVE_SHA256_8WAY - -static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8 * 8] __attribute__((aligned(32))); - uint32_t pad[8 * 16] __attribute__((aligned(32))); - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - pad[8 * 4 + i] = 0x80000000; - memset(pad + 8 * 5, 0x00, 8 * 40); - for (i = 0; i < 8; i++) - pad[8 * 15 + i] = 0x00000280; - sha256_transform_8way(tstate, pad, 0); - memcpy(ihash, tstate, 8 * 32); - - sha256_init_8way(ostate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 8 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_8way(ostate, pad, 0); - - sha256_init_8way(tstate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 8 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_8way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8 * 8] __attribute__((aligned(32))); - uint32_t ostate2[8 * 8] __attribute__((aligned(32))); - uint32_t ibuf[8 * 16] __attribute__((aligned(32))); - uint32_t obuf[8 * 16] __attribute__((aligned(32))); - int i, j; - - memcpy(istate, tstate, 8 * 32); - sha256_transform_8way(istate, salt, 0); - - memcpy(ibuf, salt + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - ibuf[8 * 5 + i] = 0x80000000; - memset(ibuf + 8 * 6, 0x00, 8 * 36); - for (i = 0; i < 8; i++) - ibuf[8 * 15 + i] = 0x000004a0; - - for (i = 0; i < 8; i++) - obuf[8 * 8 + i] = 0x80000000; - memset(obuf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - obuf[8 * 15 + i] = 0x00000300; - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 8 * 32); - ibuf[8 * 4 + 0] = i + 1; - ibuf[8 * 4 + 1] = i + 1; - ibuf[8 * 4 + 2] = i + 1; - ibuf[8 * 4 + 3] = i + 1; - ibuf[8 * 4 + 4] = i + 1; - ibuf[8 * 4 + 5] = i + 1; - ibuf[8 * 4 + 6] = i + 1; - ibuf[8 * 4 + 7] = i + 1; - sha256_transform_8way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 8 * 32); - sha256_transform_8way(ostate2, obuf, 0); - for (j = 0; j < 8 * 8; j++) - output[8 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[8 * 16] __attribute__((aligned(32))); - int i; - - sha256_transform_8way(tstate, salt, 1); - sha256_transform_8way(tstate, salt + 8 * 16, 1); - sha256_transform_8way(tstate, finalblk_8way, 0); - - memcpy(buf, tstate, 8 * 32); - for (i = 0; i < 8; i++) - buf[8 * 8 + i] = 0x80000000; - memset(buf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - buf[8 * 15 + i] = 0x00000300; - sha256_transform_8way(ostate, buf, 0); - - for (i = 0; i < 8 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_8WAY */ - - -#if defined(__x86_64__) - -#define SCRYPT_MAX_WAYS 1 -#define HAVE_SCRYPT_3WAY 0 -#define scrypt_best_throughput() 1 -static void scrypt_core(uint32_t *X, uint32_t *V); -void scrypt_core_3way(uint32_t *X, uint32_t *V); -#if defined(USE_AVX2) -#undef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 21 -#define HAVE_SCRYPT_6WAY 0 -void scrypt_core_6way(uint32_t *X, uint32_t *V); -#endif - -#elif defined(__i386__) - -#define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 -static void scrypt_core(uint32_t *X, uint32_t *V); - -#elif defined(__arm__) && defined(__APCS_32__) - -static void scrypt_core(uint32_t *X, uint32_t *V); -#if defined(__ARM_NEON__) -#undef HAVE_SHA256_4WAY -#define SCRYPT_MAX_WAYS 1 -#define HAVE_SCRYPT_3WAY 0 -#define scrypt_best_throughput() 1 -void scrypt_core_3way(uint32_t *X, uint32_t *V); -#endif - -#endif - -static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) -{ - uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; - int i; - - x00 = (B[ 0] ^= Bx[ 0]); - x01 = (B[ 1] ^= Bx[ 1]); - x02 = (B[ 2] ^= Bx[ 2]); - x03 = (B[ 3] ^= Bx[ 3]); - x04 = (B[ 4] ^= Bx[ 4]); - x05 = (B[ 5] ^= Bx[ 5]); - x06 = (B[ 6] ^= Bx[ 6]); - x07 = (B[ 7] ^= Bx[ 7]); - x08 = (B[ 8] ^= Bx[ 8]); - x09 = (B[ 9] ^= Bx[ 9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - for (i = 0; i < 8; i += 2) { -#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns. */ - x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); - x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); - - x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); - x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); - - x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); - x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); - - x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); - x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); - - /* Operate on rows. */ - x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); - x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); - - x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); - x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); - - x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); - x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); - - x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); - x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); -#undef R - } - B[ 0] += x00; - B[ 1] += x01; - B[ 2] += x02; - B[ 3] += x03; - B[ 4] += x04; - B[ 5] += x05; - B[ 6] += x06; - B[ 7] += x07; - B[ 8] += x08; - B[ 9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; -} - -static inline void scrypt_core(uint32_t *X, uint32_t *V) -{ - uint32_t i, j, k; - - for (i = 0; i < 1024; i++) { - memcpy(&V[i * 32], X, 128); - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } - for (i = 0; i < 1024; i++) { - j = 32 * (X[16] & 1023); - for (k = 0; k < 32; k++) - X[k] ^= V[j + k]; - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } -} - -#ifndef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 -#endif - -#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63) - -unsigned char *scrypt_buffer_alloc() -{ - return malloc(SCRYPT_BUFFER_SIZE); -} - -static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, - uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[8], ostate[8]; - uint32_t X[32]; - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate, midstate, 32); - HMAC_SHA256_80_init(input, tstate, ostate); - PBKDF2_SHA256_80_128(tstate, ostate, input, X); - - scrypt_core(X, V); - - PBKDF2_SHA256_128_32(tstate, ostate, X, output); -} - -#if HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_4way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[4 * 8] __attribute__((aligned(128))); - uint32_t ostate[4 * 8] __attribute__((aligned(128))); - uint32_t W[4 * 32] __attribute__((aligned(128))); - uint32_t X[4 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = input[k * 20 + i]; - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W, tstate, ostate); - PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[k * 32 + i] = W[4 * i + k]; - scrypt_core(X + 0 * 32, V); - scrypt_core(X + 1 * 32, V); - scrypt_core(X + 2 * 32, V); - scrypt_core(X + 3 * 32, V); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = X[k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[k * 8 + i] = W[4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#if HAVE_SCRYPT_3WAY - -static void scrypt_1024_1_1_256_3way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[3 * 8], ostate[3 * 8]; - uint32_t X[3 * 32] __attribute__((aligned(64))); - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate + 0, midstate, 32); - memcpy(tstate + 8, midstate, 32); - memcpy(tstate + 16, midstate, 32); - HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); - HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); - PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); - PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); - PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); - - scrypt_core_3way(X, V); - - PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); - PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); - PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); -} - -#if HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_12way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[12 * 8] __attribute__((aligned(128))); - uint32_t ostate[12 * 8] __attribute__((aligned(128))); - uint32_t W[12 * 32] __attribute__((aligned(128))); - uint32_t X[12 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[32 * j + 4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); - HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); - PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; - scrypt_core_3way(X + 0 * 96, V); - scrypt_core_3way(X + 1 * 96, V); - scrypt_core_3way(X + 2 * 96, V); - scrypt_core_3way(X + 3 * 96, V); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#endif /* HAVE_SCRYPT_3WAY */ - -#if HAVE_SCRYPT_6WAY -static void scrypt_1024_1_1_256_24way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[24 * 8] __attribute__((aligned(128))); - uint32_t ostate[24 * 8] __attribute__((aligned(128))); - uint32_t W[24 * 32] __attribute__((aligned(128))); - uint32_t X[24 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - tstate[8 * 8 * j + 8 * i + k] = midstate[i]; - HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); - HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); - PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; - scrypt_core_6way(X + 0 * 32, V); - scrypt_core_6way(X + 6 * 32, V); - scrypt_core_6way(X + 12 * 32, V); - scrypt_core_6way(X + 18 * 32, V); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; -} -#endif /* HAVE_SCRYPT_6WAY */ - -int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; - uint32_t midstate[8]; - uint32_t n = pdata[19] - 1; - const uint32_t Htarg = ptarget[7]; - int throughput = scrypt_best_throughput(); - int i; - -#if HAVE_SHA256_4WAY - if (sha256_use_4way()) - throughput *= 4; -#endif - - for (i = 0; i < throughput; i++) - memcpy(data + i * 20, pdata, 80); - - sha256_init(midstate); - sha256_transform(midstate, data, 0); - - do { - for (i = 0; i < throughput; i++) - data[i * 20 + 19] = ++n; - -#if defined(HAVE_SHA256_4WAY) - if (throughput == 4) - scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf); - else -#endif -#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY) - if (throughput == 12) - scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); - else -#endif -#if defined(HAVE_SCRYPT_6WAY) - if (throughput == 24) - scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); - else -#endif -#if defined(HAVE_SCRYPT_3WAY) - if (throughput == 3) - scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); - else -#endif - scrypt_1024_1_1_256(data, hash, midstate, scratchbuf); - - for (i = 0; i < throughput; i++) { - if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdata[19] = data[i * 20 + 19]; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - pdata[19] + 1; - pdata[19] = n; - return 0; -} +/* + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include +#include + +static const uint32_t keypad[12] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 +}; +static const uint32_t innerpad[11] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 +}; +static const uint32_t outerpad[8] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 +}; +static const uint32_t finalblk[16] = { + 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) +{ + uint32_t ihash[8]; + uint32_t pad[16]; + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 16, 16); + memcpy(pad + 4, keypad, 48); + sha256_transform(tstate, pad, 0); + memcpy(ihash, tstate, 32); + + sha256_init(ostate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transform(ostate, pad, 0); + + sha256_init(tstate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 16; i++) + pad[i] = 0x36363636; + sha256_transform(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[8], ostate2[8]; + uint32_t ibuf[16], obuf[16]; + int i, j; + + memcpy(istate, tstate, 32); + sha256_transform(istate, salt, 0); + + memcpy(ibuf, salt + 16, 16); + memcpy(ibuf + 5, innerpad, 44); + memcpy(obuf + 8, outerpad, 32); + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 32); + ibuf[4] = i + 1; + sha256_transform(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 32); + sha256_transform(ostate2, obuf, 0); + for (j = 0; j < 8; j++) + output[8 * i + j] = swab32(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, + const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[16]; + int i; + + sha256_transform(tstate, salt, 1); + sha256_transform(tstate, salt + 16, 1); + sha256_transform(tstate, finalblk, 0); + memcpy(buf, tstate, 32); + memcpy(buf + 8, outerpad, 32); + + sha256_transform(ostate, buf, 0); + for (i = 0; i < 8; i++) + output[i] = swab32(ostate[i]); +} + + +#if HAVE_SHA256_4WAY + +static const uint32_t keypad_4way[4 * 12] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000280, 0x00000280, 0x00000280, 0x00000280 +}; +static const uint32_t innerpad_4way[4 * 11] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 +}; +static const uint32_t outerpad_4way[4 * 8] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000300, 0x00000300, 0x00000300, 0x00000300 +}; +static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = { + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000620, 0x00000620, 0x00000620, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) +{ + uint32_t ihash[4 * 8] __attribute__((aligned(16))); + uint32_t pad[4 * 16] __attribute__((aligned(16))); + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 4 * 16, 4 * 16); + memcpy(pad + 4 * 4, keypad_4way, 4 * 48); + sha256_transform_4way(tstate, pad, 0); + memcpy(ihash, tstate, 4 * 32); + + sha256_init_4way(ostate); + for (i = 0; i < 4 * 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 4 * 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transform_4way(ostate, pad, 0); + + sha256_init_4way(tstate); + for (i = 0; i < 4 * 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 4 * 16; i++) + pad[i] = 0x36363636; + sha256_transform_4way(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[4 * 8] __attribute__((aligned(16))); + uint32_t ostate2[4 * 8] __attribute__((aligned(16))); + uint32_t ibuf[4 * 16] __attribute__((aligned(16))); + uint32_t obuf[4 * 16] __attribute__((aligned(16))); + int i, j; + + memcpy(istate, tstate, 4 * 32); + sha256_transform_4way(istate, salt, 0); + + memcpy(ibuf, salt + 4 * 16, 4 * 16); + memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); + memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 4 * 32); + ibuf[4 * 4 + 0] = i + 1; + ibuf[4 * 4 + 1] = i + 1; + ibuf[4 * 4 + 2] = i + 1; + ibuf[4 * 4 + 3] = i + 1; + sha256_transform_4way(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 4 * 32); + sha256_transform_4way(ostate2, obuf, 0); + for (j = 0; j < 4 * 8; j++) + output[4 * 8 * i + j] = swab32(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[4 * 16] __attribute__((aligned(16))); + int i; + + sha256_transform_4way(tstate, salt, 1); + sha256_transform_4way(tstate, salt + 4 * 16, 1); + sha256_transform_4way(tstate, finalblk_4way, 0); + memcpy(buf, tstate, 4 * 32); + memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); + + sha256_transform_4way(ostate, buf, 0); + for (i = 0; i < 4 * 8; i++) + output[i] = swab32(ostate[i]); +} + +#endif /* HAVE_SHA256_4WAY */ + + +#if HAVE_SHA256_8WAY + +static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = { + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) +{ + uint32_t ihash[8 * 8] __attribute__((aligned(32))); + uint32_t pad[8 * 16] __attribute__((aligned(32))); + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 8 * 16, 8 * 16); + for (i = 0; i < 8; i++) + pad[8 * 4 + i] = 0x80000000; + memset(pad + 8 * 5, 0x00, 8 * 40); + for (i = 0; i < 8; i++) + pad[8 * 15 + i] = 0x00000280; + sha256_transform_8way(tstate, pad, 0); + memcpy(ihash, tstate, 8 * 32); + + sha256_init_8way(ostate); + for (i = 0; i < 8 * 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 8 * 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transform_8way(ostate, pad, 0); + + sha256_init_8way(tstate); + for (i = 0; i < 8 * 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 8 * 16; i++) + pad[i] = 0x36363636; + sha256_transform_8way(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[8 * 8] __attribute__((aligned(32))); + uint32_t ostate2[8 * 8] __attribute__((aligned(32))); + uint32_t ibuf[8 * 16] __attribute__((aligned(32))); + uint32_t obuf[8 * 16] __attribute__((aligned(32))); + int i, j; + + memcpy(istate, tstate, 8 * 32); + sha256_transform_8way(istate, salt, 0); + + memcpy(ibuf, salt + 8 * 16, 8 * 16); + for (i = 0; i < 8; i++) + ibuf[8 * 5 + i] = 0x80000000; + memset(ibuf + 8 * 6, 0x00, 8 * 36); + for (i = 0; i < 8; i++) + ibuf[8 * 15 + i] = 0x000004a0; + + for (i = 0; i < 8; i++) + obuf[8 * 8 + i] = 0x80000000; + memset(obuf + 8 * 9, 0x00, 8 * 24); + for (i = 0; i < 8; i++) + obuf[8 * 15 + i] = 0x00000300; + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 8 * 32); + ibuf[8 * 4 + 0] = i + 1; + ibuf[8 * 4 + 1] = i + 1; + ibuf[8 * 4 + 2] = i + 1; + ibuf[8 * 4 + 3] = i + 1; + ibuf[8 * 4 + 4] = i + 1; + ibuf[8 * 4 + 5] = i + 1; + ibuf[8 * 4 + 6] = i + 1; + ibuf[8 * 4 + 7] = i + 1; + sha256_transform_8way(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 8 * 32); + sha256_transform_8way(ostate2, obuf, 0); + for (j = 0; j < 8 * 8; j++) + output[8 * 8 * i + j] = swab32(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[8 * 16] __attribute__((aligned(32))); + int i; + + sha256_transform_8way(tstate, salt, 1); + sha256_transform_8way(tstate, salt + 8 * 16, 1); + sha256_transform_8way(tstate, finalblk_8way, 0); + + memcpy(buf, tstate, 8 * 32); + for (i = 0; i < 8; i++) + buf[8 * 8 + i] = 0x80000000; + memset(buf + 8 * 9, 0x00, 8 * 24); + for (i = 0; i < 8; i++) + buf[8 * 15 + i] = 0x00000300; + sha256_transform_8way(ostate, buf, 0); + + for (i = 0; i < 8 * 8; i++) + output[i] = swab32(ostate[i]); +} + +#endif /* HAVE_SHA256_8WAY */ + + +#if defined(__x86_64__) + +#define SCRYPT_MAX_WAYS 1 +#define HAVE_SCRYPT_3WAY 0 +#define scrypt_best_throughput() 1 +static void scrypt_core(uint32_t *X, uint32_t *V); +void scrypt_core_3way(uint32_t *X, uint32_t *V); +#if defined(USE_AVX2) +#undef SCRYPT_MAX_WAYS +#define SCRYPT_MAX_WAYS 21 +#define HAVE_SCRYPT_6WAY 0 +void scrypt_core_6way(uint32_t *X, uint32_t *V); +#endif + +#elif defined(__i386__) + +#define SCRYPT_MAX_WAYS 1 +#define scrypt_best_throughput() 1 +static void scrypt_core(uint32_t *X, uint32_t *V); + +#elif defined(__arm__) && defined(__APCS_32__) + +static void scrypt_core(uint32_t *X, uint32_t *V); +#if defined(__ARM_NEON__) +#undef HAVE_SHA256_4WAY +#define SCRYPT_MAX_WAYS 1 +#define HAVE_SCRYPT_3WAY 0 +#define scrypt_best_throughput() 1 +void scrypt_core_3way(uint32_t *X, uint32_t *V); +#endif + +#endif + +static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) +{ + uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; + int i; + + x00 = (B[ 0] ^= Bx[ 0]); + x01 = (B[ 1] ^= Bx[ 1]); + x02 = (B[ 2] ^= Bx[ 2]); + x03 = (B[ 3] ^= Bx[ 3]); + x04 = (B[ 4] ^= Bx[ 4]); + x05 = (B[ 5] ^= Bx[ 5]); + x06 = (B[ 6] ^= Bx[ 6]); + x07 = (B[ 7] ^= Bx[ 7]); + x08 = (B[ 8] ^= Bx[ 8]); + x09 = (B[ 9] ^= Bx[ 9]); + x10 = (B[10] ^= Bx[10]); + x11 = (B[11] ^= Bx[11]); + x12 = (B[12] ^= Bx[12]); + x13 = (B[13] ^= Bx[13]); + x14 = (B[14] ^= Bx[14]); + x15 = (B[15] ^= Bx[15]); + for (i = 0; i < 8; i += 2) { +#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns. */ + x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); + x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); + + x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); + x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); + + x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); + x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); + + x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); + x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); + + /* Operate on rows. */ + x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); + x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); + + x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); + x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); + + x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); + x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); + + x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); + x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); +#undef R + } + B[ 0] += x00; + B[ 1] += x01; + B[ 2] += x02; + B[ 3] += x03; + B[ 4] += x04; + B[ 5] += x05; + B[ 6] += x06; + B[ 7] += x07; + B[ 8] += x08; + B[ 9] += x09; + B[10] += x10; + B[11] += x11; + B[12] += x12; + B[13] += x13; + B[14] += x14; + B[15] += x15; +} + +static inline void scrypt_core(uint32_t *X, uint32_t *V) +{ + uint32_t i, j, k; + + for (i = 0; i < 1024; i++) { + memcpy(&V[i * 32], X, 128); + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } + for (i = 0; i < 1024; i++) { + j = 32 * (X[16] & 1023); + for (k = 0; k < 32; k++) + X[k] ^= V[j + k]; + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } +} + +#ifndef SCRYPT_MAX_WAYS +#define SCRYPT_MAX_WAYS 1 +#define scrypt_best_throughput() 1 +#endif + +#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63) + +unsigned char *scrypt_buffer_alloc() +{ + return malloc(SCRYPT_BUFFER_SIZE); +} + +static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, + uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[8], ostate[8]; + uint32_t X[32]; + uint32_t *V; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + memcpy(tstate, midstate, 32); + HMAC_SHA256_80_init(input, tstate, ostate); + PBKDF2_SHA256_80_128(tstate, ostate, input, X); + + scrypt_core(X, V); + + PBKDF2_SHA256_128_32(tstate, ostate, X, output); +} + +#if HAVE_SHA256_4WAY +static void scrypt_1024_1_1_256_4way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[4 * 8] __attribute__((aligned(128))); + uint32_t ostate[4 * 8] __attribute__((aligned(128))); + uint32_t W[4 * 32] __attribute__((aligned(128))); + uint32_t X[4 * 32] __attribute__((aligned(128))); + uint32_t *V; + int i, k; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + for (i = 0; i < 20; i++) + for (k = 0; k < 4; k++) + W[4 * i + k] = input[k * 20 + i]; + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + tstate[4 * i + k] = midstate[i]; + HMAC_SHA256_80_init_4way(W, tstate, ostate); + PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + X[k * 32 + i] = W[4 * i + k]; + scrypt_core(X + 0 * 32, V); + scrypt_core(X + 1 * 32, V); + scrypt_core(X + 2 * 32, V); + scrypt_core(X + 3 * 32, V); + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + W[4 * i + k] = X[k * 32 + i]; + PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + output[k * 8 + i] = W[4 * i + k]; +} +#endif /* HAVE_SHA256_4WAY */ + +#if HAVE_SCRYPT_3WAY + +static void scrypt_1024_1_1_256_3way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[3 * 8], ostate[3 * 8]; + uint32_t X[3 * 32] __attribute__((aligned(64))); + uint32_t *V; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + memcpy(tstate + 0, midstate, 32); + memcpy(tstate + 8, midstate, 32); + memcpy(tstate + 16, midstate, 32); + HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); + HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); + HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); + PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); + PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); + PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); + + scrypt_core_3way(X, V); + + PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); + PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); + PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); +} + +#if HAVE_SHA256_4WAY +static void scrypt_1024_1_1_256_12way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[12 * 8] __attribute__((aligned(128))); + uint32_t ostate[12 * 8] __attribute__((aligned(128))); + uint32_t W[12 * 32] __attribute__((aligned(128))); + uint32_t X[12 * 32] __attribute__((aligned(128))); + uint32_t *V; + int i, j, k; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + for (j = 0; j < 3; j++) + for (i = 0; i < 20; i++) + for (k = 0; k < 4; k++) + W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + tstate[32 * j + 4 * i + k] = midstate[i]; + HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); + HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); + HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); + PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); + PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; + scrypt_core_3way(X + 0 * 96, V); + scrypt_core_3way(X + 1 * 96, V); + scrypt_core_3way(X + 2 * 96, V); + scrypt_core_3way(X + 3 * 96, V); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; + PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); + PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; +} +#endif /* HAVE_SHA256_4WAY */ + +#endif /* HAVE_SCRYPT_3WAY */ + +#if HAVE_SCRYPT_6WAY +static void scrypt_1024_1_1_256_24way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[24 * 8] __attribute__((aligned(128))); + uint32_t ostate[24 * 8] __attribute__((aligned(128))); + uint32_t W[24 * 32] __attribute__((aligned(128))); + uint32_t X[24 * 32] __attribute__((aligned(128))); + uint32_t *V; + int i, j, k; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + for (j = 0; j < 3; j++) + for (i = 0; i < 20; i++) + for (k = 0; k < 8; k++) + W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 8; k++) + tstate[8 * 8 * j + 8 * i + k] = midstate[i]; + HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); + HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); + HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); + PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); + PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 8; k++) + X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; + scrypt_core_6way(X + 0 * 32, V); + scrypt_core_6way(X + 6 * 32, V); + scrypt_core_6way(X + 12 * 32, V); + scrypt_core_6way(X + 18 * 32, V); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 8; k++) + W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; + PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); + PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 8; k++) + output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; +} +#endif /* HAVE_SCRYPT_6WAY */ + +int scanhash_scrypt(int thr_id, uint32_t *pdata, + unsigned char *scratchbuf, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; + uint32_t midstate[8]; + uint32_t n = pdata[19] - 1; + const uint32_t Htarg = ptarget[7]; + int throughput = scrypt_best_throughput(); + int i; + +#if HAVE_SHA256_4WAY + if (sha256_use_4way()) + throughput *= 4; +#endif + + for (i = 0; i < throughput; i++) + memcpy(data + i * 20, pdata, 80); + + sha256_init(midstate); + sha256_transform(midstate, data, 0); + + do { + for (i = 0; i < throughput; i++) + data[i * 20 + 19] = ++n; + +#if defined(HAVE_SHA256_4WAY) + if (throughput == 4) + scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf); + else +#endif +#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY) + if (throughput == 12) + scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); + else +#endif +#if defined(HAVE_SCRYPT_6WAY) + if (throughput == 24) + scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); + else +#endif +#if defined(HAVE_SCRYPT_3WAY) + if (throughput == 3) + scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); + else +#endif + scrypt_1024_1_1_256(data, hash, midstate, scratchbuf); + + for (i = 0; i < throughput; i++) { + if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) { + *hashes_done = n - pdata[19] + 1; + pdata[19] = data[i * 20 + 19]; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - pdata[19] + 1; + pdata[19] = n; + return 0; +} diff --git a/sha2.c b/sha2.c index 4bfdcc7..8c5df1e 100644 --- a/sha2.c +++ b/sha2.c @@ -1,630 +1,630 @@ -/* - * Copyright 2011 ArtForz - * Copyright 2011-2013 pooler - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#include "cpuminer-config.h" -#include "miner.h" - -#include -#include - -#if defined(__arm__) && defined(__APCS_32__) -#define EXTERN_SHA256 -#endif - -static const uint32_t sha256_h[8] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 -}; - -static const uint32_t sha256_k[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -void sha256_init(uint32_t *state) -{ - memcpy(state, sha256_h, 32); -} - -/* Elementary functions used by SHA256 */ -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define ROTR(x, n) ((x >> n) | (x << (32 - n))) -#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) -#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) -#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) -#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) - -/* SHA256 round function */ -#define RND(a, b, c, d, e, f, g, h, k) \ - do { \ - t0 = h + S1(e) + Ch(e, f, g) + k; \ - t1 = S0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; \ - } while (0) - -/* Adjusted round function for rotating state */ -#define RNDr(S, W, i) \ - RND(S[(64 - i) % 8], S[(65 - i) % 8], \ - S[(66 - i) % 8], S[(67 - i) % 8], \ - S[(68 - i) % 8], S[(69 - i) % 8], \ - S[(70 - i) % 8], S[(71 - i) % 8], \ - W[i] + sha256_k[i]) - -#ifndef EXTERN_SHA256 - -/* - * SHA256 block compression function. The 256-bit state is transformed via - * the 512-bit input block to produce a new state. - */ -void sha256_transform(uint32_t *state, const uint32_t *block, int swap) -{ - uint32_t W[64]; - uint32_t S[8]; - uint32_t t0, t1; - int i; - - /* 1. Prepare message schedule W. */ - if (swap) { - for (i = 0; i < 16; i++) - W[i] = swab32(block[i]); - } else - memcpy(W, block, 64); - for (i = 16; i < 64; i += 2) { - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; - W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; - } - - /* 2. Initialize working variables. */ - memcpy(S, state, 32); - - /* 3. Mix. */ - RNDr(S, W, 0); - RNDr(S, W, 1); - RNDr(S, W, 2); - RNDr(S, W, 3); - RNDr(S, W, 4); - RNDr(S, W, 5); - RNDr(S, W, 6); - RNDr(S, W, 7); - RNDr(S, W, 8); - RNDr(S, W, 9); - RNDr(S, W, 10); - RNDr(S, W, 11); - RNDr(S, W, 12); - RNDr(S, W, 13); - RNDr(S, W, 14); - RNDr(S, W, 15); - RNDr(S, W, 16); - RNDr(S, W, 17); - RNDr(S, W, 18); - RNDr(S, W, 19); - RNDr(S, W, 20); - RNDr(S, W, 21); - RNDr(S, W, 22); - RNDr(S, W, 23); - RNDr(S, W, 24); - RNDr(S, W, 25); - RNDr(S, W, 26); - RNDr(S, W, 27); - RNDr(S, W, 28); - RNDr(S, W, 29); - RNDr(S, W, 30); - RNDr(S, W, 31); - RNDr(S, W, 32); - RNDr(S, W, 33); - RNDr(S, W, 34); - RNDr(S, W, 35); - RNDr(S, W, 36); - RNDr(S, W, 37); - RNDr(S, W, 38); - RNDr(S, W, 39); - RNDr(S, W, 40); - RNDr(S, W, 41); - RNDr(S, W, 42); - RNDr(S, W, 43); - RNDr(S, W, 44); - RNDr(S, W, 45); - RNDr(S, W, 46); - RNDr(S, W, 47); - RNDr(S, W, 48); - RNDr(S, W, 49); - RNDr(S, W, 50); - RNDr(S, W, 51); - RNDr(S, W, 52); - RNDr(S, W, 53); - RNDr(S, W, 54); - RNDr(S, W, 55); - RNDr(S, W, 56); - RNDr(S, W, 57); - RNDr(S, W, 58); - RNDr(S, W, 59); - RNDr(S, W, 60); - RNDr(S, W, 61); - RNDr(S, W, 62); - RNDr(S, W, 63); - - /* 4. Mix local working variables into global state */ - for (i = 0; i < 8; i++) - state[i] += S[i]; -} - -#endif /* EXTERN_SHA256 */ - - -static const uint32_t sha256d_hash1[16] = { - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x80000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000100 -}; - -static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) -{ - uint32_t S[16]; - int i; - - sha256_init(S); - sha256_transform(S, data, 0); - sha256_transform(S, data + 16, 0); - memcpy(S + 8, sha256d_hash1 + 8, 32); - sha256_init(hash); - sha256_transform(hash, S, 0); - for (i = 0; i < 8; i++) - hash[i] = swab32(hash[i]); -} - -void sha256d(unsigned char *hash, const unsigned char *data, int len) -{ - uint32_t S[16], T[16]; - int i, r; - - sha256_init(S); - for (r = len; r > -9; r -= 64) { - if (r < 64) - memset(T, 0, 64); - memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); - if (r >= 0 && r < 64) - ((unsigned char *)T)[r] = 0x80; - for (i = 0; i < 16; i++) - T[i] = be32dec(T + i); - if (r < 56) - T[15] = 8 * len; - sha256_transform(S, T, 0); - } - memcpy(S + 8, sha256d_hash1 + 8, 32); - sha256_init(T); - sha256_transform(T, S, 0); - for (i = 0; i < 8; i++) - be32enc((uint32_t *)hash + i, T[i]); -} - -static inline void sha256d_preextend(uint32_t *W) -{ - W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; - W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; - W[18] = s1(W[16]) + W[11] + W[ 2]; - W[19] = s1(W[17]) + W[12] + s0(W[ 4]); - W[20] = W[13] + s0(W[ 5]) + W[ 4]; - W[21] = W[14] + s0(W[ 6]) + W[ 5]; - W[22] = W[15] + s0(W[ 7]) + W[ 6]; - W[23] = W[16] + s0(W[ 8]) + W[ 7]; - W[24] = W[17] + s0(W[ 9]) + W[ 8]; - W[25] = s0(W[10]) + W[ 9]; - W[26] = s0(W[11]) + W[10]; - W[27] = s0(W[12]) + W[11]; - W[28] = s0(W[13]) + W[12]; - W[29] = s0(W[14]) + W[13]; - W[30] = s0(W[15]) + W[14]; - W[31] = s0(W[16]) + W[15]; -} - -static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) -{ - uint32_t t0, t1; - RNDr(S, W, 0); - RNDr(S, W, 1); - RNDr(S, W, 2); -} - -#ifdef EXTERN_SHA256 - -void sha256d_ms(uint32_t *hash, uint32_t *W, - const uint32_t *midstate, const uint32_t *prehash); - -#else - -static inline void sha256d_ms(uint32_t *hash, uint32_t *W, - const uint32_t *midstate, const uint32_t *prehash) -{ - uint32_t S[64]; - uint32_t t0, t1; - int i; - - S[18] = W[18]; - S[19] = W[19]; - S[20] = W[20]; - S[22] = W[22]; - S[23] = W[23]; - S[24] = W[24]; - S[30] = W[30]; - S[31] = W[31]; - - W[18] += s0(W[3]); - W[19] += W[3]; - W[20] += s1(W[18]); - W[21] = s1(W[19]); - W[22] += s1(W[20]); - W[23] += s1(W[21]); - W[24] += s1(W[22]); - W[25] = s1(W[23]) + W[18]; - W[26] = s1(W[24]) + W[19]; - W[27] = s1(W[25]) + W[20]; - W[28] = s1(W[26]) + W[21]; - W[29] = s1(W[27]) + W[22]; - W[30] += s1(W[28]) + W[23]; - W[31] += s1(W[29]) + W[24]; - for (i = 32; i < 64; i += 2) { - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; - W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; - } - - memcpy(S, prehash, 32); - - RNDr(S, W, 3); - RNDr(S, W, 4); - RNDr(S, W, 5); - RNDr(S, W, 6); - RNDr(S, W, 7); - RNDr(S, W, 8); - RNDr(S, W, 9); - RNDr(S, W, 10); - RNDr(S, W, 11); - RNDr(S, W, 12); - RNDr(S, W, 13); - RNDr(S, W, 14); - RNDr(S, W, 15); - RNDr(S, W, 16); - RNDr(S, W, 17); - RNDr(S, W, 18); - RNDr(S, W, 19); - RNDr(S, W, 20); - RNDr(S, W, 21); - RNDr(S, W, 22); - RNDr(S, W, 23); - RNDr(S, W, 24); - RNDr(S, W, 25); - RNDr(S, W, 26); - RNDr(S, W, 27); - RNDr(S, W, 28); - RNDr(S, W, 29); - RNDr(S, W, 30); - RNDr(S, W, 31); - RNDr(S, W, 32); - RNDr(S, W, 33); - RNDr(S, W, 34); - RNDr(S, W, 35); - RNDr(S, W, 36); - RNDr(S, W, 37); - RNDr(S, W, 38); - RNDr(S, W, 39); - RNDr(S, W, 40); - RNDr(S, W, 41); - RNDr(S, W, 42); - RNDr(S, W, 43); - RNDr(S, W, 44); - RNDr(S, W, 45); - RNDr(S, W, 46); - RNDr(S, W, 47); - RNDr(S, W, 48); - RNDr(S, W, 49); - RNDr(S, W, 50); - RNDr(S, W, 51); - RNDr(S, W, 52); - RNDr(S, W, 53); - RNDr(S, W, 54); - RNDr(S, W, 55); - RNDr(S, W, 56); - RNDr(S, W, 57); - RNDr(S, W, 58); - RNDr(S, W, 59); - RNDr(S, W, 60); - RNDr(S, W, 61); - RNDr(S, W, 62); - RNDr(S, W, 63); - - for (i = 0; i < 8; i++) - S[i] += midstate[i]; - - W[18] = S[18]; - W[19] = S[19]; - W[20] = S[20]; - W[22] = S[22]; - W[23] = S[23]; - W[24] = S[24]; - W[30] = S[30]; - W[31] = S[31]; - - memcpy(S + 8, sha256d_hash1 + 8, 32); - S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; - S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; - S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; - S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; - S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; - S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; - S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; - S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; - S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; - S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; - S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; - S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; - S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; - S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; - S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; - S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; - for (i = 32; i < 60; i += 2) { - S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; - S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; - } - S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; - - sha256_init(hash); - - RNDr(hash, S, 0); - RNDr(hash, S, 1); - RNDr(hash, S, 2); - RNDr(hash, S, 3); - RNDr(hash, S, 4); - RNDr(hash, S, 5); - RNDr(hash, S, 6); - RNDr(hash, S, 7); - RNDr(hash, S, 8); - RNDr(hash, S, 9); - RNDr(hash, S, 10); - RNDr(hash, S, 11); - RNDr(hash, S, 12); - RNDr(hash, S, 13); - RNDr(hash, S, 14); - RNDr(hash, S, 15); - RNDr(hash, S, 16); - RNDr(hash, S, 17); - RNDr(hash, S, 18); - RNDr(hash, S, 19); - RNDr(hash, S, 20); - RNDr(hash, S, 21); - RNDr(hash, S, 22); - RNDr(hash, S, 23); - RNDr(hash, S, 24); - RNDr(hash, S, 25); - RNDr(hash, S, 26); - RNDr(hash, S, 27); - RNDr(hash, S, 28); - RNDr(hash, S, 29); - RNDr(hash, S, 30); - RNDr(hash, S, 31); - RNDr(hash, S, 32); - RNDr(hash, S, 33); - RNDr(hash, S, 34); - RNDr(hash, S, 35); - RNDr(hash, S, 36); - RNDr(hash, S, 37); - RNDr(hash, S, 38); - RNDr(hash, S, 39); - RNDr(hash, S, 40); - RNDr(hash, S, 41); - RNDr(hash, S, 42); - RNDr(hash, S, 43); - RNDr(hash, S, 44); - RNDr(hash, S, 45); - RNDr(hash, S, 46); - RNDr(hash, S, 47); - RNDr(hash, S, 48); - RNDr(hash, S, 49); - RNDr(hash, S, 50); - RNDr(hash, S, 51); - RNDr(hash, S, 52); - RNDr(hash, S, 53); - RNDr(hash, S, 54); - RNDr(hash, S, 55); - RNDr(hash, S, 56); - - hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) - + S[57] + sha256_k[57]; - hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) - + S[58] + sha256_k[58]; - hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) - + S[59] + sha256_k[59]; - hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) - + S[60] + sha256_k[60] - + sha256_h[7]; -} - -#endif /* EXTERN_SHA256 */ - -#if HAVE_SHA256_4WAY - -void sha256d_ms_4way(uint32_t *hash, uint32_t *data, - const uint32_t *midstate, const uint32_t *prehash); - -static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[4 * 64] __attribute__((aligned(128))); - uint32_t hash[4 * 8] __attribute__((aligned(32))); - uint32_t midstate[4 * 8] __attribute__((aligned(32))); - uint32_t prehash[4 * 8] __attribute__((aligned(32))); - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int i, j; - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - for (i = 31; i >= 0; i--) - for (j = 0; j < 4; j++) - data[i * 4 + j] = data[i]; - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - for (i = 7; i >= 0; i--) { - for (j = 0; j < 4; j++) { - midstate[i * 4 + j] = midstate[i]; - prehash[i * 4 + j] = prehash[i]; - } - } - - do { - for (i = 0; i < 4; i++) - data[4 * 3 + i] = ++n; - - sha256d_ms_4way(hash, data, midstate, prehash); - - for (i = 0; i < 4; i++) { - if (swab32(hash[4 * 7 + i]) <= Htarg) { - pdata[19] = data[4 * 3 + i]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -#endif /* HAVE_SHA256_4WAY */ - -#if HAVE_SHA256_8WAY - -void sha256d_ms_8way(uint32_t *hash, uint32_t *data, - const uint32_t *midstate, const uint32_t *prehash); - -static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[8 * 64] __attribute__((aligned(128))); - uint32_t hash[8 * 8] __attribute__((aligned(32))); - uint32_t midstate[8 * 8] __attribute__((aligned(32))); - uint32_t prehash[8 * 8] __attribute__((aligned(32))); - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int i, j; - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - for (i = 31; i >= 0; i--) - for (j = 0; j < 8; j++) - data[i * 8 + j] = data[i]; - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - for (i = 7; i >= 0; i--) { - for (j = 0; j < 8; j++) { - midstate[i * 8 + j] = midstate[i]; - prehash[i * 8 + j] = prehash[i]; - } - } - - do { - for (i = 0; i < 8; i++) - data[8 * 3 + i] = ++n; - - sha256d_ms_8way(hash, data, midstate, prehash); - - for (i = 0; i < 8; i++) { - if (swab32(hash[8 * 7 + i]) <= Htarg) { - pdata[19] = data[8 * 3 + i]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -#endif /* HAVE_SHA256_8WAY */ - -int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[64] /* __attribute__((aligned(128))) */; - uint32_t hash[8] /* __attribute__((aligned(32))) */; - uint32_t midstate[8] /* __attribute__((aligned(32))) */; - uint32_t prehash[8] /* __attribute__((aligned(32))) */; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - -#if HAVE_SHA256_8WAY - if (sha256_use_8way()) - return scanhash_sha256d_8way(thr_id, pdata, ptarget, - max_nonce, hashes_done); -#endif -#if HAVE_SHA256_4WAY - if (sha256_use_4way()) - return scanhash_sha256d_4way(thr_id, pdata, ptarget, - max_nonce, hashes_done); -#endif - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - - do { - data[3] = ++n; - sha256d_ms(hash, data, midstate, prehash); - if (swab32(hash[7]) <= Htarg) { - pdata[19] = data[3]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} +/* + * Copyright 2011 ArtForz + * Copyright 2011-2013 pooler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include + +#if defined(__arm__) && defined(__APCS_32__) +#define EXTERN_SHA256 +#endif + +static const uint32_t sha256_h[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 +}; + +static const uint32_t sha256_k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +void sha256_init(uint32_t *state) +{ + memcpy(state, sha256_h, 32); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + do { \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; \ + } while (0) + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + sha256_k[i]) + +#ifndef EXTERN_SHA256 + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +void sha256_transform(uint32_t *state, const uint32_t *block, int swap) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + if (swap) { + for (i = 0; i < 16; i++) + W[i] = swab32(block[i]); + } else + memcpy(W, block, 64); + for (i = 16; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +#endif /* EXTERN_SHA256 */ + + +static const uint32_t sha256d_hash1[16] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100 +}; + +static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) +{ + uint32_t S[16]; + int i; + + sha256_init(S); + sha256_transform(S, data, 0); + sha256_transform(S, data + 16, 0); + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(hash); + sha256_transform(hash, S, 0); + for (i = 0; i < 8; i++) + hash[i] = swab32(hash[i]); +} + +void sha256d(unsigned char *hash, const unsigned char *data, int len) +{ + uint32_t S[16], T[16]; + int i, r; + + sha256_init(S); + for (r = len; r > -9; r -= 64) { + if (r < 64) + memset(T, 0, 64); + memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); + if (r >= 0 && r < 64) + ((unsigned char *)T)[r] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + if (r < 56) + T[15] = 8 * len; + sha256_transform(S, T, 0); + } + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(T); + sha256_transform(T, S, 0); + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, T[i]); +} + +static inline void sha256d_preextend(uint32_t *W) +{ + W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; + W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; + W[18] = s1(W[16]) + W[11] + W[ 2]; + W[19] = s1(W[17]) + W[12] + s0(W[ 4]); + W[20] = W[13] + s0(W[ 5]) + W[ 4]; + W[21] = W[14] + s0(W[ 6]) + W[ 5]; + W[22] = W[15] + s0(W[ 7]) + W[ 6]; + W[23] = W[16] + s0(W[ 8]) + W[ 7]; + W[24] = W[17] + s0(W[ 9]) + W[ 8]; + W[25] = s0(W[10]) + W[ 9]; + W[26] = s0(W[11]) + W[10]; + W[27] = s0(W[12]) + W[11]; + W[28] = s0(W[13]) + W[12]; + W[29] = s0(W[14]) + W[13]; + W[30] = s0(W[15]) + W[14]; + W[31] = s0(W[16]) + W[15]; +} + +static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) +{ + uint32_t t0, t1; + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); +} + +#ifdef EXTERN_SHA256 + +void sha256d_ms(uint32_t *hash, uint32_t *W, + const uint32_t *midstate, const uint32_t *prehash); + +#else + +static inline void sha256d_ms(uint32_t *hash, uint32_t *W, + const uint32_t *midstate, const uint32_t *prehash) +{ + uint32_t S[64]; + uint32_t t0, t1; + int i; + + S[18] = W[18]; + S[19] = W[19]; + S[20] = W[20]; + S[22] = W[22]; + S[23] = W[23]; + S[24] = W[24]; + S[30] = W[30]; + S[31] = W[31]; + + W[18] += s0(W[3]); + W[19] += W[3]; + W[20] += s1(W[18]); + W[21] = s1(W[19]); + W[22] += s1(W[20]); + W[23] += s1(W[21]); + W[24] += s1(W[22]); + W[25] = s1(W[23]) + W[18]; + W[26] = s1(W[24]) + W[19]; + W[27] = s1(W[25]) + W[20]; + W[28] = s1(W[26]) + W[21]; + W[29] = s1(W[27]) + W[22]; + W[30] += s1(W[28]) + W[23]; + W[31] += s1(W[29]) + W[24]; + for (i = 32; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + memcpy(S, prehash, 32); + + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + for (i = 0; i < 8; i++) + S[i] += midstate[i]; + + W[18] = S[18]; + W[19] = S[19]; + W[20] = S[20]; + W[22] = S[22]; + W[23] = S[23]; + W[24] = S[24]; + W[30] = S[30]; + W[31] = S[31]; + + memcpy(S + 8, sha256d_hash1 + 8, 32); + S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; + S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; + S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; + S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; + S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; + S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; + S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; + S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; + S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; + S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; + S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; + S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; + S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; + S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; + S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; + S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; + for (i = 32; i < 60; i += 2) { + S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; + S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; + } + S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; + + sha256_init(hash); + + RNDr(hash, S, 0); + RNDr(hash, S, 1); + RNDr(hash, S, 2); + RNDr(hash, S, 3); + RNDr(hash, S, 4); + RNDr(hash, S, 5); + RNDr(hash, S, 6); + RNDr(hash, S, 7); + RNDr(hash, S, 8); + RNDr(hash, S, 9); + RNDr(hash, S, 10); + RNDr(hash, S, 11); + RNDr(hash, S, 12); + RNDr(hash, S, 13); + RNDr(hash, S, 14); + RNDr(hash, S, 15); + RNDr(hash, S, 16); + RNDr(hash, S, 17); + RNDr(hash, S, 18); + RNDr(hash, S, 19); + RNDr(hash, S, 20); + RNDr(hash, S, 21); + RNDr(hash, S, 22); + RNDr(hash, S, 23); + RNDr(hash, S, 24); + RNDr(hash, S, 25); + RNDr(hash, S, 26); + RNDr(hash, S, 27); + RNDr(hash, S, 28); + RNDr(hash, S, 29); + RNDr(hash, S, 30); + RNDr(hash, S, 31); + RNDr(hash, S, 32); + RNDr(hash, S, 33); + RNDr(hash, S, 34); + RNDr(hash, S, 35); + RNDr(hash, S, 36); + RNDr(hash, S, 37); + RNDr(hash, S, 38); + RNDr(hash, S, 39); + RNDr(hash, S, 40); + RNDr(hash, S, 41); + RNDr(hash, S, 42); + RNDr(hash, S, 43); + RNDr(hash, S, 44); + RNDr(hash, S, 45); + RNDr(hash, S, 46); + RNDr(hash, S, 47); + RNDr(hash, S, 48); + RNDr(hash, S, 49); + RNDr(hash, S, 50); + RNDr(hash, S, 51); + RNDr(hash, S, 52); + RNDr(hash, S, 53); + RNDr(hash, S, 54); + RNDr(hash, S, 55); + RNDr(hash, S, 56); + + hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) + + S[57] + sha256_k[57]; + hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) + + S[58] + sha256_k[58]; + hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) + + S[59] + sha256_k[59]; + hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) + + S[60] + sha256_k[60] + + sha256_h[7]; +} + +#endif /* EXTERN_SHA256 */ + +#if HAVE_SHA256_4WAY + +void sha256d_ms_4way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); + +static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[4 * 64] __attribute__((aligned(128))); + uint32_t hash[4 * 8] __attribute__((aligned(32))); + uint32_t midstate[4 * 8] __attribute__((aligned(32))); + uint32_t prehash[4 * 8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int i, j; + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + for (i = 31; i >= 0; i--) + for (j = 0; j < 4; j++) + data[i * 4 + j] = data[i]; + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + for (i = 7; i >= 0; i--) { + for (j = 0; j < 4; j++) { + midstate[i * 4 + j] = midstate[i]; + prehash[i * 4 + j] = prehash[i]; + } + } + + do { + for (i = 0; i < 4; i++) + data[4 * 3 + i] = ++n; + + sha256d_ms_4way(hash, data, midstate, prehash); + + for (i = 0; i < 4; i++) { + if (swab32(hash[4 * 7 + i]) <= Htarg) { + pdata[19] = data[4 * 3 + i]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +#endif /* HAVE_SHA256_4WAY */ + +#if HAVE_SHA256_8WAY + +void sha256d_ms_8way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); + +static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[8 * 64] __attribute__((aligned(128))); + uint32_t hash[8 * 8] __attribute__((aligned(32))); + uint32_t midstate[8 * 8] __attribute__((aligned(32))); + uint32_t prehash[8 * 8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int i, j; + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + for (i = 31; i >= 0; i--) + for (j = 0; j < 8; j++) + data[i * 8 + j] = data[i]; + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + for (i = 7; i >= 0; i--) { + for (j = 0; j < 8; j++) { + midstate[i * 8 + j] = midstate[i]; + prehash[i * 8 + j] = prehash[i]; + } + } + + do { + for (i = 0; i < 8; i++) + data[8 * 3 + i] = ++n; + + sha256d_ms_8way(hash, data, midstate, prehash); + + for (i = 0; i < 8; i++) { + if (swab32(hash[8 * 7 + i]) <= Htarg) { + pdata[19] = data[8 * 3 + i]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +#endif /* HAVE_SHA256_8WAY */ + +int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[64] /* __attribute__((aligned(128))) */; + uint32_t hash[8] /* __attribute__((aligned(32))) */; + uint32_t midstate[8] /* __attribute__((aligned(32))) */; + uint32_t prehash[8] /* __attribute__((aligned(32))) */; + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + +#if HAVE_SHA256_8WAY + if (sha256_use_8way()) + return scanhash_sha256d_8way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif +#if HAVE_SHA256_4WAY + if (sha256_use_4way()) + return scanhash_sha256d_4way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + + do { + data[3] = ++n; + sha256d_ms(hash, data, midstate, prehash); + if (swab32(hash[7]) <= Htarg) { + pdata[19] = data[3]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} diff --git a/sph/blake.c b/sph/blake.c index a9043e9..0650b9c 100644 --- a/sph/blake.c +++ b/sph/blake.c @@ -1,1120 +1,1120 @@ -/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ -/* - * BLAKE implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include -#include - -#include "sph_blake.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE -#define SPH_SMALL_FOOTPRINT_BLAKE 1 -#endif - -#if SPH_SMALL_FOOTPRINT_BLAKE -#define SPH_COMPACT_BLAKE_32 1 -#endif - -#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE) -#define SPH_COMPACT_BLAKE_64 1 -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -static const sph_u32 IV224[8] = { - SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), - SPH_C32(0x3070DD17), SPH_C32(0xF70E5939), - SPH_C32(0xFFC00B31), SPH_C32(0x68581511), - SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4) -}; - -static const sph_u32 IV256[8] = { - SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), - SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), - SPH_C32(0x510E527F), SPH_C32(0x9B05688C), - SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) -}; - -#if SPH_64 - -static const sph_u64 IV384[8] = { - SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507), - SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939), - SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511), - SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4) -}; - -static const sph_u64 IV512[8] = { - SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) -}; - -#endif - -#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64 - -static const unsigned sigma[16][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } -}; - -/* - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - 14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3 - 11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4 - 7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8 - 9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13 - 2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9 - 12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11 - 13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10 - 6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5 - 10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0 -*/ -#endif - -#define Z00 0 -#define Z01 1 -#define Z02 2 -#define Z03 3 -#define Z04 4 -#define Z05 5 -#define Z06 6 -#define Z07 7 -#define Z08 8 -#define Z09 9 -#define Z0A A -#define Z0B B -#define Z0C C -#define Z0D D -#define Z0E E -#define Z0F F - -#define Z10 E -#define Z11 A -#define Z12 4 -#define Z13 8 -#define Z14 9 -#define Z15 F -#define Z16 D -#define Z17 6 -#define Z18 1 -#define Z19 C -#define Z1A 0 -#define Z1B 2 -#define Z1C B -#define Z1D 7 -#define Z1E 5 -#define Z1F 3 - -#define Z20 B -#define Z21 8 -#define Z22 C -#define Z23 0 -#define Z24 5 -#define Z25 2 -#define Z26 F -#define Z27 D -#define Z28 A -#define Z29 E -#define Z2A 3 -#define Z2B 6 -#define Z2C 7 -#define Z2D 1 -#define Z2E 9 -#define Z2F 4 - -#define Z30 7 -#define Z31 9 -#define Z32 3 -#define Z33 1 -#define Z34 D -#define Z35 C -#define Z36 B -#define Z37 E -#define Z38 2 -#define Z39 6 -#define Z3A 5 -#define Z3B A -#define Z3C 4 -#define Z3D 0 -#define Z3E F -#define Z3F 8 - -#define Z40 9 -#define Z41 0 -#define Z42 5 -#define Z43 7 -#define Z44 2 -#define Z45 4 -#define Z46 A -#define Z47 F -#define Z48 E -#define Z49 1 -#define Z4A B -#define Z4B C -#define Z4C 6 -#define Z4D 8 -#define Z4E 3 -#define Z4F D - -#define Z50 2 -#define Z51 C -#define Z52 6 -#define Z53 A -#define Z54 0 -#define Z55 B -#define Z56 8 -#define Z57 3 -#define Z58 4 -#define Z59 D -#define Z5A 7 -#define Z5B 5 -#define Z5C F -#define Z5D E -#define Z5E 1 -#define Z5F 9 - -#define Z60 C -#define Z61 5 -#define Z62 1 -#define Z63 F -#define Z64 E -#define Z65 D -#define Z66 4 -#define Z67 A -#define Z68 0 -#define Z69 7 -#define Z6A 6 -#define Z6B 3 -#define Z6C 9 -#define Z6D 2 -#define Z6E 8 -#define Z6F B - -#define Z70 D -#define Z71 B -#define Z72 7 -#define Z73 E -#define Z74 C -#define Z75 1 -#define Z76 3 -#define Z77 9 -#define Z78 5 -#define Z79 0 -#define Z7A F -#define Z7B 4 -#define Z7C 8 -#define Z7D 6 -#define Z7E 2 -#define Z7F A - -#define Z80 6 -#define Z81 F -#define Z82 E -#define Z83 9 -#define Z84 B -#define Z85 3 -#define Z86 0 -#define Z87 8 -#define Z88 C -#define Z89 2 -#define Z8A D -#define Z8B 7 -#define Z8C 1 -#define Z8D 4 -#define Z8E A -#define Z8F 5 - -#define Z90 A -#define Z91 2 -#define Z92 8 -#define Z93 4 -#define Z94 7 -#define Z95 6 -#define Z96 1 -#define Z97 5 -#define Z98 F -#define Z99 B -#define Z9A 9 -#define Z9B E -#define Z9C 3 -#define Z9D C -#define Z9E D -#define Z9F 0 - -#define Mx(r, i) Mx_(Z ## r ## i) -#define Mx_(n) Mx__(n) -#define Mx__(n) M ## n - -#define CSx(r, i) CSx_(Z ## r ## i) -#define CSx_(n) CSx__(n) -#define CSx__(n) CS ## n - -#define CS0 SPH_C32(0x243F6A88) -#define CS1 SPH_C32(0x85A308D3) -#define CS2 SPH_C32(0x13198A2E) -#define CS3 SPH_C32(0x03707344) -#define CS4 SPH_C32(0xA4093822) -#define CS5 SPH_C32(0x299F31D0) -#define CS6 SPH_C32(0x082EFA98) -#define CS7 SPH_C32(0xEC4E6C89) -#define CS8 SPH_C32(0x452821E6) -#define CS9 SPH_C32(0x38D01377) -#define CSA SPH_C32(0xBE5466CF) -#define CSB SPH_C32(0x34E90C6C) -#define CSC SPH_C32(0xC0AC29B7) -#define CSD SPH_C32(0xC97C50DD) -#define CSE SPH_C32(0x3F84D5B5) -#define CSF SPH_C32(0xB5470917) - -#if SPH_COMPACT_BLAKE_32 - -static const sph_u32 CS[16] = { - SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), - SPH_C32(0x13198A2E), SPH_C32(0x03707344), - SPH_C32(0xA4093822), SPH_C32(0x299F31D0), - SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), - SPH_C32(0x452821E6), SPH_C32(0x38D01377), - SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), - SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), - SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) -}; - -#endif - -#if SPH_64 - -#define CBx(r, i) CBx_(Z ## r ## i) -#define CBx_(n) CBx__(n) -#define CBx__(n) CB ## n - -#define CB0 SPH_C64(0x243F6A8885A308D3) -#define CB1 SPH_C64(0x13198A2E03707344) -#define CB2 SPH_C64(0xA4093822299F31D0) -#define CB3 SPH_C64(0x082EFA98EC4E6C89) -#define CB4 SPH_C64(0x452821E638D01377) -#define CB5 SPH_C64(0xBE5466CF34E90C6C) -#define CB6 SPH_C64(0xC0AC29B7C97C50DD) -#define CB7 SPH_C64(0x3F84D5B5B5470917) -#define CB8 SPH_C64(0x9216D5D98979FB1B) -#define CB9 SPH_C64(0xD1310BA698DFB5AC) -#define CBA SPH_C64(0x2FFD72DBD01ADFB7) -#define CBB SPH_C64(0xB8E1AFED6A267E96) -#define CBC SPH_C64(0xBA7C9045F12C7F99) -#define CBD SPH_C64(0x24A19947B3916CF7) -#define CBE SPH_C64(0x0801F2E2858EFC16) -#define CBF SPH_C64(0x636920D871574E69) - -#if SPH_COMPACT_BLAKE_64 - -static const sph_u64 CB[16] = { - SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344), - SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89), - SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C), - SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917), - SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC), - SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96), - SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7), - SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69) -}; - -#endif - -#endif - -#define GS(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T32(a + b + (m0 ^ c1)); \ - d = SPH_ROTR32(d ^ a, 16); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 12); \ - a = SPH_T32(a + b + (m1 ^ c0)); \ - d = SPH_ROTR32(d ^ a, 8); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 7); \ - } while (0) - -#if SPH_COMPACT_BLAKE_32 - -#define ROUND_S(r) do { \ - GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \ - CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \ - GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \ - CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \ - GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \ - CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \ - GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \ - CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \ - GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \ - CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \ - GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \ - CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \ - GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \ - CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \ - GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \ - CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \ - } while (0) - -#else - -#define ROUND_S(r) do { \ - GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ - GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ - GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ - GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ - GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ - GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ - GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ - GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ - } while (0) - -#endif - -#if SPH_64 - -#define GB(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T64(a + b + (m0 ^ c1)); \ - d = SPH_ROTR64(d ^ a, 32); \ - c = SPH_T64(c + d); \ - b = SPH_ROTR64(b ^ c, 25); \ - a = SPH_T64(a + b + (m1 ^ c0)); \ - d = SPH_ROTR64(d ^ a, 16); \ - c = SPH_T64(c + d); \ - b = SPH_ROTR64(b ^ c, 11); \ - } while (0) - -#if SPH_COMPACT_BLAKE_64 - -#define ROUND_B(r) do { \ - GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \ - CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \ - GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \ - CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \ - GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \ - CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \ - GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \ - CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \ - GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \ - CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \ - GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \ - CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \ - GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \ - CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \ - GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \ - CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \ - } while (0) - -#else - -#define ROUND_B(r) do { \ - GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ - GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ - GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ - GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \ - GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \ - GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ - GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ - GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ - } while (0) - -#endif - -#endif - -#define DECL_STATE32 \ - sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \ - sph_u32 S0, S1, S2, S3, T0, T1; - -#define READ_STATE32(state) do { \ - H0 = (state)->H[0]; \ - H1 = (state)->H[1]; \ - H2 = (state)->H[2]; \ - H3 = (state)->H[3]; \ - H4 = (state)->H[4]; \ - H5 = (state)->H[5]; \ - H6 = (state)->H[6]; \ - H7 = (state)->H[7]; \ - S0 = (state)->S[0]; \ - S1 = (state)->S[1]; \ - S2 = (state)->S[2]; \ - S3 = (state)->S[3]; \ - T0 = (state)->T0; \ - T1 = (state)->T1; \ - } while (0) - -#define WRITE_STATE32(state) do { \ - (state)->H[0] = H0; \ - (state)->H[1] = H1; \ - (state)->H[2] = H2; \ - (state)->H[3] = H3; \ - (state)->H[4] = H4; \ - (state)->H[5] = H5; \ - (state)->H[6] = H6; \ - (state)->H[7] = H7; \ - (state)->S[0] = S0; \ - (state)->S[1] = S1; \ - (state)->S[2] = S2; \ - (state)->S[3] = S3; \ - (state)->T0 = T0; \ - (state)->T1 = T1; \ - } while (0) - -#if SPH_COMPACT_BLAKE_32 - -#define COMPRESS32 do { \ - sph_u32 M[16]; \ - sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ - unsigned r; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = S0 ^ CS0; \ - V9 = S1 ^ CS1; \ - VA = S2 ^ CS2; \ - VB = S3 ^ CS3; \ - VC = T0 ^ CS4; \ - VD = T0 ^ CS5; \ - VE = T1 ^ CS6; \ - VF = T1 ^ CS7; \ - M[0x0] = sph_dec32be_aligned(buf + 0); \ - M[0x1] = sph_dec32be_aligned(buf + 4); \ - M[0x2] = sph_dec32be_aligned(buf + 8); \ - M[0x3] = sph_dec32be_aligned(buf + 12); \ - M[0x4] = sph_dec32be_aligned(buf + 16); \ - M[0x5] = sph_dec32be_aligned(buf + 20); \ - M[0x6] = sph_dec32be_aligned(buf + 24); \ - M[0x7] = sph_dec32be_aligned(buf + 28); \ - M[0x8] = sph_dec32be_aligned(buf + 32); \ - M[0x9] = sph_dec32be_aligned(buf + 36); \ - M[0xA] = sph_dec32be_aligned(buf + 40); \ - M[0xB] = sph_dec32be_aligned(buf + 44); \ - M[0xC] = sph_dec32be_aligned(buf + 48); \ - M[0xD] = sph_dec32be_aligned(buf + 52); \ - M[0xE] = sph_dec32be_aligned(buf + 56); \ - M[0xF] = sph_dec32be_aligned(buf + 60); \ - for (r = 0; r < 14; r ++) \ - ROUND_S(r); \ - H0 ^= S0 ^ V0 ^ V8; \ - H1 ^= S1 ^ V1 ^ V9; \ - H2 ^= S2 ^ V2 ^ VA; \ - H3 ^= S3 ^ V3 ^ VB; \ - H4 ^= S0 ^ V4 ^ VC; \ - H5 ^= S1 ^ V5 ^ VD; \ - H6 ^= S2 ^ V6 ^ VE; \ - H7 ^= S3 ^ V7 ^ VF; \ - } while (0) - -#else - -#define COMPRESS32 do { \ - sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ - sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = S0 ^ CS0; \ - V9 = S1 ^ CS1; \ - VA = S2 ^ CS2; \ - VB = S3 ^ CS3; \ - VC = T0 ^ CS4; \ - VD = T0 ^ CS5; \ - VE = T1 ^ CS6; \ - VF = T1 ^ CS7; \ - M0 = sph_dec32be_aligned(buf + 0); \ - M1 = sph_dec32be_aligned(buf + 4); \ - M2 = sph_dec32be_aligned(buf + 8); \ - M3 = sph_dec32be_aligned(buf + 12); \ - M4 = sph_dec32be_aligned(buf + 16); \ - M5 = sph_dec32be_aligned(buf + 20); \ - M6 = sph_dec32be_aligned(buf + 24); \ - M7 = sph_dec32be_aligned(buf + 28); \ - M8 = sph_dec32be_aligned(buf + 32); \ - M9 = sph_dec32be_aligned(buf + 36); \ - MA = sph_dec32be_aligned(buf + 40); \ - MB = sph_dec32be_aligned(buf + 44); \ - MC = sph_dec32be_aligned(buf + 48); \ - MD = sph_dec32be_aligned(buf + 52); \ - ME = sph_dec32be_aligned(buf + 56); \ - MF = sph_dec32be_aligned(buf + 60); \ - ROUND_S(0); \ - ROUND_S(1); \ - ROUND_S(2); \ - ROUND_S(3); \ - ROUND_S(4); \ - ROUND_S(5); \ - ROUND_S(6); \ - ROUND_S(7); \ - ROUND_S(8); \ - ROUND_S(9); \ - ROUND_S(0); \ - ROUND_S(1); \ - ROUND_S(2); \ - ROUND_S(3); \ - H0 ^= S0 ^ V0 ^ V8; \ - H1 ^= S1 ^ V1 ^ V9; \ - H2 ^= S2 ^ V2 ^ VA; \ - H3 ^= S3 ^ V3 ^ VB; \ - H4 ^= S0 ^ V4 ^ VC; \ - H5 ^= S1 ^ V5 ^ VD; \ - H6 ^= S2 ^ V6 ^ VE; \ - H7 ^= S3 ^ V7 ^ VF; \ - } while (0) - -#endif - -#if SPH_64 - -#define DECL_STATE64 \ - sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \ - sph_u64 S0, S1, S2, S3, T0, T1; - -#define READ_STATE64(state) do { \ - H0 = (state)->H[0]; \ - H1 = (state)->H[1]; \ - H2 = (state)->H[2]; \ - H3 = (state)->H[3]; \ - H4 = (state)->H[4]; \ - H5 = (state)->H[5]; \ - H6 = (state)->H[6]; \ - H7 = (state)->H[7]; \ - S0 = (state)->S[0]; \ - S1 = (state)->S[1]; \ - S2 = (state)->S[2]; \ - S3 = (state)->S[3]; \ - T0 = (state)->T0; \ - T1 = (state)->T1; \ - } while (0) - -#define WRITE_STATE64(state) do { \ - (state)->H[0] = H0; \ - (state)->H[1] = H1; \ - (state)->H[2] = H2; \ - (state)->H[3] = H3; \ - (state)->H[4] = H4; \ - (state)->H[5] = H5; \ - (state)->H[6] = H6; \ - (state)->H[7] = H7; \ - (state)->S[0] = S0; \ - (state)->S[1] = S1; \ - (state)->S[2] = S2; \ - (state)->S[3] = S3; \ - (state)->T0 = T0; \ - (state)->T1 = T1; \ - } while (0) - -#if SPH_COMPACT_BLAKE_64 - -#define COMPRESS64 do { \ - sph_u64 M[16]; \ - sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ - unsigned r; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = S0 ^ CB0; \ - V9 = S1 ^ CB1; \ - VA = S2 ^ CB2; \ - VB = S3 ^ CB3; \ - VC = T0 ^ CB4; \ - VD = T0 ^ CB5; \ - VE = T1 ^ CB6; \ - VF = T1 ^ CB7; \ - M[0x0] = sph_dec64be_aligned(buf + 0); \ - M[0x1] = sph_dec64be_aligned(buf + 8); \ - M[0x2] = sph_dec64be_aligned(buf + 16); \ - M[0x3] = sph_dec64be_aligned(buf + 24); \ - M[0x4] = sph_dec64be_aligned(buf + 32); \ - M[0x5] = sph_dec64be_aligned(buf + 40); \ - M[0x6] = sph_dec64be_aligned(buf + 48); \ - M[0x7] = sph_dec64be_aligned(buf + 56); \ - M[0x8] = sph_dec64be_aligned(buf + 64); \ - M[0x9] = sph_dec64be_aligned(buf + 72); \ - M[0xA] = sph_dec64be_aligned(buf + 80); \ - M[0xB] = sph_dec64be_aligned(buf + 88); \ - M[0xC] = sph_dec64be_aligned(buf + 96); \ - M[0xD] = sph_dec64be_aligned(buf + 104); \ - M[0xE] = sph_dec64be_aligned(buf + 112); \ - M[0xF] = sph_dec64be_aligned(buf + 120); \ - for (r = 0; r < 16; r ++) \ - ROUND_B(r); \ - H0 ^= S0 ^ V0 ^ V8; \ - H1 ^= S1 ^ V1 ^ V9; \ - H2 ^= S2 ^ V2 ^ VA; \ - H3 ^= S3 ^ V3 ^ VB; \ - H4 ^= S0 ^ V4 ^ VC; \ - H5 ^= S1 ^ V5 ^ VD; \ - H6 ^= S2 ^ V6 ^ VE; \ - H7 ^= S3 ^ V7 ^ VF; \ - } while (0) - -#else - -#define COMPRESS64 do { \ - sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \ - sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = H4; \ - V5 = H5; \ - V6 = H6; \ - V7 = H7; \ - V8 = S0 ^ CB0; \ - V9 = S1 ^ CB1; \ - VA = S2 ^ CB2; \ - VB = S3 ^ CB3; \ - VC = T0 ^ CB4; \ - VD = T0 ^ CB5; \ - VE = T1 ^ CB6; \ - VF = T1 ^ CB7; \ - M0 = sph_dec64be_aligned(buf + 0); \ - M1 = sph_dec64be_aligned(buf + 8); \ - M2 = sph_dec64be_aligned(buf + 16); \ - M3 = sph_dec64be_aligned(buf + 24); \ - M4 = sph_dec64be_aligned(buf + 32); \ - M5 = sph_dec64be_aligned(buf + 40); \ - M6 = sph_dec64be_aligned(buf + 48); \ - M7 = sph_dec64be_aligned(buf + 56); \ - M8 = sph_dec64be_aligned(buf + 64); \ - M9 = sph_dec64be_aligned(buf + 72); \ - MA = sph_dec64be_aligned(buf + 80); \ - MB = sph_dec64be_aligned(buf + 88); \ - MC = sph_dec64be_aligned(buf + 96); \ - MD = sph_dec64be_aligned(buf + 104); \ - ME = sph_dec64be_aligned(buf + 112); \ - MF = sph_dec64be_aligned(buf + 120); \ - ROUND_B(0); \ - ROUND_B(1); \ - ROUND_B(2); \ - ROUND_B(3); \ - ROUND_B(4); \ - ROUND_B(5); \ - ROUND_B(6); \ - ROUND_B(7); \ - ROUND_B(8); \ - ROUND_B(9); \ - ROUND_B(0); \ - ROUND_B(1); \ - ROUND_B(2); \ - ROUND_B(3); \ - ROUND_B(4); \ - ROUND_B(5); \ - H0 ^= S0 ^ V0 ^ V8; \ - H1 ^= S1 ^ V1 ^ V9; \ - H2 ^= S2 ^ V2 ^ VA; \ - H3 ^= S3 ^ V3 ^ VB; \ - H4 ^= S0 ^ V4 ^ VC; \ - H5 ^= S1 ^ V5 ^ VD; \ - H6 ^= S2 ^ V6 ^ VE; \ - H7 ^= S3 ^ V7 ^ VF; \ - } while (0) - -#endif - -#endif - -static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 }; - -static void -blake32_init(sph_blake_small_context *sc, - const sph_u32 *iv, const sph_u32 *salt) -{ - memcpy(sc->H, iv, 8 * sizeof(sph_u32)); - memcpy(sc->S, salt, 4 * sizeof(sph_u32)); - sc->T0 = sc->T1 = 0; - sc->ptr = 0; -} - -static void -blake32(sph_blake_small_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE32 - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE32(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - if ((T0 = SPH_T32(T0 + 512)) < 512) - T1 = SPH_T32(T1 + 1); - COMPRESS32; - ptr = 0; - } - } - WRITE_STATE32(sc); - sc->ptr = ptr; -} - -static void -blake32_close(sph_blake_small_context *sc, - unsigned ub, unsigned n, void *dst, size_t out_size_w32) -{ - union { - unsigned char buf[64]; - sph_u32 dummy; - } u; - size_t ptr, k; - unsigned bit_len; - unsigned z; - sph_u32 th, tl; - unsigned char *out; - - ptr = sc->ptr; - bit_len = ((unsigned)ptr << 3) + n; - z = 0x80 >> n; - u.buf[ptr] = ((ub & -z) | z) & 0xFF; - tl = sc->T0 + bit_len; - th = sc->T1; - if (ptr == 0 && n == 0) { - sc->T0 = SPH_C32(0xFFFFFE00); - sc->T1 = SPH_C32(0xFFFFFFFF); - } else if (sc->T0 == 0) { - sc->T0 = SPH_C32(0xFFFFFE00) + bit_len; - sc->T1 = SPH_T32(sc->T1 - 1); - } else { - sc->T0 -= 512 - bit_len; - } - if (bit_len <= 446) { - memset(u.buf + ptr + 1, 0, 55 - ptr); - if (out_size_w32 == 8) - u.buf[55] |= 1; - sph_enc32be_aligned(u.buf + 56, th); - sph_enc32be_aligned(u.buf + 60, tl); - blake32(sc, u.buf + ptr, 64 - ptr); - } else { - memset(u.buf + ptr + 1, 0, 63 - ptr); - blake32(sc, u.buf + ptr, 64 - ptr); - sc->T0 = SPH_C32(0xFFFFFE00); - sc->T1 = SPH_C32(0xFFFFFFFF); - memset(u.buf, 0, 56); - if (out_size_w32 == 8) - u.buf[55] = 1; - sph_enc32be_aligned(u.buf + 56, th); - sph_enc32be_aligned(u.buf + 60, tl); - blake32(sc, u.buf, 64); - } - out = dst; - for (k = 0; k < out_size_w32; k ++) - sph_enc32be(out + (k << 2), sc->H[k]); -} - -#if SPH_64 - -static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 }; - -static void -blake64_init(sph_blake_big_context *sc, - const sph_u64 *iv, const sph_u64 *salt) -{ - memcpy(sc->H, iv, 8 * sizeof(sph_u64)); - memcpy(sc->S, salt, 4 * sizeof(sph_u64)); - sc->T0 = sc->T1 = 0; - sc->ptr = 0; -} - -static void -blake64(sph_blake_big_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE64 - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE64(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - if ((T0 = SPH_T64(T0 + 1024)) < 1024) - T1 = SPH_T64(T1 + 1); - COMPRESS64; - ptr = 0; - } - } - WRITE_STATE64(sc); - sc->ptr = ptr; -} - -static void -blake64_close(sph_blake_big_context *sc, - unsigned ub, unsigned n, void *dst, size_t out_size_w64) -{ - union { - unsigned char buf[128]; - sph_u64 dummy; - } u; - size_t ptr, k; - unsigned bit_len; - unsigned z; - sph_u64 th, tl; - unsigned char *out; - - ptr = sc->ptr; - bit_len = ((unsigned)ptr << 3) + n; - z = 0x80 >> n; - u.buf[ptr] = ((ub & -z) | z) & 0xFF; - tl = sc->T0 + bit_len; - th = sc->T1; - if (ptr == 0 && n == 0) { - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00); - sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF); - } else if (sc->T0 == 0) { - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len; - sc->T1 = SPH_T64(sc->T1 - 1); - } else { - sc->T0 -= 1024 - bit_len; - } - if (bit_len <= 894) { - memset(u.buf + ptr + 1, 0, 111 - ptr); - if (out_size_w64 == 8) - u.buf[111] |= 1; - sph_enc64be_aligned(u.buf + 112, th); - sph_enc64be_aligned(u.buf + 120, tl); - blake64(sc, u.buf + ptr, 128 - ptr); - } else { - memset(u.buf + ptr + 1, 0, 127 - ptr); - blake64(sc, u.buf + ptr, 128 - ptr); - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00); - sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF); - memset(u.buf, 0, 112); - if (out_size_w64 == 8) - u.buf[111] = 1; - sph_enc64be_aligned(u.buf + 112, th); - sph_enc64be_aligned(u.buf + 120, tl); - blake64(sc, u.buf, 128); - } - out = dst; - for (k = 0; k < out_size_w64; k ++) - sph_enc64be(out + (k << 3), sc->H[k]); -} - -#endif - -/* see sph_blake.h */ -void -sph_blake224_init(void *cc) -{ - blake32_init(cc, IV224, salt_zero_small); -} - -/* see sph_blake.h */ -void -sph_blake224(void *cc, const void *data, size_t len) -{ - blake32(cc, data, len); -} - -/* see sph_blake.h */ -void -sph_blake224_close(void *cc, void *dst) -{ - sph_blake224_addbits_and_close(cc, 0, 0, dst); -} - -/* see sph_blake.h */ -void -sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - blake32_close(cc, ub, n, dst, 7); - sph_blake224_init(cc); -} - -/* see sph_blake.h */ -void -sph_blake256_init(void *cc) -{ - blake32_init(cc, IV256, salt_zero_small); -} - -/* see sph_blake.h */ -void -sph_blake256(void *cc, const void *data, size_t len) -{ - blake32(cc, data, len); -} - -/* see sph_blake.h */ -void -sph_blake256_close(void *cc, void *dst) -{ - sph_blake256_addbits_and_close(cc, 0, 0, dst); -} - -/* see sph_blake.h */ -void -sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - blake32_close(cc, ub, n, dst, 8); - sph_blake256_init(cc); -} - -#if SPH_64 - -/* see sph_blake.h */ -void -sph_blake384_init(void *cc) -{ - blake64_init(cc, IV384, salt_zero_big); -} - -/* see sph_blake.h */ -void -sph_blake384(void *cc, const void *data, size_t len) -{ - blake64(cc, data, len); -} - -/* see sph_blake.h */ -void -sph_blake384_close(void *cc, void *dst) -{ - sph_blake384_addbits_and_close(cc, 0, 0, dst); -} - -/* see sph_blake.h */ -void -sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - blake64_close(cc, ub, n, dst, 6); - sph_blake384_init(cc); -} - -/* see sph_blake.h */ -void -sph_blake512_init(void *cc) -{ - blake64_init(cc, IV512, salt_zero_big); -} - -/* see sph_blake.h */ -void -sph_blake512(void *cc, const void *data, size_t len) -{ - blake64(cc, data, len); -} - -/* see sph_blake.h */ -void -sph_blake512_close(void *cc, void *dst) -{ - sph_blake512_addbits_and_close(cc, 0, 0, dst); -} - -/* see sph_blake.h */ -void -sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - blake64_close(cc, ub, n, dst, 8); - sph_blake512_init(cc); -} - -#endif - -#ifdef __cplusplus -} -#endif +/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ +/* + * BLAKE implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_blake.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE +#define SPH_SMALL_FOOTPRINT_BLAKE 1 +#endif + +#if SPH_SMALL_FOOTPRINT_BLAKE +#define SPH_COMPACT_BLAKE_32 1 +#endif + +#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE) +#define SPH_COMPACT_BLAKE_64 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[8] = { + SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), + SPH_C32(0x3070DD17), SPH_C32(0xF70E5939), + SPH_C32(0xFFC00B31), SPH_C32(0x68581511), + SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4) +}; + +static const sph_u32 IV256[8] = { + SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), + SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), + SPH_C32(0x510E527F), SPH_C32(0x9B05688C), + SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) +}; + +#if SPH_64 + +static const sph_u64 IV384[8] = { + SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507), + SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939), + SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511), + SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4) +}; + +static const sph_u64 IV512[8] = { + SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), + SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), + SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), + SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) +}; + +#endif + +#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64 + +static const unsigned sigma[16][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +/* + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + 14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3 + 11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4 + 7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8 + 9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13 + 2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9 + 12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11 + 13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10 + 6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5 + 10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0 +*/ +#endif + +#define Z00 0 +#define Z01 1 +#define Z02 2 +#define Z03 3 +#define Z04 4 +#define Z05 5 +#define Z06 6 +#define Z07 7 +#define Z08 8 +#define Z09 9 +#define Z0A A +#define Z0B B +#define Z0C C +#define Z0D D +#define Z0E E +#define Z0F F + +#define Z10 E +#define Z11 A +#define Z12 4 +#define Z13 8 +#define Z14 9 +#define Z15 F +#define Z16 D +#define Z17 6 +#define Z18 1 +#define Z19 C +#define Z1A 0 +#define Z1B 2 +#define Z1C B +#define Z1D 7 +#define Z1E 5 +#define Z1F 3 + +#define Z20 B +#define Z21 8 +#define Z22 C +#define Z23 0 +#define Z24 5 +#define Z25 2 +#define Z26 F +#define Z27 D +#define Z28 A +#define Z29 E +#define Z2A 3 +#define Z2B 6 +#define Z2C 7 +#define Z2D 1 +#define Z2E 9 +#define Z2F 4 + +#define Z30 7 +#define Z31 9 +#define Z32 3 +#define Z33 1 +#define Z34 D +#define Z35 C +#define Z36 B +#define Z37 E +#define Z38 2 +#define Z39 6 +#define Z3A 5 +#define Z3B A +#define Z3C 4 +#define Z3D 0 +#define Z3E F +#define Z3F 8 + +#define Z40 9 +#define Z41 0 +#define Z42 5 +#define Z43 7 +#define Z44 2 +#define Z45 4 +#define Z46 A +#define Z47 F +#define Z48 E +#define Z49 1 +#define Z4A B +#define Z4B C +#define Z4C 6 +#define Z4D 8 +#define Z4E 3 +#define Z4F D + +#define Z50 2 +#define Z51 C +#define Z52 6 +#define Z53 A +#define Z54 0 +#define Z55 B +#define Z56 8 +#define Z57 3 +#define Z58 4 +#define Z59 D +#define Z5A 7 +#define Z5B 5 +#define Z5C F +#define Z5D E +#define Z5E 1 +#define Z5F 9 + +#define Z60 C +#define Z61 5 +#define Z62 1 +#define Z63 F +#define Z64 E +#define Z65 D +#define Z66 4 +#define Z67 A +#define Z68 0 +#define Z69 7 +#define Z6A 6 +#define Z6B 3 +#define Z6C 9 +#define Z6D 2 +#define Z6E 8 +#define Z6F B + +#define Z70 D +#define Z71 B +#define Z72 7 +#define Z73 E +#define Z74 C +#define Z75 1 +#define Z76 3 +#define Z77 9 +#define Z78 5 +#define Z79 0 +#define Z7A F +#define Z7B 4 +#define Z7C 8 +#define Z7D 6 +#define Z7E 2 +#define Z7F A + +#define Z80 6 +#define Z81 F +#define Z82 E +#define Z83 9 +#define Z84 B +#define Z85 3 +#define Z86 0 +#define Z87 8 +#define Z88 C +#define Z89 2 +#define Z8A D +#define Z8B 7 +#define Z8C 1 +#define Z8D 4 +#define Z8E A +#define Z8F 5 + +#define Z90 A +#define Z91 2 +#define Z92 8 +#define Z93 4 +#define Z94 7 +#define Z95 6 +#define Z96 1 +#define Z97 5 +#define Z98 F +#define Z99 B +#define Z9A 9 +#define Z9B E +#define Z9C 3 +#define Z9D C +#define Z9E D +#define Z9F 0 + +#define Mx(r, i) Mx_(Z ## r ## i) +#define Mx_(n) Mx__(n) +#define Mx__(n) M ## n + +#define CSx(r, i) CSx_(Z ## r ## i) +#define CSx_(n) CSx__(n) +#define CSx__(n) CS ## n + +#define CS0 SPH_C32(0x243F6A88) +#define CS1 SPH_C32(0x85A308D3) +#define CS2 SPH_C32(0x13198A2E) +#define CS3 SPH_C32(0x03707344) +#define CS4 SPH_C32(0xA4093822) +#define CS5 SPH_C32(0x299F31D0) +#define CS6 SPH_C32(0x082EFA98) +#define CS7 SPH_C32(0xEC4E6C89) +#define CS8 SPH_C32(0x452821E6) +#define CS9 SPH_C32(0x38D01377) +#define CSA SPH_C32(0xBE5466CF) +#define CSB SPH_C32(0x34E90C6C) +#define CSC SPH_C32(0xC0AC29B7) +#define CSD SPH_C32(0xC97C50DD) +#define CSE SPH_C32(0x3F84D5B5) +#define CSF SPH_C32(0xB5470917) + +#if SPH_COMPACT_BLAKE_32 + +static const sph_u32 CS[16] = { + SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), + SPH_C32(0x13198A2E), SPH_C32(0x03707344), + SPH_C32(0xA4093822), SPH_C32(0x299F31D0), + SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), + SPH_C32(0x452821E6), SPH_C32(0x38D01377), + SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), + SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), + SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) +}; + +#endif + +#if SPH_64 + +#define CBx(r, i) CBx_(Z ## r ## i) +#define CBx_(n) CBx__(n) +#define CBx__(n) CB ## n + +#define CB0 SPH_C64(0x243F6A8885A308D3) +#define CB1 SPH_C64(0x13198A2E03707344) +#define CB2 SPH_C64(0xA4093822299F31D0) +#define CB3 SPH_C64(0x082EFA98EC4E6C89) +#define CB4 SPH_C64(0x452821E638D01377) +#define CB5 SPH_C64(0xBE5466CF34E90C6C) +#define CB6 SPH_C64(0xC0AC29B7C97C50DD) +#define CB7 SPH_C64(0x3F84D5B5B5470917) +#define CB8 SPH_C64(0x9216D5D98979FB1B) +#define CB9 SPH_C64(0xD1310BA698DFB5AC) +#define CBA SPH_C64(0x2FFD72DBD01ADFB7) +#define CBB SPH_C64(0xB8E1AFED6A267E96) +#define CBC SPH_C64(0xBA7C9045F12C7F99) +#define CBD SPH_C64(0x24A19947B3916CF7) +#define CBE SPH_C64(0x0801F2E2858EFC16) +#define CBF SPH_C64(0x636920D871574E69) + +#if SPH_COMPACT_BLAKE_64 + +static const sph_u64 CB[16] = { + SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344), + SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89), + SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C), + SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917), + SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC), + SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96), + SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7), + SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69) +}; + +#endif + +#endif + +#define GS(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T32(a + b + (m0 ^ c1)); \ + d = SPH_ROTR32(d ^ a, 16); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 12); \ + a = SPH_T32(a + b + (m1 ^ c0)); \ + d = SPH_ROTR32(d ^ a, 8); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 7); \ + } while (0) + +#if SPH_COMPACT_BLAKE_32 + +#define ROUND_S(r) do { \ + GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \ + CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \ + GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \ + CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \ + GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \ + CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \ + GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \ + CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \ + GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \ + CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \ + GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \ + CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \ + GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \ + CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \ + GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \ + CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \ + } while (0) + +#else + +#define ROUND_S(r) do { \ + GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ + GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ + GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ + GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ + GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ + GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ + GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ + GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ + } while (0) + +#endif + +#if SPH_64 + +#define GB(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T64(a + b + (m0 ^ c1)); \ + d = SPH_ROTR64(d ^ a, 32); \ + c = SPH_T64(c + d); \ + b = SPH_ROTR64(b ^ c, 25); \ + a = SPH_T64(a + b + (m1 ^ c0)); \ + d = SPH_ROTR64(d ^ a, 16); \ + c = SPH_T64(c + d); \ + b = SPH_ROTR64(b ^ c, 11); \ + } while (0) + +#if SPH_COMPACT_BLAKE_64 + +#define ROUND_B(r) do { \ + GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \ + CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \ + GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \ + CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \ + GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \ + CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \ + GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \ + CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \ + GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \ + CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \ + GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \ + CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \ + GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \ + CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \ + GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \ + CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \ + } while (0) + +#else + +#define ROUND_B(r) do { \ + GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ + GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ + GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ + GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \ + GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \ + GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ + GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ + GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ + } while (0) + +#endif + +#endif + +#define DECL_STATE32 \ + sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \ + sph_u32 S0, S1, S2, S3, T0, T1; + +#define READ_STATE32(state) do { \ + H0 = (state)->H[0]; \ + H1 = (state)->H[1]; \ + H2 = (state)->H[2]; \ + H3 = (state)->H[3]; \ + H4 = (state)->H[4]; \ + H5 = (state)->H[5]; \ + H6 = (state)->H[6]; \ + H7 = (state)->H[7]; \ + S0 = (state)->S[0]; \ + S1 = (state)->S[1]; \ + S2 = (state)->S[2]; \ + S3 = (state)->S[3]; \ + T0 = (state)->T0; \ + T1 = (state)->T1; \ + } while (0) + +#define WRITE_STATE32(state) do { \ + (state)->H[0] = H0; \ + (state)->H[1] = H1; \ + (state)->H[2] = H2; \ + (state)->H[3] = H3; \ + (state)->H[4] = H4; \ + (state)->H[5] = H5; \ + (state)->H[6] = H6; \ + (state)->H[7] = H7; \ + (state)->S[0] = S0; \ + (state)->S[1] = S1; \ + (state)->S[2] = S2; \ + (state)->S[3] = S3; \ + (state)->T0 = T0; \ + (state)->T1 = T1; \ + } while (0) + +#if SPH_COMPACT_BLAKE_32 + +#define COMPRESS32 do { \ + sph_u32 M[16]; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + unsigned r; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M[0x0] = sph_dec32be_aligned(buf + 0); \ + M[0x1] = sph_dec32be_aligned(buf + 4); \ + M[0x2] = sph_dec32be_aligned(buf + 8); \ + M[0x3] = sph_dec32be_aligned(buf + 12); \ + M[0x4] = sph_dec32be_aligned(buf + 16); \ + M[0x5] = sph_dec32be_aligned(buf + 20); \ + M[0x6] = sph_dec32be_aligned(buf + 24); \ + M[0x7] = sph_dec32be_aligned(buf + 28); \ + M[0x8] = sph_dec32be_aligned(buf + 32); \ + M[0x9] = sph_dec32be_aligned(buf + 36); \ + M[0xA] = sph_dec32be_aligned(buf + 40); \ + M[0xB] = sph_dec32be_aligned(buf + 44); \ + M[0xC] = sph_dec32be_aligned(buf + 48); \ + M[0xD] = sph_dec32be_aligned(buf + 52); \ + M[0xE] = sph_dec32be_aligned(buf + 56); \ + M[0xF] = sph_dec32be_aligned(buf + 60); \ + for (r = 0; r < 14; r ++) \ + ROUND_S(r); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#else + +#define COMPRESS32 do { \ + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + M8 = sph_dec32be_aligned(buf + 32); \ + M9 = sph_dec32be_aligned(buf + 36); \ + MA = sph_dec32be_aligned(buf + 40); \ + MB = sph_dec32be_aligned(buf + 44); \ + MC = sph_dec32be_aligned(buf + 48); \ + MD = sph_dec32be_aligned(buf + 52); \ + ME = sph_dec32be_aligned(buf + 56); \ + MF = sph_dec32be_aligned(buf + 60); \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + ROUND_S(4); \ + ROUND_S(5); \ + ROUND_S(6); \ + ROUND_S(7); \ + ROUND_S(8); \ + ROUND_S(9); \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#endif + +#if SPH_64 + +#define DECL_STATE64 \ + sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \ + sph_u64 S0, S1, S2, S3, T0, T1; + +#define READ_STATE64(state) do { \ + H0 = (state)->H[0]; \ + H1 = (state)->H[1]; \ + H2 = (state)->H[2]; \ + H3 = (state)->H[3]; \ + H4 = (state)->H[4]; \ + H5 = (state)->H[5]; \ + H6 = (state)->H[6]; \ + H7 = (state)->H[7]; \ + S0 = (state)->S[0]; \ + S1 = (state)->S[1]; \ + S2 = (state)->S[2]; \ + S3 = (state)->S[3]; \ + T0 = (state)->T0; \ + T1 = (state)->T1; \ + } while (0) + +#define WRITE_STATE64(state) do { \ + (state)->H[0] = H0; \ + (state)->H[1] = H1; \ + (state)->H[2] = H2; \ + (state)->H[3] = H3; \ + (state)->H[4] = H4; \ + (state)->H[5] = H5; \ + (state)->H[6] = H6; \ + (state)->H[7] = H7; \ + (state)->S[0] = S0; \ + (state)->S[1] = S1; \ + (state)->S[2] = S2; \ + (state)->S[3] = S3; \ + (state)->T0 = T0; \ + (state)->T1 = T1; \ + } while (0) + +#if SPH_COMPACT_BLAKE_64 + +#define COMPRESS64 do { \ + sph_u64 M[16]; \ + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ + unsigned r; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CB0; \ + V9 = S1 ^ CB1; \ + VA = S2 ^ CB2; \ + VB = S3 ^ CB3; \ + VC = T0 ^ CB4; \ + VD = T0 ^ CB5; \ + VE = T1 ^ CB6; \ + VF = T1 ^ CB7; \ + M[0x0] = sph_dec64be_aligned(buf + 0); \ + M[0x1] = sph_dec64be_aligned(buf + 8); \ + M[0x2] = sph_dec64be_aligned(buf + 16); \ + M[0x3] = sph_dec64be_aligned(buf + 24); \ + M[0x4] = sph_dec64be_aligned(buf + 32); \ + M[0x5] = sph_dec64be_aligned(buf + 40); \ + M[0x6] = sph_dec64be_aligned(buf + 48); \ + M[0x7] = sph_dec64be_aligned(buf + 56); \ + M[0x8] = sph_dec64be_aligned(buf + 64); \ + M[0x9] = sph_dec64be_aligned(buf + 72); \ + M[0xA] = sph_dec64be_aligned(buf + 80); \ + M[0xB] = sph_dec64be_aligned(buf + 88); \ + M[0xC] = sph_dec64be_aligned(buf + 96); \ + M[0xD] = sph_dec64be_aligned(buf + 104); \ + M[0xE] = sph_dec64be_aligned(buf + 112); \ + M[0xF] = sph_dec64be_aligned(buf + 120); \ + for (r = 0; r < 16; r ++) \ + ROUND_B(r); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#else + +#define COMPRESS64 do { \ + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CB0; \ + V9 = S1 ^ CB1; \ + VA = S2 ^ CB2; \ + VB = S3 ^ CB3; \ + VC = T0 ^ CB4; \ + VD = T0 ^ CB5; \ + VE = T1 ^ CB6; \ + VF = T1 ^ CB7; \ + M0 = sph_dec64be_aligned(buf + 0); \ + M1 = sph_dec64be_aligned(buf + 8); \ + M2 = sph_dec64be_aligned(buf + 16); \ + M3 = sph_dec64be_aligned(buf + 24); \ + M4 = sph_dec64be_aligned(buf + 32); \ + M5 = sph_dec64be_aligned(buf + 40); \ + M6 = sph_dec64be_aligned(buf + 48); \ + M7 = sph_dec64be_aligned(buf + 56); \ + M8 = sph_dec64be_aligned(buf + 64); \ + M9 = sph_dec64be_aligned(buf + 72); \ + MA = sph_dec64be_aligned(buf + 80); \ + MB = sph_dec64be_aligned(buf + 88); \ + MC = sph_dec64be_aligned(buf + 96); \ + MD = sph_dec64be_aligned(buf + 104); \ + ME = sph_dec64be_aligned(buf + 112); \ + MF = sph_dec64be_aligned(buf + 120); \ + ROUND_B(0); \ + ROUND_B(1); \ + ROUND_B(2); \ + ROUND_B(3); \ + ROUND_B(4); \ + ROUND_B(5); \ + ROUND_B(6); \ + ROUND_B(7); \ + ROUND_B(8); \ + ROUND_B(9); \ + ROUND_B(0); \ + ROUND_B(1); \ + ROUND_B(2); \ + ROUND_B(3); \ + ROUND_B(4); \ + ROUND_B(5); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#endif + +#endif + +static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 }; + +static void +blake32_init(sph_blake_small_context *sc, + const sph_u32 *iv, const sph_u32 *salt) +{ + memcpy(sc->H, iv, 8 * sizeof(sph_u32)); + memcpy(sc->S, salt, 4 * sizeof(sph_u32)); + sc->T0 = sc->T1 = 0; + sc->ptr = 0; +} + +static void +blake32(sph_blake_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE32 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE32(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((T0 = SPH_T32(T0 + 512)) < 512) + T1 = SPH_T32(T1 + 1); + COMPRESS32; + ptr = 0; + } + } + WRITE_STATE32(sc); + sc->ptr = ptr; +} + +static void +blake32_close(sph_blake_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + union { + unsigned char buf[64]; + sph_u32 dummy; + } u; + size_t ptr, k; + unsigned bit_len; + unsigned z; + sph_u32 th, tl; + unsigned char *out; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3) + n; + z = 0x80 >> n; + u.buf[ptr] = ((ub & -z) | z) & 0xFF; + tl = sc->T0 + bit_len; + th = sc->T1; + if (ptr == 0 && n == 0) { + sc->T0 = SPH_C32(0xFFFFFE00); + sc->T1 = SPH_C32(0xFFFFFFFF); + } else if (sc->T0 == 0) { + sc->T0 = SPH_C32(0xFFFFFE00) + bit_len; + sc->T1 = SPH_T32(sc->T1 - 1); + } else { + sc->T0 -= 512 - bit_len; + } + if (bit_len <= 446) { + memset(u.buf + ptr + 1, 0, 55 - ptr); + if (out_size_w32 == 8) + u.buf[55] |= 1; + sph_enc32be_aligned(u.buf + 56, th); + sph_enc32be_aligned(u.buf + 60, tl); + blake32(sc, u.buf + ptr, 64 - ptr); + } else { + memset(u.buf + ptr + 1, 0, 63 - ptr); + blake32(sc, u.buf + ptr, 64 - ptr); + sc->T0 = SPH_C32(0xFFFFFE00); + sc->T1 = SPH_C32(0xFFFFFFFF); + memset(u.buf, 0, 56); + if (out_size_w32 == 8) + u.buf[55] = 1; + sph_enc32be_aligned(u.buf + 56, th); + sph_enc32be_aligned(u.buf + 60, tl); + blake32(sc, u.buf, 64); + } + out = dst; + for (k = 0; k < out_size_w32; k ++) + sph_enc32be(out + (k << 2), sc->H[k]); +} + +#if SPH_64 + +static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 }; + +static void +blake64_init(sph_blake_big_context *sc, + const sph_u64 *iv, const sph_u64 *salt) +{ + memcpy(sc->H, iv, 8 * sizeof(sph_u64)); + memcpy(sc->S, salt, 4 * sizeof(sph_u64)); + sc->T0 = sc->T1 = 0; + sc->ptr = 0; +} + +static void +blake64(sph_blake_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE64 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE64(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((T0 = SPH_T64(T0 + 1024)) < 1024) + T1 = SPH_T64(T1 + 1); + COMPRESS64; + ptr = 0; + } + } + WRITE_STATE64(sc); + sc->ptr = ptr; +} + +static void +blake64_close(sph_blake_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w64) +{ + union { + unsigned char buf[128]; + sph_u64 dummy; + } u; + size_t ptr, k; + unsigned bit_len; + unsigned z; + sph_u64 th, tl; + unsigned char *out; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3) + n; + z = 0x80 >> n; + u.buf[ptr] = ((ub & -z) | z) & 0xFF; + tl = sc->T0 + bit_len; + th = sc->T1; + if (ptr == 0 && n == 0) { + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00); + sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF); + } else if (sc->T0 == 0) { + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len; + sc->T1 = SPH_T64(sc->T1 - 1); + } else { + sc->T0 -= 1024 - bit_len; + } + if (bit_len <= 894) { + memset(u.buf + ptr + 1, 0, 111 - ptr); + if (out_size_w64 == 8) + u.buf[111] |= 1; + sph_enc64be_aligned(u.buf + 112, th); + sph_enc64be_aligned(u.buf + 120, tl); + blake64(sc, u.buf + ptr, 128 - ptr); + } else { + memset(u.buf + ptr + 1, 0, 127 - ptr); + blake64(sc, u.buf + ptr, 128 - ptr); + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00); + sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF); + memset(u.buf, 0, 112); + if (out_size_w64 == 8) + u.buf[111] = 1; + sph_enc64be_aligned(u.buf + 112, th); + sph_enc64be_aligned(u.buf + 120, tl); + blake64(sc, u.buf, 128); + } + out = dst; + for (k = 0; k < out_size_w64; k ++) + sph_enc64be(out + (k << 3), sc->H[k]); +} + +#endif + +/* see sph_blake.h */ +void +sph_blake224_init(void *cc) +{ + blake32_init(cc, IV224, salt_zero_small); +} + +/* see sph_blake.h */ +void +sph_blake224(void *cc, const void *data, size_t len) +{ + blake32(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake224_close(void *cc, void *dst) +{ + sph_blake224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake32_close(cc, ub, n, dst, 7); + sph_blake224_init(cc); +} + +/* see sph_blake.h */ +void +sph_blake256_init(void *cc) +{ + blake32_init(cc, IV256, salt_zero_small); +} + +/* see sph_blake.h */ +void +sph_blake256(void *cc, const void *data, size_t len) +{ + blake32(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake256_close(void *cc, void *dst) +{ + sph_blake256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake32_close(cc, ub, n, dst, 8); + sph_blake256_init(cc); +} + +#if SPH_64 + +/* see sph_blake.h */ +void +sph_blake384_init(void *cc) +{ + blake64_init(cc, IV384, salt_zero_big); +} + +/* see sph_blake.h */ +void +sph_blake384(void *cc, const void *data, size_t len) +{ + blake64(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake384_close(void *cc, void *dst) +{ + sph_blake384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake64_close(cc, ub, n, dst, 6); + sph_blake384_init(cc); +} + +/* see sph_blake.h */ +void +sph_blake512_init(void *cc) +{ + blake64_init(cc, IV512, salt_zero_big); +} + +/* see sph_blake.h */ +void +sph_blake512(void *cc, const void *data, size_t len) +{ + blake64(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake512_close(void *cc, void *dst) +{ + sph_blake512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake64_close(cc, ub, n, dst, 8); + sph_blake512_init(cc); +} + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/sph/groestl.c b/sph/groestl.c index 5f19ed1..31b7b03 100644 --- a/sph/groestl.c +++ b/sph/groestl.c @@ -1,3124 +1,3124 @@ -/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */ -/* - * Groestl implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ -#include -#include -#include - -#include "sph_groestl.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_GROESTL -#define SPH_SMALL_FOOTPRINT_GROESTL 1 -#endif - -/* - * Apparently, the 32-bit-only version is not faster than the 64-bit - * version unless using the "small footprint" code on a 32-bit machine. - */ -#if !defined SPH_GROESTL_64 -#if SPH_SMALL_FOOTPRINT_GROESTL && !SPH_64_TRUE -#define SPH_GROESTL_64 0 -#else -#define SPH_GROESTL_64 1 -#endif -#endif - -#if !SPH_64 -#undef SPH_GROESTL_64 -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* - * The internal representation may use either big-endian or - * little-endian. Using the platform default representation speeds up - * encoding and decoding between bytes and the matrix columns. - */ - -#undef USE_LE -#if SPH_GROESTL_LITTLE_ENDIAN -#define USE_LE 1 -#elif SPH_GROESTL_BIG_ENDIAN -#define USE_LE 0 -#elif SPH_LITTLE_ENDIAN -#define USE_LE 1 -#endif - -#if USE_LE - -#define C32e(x) ((SPH_C32(x) >> 24) \ - | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ - | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ - | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) -#define dec32e_aligned sph_dec32le_aligned -#define enc32e sph_enc32le -#define B32_0(x) ((x) & 0xFF) -#define B32_1(x) (((x) >> 8) & 0xFF) -#define B32_2(x) (((x) >> 16) & 0xFF) -#define B32_3(x) ((x) >> 24) - -#define R32u(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) -#define R32d(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) - -#define PC32up(j, r) ((sph_u32)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) SPH_C32(0xFFFFFFFF) -#define QC32dn(j, r) (((sph_u32)(r) << 24) ^ SPH_T32(~((sph_u32)(j) << 24))) - -#if SPH_64 -#define C64e(x) ((SPH_C64(x) >> 56) \ - | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ - | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ - | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ - | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ - | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ - | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ - | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) -#define dec64e_aligned sph_dec64le_aligned -#define enc64e sph_enc64le -#define B64_0(x) ((x) & 0xFF) -#define B64_1(x) (((x) >> 8) & 0xFF) -#define B64_2(x) (((x) >> 16) & 0xFF) -#define B64_3(x) (((x) >> 24) & 0xFF) -#define B64_4(x) (((x) >> 32) & 0xFF) -#define B64_5(x) (((x) >> 40) & 0xFF) -#define B64_6(x) (((x) >> 48) & 0xFF) -#define B64_7(x) ((x) >> 56) -#define R64 SPH_ROTL64 -#define PC64(j, r) ((sph_u64)((j) + (r))) -#define QC64(j, r) (((sph_u64)(r) << 56) ^ SPH_T64(~((sph_u64)(j) << 56))) -#endif - -#else - -#define C32e(x) SPH_C32(x) -#define dec32e_aligned sph_dec32be_aligned -#define enc32e sph_enc32be -#define B32_0(x) ((x) >> 24) -#define B32_1(x) (((x) >> 16) & 0xFF) -#define B32_2(x) (((x) >> 8) & 0xFF) -#define B32_3(x) ((x) & 0xFF) - -#define R32u(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) -#define R32d(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) - -#define PC32up(j, r) ((sph_u32)((j) + (r)) << 24) -#define PC32dn(j, r) 0 -#define QC32up(j, r) SPH_C32(0xFFFFFFFF) -#define QC32dn(j, r) ((sph_u32)(r) ^ SPH_T32(~(sph_u32)(j))) - -#if SPH_64 -#define C64e(x) SPH_C64(x) -#define dec64e_aligned sph_dec64be_aligned -#define enc64e sph_enc64be -#define B64_0(x) ((x) >> 56) -#define B64_1(x) (((x) >> 48) & 0xFF) -#define B64_2(x) (((x) >> 40) & 0xFF) -#define B64_3(x) (((x) >> 32) & 0xFF) -#define B64_4(x) (((x) >> 24) & 0xFF) -#define B64_5(x) (((x) >> 16) & 0xFF) -#define B64_6(x) (((x) >> 8) & 0xFF) -#define B64_7(x) ((x) & 0xFF) -#define R64 SPH_ROTR64 -#define PC64(j, r) ((sph_u64)((j) + (r)) << 56) -#define QC64(j, r) ((sph_u64)(r) ^ SPH_T64(~(sph_u64)(j))) -#endif - -#endif - -#if SPH_GROESTL_64 - -static const sph_u64 T0[] = { - C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), - C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), - C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), - C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), - C64e(0x6090f050f0c05060), C64e(0x0207050305040302), - C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), - C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), - C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), - C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), - C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), - C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), - C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), - C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), - C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), - C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), - C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), - C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), - C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), - C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), - C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), - C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), - C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), - C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), - C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), - C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), - C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), - C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), - C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), - C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), - C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), - C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), - C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), - C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), - C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), - C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), - C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), - C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), - C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), - C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), - C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), - C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), - C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), - C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), - C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), - C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), - C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), - C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), - C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), - C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), - C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), - C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), - C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), - C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), - C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), - C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), - C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), - C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), - C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), - C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), - C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), - C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), - C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), - C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), - C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), - C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), - C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), - C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), - C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), - C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), - C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), - C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), - C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), - C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), - C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), - C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), - C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), - C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), - C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), - C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), - C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), - C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), - C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), - C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), - C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), - C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), - C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), - C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), - C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), - C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), - C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), - C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), - C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), - C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), - C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), - C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), - C64e(0x476720e9208ee947), C64e(0x1038281828201810), - C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), - C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), - C64e(0x38546c246c702438), C64e(0x575f08f108aef157), - C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), - C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), - C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), - C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), - C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), - C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), - C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), - C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), - C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), - C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), - C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), - C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), - C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), - C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), - C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), - C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), - C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), - C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), - C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), - C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), - C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), - C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), - C64e(0x09929b809b128009), C64e(0x1a2339173934171a), - C64e(0x651075da75cada65), C64e(0xd784533153b531d7), - C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), - C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), - C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), - C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), - C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) -}; - -#if !SPH_SMALL_FOOTPRINT_GROESTL - -static const sph_u64 T1[] = { - C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), - C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), - C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), - C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), - C64e(0x606090f050f0c050), C64e(0x0202070503050403), - C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), - C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), - C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), - C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), - C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), - C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), - C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), - C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), - C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), - C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), - C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), - C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), - C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), - C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), - C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), - C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), - C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), - C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), - C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), - C64e(0x08081c140c14100c), C64e(0x959563f652f63152), - C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), - C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), - C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), - C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), - C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), - C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), - C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), - C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), - C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), - C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), - C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), - C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), - C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), - C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), - C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), - C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), - C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), - C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), - C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), - C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), - C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), - C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), - C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), - C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), - C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), - C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), - C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), - C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), - C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), - C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), - C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), - C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), - C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), - C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), - C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), - C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), - C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), - C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), - C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), - C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), - C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), - C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), - C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), - C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), - C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), - C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), - C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), - C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), - C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), - C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), - C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), - C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), - C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), - C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), - C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), - C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), - C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), - C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), - C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), - C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), - C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), - C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), - C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), - C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), - C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), - C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), - C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), - C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), - C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), - C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), - C64e(0x47476720e9208ee9), C64e(0x1010382818282018), - C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), - C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), - C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), - C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), - C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), - C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), - C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), - C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), - C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), - C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), - C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), - C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), - C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), - C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), - C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), - C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), - C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), - C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), - C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), - C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), - C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), - C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), - C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), - C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), - C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), - C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), - C64e(0x65651075da75cada), C64e(0xd7d784533153b531), - C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), - C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), - C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), - C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), - C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) -}; - -static const sph_u64 T2[] = { - C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), - C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), - C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), - C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), - C64e(0x50606090f050f0c0), C64e(0x0302020705030504), - C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), - C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), - C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), - C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), - C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), - C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), - C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), - C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), - C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), - C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), - C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), - C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), - C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), - C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), - C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), - C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), - C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), - C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), - C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), - C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), - C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), - C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), - C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), - C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), - C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), - C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), - C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), - C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), - C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), - C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), - C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), - C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), - C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), - C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), - C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), - C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), - C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), - C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), - C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), - C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), - C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), - C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), - C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), - C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), - C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), - C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), - C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), - C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), - C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), - C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), - C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), - C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), - C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), - C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), - C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), - C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), - C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), - C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), - C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), - C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), - C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), - C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), - C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), - C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), - C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), - C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), - C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), - C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), - C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), - C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), - C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), - C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), - C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), - C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), - C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), - C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), - C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), - C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), - C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), - C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), - C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), - C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), - C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), - C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), - C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), - C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), - C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), - C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), - C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), - C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), - C64e(0xe947476720e9208e), C64e(0x1810103828182820), - C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), - C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), - C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), - C64e(0xc773732152c752e6), C64e(0x51979764f351f335), - C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), - C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), - C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), - C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), - C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), - C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), - C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), - C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), - C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), - C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), - C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), - C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), - C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), - C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), - C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), - C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), - C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), - C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), - C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), - C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), - C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), - C64e(0x800909929b809b12), C64e(0x171a1a2339173934), - C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), - C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), - C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), - C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), - C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), - C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) -}; - -static const sph_u64 T3[] = { - C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), - C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), - C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), - C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), - C64e(0xc050606090f050f0), C64e(0x0403020207050305), - C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), - C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), - C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), - C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), - C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), - C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), - C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), - C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), - C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), - C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), - C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), - C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), - C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), - C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), - C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), - C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), - C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), - C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), - C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), - C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), - C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), - C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), - C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), - C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), - C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), - C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), - C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), - C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), - C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), - C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), - C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), - C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), - C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), - C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), - C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), - C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), - C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), - C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), - C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), - C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), - C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), - C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), - C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), - C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), - C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), - C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), - C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), - C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), - C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), - C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), - C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), - C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), - C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), - C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), - C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), - C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), - C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), - C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), - C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), - C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), - C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), - C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), - C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), - C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), - C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), - C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), - C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), - C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), - C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), - C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), - C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), - C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), - C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), - C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), - C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), - C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), - C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), - C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), - C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), - C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), - C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), - C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), - C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), - C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), - C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), - C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), - C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), - C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), - C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), - C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), - C64e(0x8ee947476720e920), C64e(0x2018101038281828), - C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), - C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), - C64e(0x70243838546c246c), C64e(0xaef157575f08f108), - C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), - C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), - C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), - C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), - C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), - C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), - C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), - C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), - C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), - C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), - C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), - C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), - C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), - C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), - C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), - C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), - C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), - C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), - C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), - C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), - C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), - C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), - C64e(0x12800909929b809b), C64e(0x34171a1a23391739), - C64e(0xcada65651075da75), C64e(0xb531d7d784533153), - C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), - C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), - C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), - C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), - C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) -}; - -#endif - -static const sph_u64 T4[] = { - C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), - C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), - C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), - C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), - C64e(0xf0c050606090f050), C64e(0x0504030202070503), - C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), - C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), - C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), - C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), - C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), - C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), - C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), - C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), - C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), - C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), - C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), - C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), - C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), - C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), - C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), - C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), - C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), - C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), - C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), - C64e(0x14100c08081c140c), C64e(0xf63152959563f652), - C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), - C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), - C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), - C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), - C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), - C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), - C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), - C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), - C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), - C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), - C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), - C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), - C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), - C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), - C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), - C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), - C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), - C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), - C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), - C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), - C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), - C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), - C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), - C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), - C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), - C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), - C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), - C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), - C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), - C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), - C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), - C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), - C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), - C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), - C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), - C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), - C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), - C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), - C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), - C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), - C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), - C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), - C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), - C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), - C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), - C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), - C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), - C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), - C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), - C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), - C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), - C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), - C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), - C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), - C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), - C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), - C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), - C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), - C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), - C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), - C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), - C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), - C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), - C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), - C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), - C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), - C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), - C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), - C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), - C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), - C64e(0x208ee947476720e9), C64e(0x2820181010382818), - C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), - C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), - C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), - C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), - C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), - C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), - C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), - C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), - C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), - C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), - C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), - C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), - C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), - C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), - C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), - C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), - C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), - C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), - C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), - C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), - C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), - C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), - C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), - C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), - C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), - C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), - C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), - C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), - C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), - C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), - C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), - C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) -}; - -#if !SPH_SMALL_FOOTPRINT_GROESTL - -static const sph_u64 T5[] = { - C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), - C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), - C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), - C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), - C64e(0x50f0c050606090f0), C64e(0x0305040302020705), - C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), - C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), - C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), - C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), - C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), - C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), - C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), - C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), - C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), - C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), - C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), - C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), - C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), - C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), - C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), - C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), - C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), - C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), - C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), - C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), - C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), - C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), - C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), - C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), - C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), - C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), - C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), - C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), - C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), - C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), - C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), - C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), - C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), - C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), - C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), - C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), - C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), - C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), - C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), - C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), - C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), - C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), - C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), - C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), - C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), - C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), - C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), - C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), - C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), - C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), - C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), - C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), - C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), - C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), - C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), - C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), - C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), - C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), - C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), - C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), - C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), - C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), - C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), - C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), - C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), - C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), - C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), - C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), - C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), - C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), - C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), - C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), - C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), - C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), - C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), - C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), - C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), - C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), - C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), - C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), - C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), - C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), - C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), - C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), - C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), - C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), - C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), - C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), - C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), - C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), - C64e(0xe9208ee947476720), C64e(0x1828201810103828), - C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), - C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), - C64e(0x246c70243838546c), C64e(0xf108aef157575f08), - C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), - C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), - C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), - C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), - C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), - C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), - C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), - C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), - C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), - C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), - C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), - C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), - C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), - C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), - C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), - C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), - C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), - C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), - C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), - C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), - C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), - C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), - C64e(0x809b12800909929b), C64e(0x173934171a1a2339), - C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), - C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), - C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), - C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), - C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), - C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) -}; - -static const sph_u64 T6[] = { - C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), - C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), - C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), - C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), - C64e(0xf050f0c050606090), C64e(0x0503050403020207), - C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), - C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), - C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), - C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), - C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), - C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), - C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), - C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), - C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), - C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), - C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), - C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), - C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), - C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), - C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), - C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), - C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), - C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), - C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), - C64e(0x140c14100c08081c), C64e(0xf652f63152959563), - C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), - C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), - C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), - C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), - C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), - C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), - C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), - C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), - C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), - C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), - C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), - C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), - C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), - C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), - C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), - C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), - C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), - C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), - C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), - C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), - C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), - C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), - C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), - C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), - C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), - C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), - C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), - C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), - C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), - C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), - C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), - C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), - C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), - C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), - C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), - C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), - C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), - C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), - C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), - C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), - C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), - C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), - C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), - C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), - C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), - C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), - C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), - C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), - C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), - C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), - C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), - C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), - C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), - C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), - C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), - C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), - C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), - C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), - C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), - C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), - C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), - C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), - C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), - C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), - C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), - C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), - C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), - C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), - C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), - C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), - C64e(0x20e9208ee9474767), C64e(0x2818282018101038), - C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), - C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), - C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), - C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), - C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), - C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), - C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), - C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), - C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), - C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), - C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), - C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), - C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), - C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), - C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), - C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), - C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), - C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), - C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), - C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), - C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), - C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), - C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), - C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), - C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), - C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), - C64e(0x75da75cada656510), C64e(0x533153b531d7d784), - C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), - C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), - C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), - C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), - C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) -}; - -static const sph_u64 T7[] = { - C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), - C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), - C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), - C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), - C64e(0x90f050f0c0506060), C64e(0x0705030504030202), - C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), - C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), - C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), - C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), - C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), - C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), - C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), - C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), - C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), - C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), - C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), - C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), - C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), - C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), - C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), - C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), - C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), - C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), - C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), - C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), - C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), - C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), - C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), - C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), - C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), - C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), - C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), - C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), - C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), - C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), - C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), - C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), - C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), - C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), - C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), - C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), - C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), - C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), - C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), - C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), - C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), - C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), - C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), - C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), - C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), - C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), - C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), - C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), - C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), - C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), - C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), - C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), - C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), - C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), - C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), - C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), - C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), - C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), - C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), - C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), - C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), - C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), - C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), - C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), - C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), - C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), - C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), - C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), - C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), - C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), - C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), - C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), - C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), - C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), - C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), - C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), - C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), - C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), - C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), - C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), - C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), - C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), - C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), - C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), - C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), - C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), - C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), - C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), - C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), - C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), - C64e(0x6720e9208ee94747), C64e(0x3828182820181010), - C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), - C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), - C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), - C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), - C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), - C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), - C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), - C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), - C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), - C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), - C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), - C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), - C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), - C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), - C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), - C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), - C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), - C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), - C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), - C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), - C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), - C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), - C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), - C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), - C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), - C64e(0x929b809b12800909), C64e(0x2339173934171a1a), - C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), - C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), - C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), - C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), - C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), - C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) -}; - -#endif - -#define DECL_STATE_SMALL \ - sph_u64 H[8]; - -#define READ_STATE_SMALL(sc) do { \ - memcpy(H, (sc)->state.wide, sizeof H); \ - } while (0) - -#define WRITE_STATE_SMALL(sc) do { \ - memcpy((sc)->state.wide, H, sizeof H); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d] = T0[B64_0(a[b0])] \ - ^ R64(T0[B64_1(a[b1])], 8) \ - ^ R64(T0[B64_2(a[b2])], 16) \ - ^ R64(T0[B64_3(a[b3])], 24) \ - ^ T4[B64_4(a[b4])] \ - ^ R64(T4[B64_5(a[b5])], 8) \ - ^ R64(T4[B64_6(a[b6])], 16) \ - ^ R64(T4[B64_7(a[b7])], 24); \ - } while (0) - -#else - -#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d] = T0[B64_0(a[b0])] \ - ^ T1[B64_1(a[b1])] \ - ^ T2[B64_2(a[b2])] \ - ^ T3[B64_3(a[b3])] \ - ^ T4[B64_4(a[b4])] \ - ^ T5[B64_5(a[b5])] \ - ^ T6[B64_6(a[b6])] \ - ^ T7[B64_7(a[b7])]; \ - } while (0) - -#endif - -#define ROUND_SMALL_P(a, r) do { \ - sph_u64 t[8]; \ - a[0] ^= PC64(0x00, r); \ - a[1] ^= PC64(0x10, r); \ - a[2] ^= PC64(0x20, r); \ - a[3] ^= PC64(0x30, r); \ - a[4] ^= PC64(0x40, r); \ - a[5] ^= PC64(0x50, r); \ - a[6] ^= PC64(0x60, r); \ - a[7] ^= PC64(0x70, r); \ - RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \ - RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \ - RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \ - RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \ - RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \ - RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \ - RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \ - RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ - a[0] = t[0]; \ - a[1] = t[1]; \ - a[2] = t[2]; \ - a[3] = t[3]; \ - a[4] = t[4]; \ - a[5] = t[5]; \ - a[6] = t[6]; \ - a[7] = t[7]; \ - } while (0) - -#define ROUND_SMALL_Q(a, r) do { \ - sph_u64 t[8]; \ - a[0] ^= QC64(0x00, r); \ - a[1] ^= QC64(0x10, r); \ - a[2] ^= QC64(0x20, r); \ - a[3] ^= QC64(0x30, r); \ - a[4] ^= QC64(0x40, r); \ - a[5] ^= QC64(0x50, r); \ - a[6] ^= QC64(0x60, r); \ - a[7] ^= QC64(0x70, r); \ - RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \ - RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \ - RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \ - RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \ - RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \ - RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \ - RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \ - RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \ - a[0] = t[0]; \ - a[1] = t[1]; \ - a[2] = t[2]; \ - a[3] = t[3]; \ - a[4] = t[4]; \ - a[5] = t[5]; \ - a[6] = t[6]; \ - a[7] = t[7]; \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define PERM_SMALL_P(a) do { \ - int r; \ - for (r = 0; r < 10; r ++) \ - ROUND_SMALL_P(a, r); \ - } while (0) - -#define PERM_SMALL_Q(a) do { \ - int r; \ - for (r = 0; r < 10; r ++) \ - ROUND_SMALL_Q(a, r); \ - } while (0) - -#else - -/* - * Apparently, unrolling more than that confuses GCC, resulting in - * lower performance, even though L1 cache would be no problem. - */ -#define PERM_SMALL_P(a) do { \ - int r; \ - for (r = 0; r < 10; r += 2) { \ - ROUND_SMALL_P(a, r + 0); \ - ROUND_SMALL_P(a, r + 1); \ - } \ - } while (0) - -#define PERM_SMALL_Q(a) do { \ - int r; \ - for (r = 0; r < 10; r += 2) { \ - ROUND_SMALL_Q(a, r + 0); \ - ROUND_SMALL_Q(a, r + 1); \ - } \ - } while (0) - -#endif - -#define COMPRESS_SMALL do { \ - sph_u64 g[8], m[8]; \ - size_t u; \ - for (u = 0; u < 8; u ++) { \ - m[u] = dec64e_aligned(buf + (u << 3)); \ - g[u] = m[u] ^ H[u]; \ - } \ - PERM_SMALL_P(g); \ - PERM_SMALL_Q(m); \ - for (u = 0; u < 8; u ++) \ - H[u] ^= g[u] ^ m[u]; \ - } while (0) - -#define FINAL_SMALL do { \ - sph_u64 x[8]; \ - size_t u; \ - memcpy(x, H, sizeof x); \ - PERM_SMALL_P(x); \ - for (u = 0; u < 8; u ++) \ - H[u] ^= x[u]; \ - } while (0) - -#define DECL_STATE_BIG \ - sph_u64 H[16]; - -#define READ_STATE_BIG(sc) do { \ - memcpy(H, (sc)->state.wide, sizeof H); \ - } while (0) - -#define WRITE_STATE_BIG(sc) do { \ - memcpy((sc)->state.wide, H, sizeof H); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d] = T0[B64_0(a[b0])] \ - ^ R64(T0[B64_1(a[b1])], 8) \ - ^ R64(T0[B64_2(a[b2])], 16) \ - ^ R64(T0[B64_3(a[b3])], 24) \ - ^ T4[B64_4(a[b4])] \ - ^ R64(T4[B64_5(a[b5])], 8) \ - ^ R64(T4[B64_6(a[b6])], 16) \ - ^ R64(T4[B64_7(a[b7])], 24); \ - } while (0) - -#else - -#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d] = T0[B64_0(a[b0])] \ - ^ T1[B64_1(a[b1])] \ - ^ T2[B64_2(a[b2])] \ - ^ T3[B64_3(a[b3])] \ - ^ T4[B64_4(a[b4])] \ - ^ T5[B64_5(a[b5])] \ - ^ T6[B64_6(a[b6])] \ - ^ T7[B64_7(a[b7])]; \ - } while (0) - -#endif - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define ROUND_BIG_P(a, r) do { \ - sph_u64 t[16]; \ - size_t u; \ - a[0x0] ^= PC64(0x00, r); \ - a[0x1] ^= PC64(0x10, r); \ - a[0x2] ^= PC64(0x20, r); \ - a[0x3] ^= PC64(0x30, r); \ - a[0x4] ^= PC64(0x40, r); \ - a[0x5] ^= PC64(0x50, r); \ - a[0x6] ^= PC64(0x60, r); \ - a[0x7] ^= PC64(0x70, r); \ - a[0x8] ^= PC64(0x80, r); \ - a[0x9] ^= PC64(0x90, r); \ - a[0xA] ^= PC64(0xA0, r); \ - a[0xB] ^= PC64(0xB0, r); \ - a[0xC] ^= PC64(0xC0, r); \ - a[0xD] ^= PC64(0xD0, r); \ - a[0xE] ^= PC64(0xE0, r); \ - a[0xF] ^= PC64(0xF0, r); \ - for (u = 0; u < 16; u += 4) { \ - RBTT(u + 0, a, u + 0, (u + 1) & 0xF, \ - (u + 2) & 0xF, (u + 3) & 0xF, (u + 4) & 0xF, \ - (u + 5) & 0xF, (u + 6) & 0xF, (u + 11) & 0xF); \ - RBTT(u + 1, a, u + 1, (u + 2) & 0xF, \ - (u + 3) & 0xF, (u + 4) & 0xF, (u + 5) & 0xF, \ - (u + 6) & 0xF, (u + 7) & 0xF, (u + 12) & 0xF); \ - RBTT(u + 2, a, u + 2, (u + 3) & 0xF, \ - (u + 4) & 0xF, (u + 5) & 0xF, (u + 6) & 0xF, \ - (u + 7) & 0xF, (u + 8) & 0xF, (u + 13) & 0xF); \ - RBTT(u + 3, a, u + 3, (u + 4) & 0xF, \ - (u + 5) & 0xF, (u + 6) & 0xF, (u + 7) & 0xF, \ - (u + 8) & 0xF, (u + 9) & 0xF, (u + 14) & 0xF); \ - } \ - memcpy(a, t, sizeof t); \ - } while (0) - -#define ROUND_BIG_Q(a, r) do { \ - sph_u64 t[16]; \ - size_t u; \ - a[0x0] ^= QC64(0x00, r); \ - a[0x1] ^= QC64(0x10, r); \ - a[0x2] ^= QC64(0x20, r); \ - a[0x3] ^= QC64(0x30, r); \ - a[0x4] ^= QC64(0x40, r); \ - a[0x5] ^= QC64(0x50, r); \ - a[0x6] ^= QC64(0x60, r); \ - a[0x7] ^= QC64(0x70, r); \ - a[0x8] ^= QC64(0x80, r); \ - a[0x9] ^= QC64(0x90, r); \ - a[0xA] ^= QC64(0xA0, r); \ - a[0xB] ^= QC64(0xB0, r); \ - a[0xC] ^= QC64(0xC0, r); \ - a[0xD] ^= QC64(0xD0, r); \ - a[0xE] ^= QC64(0xE0, r); \ - a[0xF] ^= QC64(0xF0, r); \ - for (u = 0; u < 16; u += 4) { \ - RBTT(u + 0, a, (u + 1) & 0xF, (u + 3) & 0xF, \ - (u + 5) & 0xF, (u + 11) & 0xF, (u + 0) & 0xF, \ - (u + 2) & 0xF, (u + 4) & 0xF, (u + 6) & 0xF); \ - RBTT(u + 1, a, (u + 2) & 0xF, (u + 4) & 0xF, \ - (u + 6) & 0xF, (u + 12) & 0xF, (u + 1) & 0xF, \ - (u + 3) & 0xF, (u + 5) & 0xF, (u + 7) & 0xF); \ - RBTT(u + 2, a, (u + 3) & 0xF, (u + 5) & 0xF, \ - (u + 7) & 0xF, (u + 13) & 0xF, (u + 2) & 0xF, \ - (u + 4) & 0xF, (u + 6) & 0xF, (u + 8) & 0xF); \ - RBTT(u + 3, a, (u + 4) & 0xF, (u + 6) & 0xF, \ - (u + 8) & 0xF, (u + 14) & 0xF, (u + 3) & 0xF, \ - (u + 5) & 0xF, (u + 7) & 0xF, (u + 9) & 0xF); \ - } \ - memcpy(a, t, sizeof t); \ - } while (0) - -#else - -#define ROUND_BIG_P(a, r) do { \ - sph_u64 t[16]; \ - a[0x0] ^= PC64(0x00, r); \ - a[0x1] ^= PC64(0x10, r); \ - a[0x2] ^= PC64(0x20, r); \ - a[0x3] ^= PC64(0x30, r); \ - a[0x4] ^= PC64(0x40, r); \ - a[0x5] ^= PC64(0x50, r); \ - a[0x6] ^= PC64(0x60, r); \ - a[0x7] ^= PC64(0x70, r); \ - a[0x8] ^= PC64(0x80, r); \ - a[0x9] ^= PC64(0x90, r); \ - a[0xA] ^= PC64(0xA0, r); \ - a[0xB] ^= PC64(0xB0, r); \ - a[0xC] ^= PC64(0xC0, r); \ - a[0xD] ^= PC64(0xD0, r); \ - a[0xE] ^= PC64(0xE0, r); \ - a[0xF] ^= PC64(0xF0, r); \ - RBTT(0x0, a, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \ - RBTT(0x1, a, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \ - RBTT(0x2, a, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \ - RBTT(0x3, a, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \ - RBTT(0x4, a, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \ - RBTT(0x5, a, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \ - RBTT(0x6, a, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \ - RBTT(0x7, a, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \ - RBTT(0x8, a, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \ - RBTT(0x9, a, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \ - RBTT(0xA, a, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \ - RBTT(0xB, a, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \ - RBTT(0xC, a, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \ - RBTT(0xD, a, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \ - RBTT(0xE, a, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \ - RBTT(0xF, a, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \ - a[0x0] = t[0x0]; \ - a[0x1] = t[0x1]; \ - a[0x2] = t[0x2]; \ - a[0x3] = t[0x3]; \ - a[0x4] = t[0x4]; \ - a[0x5] = t[0x5]; \ - a[0x6] = t[0x6]; \ - a[0x7] = t[0x7]; \ - a[0x8] = t[0x8]; \ - a[0x9] = t[0x9]; \ - a[0xA] = t[0xA]; \ - a[0xB] = t[0xB]; \ - a[0xC] = t[0xC]; \ - a[0xD] = t[0xD]; \ - a[0xE] = t[0xE]; \ - a[0xF] = t[0xF]; \ - } while (0) - -#define ROUND_BIG_Q(a, r) do { \ - sph_u64 t[16]; \ - a[0x0] ^= QC64(0x00, r); \ - a[0x1] ^= QC64(0x10, r); \ - a[0x2] ^= QC64(0x20, r); \ - a[0x3] ^= QC64(0x30, r); \ - a[0x4] ^= QC64(0x40, r); \ - a[0x5] ^= QC64(0x50, r); \ - a[0x6] ^= QC64(0x60, r); \ - a[0x7] ^= QC64(0x70, r); \ - a[0x8] ^= QC64(0x80, r); \ - a[0x9] ^= QC64(0x90, r); \ - a[0xA] ^= QC64(0xA0, r); \ - a[0xB] ^= QC64(0xB0, r); \ - a[0xC] ^= QC64(0xC0, r); \ - a[0xD] ^= QC64(0xD0, r); \ - a[0xE] ^= QC64(0xE0, r); \ - a[0xF] ^= QC64(0xF0, r); \ - RBTT(0x0, a, 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \ - RBTT(0x1, a, 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \ - RBTT(0x2, a, 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \ - RBTT(0x3, a, 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \ - RBTT(0x4, a, 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \ - RBTT(0x5, a, 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \ - RBTT(0x6, a, 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \ - RBTT(0x7, a, 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \ - RBTT(0x8, a, 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \ - RBTT(0x9, a, 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \ - RBTT(0xA, a, 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \ - RBTT(0xB, a, 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \ - RBTT(0xC, a, 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \ - RBTT(0xD, a, 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \ - RBTT(0xE, a, 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \ - RBTT(0xF, a, 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \ - a[0x0] = t[0x0]; \ - a[0x1] = t[0x1]; \ - a[0x2] = t[0x2]; \ - a[0x3] = t[0x3]; \ - a[0x4] = t[0x4]; \ - a[0x5] = t[0x5]; \ - a[0x6] = t[0x6]; \ - a[0x7] = t[0x7]; \ - a[0x8] = t[0x8]; \ - a[0x9] = t[0x9]; \ - a[0xA] = t[0xA]; \ - a[0xB] = t[0xB]; \ - a[0xC] = t[0xC]; \ - a[0xD] = t[0xD]; \ - a[0xE] = t[0xE]; \ - a[0xF] = t[0xF]; \ - } while (0) - -#endif - -#define PERM_BIG_P(a) do { \ - int r; \ - for (r = 0; r < 14; r += 2) { \ - ROUND_BIG_P(a, r + 0); \ - ROUND_BIG_P(a, r + 1); \ - } \ - } while (0) - -#define PERM_BIG_Q(a) do { \ - int r; \ - for (r = 0; r < 14; r += 2) { \ - ROUND_BIG_Q(a, r + 0); \ - ROUND_BIG_Q(a, r + 1); \ - } \ - } while (0) - -/* obsolete -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define COMPRESS_BIG do { \ - sph_u64 g[16], m[16], *ya; \ - const sph_u64 *yc; \ - size_t u; \ - int i; \ - for (u = 0; u < 16; u ++) { \ - m[u] = dec64e_aligned(buf + (u << 3)); \ - g[u] = m[u] ^ H[u]; \ - } \ - ya = g; \ - yc = CP; \ - for (i = 0; i < 2; i ++) { \ - PERM_BIG(ya, yc); \ - ya = m; \ - yc = CQ; \ - } \ - for (u = 0; u < 16; u ++) { \ - H[u] ^= g[u] ^ m[u]; \ - } \ - } while (0) - -#else -*/ - -#define COMPRESS_BIG do { \ - sph_u64 g[16], m[16]; \ - size_t u; \ - for (u = 0; u < 16; u ++) { \ - m[u] = dec64e_aligned(buf + (u << 3)); \ - g[u] = m[u] ^ H[u]; \ - } \ - PERM_BIG_P(g); \ - PERM_BIG_Q(m); \ - for (u = 0; u < 16; u ++) { \ - H[u] ^= g[u] ^ m[u]; \ - } \ - } while (0) - -/* obsolete -#endif -*/ - -#define FINAL_BIG do { \ - sph_u64 x[16]; \ - size_t u; \ - memcpy(x, H, sizeof x); \ - PERM_BIG_P(x); \ - for (u = 0; u < 16; u ++) \ - H[u] ^= x[u]; \ - } while (0) - -#else - -static const sph_u32 T0up[] = { - C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), - C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), - C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), - C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), - C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), - C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), - C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), - C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), - C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), - C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), - C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), - C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), - C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), - C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), - C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), - C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), - C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), - C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), - C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), - C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), - C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), - C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), - C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), - C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), - C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), - C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), - C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), - C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), - C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), - C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), - C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), - C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), - C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), - C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), - C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), - C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), - C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), - C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), - C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), - C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), - C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), - C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), - C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), - C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), - C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), - C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), - C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), - C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), - C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), - C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), - C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), - C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), - C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), - C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), - C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), - C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), - C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), - C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), - C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), - C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), - C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), - C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), - C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), - C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) -}; - -static const sph_u32 T0dn[] = { - C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), - C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), - C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), - C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), - C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), - C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), - C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), - C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), - C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), - C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), - C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), - C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), - C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), - C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), - C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), - C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), - C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), - C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), - C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), - C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), - C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), - C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), - C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), - C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), - C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), - C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), - C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), - C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), - C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), - C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), - C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), - C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), - C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), - C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), - C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), - C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), - C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), - C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), - C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), - C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), - C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), - C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), - C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), - C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), - C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), - C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), - C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), - C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), - C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), - C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), - C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), - C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), - C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), - C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), - C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), - C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), - C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), - C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), - C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), - C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), - C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), - C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), - C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), - C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) -}; - -static const sph_u32 T1up[] = { - C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), - C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), - C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), - C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), - C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), - C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), - C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), - C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), - C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), - C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), - C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), - C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), - C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), - C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), - C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), - C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), - C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), - C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), - C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), - C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), - C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), - C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), - C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), - C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), - C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), - C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), - C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), - C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), - C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), - C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), - C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), - C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), - C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), - C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), - C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), - C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), - C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), - C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), - C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), - C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), - C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), - C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), - C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), - C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), - C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), - C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), - C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), - C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), - C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), - C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), - C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), - C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), - C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), - C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), - C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), - C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), - C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), - C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), - C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), - C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), - C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), - C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), - C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), - C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) -}; - -static const sph_u32 T1dn[] = { - C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), - C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), - C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), - C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), - C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), - C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), - C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), - C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), - C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), - C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), - C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), - C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), - C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), - C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), - C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), - C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), - C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), - C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), - C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), - C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), - C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), - C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), - C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), - C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), - C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), - C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), - C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), - C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), - C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), - C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), - C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), - C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), - C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), - C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), - C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), - C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), - C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), - C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), - C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), - C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), - C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), - C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), - C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), - C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), - C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), - C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), - C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), - C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), - C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), - C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), - C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), - C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), - C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), - C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), - C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), - C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), - C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), - C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), - C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), - C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), - C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), - C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), - C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), - C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) -}; - -static const sph_u32 T2up[] = { - C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), - C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), - C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), - C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), - C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), - C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), - C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), - C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), - C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), - C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), - C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), - C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), - C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), - C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), - C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), - C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), - C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), - C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), - C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), - C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), - C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), - C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), - C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), - C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), - C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), - C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), - C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), - C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), - C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), - C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), - C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), - C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), - C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), - C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), - C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), - C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), - C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), - C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), - C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), - C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), - C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), - C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), - C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), - C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), - C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), - C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), - C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), - C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), - C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), - C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), - C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), - C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), - C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), - C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), - C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), - C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), - C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), - C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), - C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), - C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), - C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), - C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), - C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), - C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) -}; - -static const sph_u32 T2dn[] = { - C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), - C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), - C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), - C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), - C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), - C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), - C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), - C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), - C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), - C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), - C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), - C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), - C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), - C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), - C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), - C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), - C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), - C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), - C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), - C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), - C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), - C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), - C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), - C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), - C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), - C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), - C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), - C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), - C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), - C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), - C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), - C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), - C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), - C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), - C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), - C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), - C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), - C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), - C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), - C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), - C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), - C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), - C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), - C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), - C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), - C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), - C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), - C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), - C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), - C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), - C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), - C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), - C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), - C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), - C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), - C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), - C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), - C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), - C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), - C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), - C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), - C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), - C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), - C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) -}; - -static const sph_u32 T3up[] = { - C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), - C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), - C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), - C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), - C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), - C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), - C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), - C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), - C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), - C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), - C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), - C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), - C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), - C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), - C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), - C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), - C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), - C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), - C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), - C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), - C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), - C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), - C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), - C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), - C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), - C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), - C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), - C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), - C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), - C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), - C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), - C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), - C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), - C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), - C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), - C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), - C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), - C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), - C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), - C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), - C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), - C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), - C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), - C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), - C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), - C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), - C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), - C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), - C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), - C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), - C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), - C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), - C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), - C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), - C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), - C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), - C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), - C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), - C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), - C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), - C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), - C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), - C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), - C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) -}; - -static const sph_u32 T3dn[] = { - C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), - C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), - C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), - C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), - C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), - C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), - C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), - C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), - C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), - C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), - C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), - C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), - C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), - C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), - C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), - C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), - C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), - C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), - C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), - C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), - C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), - C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), - C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), - C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), - C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), - C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), - C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), - C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), - C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), - C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), - C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), - C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), - C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), - C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), - C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), - C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), - C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), - C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), - C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), - C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), - C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), - C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), - C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), - C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), - C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), - C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), - C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), - C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), - C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), - C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), - C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), - C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), - C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), - C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), - C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), - C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), - C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), - C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), - C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), - C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), - C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), - C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), - C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), - C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) -}; - -#define DECL_STATE_SMALL \ - sph_u32 H[16]; - -#define READ_STATE_SMALL(sc) do { \ - memcpy(H, (sc)->state.narrow, sizeof H); \ - } while (0) - -#define WRITE_STATE_SMALL(sc) do { \ - memcpy((sc)->state.narrow, H, sizeof H); \ - } while (0) - -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y - -#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d0] = T0up[B32_0(a[b0])] \ - ^ T1up[B32_1(a[b1])] \ - ^ T2up[B32_2(a[b2])] \ - ^ T3up[B32_3(a[b3])] \ - ^ T0dn[B32_0(a[b4])] \ - ^ T1dn[B32_1(a[b5])] \ - ^ T2dn[B32_2(a[b6])] \ - ^ T3dn[B32_3(a[b7])]; \ - t[d1] = T0dn[B32_0(a[b0])] \ - ^ T1dn[B32_1(a[b1])] \ - ^ T2dn[B32_2(a[b2])] \ - ^ T3dn[B32_3(a[b3])] \ - ^ T0up[B32_0(a[b4])] \ - ^ T1up[B32_1(a[b5])] \ - ^ T2up[B32_2(a[b6])] \ - ^ T3up[B32_3(a[b7])]; \ - } while (0) - -#define ROUND_SMALL_P(a, r) do { \ - sph_u32 t[16]; \ - a[0x0] ^= PC32up(0x00, r); \ - a[0x1] ^= PC32dn(0x00, r); \ - a[0x2] ^= PC32up(0x10, r); \ - a[0x3] ^= PC32dn(0x10, r); \ - a[0x4] ^= PC32up(0x20, r); \ - a[0x5] ^= PC32dn(0x20, r); \ - a[0x6] ^= PC32up(0x30, r); \ - a[0x7] ^= PC32dn(0x30, r); \ - a[0x8] ^= PC32up(0x40, r); \ - a[0x9] ^= PC32dn(0x40, r); \ - a[0xA] ^= PC32up(0x50, r); \ - a[0xB] ^= PC32dn(0x50, r); \ - a[0xC] ^= PC32up(0x60, r); \ - a[0xD] ^= PC32dn(0x60, r); \ - a[0xE] ^= PC32up(0x70, r); \ - a[0xF] ^= PC32dn(0x70, r); \ - RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); \ - RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); \ - RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); \ - RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); \ - RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); \ - RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); \ - RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); \ - RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); \ - memcpy(a, t, sizeof t); \ - } while (0) - -#define ROUND_SMALL_Q(a, r) do { \ - sph_u32 t[16]; \ - a[0x0] ^= QC32up(0x00, r); \ - a[0x1] ^= QC32dn(0x00, r); \ - a[0x2] ^= QC32up(0x10, r); \ - a[0x3] ^= QC32dn(0x10, r); \ - a[0x4] ^= QC32up(0x20, r); \ - a[0x5] ^= QC32dn(0x20, r); \ - a[0x6] ^= QC32up(0x30, r); \ - a[0x7] ^= QC32dn(0x30, r); \ - a[0x8] ^= QC32up(0x40, r); \ - a[0x9] ^= QC32dn(0x40, r); \ - a[0xA] ^= QC32up(0x50, r); \ - a[0xB] ^= QC32dn(0x50, r); \ - a[0xC] ^= QC32up(0x60, r); \ - a[0xD] ^= QC32dn(0x60, r); \ - a[0xE] ^= QC32up(0x70, r); \ - a[0xF] ^= QC32dn(0x70, r); \ - RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); \ - RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); \ - RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); \ - RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); \ - RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); \ - RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); \ - RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); \ - RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); \ - memcpy(a, t, sizeof t); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define PERM_SMALL_P(a) do { \ - int r; \ - for (r = 0; r < 10; r ++) \ - ROUND_SMALL_P(a, r); \ - } while (0) - -#define PERM_SMALL_Q(a) do { \ - int r; \ - for (r = 0; r < 10; r ++) \ - ROUND_SMALL_Q(a, r); \ - } while (0) - -#else - -#define PERM_SMALL_P(a) do { \ - int r; \ - for (r = 0; r < 10; r += 2) { \ - ROUND_SMALL_P(a, r + 0); \ - ROUND_SMALL_P(a, r + 1); \ - } \ - } while (0) - -#define PERM_SMALL_Q(a) do { \ - int r; \ - for (r = 0; r < 10; r += 2) { \ - ROUND_SMALL_Q(a, r + 0); \ - ROUND_SMALL_Q(a, r + 1); \ - } \ - } while (0) - -#endif - -#define COMPRESS_SMALL do { \ - sph_u32 g[16], m[16]; \ - size_t u; \ - for (u = 0; u < 16; u ++) { \ - m[u] = dec32e_aligned(buf + (u << 2)); \ - g[u] = m[u] ^ H[u]; \ - } \ - PERM_SMALL_P(g); \ - PERM_SMALL_Q(m); \ - for (u = 0; u < 16; u ++) \ - H[u] ^= g[u] ^ m[u]; \ - } while (0) - -#define FINAL_SMALL do { \ - sph_u32 x[16]; \ - size_t u; \ - memcpy(x, H, sizeof x); \ - PERM_SMALL_P(x); \ - for (u = 0; u < 16; u ++) \ - H[u] ^= x[u]; \ - } while (0) - -#define DECL_STATE_BIG \ - sph_u32 H[32]; - -#define READ_STATE_BIG(sc) do { \ - memcpy(H, (sc)->state.narrow, sizeof H); \ - } while (0) - -#define WRITE_STATE_BIG(sc) do { \ - memcpy((sc)->state.narrow, H, sizeof H); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - sph_u32 fu2 = T0up[B32_2(a[b2])]; \ - sph_u32 fd2 = T0dn[B32_2(a[b2])]; \ - sph_u32 fu3 = T1up[B32_3(a[b3])]; \ - sph_u32 fd3 = T1dn[B32_3(a[b3])]; \ - sph_u32 fu6 = T0up[B32_2(a[b6])]; \ - sph_u32 fd6 = T0dn[B32_2(a[b6])]; \ - sph_u32 fu7 = T1up[B32_3(a[b7])]; \ - sph_u32 fd7 = T1dn[B32_3(a[b7])]; \ - t[d0] = T0up[B32_0(a[b0])] \ - ^ T1up[B32_1(a[b1])] \ - ^ R32u(fu2, fd2) \ - ^ R32u(fu3, fd3) \ - ^ T0dn[B32_0(a[b4])] \ - ^ T1dn[B32_1(a[b5])] \ - ^ R32d(fu6, fd6) \ - ^ R32d(fu7, fd7); \ - t[d1] = T0dn[B32_0(a[b0])] \ - ^ T1dn[B32_1(a[b1])] \ - ^ R32d(fu2, fd2) \ - ^ R32d(fu3, fd3) \ - ^ T0up[B32_0(a[b4])] \ - ^ T1up[B32_1(a[b5])] \ - ^ R32u(fu6, fd6) \ - ^ R32u(fu7, fd7); \ - } while (0) - -#else - -#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d0] = T0up[B32_0(a[b0])] \ - ^ T1up[B32_1(a[b1])] \ - ^ T2up[B32_2(a[b2])] \ - ^ T3up[B32_3(a[b3])] \ - ^ T0dn[B32_0(a[b4])] \ - ^ T1dn[B32_1(a[b5])] \ - ^ T2dn[B32_2(a[b6])] \ - ^ T3dn[B32_3(a[b7])]; \ - t[d1] = T0dn[B32_0(a[b0])] \ - ^ T1dn[B32_1(a[b1])] \ - ^ T2dn[B32_2(a[b2])] \ - ^ T3dn[B32_3(a[b3])] \ - ^ T0up[B32_0(a[b4])] \ - ^ T1up[B32_1(a[b5])] \ - ^ T2up[B32_2(a[b6])] \ - ^ T3up[B32_3(a[b7])]; \ - } while (0) - -#endif - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define ROUND_BIG_P(a, r) do { \ - sph_u32 t[32]; \ - size_t u; \ - a[0x00] ^= PC32up(0x00, r); \ - a[0x01] ^= PC32dn(0x00, r); \ - a[0x02] ^= PC32up(0x10, r); \ - a[0x03] ^= PC32dn(0x10, r); \ - a[0x04] ^= PC32up(0x20, r); \ - a[0x05] ^= PC32dn(0x20, r); \ - a[0x06] ^= PC32up(0x30, r); \ - a[0x07] ^= PC32dn(0x30, r); \ - a[0x08] ^= PC32up(0x40, r); \ - a[0x09] ^= PC32dn(0x40, r); \ - a[0x0A] ^= PC32up(0x50, r); \ - a[0x0B] ^= PC32dn(0x50, r); \ - a[0x0C] ^= PC32up(0x60, r); \ - a[0x0D] ^= PC32dn(0x60, r); \ - a[0x0E] ^= PC32up(0x70, r); \ - a[0x0F] ^= PC32dn(0x70, r); \ - a[0x10] ^= PC32up(0x80, r); \ - a[0x11] ^= PC32dn(0x80, r); \ - a[0x12] ^= PC32up(0x90, r); \ - a[0x13] ^= PC32dn(0x90, r); \ - a[0x14] ^= PC32up(0xA0, r); \ - a[0x15] ^= PC32dn(0xA0, r); \ - a[0x16] ^= PC32up(0xB0, r); \ - a[0x17] ^= PC32dn(0xB0, r); \ - a[0x18] ^= PC32up(0xC0, r); \ - a[0x19] ^= PC32dn(0xC0, r); \ - a[0x1A] ^= PC32up(0xD0, r); \ - a[0x1B] ^= PC32dn(0xD0, r); \ - a[0x1C] ^= PC32up(0xE0, r); \ - a[0x1D] ^= PC32dn(0xE0, r); \ - a[0x1E] ^= PC32up(0xF0, r); \ - a[0x1F] ^= PC32dn(0xF0, r); \ - for (u = 0; u < 32; u += 8) { \ - RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ - u + 0x00, (u + 0x02) & 0x1F, \ - (u + 0x04) & 0x1F, (u + 0x06) & 0x1F, \ - (u + 0x09) & 0x1F, (u + 0x0B) & 0x1F, \ - (u + 0x0D) & 0x1F, (u + 0x17) & 0x1F); \ - RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ - u + 0x02, (u + 0x04) & 0x1F, \ - (u + 0x06) & 0x1F, (u + 0x08) & 0x1F, \ - (u + 0x0B) & 0x1F, (u + 0x0D) & 0x1F, \ - (u + 0x0F) & 0x1F, (u + 0x19) & 0x1F); \ - RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ - u + 0x04, (u + 0x06) & 0x1F, \ - (u + 0x08) & 0x1F, (u + 0x0A) & 0x1F, \ - (u + 0x0D) & 0x1F, (u + 0x0F) & 0x1F, \ - (u + 0x11) & 0x1F, (u + 0x1B) & 0x1F); \ - RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ - u + 0x06, (u + 0x08) & 0x1F, \ - (u + 0x0A) & 0x1F, (u + 0x0C) & 0x1F, \ - (u + 0x0F) & 0x1F, (u + 0x11) & 0x1F, \ - (u + 0x13) & 0x1F, (u + 0x1D) & 0x1F); \ - } \ - memcpy(a, t, sizeof t); \ - } while (0) - -#define ROUND_BIG_Q(a, r) do { \ - sph_u32 t[32]; \ - size_t u; \ - a[0x00] ^= QC32up(0x00, r); \ - a[0x01] ^= QC32dn(0x00, r); \ - a[0x02] ^= QC32up(0x10, r); \ - a[0x03] ^= QC32dn(0x10, r); \ - a[0x04] ^= QC32up(0x20, r); \ - a[0x05] ^= QC32dn(0x20, r); \ - a[0x06] ^= QC32up(0x30, r); \ - a[0x07] ^= QC32dn(0x30, r); \ - a[0x08] ^= QC32up(0x40, r); \ - a[0x09] ^= QC32dn(0x40, r); \ - a[0x0A] ^= QC32up(0x50, r); \ - a[0x0B] ^= QC32dn(0x50, r); \ - a[0x0C] ^= QC32up(0x60, r); \ - a[0x0D] ^= QC32dn(0x60, r); \ - a[0x0E] ^= QC32up(0x70, r); \ - a[0x0F] ^= QC32dn(0x70, r); \ - a[0x10] ^= QC32up(0x80, r); \ - a[0x11] ^= QC32dn(0x80, r); \ - a[0x12] ^= QC32up(0x90, r); \ - a[0x13] ^= QC32dn(0x90, r); \ - a[0x14] ^= QC32up(0xA0, r); \ - a[0x15] ^= QC32dn(0xA0, r); \ - a[0x16] ^= QC32up(0xB0, r); \ - a[0x17] ^= QC32dn(0xB0, r); \ - a[0x18] ^= QC32up(0xC0, r); \ - a[0x19] ^= QC32dn(0xC0, r); \ - a[0x1A] ^= QC32up(0xD0, r); \ - a[0x1B] ^= QC32dn(0xD0, r); \ - a[0x1C] ^= QC32up(0xE0, r); \ - a[0x1D] ^= QC32dn(0xE0, r); \ - a[0x1E] ^= QC32up(0xF0, r); \ - a[0x1F] ^= QC32dn(0xF0, r); \ - for (u = 0; u < 32; u += 8) { \ - RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ - (u + 0x02) & 0x1F, (u + 0x06) & 0x1F, \ - (u + 0x0A) & 0x1F, (u + 0x16) & 0x1F, \ - (u + 0x01) & 0x1F, (u + 0x05) & 0x1F, \ - (u + 0x09) & 0x1F, (u + 0x0D) & 0x1F); \ - RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ - (u + 0x04) & 0x1F, (u + 0x08) & 0x1F, \ - (u + 0x0C) & 0x1F, (u + 0x18) & 0x1F, \ - (u + 0x03) & 0x1F, (u + 0x07) & 0x1F, \ - (u + 0x0B) & 0x1F, (u + 0x0F) & 0x1F); \ - RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ - (u + 0x06) & 0x1F, (u + 0x0A) & 0x1F, \ - (u + 0x0E) & 0x1F, (u + 0x1A) & 0x1F, \ - (u + 0x05) & 0x1F, (u + 0x09) & 0x1F, \ - (u + 0x0D) & 0x1F, (u + 0x11) & 0x1F); \ - RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ - (u + 0x08) & 0x1F, (u + 0x0C) & 0x1F, \ - (u + 0x10) & 0x1F, (u + 0x1C) & 0x1F, \ - (u + 0x07) & 0x1F, (u + 0x0B) & 0x1F, \ - (u + 0x0F) & 0x1F, (u + 0x13) & 0x1F); \ - } \ - memcpy(a, t, sizeof t); \ - } while (0) - -#else - -#define ROUND_BIG_P(a, r) do { \ - sph_u32 t[32]; \ - a[0x00] ^= PC32up(0x00, r); \ - a[0x01] ^= PC32dn(0x00, r); \ - a[0x02] ^= PC32up(0x10, r); \ - a[0x03] ^= PC32dn(0x10, r); \ - a[0x04] ^= PC32up(0x20, r); \ - a[0x05] ^= PC32dn(0x20, r); \ - a[0x06] ^= PC32up(0x30, r); \ - a[0x07] ^= PC32dn(0x30, r); \ - a[0x08] ^= PC32up(0x40, r); \ - a[0x09] ^= PC32dn(0x40, r); \ - a[0x0A] ^= PC32up(0x50, r); \ - a[0x0B] ^= PC32dn(0x50, r); \ - a[0x0C] ^= PC32up(0x60, r); \ - a[0x0D] ^= PC32dn(0x60, r); \ - a[0x0E] ^= PC32up(0x70, r); \ - a[0x0F] ^= PC32dn(0x70, r); \ - a[0x10] ^= PC32up(0x80, r); \ - a[0x11] ^= PC32dn(0x80, r); \ - a[0x12] ^= PC32up(0x90, r); \ - a[0x13] ^= PC32dn(0x90, r); \ - a[0x14] ^= PC32up(0xA0, r); \ - a[0x15] ^= PC32dn(0xA0, r); \ - a[0x16] ^= PC32up(0xB0, r); \ - a[0x17] ^= PC32dn(0xB0, r); \ - a[0x18] ^= PC32up(0xC0, r); \ - a[0x19] ^= PC32dn(0xC0, r); \ - a[0x1A] ^= PC32up(0xD0, r); \ - a[0x1B] ^= PC32dn(0xD0, r); \ - a[0x1C] ^= PC32up(0xE0, r); \ - a[0x1D] ^= PC32dn(0xE0, r); \ - a[0x1E] ^= PC32up(0xF0, r); \ - a[0x1F] ^= PC32dn(0xF0, r); \ - RBTT(0x00, 0x01, a, \ - 0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x17); \ - RBTT(0x02, 0x03, a, \ - 0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x19); \ - RBTT(0x04, 0x05, a, \ - 0x04, 0x06, 0x08, 0x0A, 0x0D, 0x0F, 0x11, 0x1B); \ - RBTT(0x06, 0x07, a, \ - 0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x11, 0x13, 0x1D); \ - RBTT(0x08, 0x09, a, \ - 0x08, 0x0A, 0x0C, 0x0E, 0x11, 0x13, 0x15, 0x1F); \ - RBTT(0x0A, 0x0B, a, \ - 0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x15, 0x17, 0x01); \ - RBTT(0x0C, 0x0D, a, \ - 0x0C, 0x0E, 0x10, 0x12, 0x15, 0x17, 0x19, 0x03); \ - RBTT(0x0E, 0x0F, a, \ - 0x0E, 0x10, 0x12, 0x14, 0x17, 0x19, 0x1B, 0x05); \ - RBTT(0x10, 0x11, a, \ - 0x10, 0x12, 0x14, 0x16, 0x19, 0x1B, 0x1D, 0x07); \ - RBTT(0x12, 0x13, a, \ - 0x12, 0x14, 0x16, 0x18, 0x1B, 0x1D, 0x1F, 0x09); \ - RBTT(0x14, 0x15, a, \ - 0x14, 0x16, 0x18, 0x1A, 0x1D, 0x1F, 0x01, 0x0B); \ - RBTT(0x16, 0x17, a, \ - 0x16, 0x18, 0x1A, 0x1C, 0x1F, 0x01, 0x03, 0x0D); \ - RBTT(0x18, 0x19, a, \ - 0x18, 0x1A, 0x1C, 0x1E, 0x01, 0x03, 0x05, 0x0F); \ - RBTT(0x1A, 0x1B, a, \ - 0x1A, 0x1C, 0x1E, 0x00, 0x03, 0x05, 0x07, 0x11); \ - RBTT(0x1C, 0x1D, a, \ - 0x1C, 0x1E, 0x00, 0x02, 0x05, 0x07, 0x09, 0x13); \ - RBTT(0x1E, 0x1F, a, \ - 0x1E, 0x00, 0x02, 0x04, 0x07, 0x09, 0x0B, 0x15); \ - memcpy(a, t, sizeof t); \ - } while (0) - -#define ROUND_BIG_Q(a, r) do { \ - sph_u32 t[32]; \ - a[0x00] ^= QC32up(0x00, r); \ - a[0x01] ^= QC32dn(0x00, r); \ - a[0x02] ^= QC32up(0x10, r); \ - a[0x03] ^= QC32dn(0x10, r); \ - a[0x04] ^= QC32up(0x20, r); \ - a[0x05] ^= QC32dn(0x20, r); \ - a[0x06] ^= QC32up(0x30, r); \ - a[0x07] ^= QC32dn(0x30, r); \ - a[0x08] ^= QC32up(0x40, r); \ - a[0x09] ^= QC32dn(0x40, r); \ - a[0x0A] ^= QC32up(0x50, r); \ - a[0x0B] ^= QC32dn(0x50, r); \ - a[0x0C] ^= QC32up(0x60, r); \ - a[0x0D] ^= QC32dn(0x60, r); \ - a[0x0E] ^= QC32up(0x70, r); \ - a[0x0F] ^= QC32dn(0x70, r); \ - a[0x10] ^= QC32up(0x80, r); \ - a[0x11] ^= QC32dn(0x80, r); \ - a[0x12] ^= QC32up(0x90, r); \ - a[0x13] ^= QC32dn(0x90, r); \ - a[0x14] ^= QC32up(0xA0, r); \ - a[0x15] ^= QC32dn(0xA0, r); \ - a[0x16] ^= QC32up(0xB0, r); \ - a[0x17] ^= QC32dn(0xB0, r); \ - a[0x18] ^= QC32up(0xC0, r); \ - a[0x19] ^= QC32dn(0xC0, r); \ - a[0x1A] ^= QC32up(0xD0, r); \ - a[0x1B] ^= QC32dn(0xD0, r); \ - a[0x1C] ^= QC32up(0xE0, r); \ - a[0x1D] ^= QC32dn(0xE0, r); \ - a[0x1E] ^= QC32up(0xF0, r); \ - a[0x1F] ^= QC32dn(0xF0, r); \ - RBTT(0x00, 0x01, a, \ - 0x02, 0x06, 0x0A, 0x16, 0x01, 0x05, 0x09, 0x0D); \ - RBTT(0x02, 0x03, a, \ - 0x04, 0x08, 0x0C, 0x18, 0x03, 0x07, 0x0B, 0x0F); \ - RBTT(0x04, 0x05, a, \ - 0x06, 0x0A, 0x0E, 0x1A, 0x05, 0x09, 0x0D, 0x11); \ - RBTT(0x06, 0x07, a, \ - 0x08, 0x0C, 0x10, 0x1C, 0x07, 0x0B, 0x0F, 0x13); \ - RBTT(0x08, 0x09, a, \ - 0x0A, 0x0E, 0x12, 0x1E, 0x09, 0x0D, 0x11, 0x15); \ - RBTT(0x0A, 0x0B, a, \ - 0x0C, 0x10, 0x14, 0x00, 0x0B, 0x0F, 0x13, 0x17); \ - RBTT(0x0C, 0x0D, a, \ - 0x0E, 0x12, 0x16, 0x02, 0x0D, 0x11, 0x15, 0x19); \ - RBTT(0x0E, 0x0F, a, \ - 0x10, 0x14, 0x18, 0x04, 0x0F, 0x13, 0x17, 0x1B); \ - RBTT(0x10, 0x11, a, \ - 0x12, 0x16, 0x1A, 0x06, 0x11, 0x15, 0x19, 0x1D); \ - RBTT(0x12, 0x13, a, \ - 0x14, 0x18, 0x1C, 0x08, 0x13, 0x17, 0x1B, 0x1F); \ - RBTT(0x14, 0x15, a, \ - 0x16, 0x1A, 0x1E, 0x0A, 0x15, 0x19, 0x1D, 0x01); \ - RBTT(0x16, 0x17, a, \ - 0x18, 0x1C, 0x00, 0x0C, 0x17, 0x1B, 0x1F, 0x03); \ - RBTT(0x18, 0x19, a, \ - 0x1A, 0x1E, 0x02, 0x0E, 0x19, 0x1D, 0x01, 0x05); \ - RBTT(0x1A, 0x1B, a, \ - 0x1C, 0x00, 0x04, 0x10, 0x1B, 0x1F, 0x03, 0x07); \ - RBTT(0x1C, 0x1D, a, \ - 0x1E, 0x02, 0x06, 0x12, 0x1D, 0x01, 0x05, 0x09); \ - RBTT(0x1E, 0x1F, a, \ - 0x00, 0x04, 0x08, 0x14, 0x1F, 0x03, 0x07, 0x0B); \ - memcpy(a, t, sizeof t); \ - } while (0) - -#endif - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define PERM_BIG_P(a) do { \ - int r; \ - for (r = 0; r < 14; r ++) \ - ROUND_BIG_P(a, r); \ - } while (0) - -#define PERM_BIG_Q(a) do { \ - int r; \ - for (r = 0; r < 14; r ++) \ - ROUND_BIG_Q(a, r); \ - } while (0) - -#else - -#define PERM_BIG_P(a) do { \ - int r; \ - for (r = 0; r < 14; r += 2) { \ - ROUND_BIG_P(a, r + 0); \ - ROUND_BIG_P(a, r + 1); \ - } \ - } while (0) - -#define PERM_BIG_Q(a) do { \ - int r; \ - for (r = 0; r < 14; r += 2) { \ - ROUND_BIG_Q(a, r + 0); \ - ROUND_BIG_Q(a, r + 1); \ - } \ - } while (0) - -#endif - -#define COMPRESS_BIG do { \ - sph_u32 g[32], m[32]; \ - size_t u; \ - for (u = 0; u < 32; u ++) { \ - m[u] = dec32e_aligned(buf + (u << 2)); \ - g[u] = m[u] ^ H[u]; \ - } \ - PERM_BIG_P(g); \ - PERM_BIG_Q(m); \ - for (u = 0; u < 32; u ++) \ - H[u] ^= g[u] ^ m[u]; \ - } while (0) - -#define FINAL_BIG do { \ - sph_u32 x[32]; \ - size_t u; \ - memcpy(x, H, sizeof x); \ - PERM_BIG_P(x); \ - for (u = 0; u < 32; u ++) \ - H[u] ^= x[u]; \ - } while (0) - -#endif - -static void -groestl_small_init(sph_groestl_small_context *sc, unsigned out_size) -{ - size_t u; - - sc->ptr = 0; -#if SPH_GROESTL_64 - for (u = 0; u < 7; u ++) - sc->state.wide[u] = 0; -#if USE_LE - sc->state.wide[7] = ((sph_u64)(out_size & 0xFF) << 56) - | ((sph_u64)(out_size & 0xFF00) << 40); -#else - sc->state.wide[7] = (sph_u64)out_size; -#endif -#else - for (u = 0; u < 15; u ++) - sc->state.narrow[u] = 0; -#if USE_LE - sc->state.narrow[15] = ((sph_u32)(out_size & 0xFF) << 24) - | ((sph_u32)(out_size & 0xFF00) << 8); -#else - sc->state.narrow[15] = (sph_u32)out_size; -#endif -#endif -#if SPH_64 - sc->count = 0; -#else - sc->count_high = 0; - sc->count_low = 0; -#endif -} - -static void -groestl_small_core(sph_groestl_small_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE_SMALL - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE_SMALL(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - COMPRESS_SMALL; -#if SPH_64 - sc->count ++; -#else - if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) - sc->count_high = SPH_T32(sc->count_high + 1); -#endif - ptr = 0; - } - } - WRITE_STATE_SMALL(sc); - sc->ptr = ptr; -} - -static void -groestl_small_close(sph_groestl_small_context *sc, - unsigned ub, unsigned n, void *dst, size_t out_len) -{ - unsigned char *buf; - unsigned char pad[72]; - size_t u, ptr, pad_len; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif - unsigned z; - DECL_STATE_SMALL - - buf = sc->buf; - ptr = sc->ptr; - z = 0x80 >> n; - pad[0] = ((ub & -z) | z) & 0xFF; - if (ptr < 56) { - pad_len = 64 - ptr; -#if SPH_64 - count = SPH_T64(sc->count + 1); -#else - count_low = SPH_T32(sc->count_low + 1); - count_high = SPH_T32(sc->count_high); - if (count_low == 0) - count_high = SPH_T32(count_high + 1); -#endif - } else { - pad_len = 128 - ptr; -#if SPH_64 - count = SPH_T64(sc->count + 2); -#else - count_low = SPH_T32(sc->count_low + 2); - count_high = SPH_T32(sc->count_high); - if (count_low <= 1) - count_high = SPH_T32(count_high + 1); -#endif - } - memset(pad + 1, 0, pad_len - 9); -#if SPH_64 - sph_enc64be(pad + pad_len - 8, count); -#else - sph_enc64be(pad + pad_len - 8, count_high); - sph_enc64be(pad + pad_len - 4, count_low); -#endif - groestl_small_core(sc, pad, pad_len); - READ_STATE_SMALL(sc); - FINAL_SMALL; -#if SPH_GROESTL_64 - for (u = 0; u < 4; u ++) - enc64e(pad + (u << 3), H[u + 4]); -#else - for (u = 0; u < 8; u ++) - enc32e(pad + (u << 2), H[u + 8]); -#endif - memcpy(dst, pad + 32 - out_len, out_len); - groestl_small_init(sc, (unsigned)out_len << 3); -} - -static void -groestl_big_init(sph_groestl_big_context *sc, unsigned out_size) -{ - size_t u; - - sc->ptr = 0; -#if SPH_GROESTL_64 - for (u = 0; u < 15; u ++) - sc->state.wide[u] = 0; -#if USE_LE - sc->state.wide[15] = ((sph_u64)(out_size & 0xFF) << 56) - | ((sph_u64)(out_size & 0xFF00) << 40); -#else - sc->state.wide[15] = (sph_u64)out_size; -#endif -#else - for (u = 0; u < 31; u ++) - sc->state.narrow[u] = 0; -#if USE_LE - sc->state.narrow[31] = ((sph_u32)(out_size & 0xFF) << 24) - | ((sph_u32)(out_size & 0xFF00) << 8); -#else - sc->state.narrow[31] = (sph_u32)out_size; -#endif -#endif -#if SPH_64 - sc->count = 0; -#else - sc->count_high = 0; - sc->count_low = 0; -#endif -} - -static void -groestl_big_core(sph_groestl_big_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE_BIG - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE_BIG(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - COMPRESS_BIG; -#if SPH_64 - sc->count ++; -#else - if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) - sc->count_high = SPH_T32(sc->count_high + 1); -#endif - ptr = 0; - } - } - WRITE_STATE_BIG(sc); - sc->ptr = ptr; -} - -static void -groestl_big_close(sph_groestl_big_context *sc, - unsigned ub, unsigned n, void *dst, size_t out_len) -{ - unsigned char *buf; - unsigned char pad[136]; - size_t ptr, pad_len, u; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif - unsigned z; - DECL_STATE_BIG - - buf = sc->buf; - ptr = sc->ptr; - z = 0x80 >> n; - pad[0] = ((ub & -z) | z) & 0xFF; - if (ptr < 120) { - pad_len = 128 - ptr; -#if SPH_64 - count = SPH_T64(sc->count + 1); -#else - count_low = SPH_T32(sc->count_low + 1); - count_high = SPH_T32(sc->count_high); - if (count_low == 0) - count_high = SPH_T32(count_high + 1); -#endif - } else { - pad_len = 256 - ptr; -#if SPH_64 - count = SPH_T64(sc->count + 2); -#else - count_low = SPH_T32(sc->count_low + 2); - count_high = SPH_T32(sc->count_high); - if (count_low <= 1) - count_high = SPH_T32(count_high + 1); -#endif - } - memset(pad + 1, 0, pad_len - 9); - //fprintf(stderr, "%x\n", pad_len); -#if SPH_64 - sph_enc64be(pad + pad_len - 8, count); -#else - sph_enc64be(pad + pad_len - 8, count_high); - sph_enc64be(pad + pad_len - 4, count_low); -#endif - groestl_big_core(sc, pad, pad_len); - READ_STATE_BIG(sc); - FINAL_BIG; -#if SPH_GROESTL_64 - for (u = 0; u < 8; u ++) - enc64e(pad + (u << 3), H[u + 8]); -#else - for (u = 0; u < 16; u ++) - enc32e(pad + (u << 2), H[u + 16]); -#endif - memcpy(dst, pad + 64 - out_len, out_len); - groestl_big_init(sc, (unsigned)out_len << 3); -} - -/* see sph_groestl.h */ -void -sph_groestl224_init(void *cc) -{ - groestl_small_init(cc, 224); -} - -/* see sph_groestl.h */ -void -sph_groestl224(void *cc, const void *data, size_t len) -{ - groestl_small_core(cc, data, len); -} - -/* see sph_groestl.h */ -void -sph_groestl224_close(void *cc, void *dst) -{ - groestl_small_close(cc, 0, 0, dst, 28); -} - -/* see sph_groestl.h */ -void -sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - groestl_small_close(cc, ub, n, dst, 28); -} - -/* see sph_groestl.h */ -void -sph_groestl256_init(void *cc) -{ - groestl_small_init(cc, 256); -} - -/* see sph_groestl.h */ -void -sph_groestl256(void *cc, const void *data, size_t len) -{ - groestl_small_core(cc, data, len); -} - -/* see sph_groestl.h */ -void -sph_groestl256_close(void *cc, void *dst) -{ - groestl_small_close(cc, 0, 0, dst, 32); -} - -/* see sph_groestl.h */ -void -sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - groestl_small_close(cc, ub, n, dst, 32); -} - -/* see sph_groestl.h */ -void -sph_groestl384_init(void *cc) -{ - groestl_big_init(cc, 384); -} - -/* see sph_groestl.h */ -void -sph_groestl384(void *cc, const void *data, size_t len) -{ - groestl_big_core(cc, data, len); -} - -/* see sph_groestl.h */ -void -sph_groestl384_close(void *cc, void *dst) -{ - groestl_big_close(cc, 0, 0, dst, 48); -} - -/* see sph_groestl.h */ -void -sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - groestl_big_close(cc, ub, n, dst, 48); -} - -/* see sph_groestl.h */ -void -sph_groestl512_init(void *cc) -{ - groestl_big_init(cc, 512); -} - -/* see sph_groestl.h */ -void -sph_groestl512(void *cc, const void *data, size_t len) -{ - groestl_big_core(cc, data, len); -} - -/* see sph_groestl.h */ -void -sph_groestl512_close(void *cc, void *dst) -{ - groestl_big_close(cc, 0, 0, dst, 64); -} - -/* see sph_groestl.h */ -void -sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - groestl_big_close(cc, ub, n, dst, 64); -} - -#ifdef __cplusplus -} -#endif +/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */ +/* + * Groestl implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ +#include +#include +#include + +#include "sph_groestl.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_GROESTL +#define SPH_SMALL_FOOTPRINT_GROESTL 1 +#endif + +/* + * Apparently, the 32-bit-only version is not faster than the 64-bit + * version unless using the "small footprint" code on a 32-bit machine. + */ +#if !defined SPH_GROESTL_64 +#if SPH_SMALL_FOOTPRINT_GROESTL && !SPH_64_TRUE +#define SPH_GROESTL_64 0 +#else +#define SPH_GROESTL_64 1 +#endif +#endif + +#if !SPH_64 +#undef SPH_GROESTL_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +/* + * The internal representation may use either big-endian or + * little-endian. Using the platform default representation speeds up + * encoding and decoding between bytes and the matrix columns. + */ + +#undef USE_LE +#if SPH_GROESTL_LITTLE_ENDIAN +#define USE_LE 1 +#elif SPH_GROESTL_BIG_ENDIAN +#define USE_LE 0 +#elif SPH_LITTLE_ENDIAN +#define USE_LE 1 +#endif + +#if USE_LE + +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) +#define dec32e_aligned sph_dec32le_aligned +#define enc32e sph_enc32le +#define B32_0(x) ((x) & 0xFF) +#define B32_1(x) (((x) >> 8) & 0xFF) +#define B32_2(x) (((x) >> 16) & 0xFF) +#define B32_3(x) ((x) >> 24) + +#define R32u(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) +#define R32d(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) + +#define PC32up(j, r) ((sph_u32)((j) + (r))) +#define PC32dn(j, r) 0 +#define QC32up(j, r) SPH_C32(0xFFFFFFFF) +#define QC32dn(j, r) (((sph_u32)(r) << 24) ^ SPH_T32(~((sph_u32)(j) << 24))) + +#if SPH_64 +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) +#define dec64e_aligned sph_dec64le_aligned +#define enc64e sph_enc64le +#define B64_0(x) ((x) & 0xFF) +#define B64_1(x) (((x) >> 8) & 0xFF) +#define B64_2(x) (((x) >> 16) & 0xFF) +#define B64_3(x) (((x) >> 24) & 0xFF) +#define B64_4(x) (((x) >> 32) & 0xFF) +#define B64_5(x) (((x) >> 40) & 0xFF) +#define B64_6(x) (((x) >> 48) & 0xFF) +#define B64_7(x) ((x) >> 56) +#define R64 SPH_ROTL64 +#define PC64(j, r) ((sph_u64)((j) + (r))) +#define QC64(j, r) (((sph_u64)(r) << 56) ^ SPH_T64(~((sph_u64)(j) << 56))) +#endif + +#else + +#define C32e(x) SPH_C32(x) +#define dec32e_aligned sph_dec32be_aligned +#define enc32e sph_enc32be +#define B32_0(x) ((x) >> 24) +#define B32_1(x) (((x) >> 16) & 0xFF) +#define B32_2(x) (((x) >> 8) & 0xFF) +#define B32_3(x) ((x) & 0xFF) + +#define R32u(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) +#define R32d(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) + +#define PC32up(j, r) ((sph_u32)((j) + (r)) << 24) +#define PC32dn(j, r) 0 +#define QC32up(j, r) SPH_C32(0xFFFFFFFF) +#define QC32dn(j, r) ((sph_u32)(r) ^ SPH_T32(~(sph_u32)(j))) + +#if SPH_64 +#define C64e(x) SPH_C64(x) +#define dec64e_aligned sph_dec64be_aligned +#define enc64e sph_enc64be +#define B64_0(x) ((x) >> 56) +#define B64_1(x) (((x) >> 48) & 0xFF) +#define B64_2(x) (((x) >> 40) & 0xFF) +#define B64_3(x) (((x) >> 32) & 0xFF) +#define B64_4(x) (((x) >> 24) & 0xFF) +#define B64_5(x) (((x) >> 16) & 0xFF) +#define B64_6(x) (((x) >> 8) & 0xFF) +#define B64_7(x) ((x) & 0xFF) +#define R64 SPH_ROTR64 +#define PC64(j, r) ((sph_u64)((j) + (r)) << 56) +#define QC64(j, r) ((sph_u64)(r) ^ SPH_T64(~(sph_u64)(j))) +#endif + +#endif + +#if SPH_GROESTL_64 + +static const sph_u64 T0[] = { + C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), + C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), + C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), + C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), + C64e(0x6090f050f0c05060), C64e(0x0207050305040302), + C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), + C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), + C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), + C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), + C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), + C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), + C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), + C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), + C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), + C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), + C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), + C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), + C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), + C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), + C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), + C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), + C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), + C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), + C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), + C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), + C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), + C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), + C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), + C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), + C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), + C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), + C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), + C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), + C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), + C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), + C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), + C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), + C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), + C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), + C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), + C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), + C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), + C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), + C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), + C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), + C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), + C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), + C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), + C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), + C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), + C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), + C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), + C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), + C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), + C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), + C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), + C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), + C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), + C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), + C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), + C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), + C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), + C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), + C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), + C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), + C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), + C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), + C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), + C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), + C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), + C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), + C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), + C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), + C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), + C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), + C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), + C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), + C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), + C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), + C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), + C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), + C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), + C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), + C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), + C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), + C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), + C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), + C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), + C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), + C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), + C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), + C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), + C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), + C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), + C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), + C64e(0x476720e9208ee947), C64e(0x1038281828201810), + C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), + C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), + C64e(0x38546c246c702438), C64e(0x575f08f108aef157), + C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), + C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), + C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), + C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), + C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), + C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), + C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), + C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), + C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), + C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), + C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), + C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), + C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), + C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), + C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), + C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), + C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), + C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), + C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), + C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), + C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), + C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), + C64e(0x09929b809b128009), C64e(0x1a2339173934171a), + C64e(0x651075da75cada65), C64e(0xd784533153b531d7), + C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), + C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), + C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), + C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), + C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) +}; + +#if !SPH_SMALL_FOOTPRINT_GROESTL + +static const sph_u64 T1[] = { + C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), + C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), + C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), + C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), + C64e(0x606090f050f0c050), C64e(0x0202070503050403), + C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), + C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), + C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), + C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), + C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), + C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), + C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), + C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), + C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), + C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), + C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), + C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), + C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), + C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), + C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), + C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), + C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), + C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), + C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), + C64e(0x08081c140c14100c), C64e(0x959563f652f63152), + C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), + C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), + C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), + C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), + C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), + C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), + C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), + C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), + C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), + C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), + C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), + C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), + C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), + C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), + C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), + C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), + C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), + C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), + C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), + C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), + C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), + C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), + C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), + C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), + C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), + C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), + C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), + C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), + C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), + C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), + C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), + C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), + C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), + C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), + C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), + C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), + C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), + C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), + C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), + C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), + C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), + C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), + C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), + C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), + C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), + C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), + C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), + C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), + C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), + C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), + C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), + C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), + C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), + C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), + C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), + C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), + C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), + C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), + C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), + C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), + C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), + C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), + C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), + C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), + C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), + C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), + C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), + C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), + C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), + C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), + C64e(0x47476720e9208ee9), C64e(0x1010382818282018), + C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), + C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), + C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), + C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), + C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), + C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), + C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), + C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), + C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), + C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), + C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), + C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), + C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), + C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), + C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), + C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), + C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), + C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), + C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), + C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), + C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), + C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), + C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), + C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), + C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), + C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), + C64e(0x65651075da75cada), C64e(0xd7d784533153b531), + C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), + C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), + C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), + C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), + C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) +}; + +static const sph_u64 T2[] = { + C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), + C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), + C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), + C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), + C64e(0x50606090f050f0c0), C64e(0x0302020705030504), + C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), + C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), + C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), + C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), + C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), + C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), + C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), + C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), + C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), + C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), + C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), + C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), + C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), + C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), + C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), + C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), + C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), + C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), + C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), + C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), + C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), + C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), + C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), + C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), + C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), + C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), + C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), + C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), + C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), + C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), + C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), + C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), + C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), + C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), + C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), + C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), + C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), + C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), + C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), + C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), + C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), + C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), + C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), + C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), + C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), + C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), + C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), + C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), + C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), + C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), + C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), + C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), + C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), + C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), + C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), + C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), + C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), + C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), + C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), + C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), + C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), + C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), + C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), + C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), + C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), + C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), + C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), + C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), + C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), + C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), + C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), + C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), + C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), + C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), + C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), + C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), + C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), + C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), + C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), + C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), + C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), + C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), + C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), + C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), + C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), + C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), + C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), + C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), + C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), + C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), + C64e(0xe947476720e9208e), C64e(0x1810103828182820), + C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), + C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), + C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), + C64e(0xc773732152c752e6), C64e(0x51979764f351f335), + C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), + C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), + C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), + C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), + C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), + C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), + C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), + C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), + C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), + C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), + C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), + C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), + C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), + C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), + C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), + C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), + C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), + C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), + C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), + C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), + C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), + C64e(0x800909929b809b12), C64e(0x171a1a2339173934), + C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), + C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), + C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), + C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), + C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), + C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) +}; + +static const sph_u64 T3[] = { + C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), + C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), + C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), + C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), + C64e(0xc050606090f050f0), C64e(0x0403020207050305), + C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), + C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), + C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), + C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), + C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), + C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), + C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), + C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), + C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), + C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), + C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), + C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), + C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), + C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), + C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), + C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), + C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), + C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), + C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), + C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), + C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), + C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), + C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), + C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), + C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), + C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), + C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), + C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), + C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), + C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), + C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), + C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), + C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), + C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), + C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), + C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), + C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), + C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), + C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), + C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), + C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), + C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), + C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), + C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), + C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), + C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), + C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), + C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), + C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), + C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), + C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), + C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), + C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), + C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), + C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), + C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), + C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), + C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), + C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), + C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), + C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), + C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), + C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), + C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), + C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), + C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), + C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), + C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), + C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), + C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), + C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), + C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), + C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), + C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), + C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), + C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), + C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), + C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), + C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), + C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), + C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), + C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), + C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), + C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), + C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), + C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), + C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), + C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), + C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), + C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), + C64e(0x8ee947476720e920), C64e(0x2018101038281828), + C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), + C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), + C64e(0x70243838546c246c), C64e(0xaef157575f08f108), + C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), + C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), + C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), + C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), + C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), + C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), + C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), + C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), + C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), + C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), + C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), + C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), + C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), + C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), + C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), + C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), + C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), + C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), + C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), + C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), + C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), + C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), + C64e(0x12800909929b809b), C64e(0x34171a1a23391739), + C64e(0xcada65651075da75), C64e(0xb531d7d784533153), + C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), + C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), + C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), + C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), + C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) +}; + +#endif + +static const sph_u64 T4[] = { + C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), + C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), + C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), + C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), + C64e(0xf0c050606090f050), C64e(0x0504030202070503), + C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), + C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), + C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), + C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), + C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), + C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), + C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), + C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), + C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), + C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), + C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), + C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), + C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), + C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), + C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), + C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), + C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), + C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), + C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), + C64e(0x14100c08081c140c), C64e(0xf63152959563f652), + C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), + C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), + C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), + C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), + C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), + C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), + C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), + C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), + C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), + C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), + C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), + C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), + C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), + C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), + C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), + C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), + C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), + C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), + C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), + C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), + C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), + C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), + C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), + C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), + C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), + C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), + C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), + C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), + C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), + C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), + C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), + C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), + C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), + C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), + C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), + C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), + C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), + C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), + C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), + C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), + C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), + C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), + C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), + C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), + C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), + C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), + C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), + C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), + C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), + C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), + C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), + C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), + C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), + C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), + C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), + C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), + C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), + C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), + C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), + C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), + C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), + C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), + C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), + C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), + C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), + C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), + C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), + C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), + C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), + C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), + C64e(0x208ee947476720e9), C64e(0x2820181010382818), + C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), + C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), + C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), + C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), + C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), + C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), + C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), + C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), + C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), + C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), + C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), + C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), + C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), + C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), + C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), + C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), + C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), + C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), + C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), + C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), + C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), + C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), + C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), + C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), + C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), + C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), + C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), + C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), + C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), + C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), + C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), + C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) +}; + +#if !SPH_SMALL_FOOTPRINT_GROESTL + +static const sph_u64 T5[] = { + C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), + C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), + C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), + C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), + C64e(0x50f0c050606090f0), C64e(0x0305040302020705), + C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), + C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), + C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), + C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), + C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), + C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), + C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), + C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), + C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), + C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), + C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), + C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), + C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), + C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), + C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), + C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), + C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), + C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), + C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), + C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), + C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), + C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), + C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), + C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), + C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), + C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), + C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), + C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), + C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), + C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), + C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), + C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), + C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), + C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), + C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), + C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), + C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), + C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), + C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), + C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), + C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), + C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), + C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), + C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), + C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), + C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), + C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), + C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), + C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), + C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), + C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), + C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), + C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), + C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), + C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), + C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), + C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), + C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), + C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), + C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), + C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), + C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), + C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), + C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), + C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), + C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), + C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), + C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), + C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), + C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), + C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), + C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), + C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), + C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), + C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), + C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), + C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), + C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), + C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), + C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), + C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), + C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), + C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), + C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), + C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), + C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), + C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), + C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), + C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), + C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), + C64e(0xe9208ee947476720), C64e(0x1828201810103828), + C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), + C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), + C64e(0x246c70243838546c), C64e(0xf108aef157575f08), + C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), + C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), + C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), + C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), + C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), + C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), + C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), + C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), + C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), + C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), + C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), + C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), + C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), + C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), + C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), + C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), + C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), + C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), + C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), + C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), + C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), + C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), + C64e(0x809b12800909929b), C64e(0x173934171a1a2339), + C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), + C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), + C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), + C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), + C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), + C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) +}; + +static const sph_u64 T6[] = { + C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), + C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), + C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), + C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), + C64e(0xf050f0c050606090), C64e(0x0503050403020207), + C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), + C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), + C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), + C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), + C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), + C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), + C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), + C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), + C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), + C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), + C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), + C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), + C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), + C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), + C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), + C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), + C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), + C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), + C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), + C64e(0x140c14100c08081c), C64e(0xf652f63152959563), + C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), + C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), + C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), + C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), + C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), + C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), + C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), + C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), + C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), + C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), + C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), + C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), + C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), + C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), + C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), + C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), + C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), + C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), + C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), + C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), + C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), + C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), + C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), + C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), + C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), + C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), + C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), + C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), + C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), + C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), + C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), + C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), + C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), + C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), + C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), + C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), + C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), + C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), + C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), + C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), + C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), + C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), + C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), + C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), + C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), + C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), + C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), + C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), + C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), + C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), + C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), + C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), + C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), + C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), + C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), + C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), + C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), + C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), + C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), + C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), + C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), + C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), + C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), + C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), + C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), + C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), + C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), + C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), + C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), + C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), + C64e(0x20e9208ee9474767), C64e(0x2818282018101038), + C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), + C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), + C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), + C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), + C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), + C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), + C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), + C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), + C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), + C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), + C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), + C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), + C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), + C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), + C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), + C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), + C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), + C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), + C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), + C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), + C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), + C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), + C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), + C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), + C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), + C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), + C64e(0x75da75cada656510), C64e(0x533153b531d7d784), + C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), + C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), + C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), + C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), + C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) +}; + +static const sph_u64 T7[] = { + C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), + C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), + C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), + C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), + C64e(0x90f050f0c0506060), C64e(0x0705030504030202), + C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), + C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), + C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), + C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), + C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), + C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), + C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), + C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), + C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), + C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), + C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), + C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), + C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), + C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), + C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), + C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), + C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), + C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), + C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), + C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), + C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), + C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), + C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), + C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), + C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), + C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), + C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), + C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), + C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), + C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), + C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), + C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), + C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), + C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), + C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), + C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), + C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), + C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), + C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), + C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), + C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), + C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), + C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), + C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), + C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), + C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), + C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), + C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), + C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), + C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), + C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), + C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), + C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), + C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), + C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), + C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), + C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), + C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), + C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), + C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), + C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), + C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), + C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), + C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), + C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), + C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), + C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), + C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), + C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), + C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), + C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), + C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), + C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), + C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), + C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), + C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), + C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), + C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), + C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), + C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), + C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), + C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), + C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), + C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), + C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), + C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), + C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), + C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), + C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), + C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), + C64e(0x6720e9208ee94747), C64e(0x3828182820181010), + C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), + C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), + C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), + C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), + C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), + C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), + C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), + C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), + C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), + C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), + C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), + C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), + C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), + C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), + C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), + C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), + C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), + C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), + C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), + C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), + C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), + C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), + C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), + C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), + C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), + C64e(0x929b809b12800909), C64e(0x2339173934171a1a), + C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), + C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), + C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), + C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), + C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), + C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) +}; + +#endif + +#define DECL_STATE_SMALL \ + sph_u64 H[8]; + +#define READ_STATE_SMALL(sc) do { \ + memcpy(H, (sc)->state.wide, sizeof H); \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + memcpy((sc)->state.wide, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ R64(T0[B64_1(a[b1])], 8) \ + ^ R64(T0[B64_2(a[b2])], 16) \ + ^ R64(T0[B64_3(a[b3])], 24) \ + ^ T4[B64_4(a[b4])] \ + ^ R64(T4[B64_5(a[b5])], 8) \ + ^ R64(T4[B64_6(a[b6])], 16) \ + ^ R64(T4[B64_7(a[b7])], 24); \ + } while (0) + +#else + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ T1[B64_1(a[b1])] \ + ^ T2[B64_2(a[b2])] \ + ^ T3[B64_3(a[b3])] \ + ^ T4[B64_4(a[b4])] \ + ^ T5[B64_5(a[b5])] \ + ^ T6[B64_6(a[b6])] \ + ^ T7[B64_7(a[b7])]; \ + } while (0) + +#endif + +#define ROUND_SMALL_P(a, r) do { \ + sph_u64 t[8]; \ + a[0] ^= PC64(0x00, r); \ + a[1] ^= PC64(0x10, r); \ + a[2] ^= PC64(0x20, r); \ + a[3] ^= PC64(0x30, r); \ + a[4] ^= PC64(0x40, r); \ + a[5] ^= PC64(0x50, r); \ + a[6] ^= PC64(0x60, r); \ + a[7] ^= PC64(0x70, r); \ + RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \ + RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \ + RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \ + RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \ + RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \ + RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \ + RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \ + RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + sph_u64 t[8]; \ + a[0] ^= QC64(0x00, r); \ + a[1] ^= QC64(0x10, r); \ + a[2] ^= QC64(0x20, r); \ + a[3] ^= QC64(0x30, r); \ + a[4] ^= QC64(0x40, r); \ + a[5] ^= QC64(0x50, r); \ + a[6] ^= QC64(0x60, r); \ + a[7] ^= QC64(0x70, r); \ + RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \ + RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \ + RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \ + RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \ + RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \ + RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \ + RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \ + RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + +#else + +/* + * Apparently, unrolling more than that confuses GCC, resulting in + * lower performance, even though L1 cache would be no problem. + */ +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_P(a, r + 0); \ + ROUND_SMALL_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_Q(a, r + 0); \ + ROUND_SMALL_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_SMALL do { \ + sph_u64 g[8], m[8]; \ + size_t u; \ + for (u = 0; u < 8; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_SMALL_P(g); \ + PERM_SMALL_Q(m); \ + for (u = 0; u < 8; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_SMALL do { \ + sph_u64 x[8]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_SMALL_P(x); \ + for (u = 0; u < 8; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#define DECL_STATE_BIG \ + sph_u64 H[16]; + +#define READ_STATE_BIG(sc) do { \ + memcpy(H, (sc)->state.wide, sizeof H); \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + memcpy((sc)->state.wide, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ R64(T0[B64_1(a[b1])], 8) \ + ^ R64(T0[B64_2(a[b2])], 16) \ + ^ R64(T0[B64_3(a[b3])], 24) \ + ^ T4[B64_4(a[b4])] \ + ^ R64(T4[B64_5(a[b5])], 8) \ + ^ R64(T4[B64_6(a[b6])], 16) \ + ^ R64(T4[B64_7(a[b7])], 24); \ + } while (0) + +#else + +#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ T1[B64_1(a[b1])] \ + ^ T2[B64_2(a[b2])] \ + ^ T3[B64_3(a[b3])] \ + ^ T4[B64_4(a[b4])] \ + ^ T5[B64_5(a[b5])] \ + ^ T6[B64_6(a[b6])] \ + ^ T7[B64_7(a[b7])]; \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define ROUND_BIG_P(a, r) do { \ + sph_u64 t[16]; \ + size_t u; \ + a[0x0] ^= PC64(0x00, r); \ + a[0x1] ^= PC64(0x10, r); \ + a[0x2] ^= PC64(0x20, r); \ + a[0x3] ^= PC64(0x30, r); \ + a[0x4] ^= PC64(0x40, r); \ + a[0x5] ^= PC64(0x50, r); \ + a[0x6] ^= PC64(0x60, r); \ + a[0x7] ^= PC64(0x70, r); \ + a[0x8] ^= PC64(0x80, r); \ + a[0x9] ^= PC64(0x90, r); \ + a[0xA] ^= PC64(0xA0, r); \ + a[0xB] ^= PC64(0xB0, r); \ + a[0xC] ^= PC64(0xC0, r); \ + a[0xD] ^= PC64(0xD0, r); \ + a[0xE] ^= PC64(0xE0, r); \ + a[0xF] ^= PC64(0xF0, r); \ + for (u = 0; u < 16; u += 4) { \ + RBTT(u + 0, a, u + 0, (u + 1) & 0xF, \ + (u + 2) & 0xF, (u + 3) & 0xF, (u + 4) & 0xF, \ + (u + 5) & 0xF, (u + 6) & 0xF, (u + 11) & 0xF); \ + RBTT(u + 1, a, u + 1, (u + 2) & 0xF, \ + (u + 3) & 0xF, (u + 4) & 0xF, (u + 5) & 0xF, \ + (u + 6) & 0xF, (u + 7) & 0xF, (u + 12) & 0xF); \ + RBTT(u + 2, a, u + 2, (u + 3) & 0xF, \ + (u + 4) & 0xF, (u + 5) & 0xF, (u + 6) & 0xF, \ + (u + 7) & 0xF, (u + 8) & 0xF, (u + 13) & 0xF); \ + RBTT(u + 3, a, u + 3, (u + 4) & 0xF, \ + (u + 5) & 0xF, (u + 6) & 0xF, (u + 7) & 0xF, \ + (u + 8) & 0xF, (u + 9) & 0xF, (u + 14) & 0xF); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u64 t[16]; \ + size_t u; \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + for (u = 0; u < 16; u += 4) { \ + RBTT(u + 0, a, (u + 1) & 0xF, (u + 3) & 0xF, \ + (u + 5) & 0xF, (u + 11) & 0xF, (u + 0) & 0xF, \ + (u + 2) & 0xF, (u + 4) & 0xF, (u + 6) & 0xF); \ + RBTT(u + 1, a, (u + 2) & 0xF, (u + 4) & 0xF, \ + (u + 6) & 0xF, (u + 12) & 0xF, (u + 1) & 0xF, \ + (u + 3) & 0xF, (u + 5) & 0xF, (u + 7) & 0xF); \ + RBTT(u + 2, a, (u + 3) & 0xF, (u + 5) & 0xF, \ + (u + 7) & 0xF, (u + 13) & 0xF, (u + 2) & 0xF, \ + (u + 4) & 0xF, (u + 6) & 0xF, (u + 8) & 0xF); \ + RBTT(u + 3, a, (u + 4) & 0xF, (u + 6) & 0xF, \ + (u + 8) & 0xF, (u + 14) & 0xF, (u + 3) & 0xF, \ + (u + 5) & 0xF, (u + 7) & 0xF, (u + 9) & 0xF); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#else + +#define ROUND_BIG_P(a, r) do { \ + sph_u64 t[16]; \ + a[0x0] ^= PC64(0x00, r); \ + a[0x1] ^= PC64(0x10, r); \ + a[0x2] ^= PC64(0x20, r); \ + a[0x3] ^= PC64(0x30, r); \ + a[0x4] ^= PC64(0x40, r); \ + a[0x5] ^= PC64(0x50, r); \ + a[0x6] ^= PC64(0x60, r); \ + a[0x7] ^= PC64(0x70, r); \ + a[0x8] ^= PC64(0x80, r); \ + a[0x9] ^= PC64(0x90, r); \ + a[0xA] ^= PC64(0xA0, r); \ + a[0xB] ^= PC64(0xB0, r); \ + a[0xC] ^= PC64(0xC0, r); \ + a[0xD] ^= PC64(0xD0, r); \ + a[0xE] ^= PC64(0xE0, r); \ + a[0xF] ^= PC64(0xF0, r); \ + RBTT(0x0, a, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \ + RBTT(0x1, a, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \ + RBTT(0x2, a, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \ + RBTT(0x3, a, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \ + RBTT(0x4, a, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \ + RBTT(0x5, a, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \ + RBTT(0x6, a, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \ + RBTT(0x7, a, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \ + RBTT(0x8, a, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \ + RBTT(0x9, a, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \ + RBTT(0xA, a, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \ + RBTT(0xB, a, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \ + RBTT(0xC, a, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \ + RBTT(0xD, a, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \ + RBTT(0xE, a, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \ + RBTT(0xF, a, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \ + a[0x0] = t[0x0]; \ + a[0x1] = t[0x1]; \ + a[0x2] = t[0x2]; \ + a[0x3] = t[0x3]; \ + a[0x4] = t[0x4]; \ + a[0x5] = t[0x5]; \ + a[0x6] = t[0x6]; \ + a[0x7] = t[0x7]; \ + a[0x8] = t[0x8]; \ + a[0x9] = t[0x9]; \ + a[0xA] = t[0xA]; \ + a[0xB] = t[0xB]; \ + a[0xC] = t[0xC]; \ + a[0xD] = t[0xD]; \ + a[0xE] = t[0xE]; \ + a[0xF] = t[0xF]; \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u64 t[16]; \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + RBTT(0x0, a, 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \ + RBTT(0x1, a, 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \ + RBTT(0x2, a, 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \ + RBTT(0x3, a, 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \ + RBTT(0x4, a, 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \ + RBTT(0x5, a, 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \ + RBTT(0x6, a, 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \ + RBTT(0x7, a, 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \ + RBTT(0x8, a, 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \ + RBTT(0x9, a, 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \ + RBTT(0xA, a, 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \ + RBTT(0xB, a, 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \ + RBTT(0xC, a, 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \ + RBTT(0xD, a, 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \ + RBTT(0xE, a, 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \ + RBTT(0xF, a, 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \ + a[0x0] = t[0x0]; \ + a[0x1] = t[0x1]; \ + a[0x2] = t[0x2]; \ + a[0x3] = t[0x3]; \ + a[0x4] = t[0x4]; \ + a[0x5] = t[0x5]; \ + a[0x6] = t[0x6]; \ + a[0x7] = t[0x7]; \ + a[0x8] = t[0x8]; \ + a[0x9] = t[0x9]; \ + a[0xA] = t[0xA]; \ + a[0xB] = t[0xB]; \ + a[0xC] = t[0xC]; \ + a[0xD] = t[0xD]; \ + a[0xE] = t[0xE]; \ + a[0xF] = t[0xF]; \ + } while (0) + +#endif + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_P(a, r + 0); \ + ROUND_BIG_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_Q(a, r + 0); \ + ROUND_BIG_Q(a, r + 1); \ + } \ + } while (0) + +/* obsolete +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define COMPRESS_BIG do { \ + sph_u64 g[16], m[16], *ya; \ + const sph_u64 *yc; \ + size_t u; \ + int i; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + ya = g; \ + yc = CP; \ + for (i = 0; i < 2; i ++) { \ + PERM_BIG(ya, yc); \ + ya = m; \ + yc = CQ; \ + } \ + for (u = 0; u < 16; u ++) { \ + H[u] ^= g[u] ^ m[u]; \ + } \ + } while (0) + +#else +*/ + +#define COMPRESS_BIG do { \ + sph_u64 g[16], m[16]; \ + size_t u; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_BIG_P(g); \ + PERM_BIG_Q(m); \ + for (u = 0; u < 16; u ++) { \ + H[u] ^= g[u] ^ m[u]; \ + } \ + } while (0) + +/* obsolete +#endif +*/ + +#define FINAL_BIG do { \ + sph_u64 x[16]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_BIG_P(x); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#else + +static const sph_u32 T0up[] = { + C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), + C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), + C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), + C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), + C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), + C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), + C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), + C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), + C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), + C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), + C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), + C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), + C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), + C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), + C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), + C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), + C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), + C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), + C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), + C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), + C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), + C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), + C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), + C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), + C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), + C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), + C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), + C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), + C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), + C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), + C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), + C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), + C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), + C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), + C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), + C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), + C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), + C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), + C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), + C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), + C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), + C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), + C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), + C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), + C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), + C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), + C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), + C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), + C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), + C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), + C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), + C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), + C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), + C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), + C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), + C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), + C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), + C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), + C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), + C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), + C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), + C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), + C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), + C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) +}; + +static const sph_u32 T0dn[] = { + C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), + C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), + C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), + C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), + C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), + C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), + C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), + C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), + C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), + C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), + C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), + C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), + C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), + C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), + C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), + C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), + C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), + C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), + C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), + C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), + C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), + C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), + C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), + C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), + C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), + C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), + C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), + C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), + C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), + C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), + C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), + C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), + C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), + C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), + C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), + C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), + C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), + C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), + C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), + C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), + C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), + C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), + C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), + C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), + C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), + C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), + C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), + C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), + C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), + C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), + C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), + C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), + C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), + C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), + C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), + C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), + C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), + C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), + C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), + C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), + C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), + C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), + C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), + C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) +}; + +static const sph_u32 T1up[] = { + C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), + C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), + C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), + C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), + C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), + C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), + C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), + C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), + C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), + C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), + C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), + C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), + C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), + C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), + C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), + C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), + C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), + C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), + C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), + C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), + C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), + C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), + C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), + C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), + C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), + C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), + C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), + C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), + C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), + C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), + C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), + C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), + C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), + C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), + C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), + C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), + C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), + C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), + C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), + C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), + C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), + C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), + C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), + C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), + C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), + C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), + C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), + C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), + C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), + C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), + C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), + C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), + C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), + C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), + C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), + C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), + C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), + C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), + C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), + C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), + C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), + C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), + C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), + C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) +}; + +static const sph_u32 T1dn[] = { + C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), + C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), + C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), + C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), + C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), + C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), + C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), + C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), + C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), + C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), + C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), + C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), + C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), + C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), + C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), + C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), + C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), + C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), + C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), + C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), + C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), + C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), + C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), + C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), + C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), + C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), + C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), + C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), + C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), + C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), + C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), + C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), + C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), + C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), + C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), + C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), + C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), + C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), + C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), + C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), + C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), + C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), + C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), + C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), + C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), + C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), + C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), + C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), + C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), + C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), + C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), + C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), + C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), + C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), + C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), + C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), + C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), + C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), + C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), + C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), + C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), + C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), + C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), + C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) +}; + +static const sph_u32 T2up[] = { + C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), + C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), + C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), + C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), + C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), + C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), + C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), + C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), + C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), + C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), + C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), + C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), + C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), + C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), + C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), + C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), + C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), + C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), + C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), + C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), + C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), + C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), + C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), + C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), + C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), + C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), + C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), + C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), + C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), + C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), + C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), + C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), + C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), + C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), + C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), + C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), + C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), + C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), + C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), + C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), + C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), + C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), + C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), + C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), + C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), + C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), + C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), + C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), + C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), + C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), + C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), + C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), + C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), + C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), + C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), + C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), + C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), + C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), + C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), + C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), + C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), + C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), + C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), + C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) +}; + +static const sph_u32 T2dn[] = { + C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), + C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), + C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), + C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), + C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), + C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), + C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), + C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), + C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), + C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), + C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), + C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), + C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), + C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), + C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), + C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), + C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), + C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), + C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), + C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), + C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), + C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), + C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), + C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), + C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), + C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), + C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), + C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), + C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), + C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), + C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), + C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), + C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), + C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), + C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), + C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), + C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), + C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), + C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), + C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), + C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), + C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), + C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), + C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), + C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), + C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), + C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), + C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), + C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), + C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), + C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), + C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), + C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), + C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), + C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), + C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), + C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), + C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), + C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), + C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), + C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), + C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), + C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), + C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) +}; + +static const sph_u32 T3up[] = { + C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), + C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), + C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), + C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), + C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), + C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), + C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), + C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), + C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), + C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), + C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), + C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), + C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), + C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), + C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), + C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), + C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), + C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), + C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), + C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), + C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), + C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), + C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), + C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), + C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), + C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), + C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), + C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), + C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), + C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), + C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), + C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), + C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), + C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), + C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), + C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), + C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), + C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), + C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), + C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), + C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), + C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), + C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), + C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), + C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), + C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), + C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), + C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), + C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), + C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), + C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), + C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), + C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), + C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), + C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), + C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), + C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), + C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), + C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), + C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), + C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), + C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), + C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), + C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) +}; + +static const sph_u32 T3dn[] = { + C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), + C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), + C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), + C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), + C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), + C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), + C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), + C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), + C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), + C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), + C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), + C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), + C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), + C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), + C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), + C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), + C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), + C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), + C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), + C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), + C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), + C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), + C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), + C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), + C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), + C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), + C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), + C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), + C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), + C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), + C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), + C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), + C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), + C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), + C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), + C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), + C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), + C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), + C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), + C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), + C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), + C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), + C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), + C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), + C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), + C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), + C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), + C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), + C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), + C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), + C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), + C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), + C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), + C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), + C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), + C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), + C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), + C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), + C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), + C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), + C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), + C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), + C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), + C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) +}; + +#define DECL_STATE_SMALL \ + sph_u32 H[16]; + +#define READ_STATE_SMALL(sc) do { \ + memcpy(H, (sc)->state.narrow, sizeof H); \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + memcpy((sc)->state.narrow, H, sizeof H); \ + } while (0) + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ T2up[B32_2(a[b2])] \ + ^ T3up[B32_3(a[b3])] \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ T2dn[B32_2(a[b6])] \ + ^ T3dn[B32_3(a[b7])]; \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ T2dn[B32_2(a[b2])] \ + ^ T3dn[B32_3(a[b3])] \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ T2up[B32_2(a[b6])] \ + ^ T3up[B32_3(a[b7])]; \ + } while (0) + +#define ROUND_SMALL_P(a, r) do { \ + sph_u32 t[16]; \ + a[0x0] ^= PC32up(0x00, r); \ + a[0x1] ^= PC32dn(0x00, r); \ + a[0x2] ^= PC32up(0x10, r); \ + a[0x3] ^= PC32dn(0x10, r); \ + a[0x4] ^= PC32up(0x20, r); \ + a[0x5] ^= PC32dn(0x20, r); \ + a[0x6] ^= PC32up(0x30, r); \ + a[0x7] ^= PC32dn(0x30, r); \ + a[0x8] ^= PC32up(0x40, r); \ + a[0x9] ^= PC32dn(0x40, r); \ + a[0xA] ^= PC32up(0x50, r); \ + a[0xB] ^= PC32dn(0x50, r); \ + a[0xC] ^= PC32up(0x60, r); \ + a[0xD] ^= PC32dn(0x60, r); \ + a[0xE] ^= PC32up(0x70, r); \ + a[0xF] ^= PC32dn(0x70, r); \ + RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); \ + RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); \ + RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); \ + RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); \ + RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); \ + RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); \ + RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); \ + RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + sph_u32 t[16]; \ + a[0x0] ^= QC32up(0x00, r); \ + a[0x1] ^= QC32dn(0x00, r); \ + a[0x2] ^= QC32up(0x10, r); \ + a[0x3] ^= QC32dn(0x10, r); \ + a[0x4] ^= QC32up(0x20, r); \ + a[0x5] ^= QC32dn(0x20, r); \ + a[0x6] ^= QC32up(0x30, r); \ + a[0x7] ^= QC32dn(0x30, r); \ + a[0x8] ^= QC32up(0x40, r); \ + a[0x9] ^= QC32dn(0x40, r); \ + a[0xA] ^= QC32up(0x50, r); \ + a[0xB] ^= QC32dn(0x50, r); \ + a[0xC] ^= QC32up(0x60, r); \ + a[0xD] ^= QC32dn(0x60, r); \ + a[0xE] ^= QC32up(0x70, r); \ + a[0xF] ^= QC32dn(0x70, r); \ + RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); \ + RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); \ + RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); \ + RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); \ + RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); \ + RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); \ + RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); \ + RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + +#else + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_P(a, r + 0); \ + ROUND_SMALL_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_Q(a, r + 0); \ + ROUND_SMALL_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_SMALL do { \ + sph_u32 g[16], m[16]; \ + size_t u; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec32e_aligned(buf + (u << 2)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_SMALL_P(g); \ + PERM_SMALL_Q(m); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_SMALL do { \ + sph_u32 x[16]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_SMALL_P(x); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#define DECL_STATE_BIG \ + sph_u32 H[32]; + +#define READ_STATE_BIG(sc) do { \ + memcpy(H, (sc)->state.narrow, sizeof H); \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + memcpy((sc)->state.narrow, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + sph_u32 fu2 = T0up[B32_2(a[b2])]; \ + sph_u32 fd2 = T0dn[B32_2(a[b2])]; \ + sph_u32 fu3 = T1up[B32_3(a[b3])]; \ + sph_u32 fd3 = T1dn[B32_3(a[b3])]; \ + sph_u32 fu6 = T0up[B32_2(a[b6])]; \ + sph_u32 fd6 = T0dn[B32_2(a[b6])]; \ + sph_u32 fu7 = T1up[B32_3(a[b7])]; \ + sph_u32 fd7 = T1dn[B32_3(a[b7])]; \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ R32u(fu2, fd2) \ + ^ R32u(fu3, fd3) \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ R32d(fu6, fd6) \ + ^ R32d(fu7, fd7); \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ R32d(fu2, fd2) \ + ^ R32d(fu3, fd3) \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ R32u(fu6, fd6) \ + ^ R32u(fu7, fd7); \ + } while (0) + +#else + +#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ T2up[B32_2(a[b2])] \ + ^ T3up[B32_3(a[b3])] \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ T2dn[B32_2(a[b6])] \ + ^ T3dn[B32_3(a[b7])]; \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ T2dn[B32_2(a[b2])] \ + ^ T3dn[B32_3(a[b3])] \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ T2up[B32_2(a[b6])] \ + ^ T3up[B32_3(a[b7])]; \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define ROUND_BIG_P(a, r) do { \ + sph_u32 t[32]; \ + size_t u; \ + a[0x00] ^= PC32up(0x00, r); \ + a[0x01] ^= PC32dn(0x00, r); \ + a[0x02] ^= PC32up(0x10, r); \ + a[0x03] ^= PC32dn(0x10, r); \ + a[0x04] ^= PC32up(0x20, r); \ + a[0x05] ^= PC32dn(0x20, r); \ + a[0x06] ^= PC32up(0x30, r); \ + a[0x07] ^= PC32dn(0x30, r); \ + a[0x08] ^= PC32up(0x40, r); \ + a[0x09] ^= PC32dn(0x40, r); \ + a[0x0A] ^= PC32up(0x50, r); \ + a[0x0B] ^= PC32dn(0x50, r); \ + a[0x0C] ^= PC32up(0x60, r); \ + a[0x0D] ^= PC32dn(0x60, r); \ + a[0x0E] ^= PC32up(0x70, r); \ + a[0x0F] ^= PC32dn(0x70, r); \ + a[0x10] ^= PC32up(0x80, r); \ + a[0x11] ^= PC32dn(0x80, r); \ + a[0x12] ^= PC32up(0x90, r); \ + a[0x13] ^= PC32dn(0x90, r); \ + a[0x14] ^= PC32up(0xA0, r); \ + a[0x15] ^= PC32dn(0xA0, r); \ + a[0x16] ^= PC32up(0xB0, r); \ + a[0x17] ^= PC32dn(0xB0, r); \ + a[0x18] ^= PC32up(0xC0, r); \ + a[0x19] ^= PC32dn(0xC0, r); \ + a[0x1A] ^= PC32up(0xD0, r); \ + a[0x1B] ^= PC32dn(0xD0, r); \ + a[0x1C] ^= PC32up(0xE0, r); \ + a[0x1D] ^= PC32dn(0xE0, r); \ + a[0x1E] ^= PC32up(0xF0, r); \ + a[0x1F] ^= PC32dn(0xF0, r); \ + for (u = 0; u < 32; u += 8) { \ + RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ + u + 0x00, (u + 0x02) & 0x1F, \ + (u + 0x04) & 0x1F, (u + 0x06) & 0x1F, \ + (u + 0x09) & 0x1F, (u + 0x0B) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x17) & 0x1F); \ + RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ + u + 0x02, (u + 0x04) & 0x1F, \ + (u + 0x06) & 0x1F, (u + 0x08) & 0x1F, \ + (u + 0x0B) & 0x1F, (u + 0x0D) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x19) & 0x1F); \ + RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ + u + 0x04, (u + 0x06) & 0x1F, \ + (u + 0x08) & 0x1F, (u + 0x0A) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x0F) & 0x1F, \ + (u + 0x11) & 0x1F, (u + 0x1B) & 0x1F); \ + RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ + u + 0x06, (u + 0x08) & 0x1F, \ + (u + 0x0A) & 0x1F, (u + 0x0C) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x11) & 0x1F, \ + (u + 0x13) & 0x1F, (u + 0x1D) & 0x1F); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u32 t[32]; \ + size_t u; \ + a[0x00] ^= QC32up(0x00, r); \ + a[0x01] ^= QC32dn(0x00, r); \ + a[0x02] ^= QC32up(0x10, r); \ + a[0x03] ^= QC32dn(0x10, r); \ + a[0x04] ^= QC32up(0x20, r); \ + a[0x05] ^= QC32dn(0x20, r); \ + a[0x06] ^= QC32up(0x30, r); \ + a[0x07] ^= QC32dn(0x30, r); \ + a[0x08] ^= QC32up(0x40, r); \ + a[0x09] ^= QC32dn(0x40, r); \ + a[0x0A] ^= QC32up(0x50, r); \ + a[0x0B] ^= QC32dn(0x50, r); \ + a[0x0C] ^= QC32up(0x60, r); \ + a[0x0D] ^= QC32dn(0x60, r); \ + a[0x0E] ^= QC32up(0x70, r); \ + a[0x0F] ^= QC32dn(0x70, r); \ + a[0x10] ^= QC32up(0x80, r); \ + a[0x11] ^= QC32dn(0x80, r); \ + a[0x12] ^= QC32up(0x90, r); \ + a[0x13] ^= QC32dn(0x90, r); \ + a[0x14] ^= QC32up(0xA0, r); \ + a[0x15] ^= QC32dn(0xA0, r); \ + a[0x16] ^= QC32up(0xB0, r); \ + a[0x17] ^= QC32dn(0xB0, r); \ + a[0x18] ^= QC32up(0xC0, r); \ + a[0x19] ^= QC32dn(0xC0, r); \ + a[0x1A] ^= QC32up(0xD0, r); \ + a[0x1B] ^= QC32dn(0xD0, r); \ + a[0x1C] ^= QC32up(0xE0, r); \ + a[0x1D] ^= QC32dn(0xE0, r); \ + a[0x1E] ^= QC32up(0xF0, r); \ + a[0x1F] ^= QC32dn(0xF0, r); \ + for (u = 0; u < 32; u += 8) { \ + RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ + (u + 0x02) & 0x1F, (u + 0x06) & 0x1F, \ + (u + 0x0A) & 0x1F, (u + 0x16) & 0x1F, \ + (u + 0x01) & 0x1F, (u + 0x05) & 0x1F, \ + (u + 0x09) & 0x1F, (u + 0x0D) & 0x1F); \ + RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ + (u + 0x04) & 0x1F, (u + 0x08) & 0x1F, \ + (u + 0x0C) & 0x1F, (u + 0x18) & 0x1F, \ + (u + 0x03) & 0x1F, (u + 0x07) & 0x1F, \ + (u + 0x0B) & 0x1F, (u + 0x0F) & 0x1F); \ + RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ + (u + 0x06) & 0x1F, (u + 0x0A) & 0x1F, \ + (u + 0x0E) & 0x1F, (u + 0x1A) & 0x1F, \ + (u + 0x05) & 0x1F, (u + 0x09) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x11) & 0x1F); \ + RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ + (u + 0x08) & 0x1F, (u + 0x0C) & 0x1F, \ + (u + 0x10) & 0x1F, (u + 0x1C) & 0x1F, \ + (u + 0x07) & 0x1F, (u + 0x0B) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x13) & 0x1F); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#else + +#define ROUND_BIG_P(a, r) do { \ + sph_u32 t[32]; \ + a[0x00] ^= PC32up(0x00, r); \ + a[0x01] ^= PC32dn(0x00, r); \ + a[0x02] ^= PC32up(0x10, r); \ + a[0x03] ^= PC32dn(0x10, r); \ + a[0x04] ^= PC32up(0x20, r); \ + a[0x05] ^= PC32dn(0x20, r); \ + a[0x06] ^= PC32up(0x30, r); \ + a[0x07] ^= PC32dn(0x30, r); \ + a[0x08] ^= PC32up(0x40, r); \ + a[0x09] ^= PC32dn(0x40, r); \ + a[0x0A] ^= PC32up(0x50, r); \ + a[0x0B] ^= PC32dn(0x50, r); \ + a[0x0C] ^= PC32up(0x60, r); \ + a[0x0D] ^= PC32dn(0x60, r); \ + a[0x0E] ^= PC32up(0x70, r); \ + a[0x0F] ^= PC32dn(0x70, r); \ + a[0x10] ^= PC32up(0x80, r); \ + a[0x11] ^= PC32dn(0x80, r); \ + a[0x12] ^= PC32up(0x90, r); \ + a[0x13] ^= PC32dn(0x90, r); \ + a[0x14] ^= PC32up(0xA0, r); \ + a[0x15] ^= PC32dn(0xA0, r); \ + a[0x16] ^= PC32up(0xB0, r); \ + a[0x17] ^= PC32dn(0xB0, r); \ + a[0x18] ^= PC32up(0xC0, r); \ + a[0x19] ^= PC32dn(0xC0, r); \ + a[0x1A] ^= PC32up(0xD0, r); \ + a[0x1B] ^= PC32dn(0xD0, r); \ + a[0x1C] ^= PC32up(0xE0, r); \ + a[0x1D] ^= PC32dn(0xE0, r); \ + a[0x1E] ^= PC32up(0xF0, r); \ + a[0x1F] ^= PC32dn(0xF0, r); \ + RBTT(0x00, 0x01, a, \ + 0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x17); \ + RBTT(0x02, 0x03, a, \ + 0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x19); \ + RBTT(0x04, 0x05, a, \ + 0x04, 0x06, 0x08, 0x0A, 0x0D, 0x0F, 0x11, 0x1B); \ + RBTT(0x06, 0x07, a, \ + 0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x11, 0x13, 0x1D); \ + RBTT(0x08, 0x09, a, \ + 0x08, 0x0A, 0x0C, 0x0E, 0x11, 0x13, 0x15, 0x1F); \ + RBTT(0x0A, 0x0B, a, \ + 0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x15, 0x17, 0x01); \ + RBTT(0x0C, 0x0D, a, \ + 0x0C, 0x0E, 0x10, 0x12, 0x15, 0x17, 0x19, 0x03); \ + RBTT(0x0E, 0x0F, a, \ + 0x0E, 0x10, 0x12, 0x14, 0x17, 0x19, 0x1B, 0x05); \ + RBTT(0x10, 0x11, a, \ + 0x10, 0x12, 0x14, 0x16, 0x19, 0x1B, 0x1D, 0x07); \ + RBTT(0x12, 0x13, a, \ + 0x12, 0x14, 0x16, 0x18, 0x1B, 0x1D, 0x1F, 0x09); \ + RBTT(0x14, 0x15, a, \ + 0x14, 0x16, 0x18, 0x1A, 0x1D, 0x1F, 0x01, 0x0B); \ + RBTT(0x16, 0x17, a, \ + 0x16, 0x18, 0x1A, 0x1C, 0x1F, 0x01, 0x03, 0x0D); \ + RBTT(0x18, 0x19, a, \ + 0x18, 0x1A, 0x1C, 0x1E, 0x01, 0x03, 0x05, 0x0F); \ + RBTT(0x1A, 0x1B, a, \ + 0x1A, 0x1C, 0x1E, 0x00, 0x03, 0x05, 0x07, 0x11); \ + RBTT(0x1C, 0x1D, a, \ + 0x1C, 0x1E, 0x00, 0x02, 0x05, 0x07, 0x09, 0x13); \ + RBTT(0x1E, 0x1F, a, \ + 0x1E, 0x00, 0x02, 0x04, 0x07, 0x09, 0x0B, 0x15); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u32 t[32]; \ + a[0x00] ^= QC32up(0x00, r); \ + a[0x01] ^= QC32dn(0x00, r); \ + a[0x02] ^= QC32up(0x10, r); \ + a[0x03] ^= QC32dn(0x10, r); \ + a[0x04] ^= QC32up(0x20, r); \ + a[0x05] ^= QC32dn(0x20, r); \ + a[0x06] ^= QC32up(0x30, r); \ + a[0x07] ^= QC32dn(0x30, r); \ + a[0x08] ^= QC32up(0x40, r); \ + a[0x09] ^= QC32dn(0x40, r); \ + a[0x0A] ^= QC32up(0x50, r); \ + a[0x0B] ^= QC32dn(0x50, r); \ + a[0x0C] ^= QC32up(0x60, r); \ + a[0x0D] ^= QC32dn(0x60, r); \ + a[0x0E] ^= QC32up(0x70, r); \ + a[0x0F] ^= QC32dn(0x70, r); \ + a[0x10] ^= QC32up(0x80, r); \ + a[0x11] ^= QC32dn(0x80, r); \ + a[0x12] ^= QC32up(0x90, r); \ + a[0x13] ^= QC32dn(0x90, r); \ + a[0x14] ^= QC32up(0xA0, r); \ + a[0x15] ^= QC32dn(0xA0, r); \ + a[0x16] ^= QC32up(0xB0, r); \ + a[0x17] ^= QC32dn(0xB0, r); \ + a[0x18] ^= QC32up(0xC0, r); \ + a[0x19] ^= QC32dn(0xC0, r); \ + a[0x1A] ^= QC32up(0xD0, r); \ + a[0x1B] ^= QC32dn(0xD0, r); \ + a[0x1C] ^= QC32up(0xE0, r); \ + a[0x1D] ^= QC32dn(0xE0, r); \ + a[0x1E] ^= QC32up(0xF0, r); \ + a[0x1F] ^= QC32dn(0xF0, r); \ + RBTT(0x00, 0x01, a, \ + 0x02, 0x06, 0x0A, 0x16, 0x01, 0x05, 0x09, 0x0D); \ + RBTT(0x02, 0x03, a, \ + 0x04, 0x08, 0x0C, 0x18, 0x03, 0x07, 0x0B, 0x0F); \ + RBTT(0x04, 0x05, a, \ + 0x06, 0x0A, 0x0E, 0x1A, 0x05, 0x09, 0x0D, 0x11); \ + RBTT(0x06, 0x07, a, \ + 0x08, 0x0C, 0x10, 0x1C, 0x07, 0x0B, 0x0F, 0x13); \ + RBTT(0x08, 0x09, a, \ + 0x0A, 0x0E, 0x12, 0x1E, 0x09, 0x0D, 0x11, 0x15); \ + RBTT(0x0A, 0x0B, a, \ + 0x0C, 0x10, 0x14, 0x00, 0x0B, 0x0F, 0x13, 0x17); \ + RBTT(0x0C, 0x0D, a, \ + 0x0E, 0x12, 0x16, 0x02, 0x0D, 0x11, 0x15, 0x19); \ + RBTT(0x0E, 0x0F, a, \ + 0x10, 0x14, 0x18, 0x04, 0x0F, 0x13, 0x17, 0x1B); \ + RBTT(0x10, 0x11, a, \ + 0x12, 0x16, 0x1A, 0x06, 0x11, 0x15, 0x19, 0x1D); \ + RBTT(0x12, 0x13, a, \ + 0x14, 0x18, 0x1C, 0x08, 0x13, 0x17, 0x1B, 0x1F); \ + RBTT(0x14, 0x15, a, \ + 0x16, 0x1A, 0x1E, 0x0A, 0x15, 0x19, 0x1D, 0x01); \ + RBTT(0x16, 0x17, a, \ + 0x18, 0x1C, 0x00, 0x0C, 0x17, 0x1B, 0x1F, 0x03); \ + RBTT(0x18, 0x19, a, \ + 0x1A, 0x1E, 0x02, 0x0E, 0x19, 0x1D, 0x01, 0x05); \ + RBTT(0x1A, 0x1B, a, \ + 0x1C, 0x00, 0x04, 0x10, 0x1B, 0x1F, 0x03, 0x07); \ + RBTT(0x1C, 0x1D, a, \ + 0x1E, 0x02, 0x06, 0x12, 0x1D, 0x01, 0x05, 0x09); \ + RBTT(0x1E, 0x1F, a, \ + 0x00, 0x04, 0x08, 0x14, 0x1F, 0x03, 0x07, 0x0B); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r ++) \ + ROUND_BIG_P(a, r); \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r ++) \ + ROUND_BIG_Q(a, r); \ + } while (0) + +#else + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_P(a, r + 0); \ + ROUND_BIG_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_Q(a, r + 0); \ + ROUND_BIG_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_BIG do { \ + sph_u32 g[32], m[32]; \ + size_t u; \ + for (u = 0; u < 32; u ++) { \ + m[u] = dec32e_aligned(buf + (u << 2)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_BIG_P(g); \ + PERM_BIG_Q(m); \ + for (u = 0; u < 32; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_BIG do { \ + sph_u32 x[32]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_BIG_P(x); \ + for (u = 0; u < 32; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#endif + +static void +groestl_small_init(sph_groestl_small_context *sc, unsigned out_size) +{ + size_t u; + + sc->ptr = 0; +#if SPH_GROESTL_64 + for (u = 0; u < 7; u ++) + sc->state.wide[u] = 0; +#if USE_LE + sc->state.wide[7] = ((sph_u64)(out_size & 0xFF) << 56) + | ((sph_u64)(out_size & 0xFF00) << 40); +#else + sc->state.wide[7] = (sph_u64)out_size; +#endif +#else + for (u = 0; u < 15; u ++) + sc->state.narrow[u] = 0; +#if USE_LE + sc->state.narrow[15] = ((sph_u32)(out_size & 0xFF) << 24) + | ((sph_u32)(out_size & 0xFF00) << 8); +#else + sc->state.narrow[15] = (sph_u32)out_size; +#endif +#endif +#if SPH_64 + sc->count = 0; +#else + sc->count_high = 0; + sc->count_low = 0; +#endif +} + +static void +groestl_small_core(sph_groestl_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE_SMALL + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_SMALL(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + COMPRESS_SMALL; +#if SPH_64 + sc->count ++; +#else + if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) + sc->count_high = SPH_T32(sc->count_high + 1); +#endif + ptr = 0; + } + } + WRITE_STATE_SMALL(sc); + sc->ptr = ptr; +} + +static void +groestl_small_close(sph_groestl_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_len) +{ + unsigned char *buf; + unsigned char pad[72]; + size_t u, ptr, pad_len; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif + unsigned z; + DECL_STATE_SMALL + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + pad[0] = ((ub & -z) | z) & 0xFF; + if (ptr < 56) { + pad_len = 64 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 1); +#else + count_low = SPH_T32(sc->count_low + 1); + count_high = SPH_T32(sc->count_high); + if (count_low == 0) + count_high = SPH_T32(count_high + 1); +#endif + } else { + pad_len = 128 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 2); +#else + count_low = SPH_T32(sc->count_low + 2); + count_high = SPH_T32(sc->count_high); + if (count_low <= 1) + count_high = SPH_T32(count_high + 1); +#endif + } + memset(pad + 1, 0, pad_len - 9); +#if SPH_64 + sph_enc64be(pad + pad_len - 8, count); +#else + sph_enc64be(pad + pad_len - 8, count_high); + sph_enc64be(pad + pad_len - 4, count_low); +#endif + groestl_small_core(sc, pad, pad_len); + READ_STATE_SMALL(sc); + FINAL_SMALL; +#if SPH_GROESTL_64 + for (u = 0; u < 4; u ++) + enc64e(pad + (u << 3), H[u + 4]); +#else + for (u = 0; u < 8; u ++) + enc32e(pad + (u << 2), H[u + 8]); +#endif + memcpy(dst, pad + 32 - out_len, out_len); + groestl_small_init(sc, (unsigned)out_len << 3); +} + +static void +groestl_big_init(sph_groestl_big_context *sc, unsigned out_size) +{ + size_t u; + + sc->ptr = 0; +#if SPH_GROESTL_64 + for (u = 0; u < 15; u ++) + sc->state.wide[u] = 0; +#if USE_LE + sc->state.wide[15] = ((sph_u64)(out_size & 0xFF) << 56) + | ((sph_u64)(out_size & 0xFF00) << 40); +#else + sc->state.wide[15] = (sph_u64)out_size; +#endif +#else + for (u = 0; u < 31; u ++) + sc->state.narrow[u] = 0; +#if USE_LE + sc->state.narrow[31] = ((sph_u32)(out_size & 0xFF) << 24) + | ((sph_u32)(out_size & 0xFF00) << 8); +#else + sc->state.narrow[31] = (sph_u32)out_size; +#endif +#endif +#if SPH_64 + sc->count = 0; +#else + sc->count_high = 0; + sc->count_low = 0; +#endif +} + +static void +groestl_big_core(sph_groestl_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE_BIG + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_BIG(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + COMPRESS_BIG; +#if SPH_64 + sc->count ++; +#else + if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) + sc->count_high = SPH_T32(sc->count_high + 1); +#endif + ptr = 0; + } + } + WRITE_STATE_BIG(sc); + sc->ptr = ptr; +} + +static void +groestl_big_close(sph_groestl_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_len) +{ + unsigned char *buf; + unsigned char pad[136]; + size_t ptr, pad_len, u; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif + unsigned z; + DECL_STATE_BIG + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + pad[0] = ((ub & -z) | z) & 0xFF; + if (ptr < 120) { + pad_len = 128 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 1); +#else + count_low = SPH_T32(sc->count_low + 1); + count_high = SPH_T32(sc->count_high); + if (count_low == 0) + count_high = SPH_T32(count_high + 1); +#endif + } else { + pad_len = 256 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 2); +#else + count_low = SPH_T32(sc->count_low + 2); + count_high = SPH_T32(sc->count_high); + if (count_low <= 1) + count_high = SPH_T32(count_high + 1); +#endif + } + memset(pad + 1, 0, pad_len - 9); + //fprintf(stderr, "%x\n", pad_len); +#if SPH_64 + sph_enc64be(pad + pad_len - 8, count); +#else + sph_enc64be(pad + pad_len - 8, count_high); + sph_enc64be(pad + pad_len - 4, count_low); +#endif + groestl_big_core(sc, pad, pad_len); + READ_STATE_BIG(sc); + FINAL_BIG; +#if SPH_GROESTL_64 + for (u = 0; u < 8; u ++) + enc64e(pad + (u << 3), H[u + 8]); +#else + for (u = 0; u < 16; u ++) + enc32e(pad + (u << 2), H[u + 16]); +#endif + memcpy(dst, pad + 64 - out_len, out_len); + groestl_big_init(sc, (unsigned)out_len << 3); +} + +/* see sph_groestl.h */ +void +sph_groestl224_init(void *cc) +{ + groestl_small_init(cc, 224); +} + +/* see sph_groestl.h */ +void +sph_groestl224(void *cc, const void *data, size_t len) +{ + groestl_small_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl224_close(void *cc, void *dst) +{ + groestl_small_close(cc, 0, 0, dst, 28); +} + +/* see sph_groestl.h */ +void +sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_small_close(cc, ub, n, dst, 28); +} + +/* see sph_groestl.h */ +void +sph_groestl256_init(void *cc) +{ + groestl_small_init(cc, 256); +} + +/* see sph_groestl.h */ +void +sph_groestl256(void *cc, const void *data, size_t len) +{ + groestl_small_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl256_close(void *cc, void *dst) +{ + groestl_small_close(cc, 0, 0, dst, 32); +} + +/* see sph_groestl.h */ +void +sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_small_close(cc, ub, n, dst, 32); +} + +/* see sph_groestl.h */ +void +sph_groestl384_init(void *cc) +{ + groestl_big_init(cc, 384); +} + +/* see sph_groestl.h */ +void +sph_groestl384(void *cc, const void *data, size_t len) +{ + groestl_big_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl384_close(void *cc, void *dst) +{ + groestl_big_close(cc, 0, 0, dst, 48); +} + +/* see sph_groestl.h */ +void +sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_big_close(cc, ub, n, dst, 48); +} + +/* see sph_groestl.h */ +void +sph_groestl512_init(void *cc) +{ + groestl_big_init(cc, 512); +} + +/* see sph_groestl.h */ +void +sph_groestl512(void *cc, const void *data, size_t len) +{ + groestl_big_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl512_close(void *cc, void *dst) +{ + groestl_big_close(cc, 0, 0, dst, 64); +} + +/* see sph_groestl.h */ +void +sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_big_close(cc, ub, n, dst, 64); +} + +#ifdef __cplusplus +} +#endif diff --git a/sph/keccak.c b/sph/keccak.c index 8c90f3a..cff9f87 100644 --- a/sph/keccak.c +++ b/sph/keccak.c @@ -1,1824 +1,1824 @@ -/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */ -/* - * Keccak implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_keccak.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -/* - * Parameters: - * - * SPH_KECCAK_64 use a 64-bit type - * SPH_KECCAK_UNROLL number of loops to unroll (0/undef for full unroll) - * SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only) - * SPH_KECCAK_NOCOPY do not copy the state into local variables - * - * If there is no usable 64-bit type, the code automatically switches - * back to the 32-bit implementation. - * - * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1 - * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core - * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302, - * 8 kB L1 code cache), seem to show that the following are optimal: - * - * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds, - * do not copy the state; unrolling 2, 6 or all rounds also provides - * near-optimal performance. - * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds, - * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds - * also provides near-optimal performance. - * -- PowerPC: use the 64-bit implementation, unroll 8 rounds, - * copy the state. Unrolling 4 or 6 rounds is near-optimal. - * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds, - * copy the state. - * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy - * the state. Unrolling only 1 round is also near-optimal. - * - * Also, interleaving does not always yield actual improvements when - * using a 32-bit implementation; in particular when the architecture - * does not offer a native rotation opcode (interleaving replaces one - * 64-bit rotation with two 32-bit rotations, which is a gain only if - * there is a native 32-bit rotation opcode and not a native 64-bit - * rotation opcode; also, interleaving implies a small overhead when - * processing input words). - * - * To sum up: - * -- when possible, use the 64-bit code - * -- exception: on 32-bit x86, use 32-bit code - * -- when using 32-bit code, use interleaving - * -- copy the state, except on x86 - * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines - */ - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK -#define SPH_SMALL_FOOTPRINT_KECCAK 1 -#endif - -/* - * By default, we select the 64-bit implementation if a 64-bit type - * is available, unless a 32-bit x86 is detected. - */ -#if !defined SPH_KECCAK_64 && SPH_64 \ - && !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC) -#define SPH_KECCAK_64 1 -#endif - -/* - * If using a 32-bit implementation, we prefer to interleave. - */ -#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE -#define SPH_KECCAK_INTERLEAVE 1 -#endif - -/* - * Unroll 8 rounds on big systems, 2 rounds on small systems. - */ -#ifndef SPH_KECCAK_UNROLL -#if SPH_SMALL_FOOTPRINT_KECCAK -#define SPH_KECCAK_UNROLL 2 -#else -#define SPH_KECCAK_UNROLL 8 -#endif -#endif - -/* - * We do not want to copy the state to local variables on x86 (32-bit - * and 64-bit alike). - */ -#ifndef SPH_KECCAK_NOCOPY -#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC -#define SPH_KECCAK_NOCOPY 1 -#else -#define SPH_KECCAK_NOCOPY 0 -#endif -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -#if SPH_KECCAK_64 - -static const sph_u64 RC[] = { - SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), - SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), - SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), - SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), - SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), - SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), - SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), - SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), - SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), - SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) -}; - -#if SPH_KECCAK_NOCOPY - -#define a00 (kc->u.wide[ 0]) -#define a10 (kc->u.wide[ 1]) -#define a20 (kc->u.wide[ 2]) -#define a30 (kc->u.wide[ 3]) -#define a40 (kc->u.wide[ 4]) -#define a01 (kc->u.wide[ 5]) -#define a11 (kc->u.wide[ 6]) -#define a21 (kc->u.wide[ 7]) -#define a31 (kc->u.wide[ 8]) -#define a41 (kc->u.wide[ 9]) -#define a02 (kc->u.wide[10]) -#define a12 (kc->u.wide[11]) -#define a22 (kc->u.wide[12]) -#define a32 (kc->u.wide[13]) -#define a42 (kc->u.wide[14]) -#define a03 (kc->u.wide[15]) -#define a13 (kc->u.wide[16]) -#define a23 (kc->u.wide[17]) -#define a33 (kc->u.wide[18]) -#define a43 (kc->u.wide[19]) -#define a04 (kc->u.wide[20]) -#define a14 (kc->u.wide[21]) -#define a24 (kc->u.wide[22]) -#define a34 (kc->u.wide[23]) -#define a44 (kc->u.wide[24]) - -#define DECL_STATE -#define READ_STATE(sc) -#define WRITE_STATE(sc) - -#define INPUT_BUF(size) do { \ - size_t j; \ - for (j = 0; j < (size); j += 8) { \ - kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \ - } \ - } while (0) - -#define INPUT_BUF144 INPUT_BUF(144) -#define INPUT_BUF136 INPUT_BUF(136) -#define INPUT_BUF104 INPUT_BUF(104) -#define INPUT_BUF72 INPUT_BUF(72) - -#else - -#define DECL_STATE \ - sph_u64 a00, a01, a02, a03, a04; \ - sph_u64 a10, a11, a12, a13, a14; \ - sph_u64 a20, a21, a22, a23, a24; \ - sph_u64 a30, a31, a32, a33, a34; \ - sph_u64 a40, a41, a42, a43, a44; - -#define READ_STATE(state) do { \ - a00 = (state)->u.wide[ 0]; \ - a10 = (state)->u.wide[ 1]; \ - a20 = (state)->u.wide[ 2]; \ - a30 = (state)->u.wide[ 3]; \ - a40 = (state)->u.wide[ 4]; \ - a01 = (state)->u.wide[ 5]; \ - a11 = (state)->u.wide[ 6]; \ - a21 = (state)->u.wide[ 7]; \ - a31 = (state)->u.wide[ 8]; \ - a41 = (state)->u.wide[ 9]; \ - a02 = (state)->u.wide[10]; \ - a12 = (state)->u.wide[11]; \ - a22 = (state)->u.wide[12]; \ - a32 = (state)->u.wide[13]; \ - a42 = (state)->u.wide[14]; \ - a03 = (state)->u.wide[15]; \ - a13 = (state)->u.wide[16]; \ - a23 = (state)->u.wide[17]; \ - a33 = (state)->u.wide[18]; \ - a43 = (state)->u.wide[19]; \ - a04 = (state)->u.wide[20]; \ - a14 = (state)->u.wide[21]; \ - a24 = (state)->u.wide[22]; \ - a34 = (state)->u.wide[23]; \ - a44 = (state)->u.wide[24]; \ - } while (0) - -#define WRITE_STATE(state) do { \ - (state)->u.wide[ 0] = a00; \ - (state)->u.wide[ 1] = a10; \ - (state)->u.wide[ 2] = a20; \ - (state)->u.wide[ 3] = a30; \ - (state)->u.wide[ 4] = a40; \ - (state)->u.wide[ 5] = a01; \ - (state)->u.wide[ 6] = a11; \ - (state)->u.wide[ 7] = a21; \ - (state)->u.wide[ 8] = a31; \ - (state)->u.wide[ 9] = a41; \ - (state)->u.wide[10] = a02; \ - (state)->u.wide[11] = a12; \ - (state)->u.wide[12] = a22; \ - (state)->u.wide[13] = a32; \ - (state)->u.wide[14] = a42; \ - (state)->u.wide[15] = a03; \ - (state)->u.wide[16] = a13; \ - (state)->u.wide[17] = a23; \ - (state)->u.wide[18] = a33; \ - (state)->u.wide[19] = a43; \ - (state)->u.wide[20] = a04; \ - (state)->u.wide[21] = a14; \ - (state)->u.wide[22] = a24; \ - (state)->u.wide[23] = a34; \ - (state)->u.wide[24] = a44; \ - } while (0) - -#define INPUT_BUF144 do { \ - a00 ^= sph_dec64le_aligned(buf + 0); \ - a10 ^= sph_dec64le_aligned(buf + 8); \ - a20 ^= sph_dec64le_aligned(buf + 16); \ - a30 ^= sph_dec64le_aligned(buf + 24); \ - a40 ^= sph_dec64le_aligned(buf + 32); \ - a01 ^= sph_dec64le_aligned(buf + 40); \ - a11 ^= sph_dec64le_aligned(buf + 48); \ - a21 ^= sph_dec64le_aligned(buf + 56); \ - a31 ^= sph_dec64le_aligned(buf + 64); \ - a41 ^= sph_dec64le_aligned(buf + 72); \ - a02 ^= sph_dec64le_aligned(buf + 80); \ - a12 ^= sph_dec64le_aligned(buf + 88); \ - a22 ^= sph_dec64le_aligned(buf + 96); \ - a32 ^= sph_dec64le_aligned(buf + 104); \ - a42 ^= sph_dec64le_aligned(buf + 112); \ - a03 ^= sph_dec64le_aligned(buf + 120); \ - a13 ^= sph_dec64le_aligned(buf + 128); \ - a23 ^= sph_dec64le_aligned(buf + 136); \ - } while (0) - -#define INPUT_BUF136 do { \ - a00 ^= sph_dec64le_aligned(buf + 0); \ - a10 ^= sph_dec64le_aligned(buf + 8); \ - a20 ^= sph_dec64le_aligned(buf + 16); \ - a30 ^= sph_dec64le_aligned(buf + 24); \ - a40 ^= sph_dec64le_aligned(buf + 32); \ - a01 ^= sph_dec64le_aligned(buf + 40); \ - a11 ^= sph_dec64le_aligned(buf + 48); \ - a21 ^= sph_dec64le_aligned(buf + 56); \ - a31 ^= sph_dec64le_aligned(buf + 64); \ - a41 ^= sph_dec64le_aligned(buf + 72); \ - a02 ^= sph_dec64le_aligned(buf + 80); \ - a12 ^= sph_dec64le_aligned(buf + 88); \ - a22 ^= sph_dec64le_aligned(buf + 96); \ - a32 ^= sph_dec64le_aligned(buf + 104); \ - a42 ^= sph_dec64le_aligned(buf + 112); \ - a03 ^= sph_dec64le_aligned(buf + 120); \ - a13 ^= sph_dec64le_aligned(buf + 128); \ - } while (0) - -#define INPUT_BUF104 do { \ - a00 ^= sph_dec64le_aligned(buf + 0); \ - a10 ^= sph_dec64le_aligned(buf + 8); \ - a20 ^= sph_dec64le_aligned(buf + 16); \ - a30 ^= sph_dec64le_aligned(buf + 24); \ - a40 ^= sph_dec64le_aligned(buf + 32); \ - a01 ^= sph_dec64le_aligned(buf + 40); \ - a11 ^= sph_dec64le_aligned(buf + 48); \ - a21 ^= sph_dec64le_aligned(buf + 56); \ - a31 ^= sph_dec64le_aligned(buf + 64); \ - a41 ^= sph_dec64le_aligned(buf + 72); \ - a02 ^= sph_dec64le_aligned(buf + 80); \ - a12 ^= sph_dec64le_aligned(buf + 88); \ - a22 ^= sph_dec64le_aligned(buf + 96); \ - } while (0) - -#define INPUT_BUF72 do { \ - a00 ^= sph_dec64le_aligned(buf + 0); \ - a10 ^= sph_dec64le_aligned(buf + 8); \ - a20 ^= sph_dec64le_aligned(buf + 16); \ - a30 ^= sph_dec64le_aligned(buf + 24); \ - a40 ^= sph_dec64le_aligned(buf + 32); \ - a01 ^= sph_dec64le_aligned(buf + 40); \ - a11 ^= sph_dec64le_aligned(buf + 48); \ - a21 ^= sph_dec64le_aligned(buf + 56); \ - a31 ^= sph_dec64le_aligned(buf + 64); \ - } while (0) - -#define INPUT_BUF(lim) do { \ - a00 ^= sph_dec64le_aligned(buf + 0); \ - a10 ^= sph_dec64le_aligned(buf + 8); \ - a20 ^= sph_dec64le_aligned(buf + 16); \ - a30 ^= sph_dec64le_aligned(buf + 24); \ - a40 ^= sph_dec64le_aligned(buf + 32); \ - a01 ^= sph_dec64le_aligned(buf + 40); \ - a11 ^= sph_dec64le_aligned(buf + 48); \ - a21 ^= sph_dec64le_aligned(buf + 56); \ - a31 ^= sph_dec64le_aligned(buf + 64); \ - if ((lim) == 72) \ - break; \ - a41 ^= sph_dec64le_aligned(buf + 72); \ - a02 ^= sph_dec64le_aligned(buf + 80); \ - a12 ^= sph_dec64le_aligned(buf + 88); \ - a22 ^= sph_dec64le_aligned(buf + 96); \ - if ((lim) == 104) \ - break; \ - a32 ^= sph_dec64le_aligned(buf + 104); \ - a42 ^= sph_dec64le_aligned(buf + 112); \ - a03 ^= sph_dec64le_aligned(buf + 120); \ - a13 ^= sph_dec64le_aligned(buf + 128); \ - if ((lim) == 136) \ - break; \ - a23 ^= sph_dec64le_aligned(buf + 136); \ - } while (0) - -#endif - -#define DECL64(x) sph_u64 x -#define MOV64(d, s) (d = s) -#define XOR64(d, a, b) (d = a ^ b) -#define AND64(d, a, b) (d = a & b) -#define OR64(d, a, b) (d = a | b) -#define NOT64(d, s) (d = SPH_T64(~s)) -#define ROL64(d, v, n) (d = SPH_ROTL64(v, n)) -#define XOR64_IOTA XOR64 - -#else - -static const struct { - sph_u32 high, low; -} RC[] = { -#if SPH_KECCAK_INTERLEAVE - { SPH_C32(0x00000000), SPH_C32(0x00000001) }, - { SPH_C32(0x00000089), SPH_C32(0x00000000) }, - { SPH_C32(0x8000008B), SPH_C32(0x00000000) }, - { SPH_C32(0x80008080), SPH_C32(0x00000000) }, - { SPH_C32(0x0000008B), SPH_C32(0x00000001) }, - { SPH_C32(0x00008000), SPH_C32(0x00000001) }, - { SPH_C32(0x80008088), SPH_C32(0x00000001) }, - { SPH_C32(0x80000082), SPH_C32(0x00000001) }, - { SPH_C32(0x0000000B), SPH_C32(0x00000000) }, - { SPH_C32(0x0000000A), SPH_C32(0x00000000) }, - { SPH_C32(0x00008082), SPH_C32(0x00000001) }, - { SPH_C32(0x00008003), SPH_C32(0x00000000) }, - { SPH_C32(0x0000808B), SPH_C32(0x00000001) }, - { SPH_C32(0x8000000B), SPH_C32(0x00000001) }, - { SPH_C32(0x8000008A), SPH_C32(0x00000001) }, - { SPH_C32(0x80000081), SPH_C32(0x00000001) }, - { SPH_C32(0x80000081), SPH_C32(0x00000000) }, - { SPH_C32(0x80000008), SPH_C32(0x00000000) }, - { SPH_C32(0x00000083), SPH_C32(0x00000000) }, - { SPH_C32(0x80008003), SPH_C32(0x00000000) }, - { SPH_C32(0x80008088), SPH_C32(0x00000001) }, - { SPH_C32(0x80000088), SPH_C32(0x00000000) }, - { SPH_C32(0x00008000), SPH_C32(0x00000001) }, - { SPH_C32(0x80008082), SPH_C32(0x00000000) } -#else - { SPH_C32(0x00000000), SPH_C32(0x00000001) }, - { SPH_C32(0x00000000), SPH_C32(0x00008082) }, - { SPH_C32(0x80000000), SPH_C32(0x0000808A) }, - { SPH_C32(0x80000000), SPH_C32(0x80008000) }, - { SPH_C32(0x00000000), SPH_C32(0x0000808B) }, - { SPH_C32(0x00000000), SPH_C32(0x80000001) }, - { SPH_C32(0x80000000), SPH_C32(0x80008081) }, - { SPH_C32(0x80000000), SPH_C32(0x00008009) }, - { SPH_C32(0x00000000), SPH_C32(0x0000008A) }, - { SPH_C32(0x00000000), SPH_C32(0x00000088) }, - { SPH_C32(0x00000000), SPH_C32(0x80008009) }, - { SPH_C32(0x00000000), SPH_C32(0x8000000A) }, - { SPH_C32(0x00000000), SPH_C32(0x8000808B) }, - { SPH_C32(0x80000000), SPH_C32(0x0000008B) }, - { SPH_C32(0x80000000), SPH_C32(0x00008089) }, - { SPH_C32(0x80000000), SPH_C32(0x00008003) }, - { SPH_C32(0x80000000), SPH_C32(0x00008002) }, - { SPH_C32(0x80000000), SPH_C32(0x00000080) }, - { SPH_C32(0x00000000), SPH_C32(0x0000800A) }, - { SPH_C32(0x80000000), SPH_C32(0x8000000A) }, - { SPH_C32(0x80000000), SPH_C32(0x80008081) }, - { SPH_C32(0x80000000), SPH_C32(0x00008080) }, - { SPH_C32(0x00000000), SPH_C32(0x80000001) }, - { SPH_C32(0x80000000), SPH_C32(0x80008008) } -#endif -}; - -#if SPH_KECCAK_INTERLEAVE - -#define INTERLEAVE(xl, xh) do { \ - sph_u32 l, h, t; \ - l = (xl); h = (xh); \ - t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ - t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ - t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ - t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ - t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ - t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ - t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ - t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ - t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ - l ^= t; h ^= t >> 16; \ - (xl) = l; (xh) = h; \ - } while (0) - -#define UNINTERLEAVE(xl, xh) do { \ - sph_u32 l, h, t; \ - l = (xl); h = (xh); \ - t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ - l ^= t; h ^= t >> 16; \ - t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ - t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ - t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ - t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ - t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ - t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ - t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ - t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ - (xl) = l; (xh) = h; \ - } while (0) - -#else - -#define INTERLEAVE(l, h) -#define UNINTERLEAVE(l, h) - -#endif - -#if SPH_KECCAK_NOCOPY - -#define a00l (kc->u.narrow[2 * 0 + 0]) -#define a00h (kc->u.narrow[2 * 0 + 1]) -#define a10l (kc->u.narrow[2 * 1 + 0]) -#define a10h (kc->u.narrow[2 * 1 + 1]) -#define a20l (kc->u.narrow[2 * 2 + 0]) -#define a20h (kc->u.narrow[2 * 2 + 1]) -#define a30l (kc->u.narrow[2 * 3 + 0]) -#define a30h (kc->u.narrow[2 * 3 + 1]) -#define a40l (kc->u.narrow[2 * 4 + 0]) -#define a40h (kc->u.narrow[2 * 4 + 1]) -#define a01l (kc->u.narrow[2 * 5 + 0]) -#define a01h (kc->u.narrow[2 * 5 + 1]) -#define a11l (kc->u.narrow[2 * 6 + 0]) -#define a11h (kc->u.narrow[2 * 6 + 1]) -#define a21l (kc->u.narrow[2 * 7 + 0]) -#define a21h (kc->u.narrow[2 * 7 + 1]) -#define a31l (kc->u.narrow[2 * 8 + 0]) -#define a31h (kc->u.narrow[2 * 8 + 1]) -#define a41l (kc->u.narrow[2 * 9 + 0]) -#define a41h (kc->u.narrow[2 * 9 + 1]) -#define a02l (kc->u.narrow[2 * 10 + 0]) -#define a02h (kc->u.narrow[2 * 10 + 1]) -#define a12l (kc->u.narrow[2 * 11 + 0]) -#define a12h (kc->u.narrow[2 * 11 + 1]) -#define a22l (kc->u.narrow[2 * 12 + 0]) -#define a22h (kc->u.narrow[2 * 12 + 1]) -#define a32l (kc->u.narrow[2 * 13 + 0]) -#define a32h (kc->u.narrow[2 * 13 + 1]) -#define a42l (kc->u.narrow[2 * 14 + 0]) -#define a42h (kc->u.narrow[2 * 14 + 1]) -#define a03l (kc->u.narrow[2 * 15 + 0]) -#define a03h (kc->u.narrow[2 * 15 + 1]) -#define a13l (kc->u.narrow[2 * 16 + 0]) -#define a13h (kc->u.narrow[2 * 16 + 1]) -#define a23l (kc->u.narrow[2 * 17 + 0]) -#define a23h (kc->u.narrow[2 * 17 + 1]) -#define a33l (kc->u.narrow[2 * 18 + 0]) -#define a33h (kc->u.narrow[2 * 18 + 1]) -#define a43l (kc->u.narrow[2 * 19 + 0]) -#define a43h (kc->u.narrow[2 * 19 + 1]) -#define a04l (kc->u.narrow[2 * 20 + 0]) -#define a04h (kc->u.narrow[2 * 20 + 1]) -#define a14l (kc->u.narrow[2 * 21 + 0]) -#define a14h (kc->u.narrow[2 * 21 + 1]) -#define a24l (kc->u.narrow[2 * 22 + 0]) -#define a24h (kc->u.narrow[2 * 22 + 1]) -#define a34l (kc->u.narrow[2 * 23 + 0]) -#define a34h (kc->u.narrow[2 * 23 + 1]) -#define a44l (kc->u.narrow[2 * 24 + 0]) -#define a44h (kc->u.narrow[2 * 24 + 1]) - -#define DECL_STATE -#define READ_STATE(state) -#define WRITE_STATE(state) - -#define INPUT_BUF(size) do { \ - size_t j; \ - for (j = 0; j < (size); j += 8) { \ - sph_u32 tl, th; \ - tl = sph_dec32le_aligned(buf + j + 0); \ - th = sph_dec32le_aligned(buf + j + 4); \ - INTERLEAVE(tl, th); \ - kc->u.narrow[(j >> 2) + 0] ^= tl; \ - kc->u.narrow[(j >> 2) + 1] ^= th; \ - } \ - } while (0) - -#define INPUT_BUF144 INPUT_BUF(144) -#define INPUT_BUF136 INPUT_BUF(136) -#define INPUT_BUF104 INPUT_BUF(104) -#define INPUT_BUF72 INPUT_BUF(72) - -#else - -#define DECL_STATE \ - sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \ - sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \ - sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \ - sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \ - sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h; - -#define READ_STATE(state) do { \ - a00l = (state)->u.narrow[2 * 0 + 0]; \ - a00h = (state)->u.narrow[2 * 0 + 1]; \ - a10l = (state)->u.narrow[2 * 1 + 0]; \ - a10h = (state)->u.narrow[2 * 1 + 1]; \ - a20l = (state)->u.narrow[2 * 2 + 0]; \ - a20h = (state)->u.narrow[2 * 2 + 1]; \ - a30l = (state)->u.narrow[2 * 3 + 0]; \ - a30h = (state)->u.narrow[2 * 3 + 1]; \ - a40l = (state)->u.narrow[2 * 4 + 0]; \ - a40h = (state)->u.narrow[2 * 4 + 1]; \ - a01l = (state)->u.narrow[2 * 5 + 0]; \ - a01h = (state)->u.narrow[2 * 5 + 1]; \ - a11l = (state)->u.narrow[2 * 6 + 0]; \ - a11h = (state)->u.narrow[2 * 6 + 1]; \ - a21l = (state)->u.narrow[2 * 7 + 0]; \ - a21h = (state)->u.narrow[2 * 7 + 1]; \ - a31l = (state)->u.narrow[2 * 8 + 0]; \ - a31h = (state)->u.narrow[2 * 8 + 1]; \ - a41l = (state)->u.narrow[2 * 9 + 0]; \ - a41h = (state)->u.narrow[2 * 9 + 1]; \ - a02l = (state)->u.narrow[2 * 10 + 0]; \ - a02h = (state)->u.narrow[2 * 10 + 1]; \ - a12l = (state)->u.narrow[2 * 11 + 0]; \ - a12h = (state)->u.narrow[2 * 11 + 1]; \ - a22l = (state)->u.narrow[2 * 12 + 0]; \ - a22h = (state)->u.narrow[2 * 12 + 1]; \ - a32l = (state)->u.narrow[2 * 13 + 0]; \ - a32h = (state)->u.narrow[2 * 13 + 1]; \ - a42l = (state)->u.narrow[2 * 14 + 0]; \ - a42h = (state)->u.narrow[2 * 14 + 1]; \ - a03l = (state)->u.narrow[2 * 15 + 0]; \ - a03h = (state)->u.narrow[2 * 15 + 1]; \ - a13l = (state)->u.narrow[2 * 16 + 0]; \ - a13h = (state)->u.narrow[2 * 16 + 1]; \ - a23l = (state)->u.narrow[2 * 17 + 0]; \ - a23h = (state)->u.narrow[2 * 17 + 1]; \ - a33l = (state)->u.narrow[2 * 18 + 0]; \ - a33h = (state)->u.narrow[2 * 18 + 1]; \ - a43l = (state)->u.narrow[2 * 19 + 0]; \ - a43h = (state)->u.narrow[2 * 19 + 1]; \ - a04l = (state)->u.narrow[2 * 20 + 0]; \ - a04h = (state)->u.narrow[2 * 20 + 1]; \ - a14l = (state)->u.narrow[2 * 21 + 0]; \ - a14h = (state)->u.narrow[2 * 21 + 1]; \ - a24l = (state)->u.narrow[2 * 22 + 0]; \ - a24h = (state)->u.narrow[2 * 22 + 1]; \ - a34l = (state)->u.narrow[2 * 23 + 0]; \ - a34h = (state)->u.narrow[2 * 23 + 1]; \ - a44l = (state)->u.narrow[2 * 24 + 0]; \ - a44h = (state)->u.narrow[2 * 24 + 1]; \ - } while (0) - -#define WRITE_STATE(state) do { \ - (state)->u.narrow[2 * 0 + 0] = a00l; \ - (state)->u.narrow[2 * 0 + 1] = a00h; \ - (state)->u.narrow[2 * 1 + 0] = a10l; \ - (state)->u.narrow[2 * 1 + 1] = a10h; \ - (state)->u.narrow[2 * 2 + 0] = a20l; \ - (state)->u.narrow[2 * 2 + 1] = a20h; \ - (state)->u.narrow[2 * 3 + 0] = a30l; \ - (state)->u.narrow[2 * 3 + 1] = a30h; \ - (state)->u.narrow[2 * 4 + 0] = a40l; \ - (state)->u.narrow[2 * 4 + 1] = a40h; \ - (state)->u.narrow[2 * 5 + 0] = a01l; \ - (state)->u.narrow[2 * 5 + 1] = a01h; \ - (state)->u.narrow[2 * 6 + 0] = a11l; \ - (state)->u.narrow[2 * 6 + 1] = a11h; \ - (state)->u.narrow[2 * 7 + 0] = a21l; \ - (state)->u.narrow[2 * 7 + 1] = a21h; \ - (state)->u.narrow[2 * 8 + 0] = a31l; \ - (state)->u.narrow[2 * 8 + 1] = a31h; \ - (state)->u.narrow[2 * 9 + 0] = a41l; \ - (state)->u.narrow[2 * 9 + 1] = a41h; \ - (state)->u.narrow[2 * 10 + 0] = a02l; \ - (state)->u.narrow[2 * 10 + 1] = a02h; \ - (state)->u.narrow[2 * 11 + 0] = a12l; \ - (state)->u.narrow[2 * 11 + 1] = a12h; \ - (state)->u.narrow[2 * 12 + 0] = a22l; \ - (state)->u.narrow[2 * 12 + 1] = a22h; \ - (state)->u.narrow[2 * 13 + 0] = a32l; \ - (state)->u.narrow[2 * 13 + 1] = a32h; \ - (state)->u.narrow[2 * 14 + 0] = a42l; \ - (state)->u.narrow[2 * 14 + 1] = a42h; \ - (state)->u.narrow[2 * 15 + 0] = a03l; \ - (state)->u.narrow[2 * 15 + 1] = a03h; \ - (state)->u.narrow[2 * 16 + 0] = a13l; \ - (state)->u.narrow[2 * 16 + 1] = a13h; \ - (state)->u.narrow[2 * 17 + 0] = a23l; \ - (state)->u.narrow[2 * 17 + 1] = a23h; \ - (state)->u.narrow[2 * 18 + 0] = a33l; \ - (state)->u.narrow[2 * 18 + 1] = a33h; \ - (state)->u.narrow[2 * 19 + 0] = a43l; \ - (state)->u.narrow[2 * 19 + 1] = a43h; \ - (state)->u.narrow[2 * 20 + 0] = a04l; \ - (state)->u.narrow[2 * 20 + 1] = a04h; \ - (state)->u.narrow[2 * 21 + 0] = a14l; \ - (state)->u.narrow[2 * 21 + 1] = a14h; \ - (state)->u.narrow[2 * 22 + 0] = a24l; \ - (state)->u.narrow[2 * 22 + 1] = a24h; \ - (state)->u.narrow[2 * 23 + 0] = a34l; \ - (state)->u.narrow[2 * 23 + 1] = a34h; \ - (state)->u.narrow[2 * 24 + 0] = a44l; \ - (state)->u.narrow[2 * 24 + 1] = a44h; \ - } while (0) - -#define READ64(d, off) do { \ - sph_u32 tl, th; \ - tl = sph_dec32le_aligned(buf + (off)); \ - th = sph_dec32le_aligned(buf + (off) + 4); \ - INTERLEAVE(tl, th); \ - d ## l ^= tl; \ - d ## h ^= th; \ - } while (0) - -#define INPUT_BUF144 do { \ - READ64(a00, 0); \ - READ64(a10, 8); \ - READ64(a20, 16); \ - READ64(a30, 24); \ - READ64(a40, 32); \ - READ64(a01, 40); \ - READ64(a11, 48); \ - READ64(a21, 56); \ - READ64(a31, 64); \ - READ64(a41, 72); \ - READ64(a02, 80); \ - READ64(a12, 88); \ - READ64(a22, 96); \ - READ64(a32, 104); \ - READ64(a42, 112); \ - READ64(a03, 120); \ - READ64(a13, 128); \ - READ64(a23, 136); \ - } while (0) - -#define INPUT_BUF136 do { \ - READ64(a00, 0); \ - READ64(a10, 8); \ - READ64(a20, 16); \ - READ64(a30, 24); \ - READ64(a40, 32); \ - READ64(a01, 40); \ - READ64(a11, 48); \ - READ64(a21, 56); \ - READ64(a31, 64); \ - READ64(a41, 72); \ - READ64(a02, 80); \ - READ64(a12, 88); \ - READ64(a22, 96); \ - READ64(a32, 104); \ - READ64(a42, 112); \ - READ64(a03, 120); \ - READ64(a13, 128); \ - } while (0) - -#define INPUT_BUF104 do { \ - READ64(a00, 0); \ - READ64(a10, 8); \ - READ64(a20, 16); \ - READ64(a30, 24); \ - READ64(a40, 32); \ - READ64(a01, 40); \ - READ64(a11, 48); \ - READ64(a21, 56); \ - READ64(a31, 64); \ - READ64(a41, 72); \ - READ64(a02, 80); \ - READ64(a12, 88); \ - READ64(a22, 96); \ - } while (0) - -#define INPUT_BUF72 do { \ - READ64(a00, 0); \ - READ64(a10, 8); \ - READ64(a20, 16); \ - READ64(a30, 24); \ - READ64(a40, 32); \ - READ64(a01, 40); \ - READ64(a11, 48); \ - READ64(a21, 56); \ - READ64(a31, 64); \ - } while (0) - -#define INPUT_BUF(lim) do { \ - READ64(a00, 0); \ - READ64(a10, 8); \ - READ64(a20, 16); \ - READ64(a30, 24); \ - READ64(a40, 32); \ - READ64(a01, 40); \ - READ64(a11, 48); \ - READ64(a21, 56); \ - READ64(a31, 64); \ - if ((lim) == 72) \ - break; \ - READ64(a41, 72); \ - READ64(a02, 80); \ - READ64(a12, 88); \ - READ64(a22, 96); \ - if ((lim) == 104) \ - break; \ - READ64(a32, 104); \ - READ64(a42, 112); \ - READ64(a03, 120); \ - READ64(a13, 128); \ - if ((lim) == 136) \ - break; \ - READ64(a23, 136); \ - } while (0) - -#endif - -#define DECL64(x) sph_u64 x ## l, x ## h -#define MOV64(d, s) (d ## l = s ## l, d ## h = s ## h) -#define XOR64(d, a, b) (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h) -#define AND64(d, a, b) (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h) -#define OR64(d, a, b) (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h) -#define NOT64(d, s) (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h)) -#define ROL64(d, v, n) ROL64_ ## n(d, v) - -#if SPH_KECCAK_INTERLEAVE - -#define ROL64_odd1(d, v) do { \ - sph_u32 tmp; \ - tmp = v ## l; \ - d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \ - d ## h = tmp; \ - } while (0) - -#define ROL64_odd63(d, v) do { \ - sph_u32 tmp; \ - tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \ - d ## l = v ## h; \ - d ## h = tmp; \ - } while (0) - -#define ROL64_odd(d, v, n) do { \ - sph_u32 tmp; \ - tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \ - d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ - d ## h = tmp; \ - } while (0) - -#define ROL64_even(d, v, n) do { \ - d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \ - d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ - } while (0) - -#define ROL64_0(d, v) -#define ROL64_1(d, v) ROL64_odd1(d, v) -#define ROL64_2(d, v) ROL64_even(d, v, 1) -#define ROL64_3(d, v) ROL64_odd( d, v, 2) -#define ROL64_4(d, v) ROL64_even(d, v, 2) -#define ROL64_5(d, v) ROL64_odd( d, v, 3) -#define ROL64_6(d, v) ROL64_even(d, v, 3) -#define ROL64_7(d, v) ROL64_odd( d, v, 4) -#define ROL64_8(d, v) ROL64_even(d, v, 4) -#define ROL64_9(d, v) ROL64_odd( d, v, 5) -#define ROL64_10(d, v) ROL64_even(d, v, 5) -#define ROL64_11(d, v) ROL64_odd( d, v, 6) -#define ROL64_12(d, v) ROL64_even(d, v, 6) -#define ROL64_13(d, v) ROL64_odd( d, v, 7) -#define ROL64_14(d, v) ROL64_even(d, v, 7) -#define ROL64_15(d, v) ROL64_odd( d, v, 8) -#define ROL64_16(d, v) ROL64_even(d, v, 8) -#define ROL64_17(d, v) ROL64_odd( d, v, 9) -#define ROL64_18(d, v) ROL64_even(d, v, 9) -#define ROL64_19(d, v) ROL64_odd( d, v, 10) -#define ROL64_20(d, v) ROL64_even(d, v, 10) -#define ROL64_21(d, v) ROL64_odd( d, v, 11) -#define ROL64_22(d, v) ROL64_even(d, v, 11) -#define ROL64_23(d, v) ROL64_odd( d, v, 12) -#define ROL64_24(d, v) ROL64_even(d, v, 12) -#define ROL64_25(d, v) ROL64_odd( d, v, 13) -#define ROL64_26(d, v) ROL64_even(d, v, 13) -#define ROL64_27(d, v) ROL64_odd( d, v, 14) -#define ROL64_28(d, v) ROL64_even(d, v, 14) -#define ROL64_29(d, v) ROL64_odd( d, v, 15) -#define ROL64_30(d, v) ROL64_even(d, v, 15) -#define ROL64_31(d, v) ROL64_odd( d, v, 16) -#define ROL64_32(d, v) ROL64_even(d, v, 16) -#define ROL64_33(d, v) ROL64_odd( d, v, 17) -#define ROL64_34(d, v) ROL64_even(d, v, 17) -#define ROL64_35(d, v) ROL64_odd( d, v, 18) -#define ROL64_36(d, v) ROL64_even(d, v, 18) -#define ROL64_37(d, v) ROL64_odd( d, v, 19) -#define ROL64_38(d, v) ROL64_even(d, v, 19) -#define ROL64_39(d, v) ROL64_odd( d, v, 20) -#define ROL64_40(d, v) ROL64_even(d, v, 20) -#define ROL64_41(d, v) ROL64_odd( d, v, 21) -#define ROL64_42(d, v) ROL64_even(d, v, 21) -#define ROL64_43(d, v) ROL64_odd( d, v, 22) -#define ROL64_44(d, v) ROL64_even(d, v, 22) -#define ROL64_45(d, v) ROL64_odd( d, v, 23) -#define ROL64_46(d, v) ROL64_even(d, v, 23) -#define ROL64_47(d, v) ROL64_odd( d, v, 24) -#define ROL64_48(d, v) ROL64_even(d, v, 24) -#define ROL64_49(d, v) ROL64_odd( d, v, 25) -#define ROL64_50(d, v) ROL64_even(d, v, 25) -#define ROL64_51(d, v) ROL64_odd( d, v, 26) -#define ROL64_52(d, v) ROL64_even(d, v, 26) -#define ROL64_53(d, v) ROL64_odd( d, v, 27) -#define ROL64_54(d, v) ROL64_even(d, v, 27) -#define ROL64_55(d, v) ROL64_odd( d, v, 28) -#define ROL64_56(d, v) ROL64_even(d, v, 28) -#define ROL64_57(d, v) ROL64_odd( d, v, 29) -#define ROL64_58(d, v) ROL64_even(d, v, 29) -#define ROL64_59(d, v) ROL64_odd( d, v, 30) -#define ROL64_60(d, v) ROL64_even(d, v, 30) -#define ROL64_61(d, v) ROL64_odd( d, v, 31) -#define ROL64_62(d, v) ROL64_even(d, v, 31) -#define ROL64_63(d, v) ROL64_odd63(d, v) - -#else - -#define ROL64_small(d, v, n) do { \ - sph_u32 tmp; \ - tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \ - d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \ - d ## l = tmp; \ - } while (0) - -#define ROL64_0(d, v) 0 -#define ROL64_1(d, v) ROL64_small(d, v, 1) -#define ROL64_2(d, v) ROL64_small(d, v, 2) -#define ROL64_3(d, v) ROL64_small(d, v, 3) -#define ROL64_4(d, v) ROL64_small(d, v, 4) -#define ROL64_5(d, v) ROL64_small(d, v, 5) -#define ROL64_6(d, v) ROL64_small(d, v, 6) -#define ROL64_7(d, v) ROL64_small(d, v, 7) -#define ROL64_8(d, v) ROL64_small(d, v, 8) -#define ROL64_9(d, v) ROL64_small(d, v, 9) -#define ROL64_10(d, v) ROL64_small(d, v, 10) -#define ROL64_11(d, v) ROL64_small(d, v, 11) -#define ROL64_12(d, v) ROL64_small(d, v, 12) -#define ROL64_13(d, v) ROL64_small(d, v, 13) -#define ROL64_14(d, v) ROL64_small(d, v, 14) -#define ROL64_15(d, v) ROL64_small(d, v, 15) -#define ROL64_16(d, v) ROL64_small(d, v, 16) -#define ROL64_17(d, v) ROL64_small(d, v, 17) -#define ROL64_18(d, v) ROL64_small(d, v, 18) -#define ROL64_19(d, v) ROL64_small(d, v, 19) -#define ROL64_20(d, v) ROL64_small(d, v, 20) -#define ROL64_21(d, v) ROL64_small(d, v, 21) -#define ROL64_22(d, v) ROL64_small(d, v, 22) -#define ROL64_23(d, v) ROL64_small(d, v, 23) -#define ROL64_24(d, v) ROL64_small(d, v, 24) -#define ROL64_25(d, v) ROL64_small(d, v, 25) -#define ROL64_26(d, v) ROL64_small(d, v, 26) -#define ROL64_27(d, v) ROL64_small(d, v, 27) -#define ROL64_28(d, v) ROL64_small(d, v, 28) -#define ROL64_29(d, v) ROL64_small(d, v, 29) -#define ROL64_30(d, v) ROL64_small(d, v, 30) -#define ROL64_31(d, v) ROL64_small(d, v, 31) - -#define ROL64_32(d, v) do { \ - sph_u32 tmp; \ - tmp = v ## l; \ - d ## l = v ## h; \ - d ## h = tmp; \ - } while (0) - -#define ROL64_big(d, v, n) do { \ - sph_u32 trl, trh; \ - ROL64_small(tr, v, n); \ - d ## h = trl; \ - d ## l = trh; \ - } while (0) - -#define ROL64_33(d, v) ROL64_big(d, v, 1) -#define ROL64_34(d, v) ROL64_big(d, v, 2) -#define ROL64_35(d, v) ROL64_big(d, v, 3) -#define ROL64_36(d, v) ROL64_big(d, v, 4) -#define ROL64_37(d, v) ROL64_big(d, v, 5) -#define ROL64_38(d, v) ROL64_big(d, v, 6) -#define ROL64_39(d, v) ROL64_big(d, v, 7) -#define ROL64_40(d, v) ROL64_big(d, v, 8) -#define ROL64_41(d, v) ROL64_big(d, v, 9) -#define ROL64_42(d, v) ROL64_big(d, v, 10) -#define ROL64_43(d, v) ROL64_big(d, v, 11) -#define ROL64_44(d, v) ROL64_big(d, v, 12) -#define ROL64_45(d, v) ROL64_big(d, v, 13) -#define ROL64_46(d, v) ROL64_big(d, v, 14) -#define ROL64_47(d, v) ROL64_big(d, v, 15) -#define ROL64_48(d, v) ROL64_big(d, v, 16) -#define ROL64_49(d, v) ROL64_big(d, v, 17) -#define ROL64_50(d, v) ROL64_big(d, v, 18) -#define ROL64_51(d, v) ROL64_big(d, v, 19) -#define ROL64_52(d, v) ROL64_big(d, v, 20) -#define ROL64_53(d, v) ROL64_big(d, v, 21) -#define ROL64_54(d, v) ROL64_big(d, v, 22) -#define ROL64_55(d, v) ROL64_big(d, v, 23) -#define ROL64_56(d, v) ROL64_big(d, v, 24) -#define ROL64_57(d, v) ROL64_big(d, v, 25) -#define ROL64_58(d, v) ROL64_big(d, v, 26) -#define ROL64_59(d, v) ROL64_big(d, v, 27) -#define ROL64_60(d, v) ROL64_big(d, v, 28) -#define ROL64_61(d, v) ROL64_big(d, v, 29) -#define ROL64_62(d, v) ROL64_big(d, v, 30) -#define ROL64_63(d, v) ROL64_big(d, v, 31) - -#endif - -#define XOR64_IOTA(d, s, k) \ - (d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high) - -#endif - -#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ - DECL64(tt0); \ - DECL64(tt1); \ - DECL64(tt2); \ - DECL64(tt3); \ - XOR64(tt0, d0, d1); \ - XOR64(tt1, d2, d3); \ - XOR64(tt0, tt0, d4); \ - XOR64(tt0, tt0, tt1); \ - ROL64(tt0, tt0, 1); \ - XOR64(tt2, c0, c1); \ - XOR64(tt3, c2, c3); \ - XOR64(tt0, tt0, c4); \ - XOR64(tt2, tt2, tt3); \ - XOR64(t, tt0, tt2); \ - } while (0) - -#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - DECL64(t0); \ - DECL64(t1); \ - DECL64(t2); \ - DECL64(t3); \ - DECL64(t4); \ - TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ - TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ - TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ - TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ - TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ - XOR64(b00, b00, t0); \ - XOR64(b01, b01, t0); \ - XOR64(b02, b02, t0); \ - XOR64(b03, b03, t0); \ - XOR64(b04, b04, t0); \ - XOR64(b10, b10, t1); \ - XOR64(b11, b11, t1); \ - XOR64(b12, b12, t1); \ - XOR64(b13, b13, t1); \ - XOR64(b14, b14, t1); \ - XOR64(b20, b20, t2); \ - XOR64(b21, b21, t2); \ - XOR64(b22, b22, t2); \ - XOR64(b23, b23, t2); \ - XOR64(b24, b24, t2); \ - XOR64(b30, b30, t3); \ - XOR64(b31, b31, t3); \ - XOR64(b32, b32, t3); \ - XOR64(b33, b33, t3); \ - XOR64(b34, b34, t3); \ - XOR64(b40, b40, t4); \ - XOR64(b41, b41, t4); \ - XOR64(b42, b42, t4); \ - XOR64(b43, b43, t4); \ - XOR64(b44, b44, t4); \ - } while (0) - -#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - /* ROL64(b00, b00, 0); */ \ - ROL64(b01, b01, 36); \ - ROL64(b02, b02, 3); \ - ROL64(b03, b03, 41); \ - ROL64(b04, b04, 18); \ - ROL64(b10, b10, 1); \ - ROL64(b11, b11, 44); \ - ROL64(b12, b12, 10); \ - ROL64(b13, b13, 45); \ - ROL64(b14, b14, 2); \ - ROL64(b20, b20, 62); \ - ROL64(b21, b21, 6); \ - ROL64(b22, b22, 43); \ - ROL64(b23, b23, 15); \ - ROL64(b24, b24, 61); \ - ROL64(b30, b30, 28); \ - ROL64(b31, b31, 55); \ - ROL64(b32, b32, 25); \ - ROL64(b33, b33, 21); \ - ROL64(b34, b34, 56); \ - ROL64(b40, b40, 27); \ - ROL64(b41, b41, 20); \ - ROL64(b42, b42, 39); \ - ROL64(b43, b43, 8); \ - ROL64(b44, b44, 14); \ - } while (0) - -/* - * The KHI macro integrates the "lane complement" optimization. On input, - * some words are complemented: - * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 - * On output, the following words are complemented: - * a04 a10 a20 a22 a23 a31 - * - * The (implicit) permutation and the theta expansion will bring back - * the input mask for the next round. - */ - -#define KHI_XO(d, a, b, c) do { \ - DECL64(kt); \ - OR64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI_XA(d, a, b, c) do { \ - DECL64(kt); \ - AND64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - DECL64(c0); \ - DECL64(c1); \ - DECL64(c2); \ - DECL64(c3); \ - DECL64(c4); \ - DECL64(bnn); \ - NOT64(bnn, b20); \ - KHI_XO(c0, b00, b10, b20); \ - KHI_XO(c1, b10, bnn, b30); \ - KHI_XA(c2, b20, b30, b40); \ - KHI_XO(c3, b30, b40, b00); \ - KHI_XA(c4, b40, b00, b10); \ - MOV64(b00, c0); \ - MOV64(b10, c1); \ - MOV64(b20, c2); \ - MOV64(b30, c3); \ - MOV64(b40, c4); \ - NOT64(bnn, b41); \ - KHI_XO(c0, b01, b11, b21); \ - KHI_XA(c1, b11, b21, b31); \ - KHI_XO(c2, b21, b31, bnn); \ - KHI_XO(c3, b31, b41, b01); \ - KHI_XA(c4, b41, b01, b11); \ - MOV64(b01, c0); \ - MOV64(b11, c1); \ - MOV64(b21, c2); \ - MOV64(b31, c3); \ - MOV64(b41, c4); \ - NOT64(bnn, b32); \ - KHI_XO(c0, b02, b12, b22); \ - KHI_XA(c1, b12, b22, b32); \ - KHI_XA(c2, b22, bnn, b42); \ - KHI_XO(c3, bnn, b42, b02); \ - KHI_XA(c4, b42, b02, b12); \ - MOV64(b02, c0); \ - MOV64(b12, c1); \ - MOV64(b22, c2); \ - MOV64(b32, c3); \ - MOV64(b42, c4); \ - NOT64(bnn, b33); \ - KHI_XA(c0, b03, b13, b23); \ - KHI_XO(c1, b13, b23, b33); \ - KHI_XO(c2, b23, bnn, b43); \ - KHI_XA(c3, bnn, b43, b03); \ - KHI_XO(c4, b43, b03, b13); \ - MOV64(b03, c0); \ - MOV64(b13, c1); \ - MOV64(b23, c2); \ - MOV64(b33, c3); \ - MOV64(b43, c4); \ - NOT64(bnn, b14); \ - KHI_XA(c0, b04, bnn, b24); \ - KHI_XO(c1, bnn, b24, b34); \ - KHI_XA(c2, b24, b34, b44); \ - KHI_XO(c3, b34, b44, b04); \ - KHI_XA(c4, b44, b04, b14); \ - MOV64(b04, c0); \ - MOV64(b14, c1); \ - MOV64(b24, c2); \ - MOV64(b34, c3); \ - MOV64(b44, c4); \ - } while (0) - -#define IOTA(r) XOR64_IOTA(a00, a00, r) - -#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ - a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 -#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ - a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 -#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ - a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 -#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ - a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 -#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ - a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 -#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ - a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 -#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ - a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 -#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ - a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 -#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ - a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 -#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ - a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 -#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ - a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 -#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ - a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 -#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ - a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 -#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ - a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 -#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ - a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 -#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ - a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 -#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ - a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 -#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ - a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 -#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ - a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 -#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ - a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 -#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ - a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 -#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ - a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 -#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ - a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 -#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ - a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 - -#define P1_TO_P0 do { \ - DECL64(t); \ - MOV64(t, a01); \ - MOV64(a01, a30); \ - MOV64(a30, a33); \ - MOV64(a33, a23); \ - MOV64(a23, a12); \ - MOV64(a12, a21); \ - MOV64(a21, a02); \ - MOV64(a02, a10); \ - MOV64(a10, a11); \ - MOV64(a11, a41); \ - MOV64(a41, a24); \ - MOV64(a24, a42); \ - MOV64(a42, a04); \ - MOV64(a04, a20); \ - MOV64(a20, a22); \ - MOV64(a22, a32); \ - MOV64(a32, a43); \ - MOV64(a43, a34); \ - MOV64(a34, a03); \ - MOV64(a03, a40); \ - MOV64(a40, a44); \ - MOV64(a44, a14); \ - MOV64(a14, a31); \ - MOV64(a31, a13); \ - MOV64(a13, t); \ - } while (0) - -#define P2_TO_P0 do { \ - DECL64(t); \ - MOV64(t, a01); \ - MOV64(a01, a33); \ - MOV64(a33, a12); \ - MOV64(a12, a02); \ - MOV64(a02, a11); \ - MOV64(a11, a24); \ - MOV64(a24, a04); \ - MOV64(a04, a22); \ - MOV64(a22, a43); \ - MOV64(a43, a03); \ - MOV64(a03, a44); \ - MOV64(a44, a31); \ - MOV64(a31, t); \ - MOV64(t, a10); \ - MOV64(a10, a41); \ - MOV64(a41, a42); \ - MOV64(a42, a20); \ - MOV64(a20, a32); \ - MOV64(a32, a34); \ - MOV64(a34, a40); \ - MOV64(a40, a14); \ - MOV64(a14, a13); \ - MOV64(a13, a30); \ - MOV64(a30, a23); \ - MOV64(a23, a21); \ - MOV64(a21, t); \ - } while (0) - -#define P4_TO_P0 do { \ - DECL64(t); \ - MOV64(t, a01); \ - MOV64(a01, a12); \ - MOV64(a12, a11); \ - MOV64(a11, a04); \ - MOV64(a04, a43); \ - MOV64(a43, a44); \ - MOV64(a44, t); \ - MOV64(t, a02); \ - MOV64(a02, a24); \ - MOV64(a24, a22); \ - MOV64(a22, a03); \ - MOV64(a03, a31); \ - MOV64(a31, a33); \ - MOV64(a33, t); \ - MOV64(t, a10); \ - MOV64(a10, a42); \ - MOV64(a42, a32); \ - MOV64(a32, a40); \ - MOV64(a40, a13); \ - MOV64(a13, a23); \ - MOV64(a23, t); \ - MOV64(t, a14); \ - MOV64(a14, a30); \ - MOV64(a30, a21); \ - MOV64(a21, a41); \ - MOV64(a41, a20); \ - MOV64(a20, a34); \ - MOV64(a34, t); \ - } while (0) - -#define P6_TO_P0 do { \ - DECL64(t); \ - MOV64(t, a01); \ - MOV64(a01, a02); \ - MOV64(a02, a04); \ - MOV64(a04, a03); \ - MOV64(a03, t); \ - MOV64(t, a10); \ - MOV64(a10, a20); \ - MOV64(a20, a40); \ - MOV64(a40, a30); \ - MOV64(a30, t); \ - MOV64(t, a11); \ - MOV64(a11, a22); \ - MOV64(a22, a44); \ - MOV64(a44, a33); \ - MOV64(a33, t); \ - MOV64(t, a12); \ - MOV64(a12, a24); \ - MOV64(a24, a43); \ - MOV64(a43, a31); \ - MOV64(a31, t); \ - MOV64(t, a13); \ - MOV64(a13, a21); \ - MOV64(a21, a42); \ - MOV64(a42, a34); \ - MOV64(a34, t); \ - MOV64(t, a14); \ - MOV64(a14, a23); \ - MOV64(a23, a41); \ - MOV64(a41, a32); \ - MOV64(a32, t); \ - } while (0) - -#define P8_TO_P0 do { \ - DECL64(t); \ - MOV64(t, a01); \ - MOV64(a01, a11); \ - MOV64(a11, a43); \ - MOV64(a43, t); \ - MOV64(t, a02); \ - MOV64(a02, a22); \ - MOV64(a22, a31); \ - MOV64(a31, t); \ - MOV64(t, a03); \ - MOV64(a03, a33); \ - MOV64(a33, a24); \ - MOV64(a24, t); \ - MOV64(t, a04); \ - MOV64(a04, a44); \ - MOV64(a44, a12); \ - MOV64(a12, t); \ - MOV64(t, a10); \ - MOV64(a10, a32); \ - MOV64(a32, a13); \ - MOV64(a13, t); \ - MOV64(t, a14); \ - MOV64(a14, a21); \ - MOV64(a21, a20); \ - MOV64(a20, t); \ - MOV64(t, a23); \ - MOV64(a23, a42); \ - MOV64(a42, a40); \ - MOV64(a40, t); \ - MOV64(t, a30); \ - MOV64(a30, a41); \ - MOV64(a41, a34); \ - MOV64(a34, t); \ - } while (0) - -#define P12_TO_P0 do { \ - DECL64(t); \ - MOV64(t, a01); \ - MOV64(a01, a04); \ - MOV64(a04, t); \ - MOV64(t, a02); \ - MOV64(a02, a03); \ - MOV64(a03, t); \ - MOV64(t, a10); \ - MOV64(a10, a40); \ - MOV64(a40, t); \ - MOV64(t, a11); \ - MOV64(a11, a44); \ - MOV64(a44, t); \ - MOV64(t, a12); \ - MOV64(a12, a43); \ - MOV64(a43, t); \ - MOV64(t, a13); \ - MOV64(a13, a42); \ - MOV64(a42, t); \ - MOV64(t, a14); \ - MOV64(a14, a41); \ - MOV64(a41, t); \ - MOV64(t, a20); \ - MOV64(a20, a30); \ - MOV64(a30, t); \ - MOV64(t, a21); \ - MOV64(a21, a34); \ - MOV64(a34, t); \ - MOV64(t, a22); \ - MOV64(a22, a33); \ - MOV64(a33, t); \ - MOV64(t, a23); \ - MOV64(a23, a32); \ - MOV64(a32, t); \ - MOV64(t, a24); \ - MOV64(a24, a31); \ - MOV64(a31, t); \ - } while (0) - -#define LPAR ( -#define RPAR ) - -#define KF_ELT(r, s, k) do { \ - THETA LPAR P ## r RPAR; \ - RHO LPAR P ## r RPAR; \ - KHI LPAR P ## s RPAR; \ - IOTA(k); \ - } while (0) - -#define DO(x) x - -#define KECCAK_F_1600 DO(KECCAK_F_1600_) - -#if SPH_KECCAK_UNROLL == 1 - -#define KECCAK_F_1600_ do { \ - int j; \ - for (j = 0; j < 24; j ++) { \ - KF_ELT( 0, 1, RC[j + 0]); \ - P1_TO_P0; \ - } \ - } while (0) - -#elif SPH_KECCAK_UNROLL == 2 - -#define KECCAK_F_1600_ do { \ - int j; \ - for (j = 0; j < 24; j += 2) { \ - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - P2_TO_P0; \ - } \ - } while (0) - -#elif SPH_KECCAK_UNROLL == 4 - -#define KECCAK_F_1600_ do { \ - int j; \ - for (j = 0; j < 24; j += 4) { \ - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - KF_ELT( 2, 3, RC[j + 2]); \ - KF_ELT( 3, 4, RC[j + 3]); \ - P4_TO_P0; \ - } \ - } while (0) - -#elif SPH_KECCAK_UNROLL == 6 - -#define KECCAK_F_1600_ do { \ - int j; \ - for (j = 0; j < 24; j += 6) { \ - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - KF_ELT( 2, 3, RC[j + 2]); \ - KF_ELT( 3, 4, RC[j + 3]); \ - KF_ELT( 4, 5, RC[j + 4]); \ - KF_ELT( 5, 6, RC[j + 5]); \ - P6_TO_P0; \ - } \ - } while (0) - -#elif SPH_KECCAK_UNROLL == 8 - -#define KECCAK_F_1600_ do { \ - int j; \ - for (j = 0; j < 24; j += 8) { \ - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - KF_ELT( 2, 3, RC[j + 2]); \ - KF_ELT( 3, 4, RC[j + 3]); \ - KF_ELT( 4, 5, RC[j + 4]); \ - KF_ELT( 5, 6, RC[j + 5]); \ - KF_ELT( 6, 7, RC[j + 6]); \ - KF_ELT( 7, 8, RC[j + 7]); \ - P8_TO_P0; \ - } \ - } while (0) - -#elif SPH_KECCAK_UNROLL == 12 - -#define KECCAK_F_1600_ do { \ - int j; \ - for (j = 0; j < 24; j += 12) { \ - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - KF_ELT( 2, 3, RC[j + 2]); \ - KF_ELT( 3, 4, RC[j + 3]); \ - KF_ELT( 4, 5, RC[j + 4]); \ - KF_ELT( 5, 6, RC[j + 5]); \ - KF_ELT( 6, 7, RC[j + 6]); \ - KF_ELT( 7, 8, RC[j + 7]); \ - KF_ELT( 8, 9, RC[j + 8]); \ - KF_ELT( 9, 10, RC[j + 9]); \ - KF_ELT(10, 11, RC[j + 10]); \ - KF_ELT(11, 12, RC[j + 11]); \ - P12_TO_P0; \ - } \ - } while (0) - -#elif SPH_KECCAK_UNROLL == 0 - -#define KECCAK_F_1600_ do { \ - KF_ELT( 0, 1, RC[ 0]); \ - KF_ELT( 1, 2, RC[ 1]); \ - KF_ELT( 2, 3, RC[ 2]); \ - KF_ELT( 3, 4, RC[ 3]); \ - KF_ELT( 4, 5, RC[ 4]); \ - KF_ELT( 5, 6, RC[ 5]); \ - KF_ELT( 6, 7, RC[ 6]); \ - KF_ELT( 7, 8, RC[ 7]); \ - KF_ELT( 8, 9, RC[ 8]); \ - KF_ELT( 9, 10, RC[ 9]); \ - KF_ELT(10, 11, RC[10]); \ - KF_ELT(11, 12, RC[11]); \ - KF_ELT(12, 13, RC[12]); \ - KF_ELT(13, 14, RC[13]); \ - KF_ELT(14, 15, RC[14]); \ - KF_ELT(15, 16, RC[15]); \ - KF_ELT(16, 17, RC[16]); \ - KF_ELT(17, 18, RC[17]); \ - KF_ELT(18, 19, RC[18]); \ - KF_ELT(19, 20, RC[19]); \ - KF_ELT(20, 21, RC[20]); \ - KF_ELT(21, 22, RC[21]); \ - KF_ELT(22, 23, RC[22]); \ - KF_ELT(23, 0, RC[23]); \ - } while (0) - -#else - -#error Unimplemented unroll count for Keccak. - -#endif - -static void -keccak_init(sph_keccak_context *kc, unsigned out_size) -{ - int i; - -#if SPH_KECCAK_64 - for (i = 0; i < 25; i ++) - kc->u.wide[i] = 0; - /* - * Initialization for the "lane complement". - */ - kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF); - kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF); - kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF); - kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF); - kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF); - kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF); -#else - - for (i = 0; i < 50; i ++) - kc->u.narrow[i] = 0; - /* - * Initialization for the "lane complement". - * Note: since we set to all-one full 64-bit words, - * interleaving (if applicable) is a no-op. - */ - kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[16] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[17] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[24] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[25] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[34] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[35] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[40] = SPH_C32(0xFFFFFFFF); - kc->u.narrow[41] = SPH_C32(0xFFFFFFFF); -#endif - kc->ptr = 0; - kc->lim = 200 - (out_size >> 2); -} - -static void -keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE - - buf = kc->buf; - ptr = kc->ptr; - - if (len < (lim - ptr)) { - memcpy(buf + ptr, data, len); - kc->ptr = ptr + len; - return; - } - - READ_STATE(kc); - while (len > 0) { - size_t clen; - - clen = (lim - ptr); - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == lim) { - INPUT_BUF(lim); - KECCAK_F_1600; - ptr = 0; - } - } - WRITE_STATE(kc); - kc->ptr = ptr; -} - -#if SPH_KECCAK_64 - -#define DEFCLOSE(d, lim) \ - static void keccak_close ## d( \ - sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ - { \ - unsigned eb; \ - union { \ - unsigned char tmp[lim + 1]; \ - sph_u64 dummy; /* for alignment */ \ - } u; \ - size_t j; \ - \ - eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ - if (kc->ptr == (lim - 1)) { \ - if (n == 7) { \ - u.tmp[0] = eb; \ - memset(u.tmp + 1, 0, lim - 1); \ - u.tmp[lim] = 0x80; \ - j = 1 + lim; \ - } else { \ - u.tmp[0] = eb | 0x80; \ - j = 1; \ - } \ - } else { \ - j = lim - kc->ptr; \ - u.tmp[0] = eb; \ - memset(u.tmp + 1, 0, j - 2); \ - u.tmp[j - 1] = 0x80; \ - } \ - keccak_core(kc, u.tmp, j, lim); \ - /* Finalize the "lane complement" */ \ - kc->u.wide[ 1] = ~kc->u.wide[ 1]; \ - kc->u.wide[ 2] = ~kc->u.wide[ 2]; \ - kc->u.wide[ 8] = ~kc->u.wide[ 8]; \ - kc->u.wide[12] = ~kc->u.wide[12]; \ - kc->u.wide[17] = ~kc->u.wide[17]; \ - kc->u.wide[20] = ~kc->u.wide[20]; \ - for (j = 0; j < d; j += 8) \ - sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \ - memcpy(dst, u.tmp, d); \ - keccak_init(kc, (unsigned)d << 3); \ - } \ - -#else - -#define DEFCLOSE(d, lim) \ - static void keccak_close ## d( \ - sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ - { \ - unsigned eb; \ - union { \ - unsigned char tmp[lim + 1]; \ - sph_u64 dummy; /* for alignment */ \ - } u; \ - size_t j; \ - \ - eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ - if (kc->ptr == (lim - 1)) { \ - if (n == 7) { \ - u.tmp[0] = eb; \ - memset(u.tmp + 1, 0, lim - 1); \ - u.tmp[lim] = 0x80; \ - j = 1 + lim; \ - } else { \ - u.tmp[0] = eb | 0x80; \ - j = 1; \ - } \ - } else { \ - j = lim - kc->ptr; \ - u.tmp[0] = eb; \ - memset(u.tmp + 1, 0, j - 2); \ - u.tmp[j - 1] = 0x80; \ - } \ - keccak_core(kc, u.tmp, j, lim); \ - /* Finalize the "lane complement" */ \ - kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \ - kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \ - kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \ - kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \ - kc->u.narrow[16] = ~kc->u.narrow[16]; \ - kc->u.narrow[17] = ~kc->u.narrow[17]; \ - kc->u.narrow[24] = ~kc->u.narrow[24]; \ - kc->u.narrow[25] = ~kc->u.narrow[25]; \ - kc->u.narrow[34] = ~kc->u.narrow[34]; \ - kc->u.narrow[35] = ~kc->u.narrow[35]; \ - kc->u.narrow[40] = ~kc->u.narrow[40]; \ - kc->u.narrow[41] = ~kc->u.narrow[41]; \ - /* un-interleave */ \ - for (j = 0; j < 50; j += 2) \ - UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \ - for (j = 0; j < d; j += 4) \ - sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \ - memcpy(dst, u.tmp, d); \ - keccak_init(kc, (unsigned)d << 3); \ - } \ - -#endif - -DEFCLOSE(28, 144) -DEFCLOSE(32, 136) -DEFCLOSE(48, 104) -DEFCLOSE(64, 72) - -/* see sph_keccak.h */ -void -sph_keccak224_init(void *cc) -{ - keccak_init(cc, 224); -} - -/* see sph_keccak.h */ -void -sph_keccak224(void *cc, const void *data, size_t len) -{ - keccak_core(cc, data, len, 144); -} - -/* see sph_keccak.h */ -void -sph_keccak224_close(void *cc, void *dst) -{ - sph_keccak224_addbits_and_close(cc, 0, 0, dst); -} - -/* see sph_keccak.h */ -void -sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - keccak_close28(cc, ub, n, dst); -} - -/* see sph_keccak.h */ -void -sph_keccak256_init(void *cc) -{ - keccak_init(cc, 256); -} - -/* see sph_keccak.h */ -void -sph_keccak256(void *cc, const void *data, size_t len) -{ - keccak_core(cc, data, len, 136); -} - -/* see sph_keccak.h */ -void -sph_keccak256_close(void *cc, void *dst) -{ - sph_keccak256_addbits_and_close(cc, 0, 0, dst); -} - -/* see sph_keccak.h */ -void -sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - keccak_close32(cc, ub, n, dst); -} - -/* see sph_keccak.h */ -void -sph_keccak384_init(void *cc) -{ - keccak_init(cc, 384); -} - -/* see sph_keccak.h */ -void -sph_keccak384(void *cc, const void *data, size_t len) -{ - keccak_core(cc, data, len, 104); -} - -/* see sph_keccak.h */ -void -sph_keccak384_close(void *cc, void *dst) -{ - sph_keccak384_addbits_and_close(cc, 0, 0, dst); -} - -/* see sph_keccak.h */ -void -sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - keccak_close48(cc, ub, n, dst); -} - -/* see sph_keccak.h */ -void -sph_keccak512_init(void *cc) -{ - keccak_init(cc, 512); -} - -/* see sph_keccak.h */ -void -sph_keccak512(void *cc, const void *data, size_t len) -{ - keccak_core(cc, data, len, 72); -} - -/* see sph_keccak.h */ -void -sph_keccak512_close(void *cc, void *dst) -{ - sph_keccak512_addbits_and_close(cc, 0, 0, dst); -} - -/* see sph_keccak.h */ -void -sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - keccak_close64(cc, ub, n, dst); -} - - -#ifdef __cplusplus -} -#endif +/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */ +/* + * Keccak implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_keccak.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +/* + * Parameters: + * + * SPH_KECCAK_64 use a 64-bit type + * SPH_KECCAK_UNROLL number of loops to unroll (0/undef for full unroll) + * SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only) + * SPH_KECCAK_NOCOPY do not copy the state into local variables + * + * If there is no usable 64-bit type, the code automatically switches + * back to the 32-bit implementation. + * + * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1 + * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core + * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302, + * 8 kB L1 code cache), seem to show that the following are optimal: + * + * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds, + * do not copy the state; unrolling 2, 6 or all rounds also provides + * near-optimal performance. + * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds, + * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds + * also provides near-optimal performance. + * -- PowerPC: use the 64-bit implementation, unroll 8 rounds, + * copy the state. Unrolling 4 or 6 rounds is near-optimal. + * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds, + * copy the state. + * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy + * the state. Unrolling only 1 round is also near-optimal. + * + * Also, interleaving does not always yield actual improvements when + * using a 32-bit implementation; in particular when the architecture + * does not offer a native rotation opcode (interleaving replaces one + * 64-bit rotation with two 32-bit rotations, which is a gain only if + * there is a native 32-bit rotation opcode and not a native 64-bit + * rotation opcode; also, interleaving implies a small overhead when + * processing input words). + * + * To sum up: + * -- when possible, use the 64-bit code + * -- exception: on 32-bit x86, use 32-bit code + * -- when using 32-bit code, use interleaving + * -- copy the state, except on x86 + * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines + */ + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK +#define SPH_SMALL_FOOTPRINT_KECCAK 1 +#endif + +/* + * By default, we select the 64-bit implementation if a 64-bit type + * is available, unless a 32-bit x86 is detected. + */ +#if !defined SPH_KECCAK_64 && SPH_64 \ + && !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC) +#define SPH_KECCAK_64 1 +#endif + +/* + * If using a 32-bit implementation, we prefer to interleave. + */ +#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE +#define SPH_KECCAK_INTERLEAVE 1 +#endif + +/* + * Unroll 8 rounds on big systems, 2 rounds on small systems. + */ +#ifndef SPH_KECCAK_UNROLL +#if SPH_SMALL_FOOTPRINT_KECCAK +#define SPH_KECCAK_UNROLL 2 +#else +#define SPH_KECCAK_UNROLL 8 +#endif +#endif + +/* + * We do not want to copy the state to local variables on x86 (32-bit + * and 64-bit alike). + */ +#ifndef SPH_KECCAK_NOCOPY +#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC +#define SPH_KECCAK_NOCOPY 1 +#else +#define SPH_KECCAK_NOCOPY 0 +#endif +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#if SPH_KECCAK_64 + +static const sph_u64 RC[] = { + SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), + SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), + SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), + SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), + SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), + SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), + SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), + SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), + SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), + SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) +}; + +#if SPH_KECCAK_NOCOPY + +#define a00 (kc->u.wide[ 0]) +#define a10 (kc->u.wide[ 1]) +#define a20 (kc->u.wide[ 2]) +#define a30 (kc->u.wide[ 3]) +#define a40 (kc->u.wide[ 4]) +#define a01 (kc->u.wide[ 5]) +#define a11 (kc->u.wide[ 6]) +#define a21 (kc->u.wide[ 7]) +#define a31 (kc->u.wide[ 8]) +#define a41 (kc->u.wide[ 9]) +#define a02 (kc->u.wide[10]) +#define a12 (kc->u.wide[11]) +#define a22 (kc->u.wide[12]) +#define a32 (kc->u.wide[13]) +#define a42 (kc->u.wide[14]) +#define a03 (kc->u.wide[15]) +#define a13 (kc->u.wide[16]) +#define a23 (kc->u.wide[17]) +#define a33 (kc->u.wide[18]) +#define a43 (kc->u.wide[19]) +#define a04 (kc->u.wide[20]) +#define a14 (kc->u.wide[21]) +#define a24 (kc->u.wide[22]) +#define a34 (kc->u.wide[23]) +#define a44 (kc->u.wide[24]) + +#define DECL_STATE +#define READ_STATE(sc) +#define WRITE_STATE(sc) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size); j += 8) { \ + kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \ + } \ + } while (0) + +#define INPUT_BUF144 INPUT_BUF(144) +#define INPUT_BUF136 INPUT_BUF(136) +#define INPUT_BUF104 INPUT_BUF(104) +#define INPUT_BUF72 INPUT_BUF(72) + +#else + +#define DECL_STATE \ + sph_u64 a00, a01, a02, a03, a04; \ + sph_u64 a10, a11, a12, a13, a14; \ + sph_u64 a20, a21, a22, a23, a24; \ + sph_u64 a30, a31, a32, a33, a34; \ + sph_u64 a40, a41, a42, a43, a44; + +#define READ_STATE(state) do { \ + a00 = (state)->u.wide[ 0]; \ + a10 = (state)->u.wide[ 1]; \ + a20 = (state)->u.wide[ 2]; \ + a30 = (state)->u.wide[ 3]; \ + a40 = (state)->u.wide[ 4]; \ + a01 = (state)->u.wide[ 5]; \ + a11 = (state)->u.wide[ 6]; \ + a21 = (state)->u.wide[ 7]; \ + a31 = (state)->u.wide[ 8]; \ + a41 = (state)->u.wide[ 9]; \ + a02 = (state)->u.wide[10]; \ + a12 = (state)->u.wide[11]; \ + a22 = (state)->u.wide[12]; \ + a32 = (state)->u.wide[13]; \ + a42 = (state)->u.wide[14]; \ + a03 = (state)->u.wide[15]; \ + a13 = (state)->u.wide[16]; \ + a23 = (state)->u.wide[17]; \ + a33 = (state)->u.wide[18]; \ + a43 = (state)->u.wide[19]; \ + a04 = (state)->u.wide[20]; \ + a14 = (state)->u.wide[21]; \ + a24 = (state)->u.wide[22]; \ + a34 = (state)->u.wide[23]; \ + a44 = (state)->u.wide[24]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->u.wide[ 0] = a00; \ + (state)->u.wide[ 1] = a10; \ + (state)->u.wide[ 2] = a20; \ + (state)->u.wide[ 3] = a30; \ + (state)->u.wide[ 4] = a40; \ + (state)->u.wide[ 5] = a01; \ + (state)->u.wide[ 6] = a11; \ + (state)->u.wide[ 7] = a21; \ + (state)->u.wide[ 8] = a31; \ + (state)->u.wide[ 9] = a41; \ + (state)->u.wide[10] = a02; \ + (state)->u.wide[11] = a12; \ + (state)->u.wide[12] = a22; \ + (state)->u.wide[13] = a32; \ + (state)->u.wide[14] = a42; \ + (state)->u.wide[15] = a03; \ + (state)->u.wide[16] = a13; \ + (state)->u.wide[17] = a23; \ + (state)->u.wide[18] = a33; \ + (state)->u.wide[19] = a43; \ + (state)->u.wide[20] = a04; \ + (state)->u.wide[21] = a14; \ + (state)->u.wide[22] = a24; \ + (state)->u.wide[23] = a34; \ + (state)->u.wide[24] = a44; \ + } while (0) + +#define INPUT_BUF144 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + a23 ^= sph_dec64le_aligned(buf + 136); \ + } while (0) + +#define INPUT_BUF136 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + } while (0) + +#define INPUT_BUF104 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + } while (0) + +#define INPUT_BUF72 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + } while (0) + +#define INPUT_BUF(lim) do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + if ((lim) == 72) \ + break; \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + if ((lim) == 104) \ + break; \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + if ((lim) == 136) \ + break; \ + a23 ^= sph_dec64le_aligned(buf + 136); \ + } while (0) + +#endif + +#define DECL64(x) sph_u64 x +#define MOV64(d, s) (d = s) +#define XOR64(d, a, b) (d = a ^ b) +#define AND64(d, a, b) (d = a & b) +#define OR64(d, a, b) (d = a | b) +#define NOT64(d, s) (d = SPH_T64(~s)) +#define ROL64(d, v, n) (d = SPH_ROTL64(v, n)) +#define XOR64_IOTA XOR64 + +#else + +static const struct { + sph_u32 high, low; +} RC[] = { +#if SPH_KECCAK_INTERLEAVE + { SPH_C32(0x00000000), SPH_C32(0x00000001) }, + { SPH_C32(0x00000089), SPH_C32(0x00000000) }, + { SPH_C32(0x8000008B), SPH_C32(0x00000000) }, + { SPH_C32(0x80008080), SPH_C32(0x00000000) }, + { SPH_C32(0x0000008B), SPH_C32(0x00000001) }, + { SPH_C32(0x00008000), SPH_C32(0x00000001) }, + { SPH_C32(0x80008088), SPH_C32(0x00000001) }, + { SPH_C32(0x80000082), SPH_C32(0x00000001) }, + { SPH_C32(0x0000000B), SPH_C32(0x00000000) }, + { SPH_C32(0x0000000A), SPH_C32(0x00000000) }, + { SPH_C32(0x00008082), SPH_C32(0x00000001) }, + { SPH_C32(0x00008003), SPH_C32(0x00000000) }, + { SPH_C32(0x0000808B), SPH_C32(0x00000001) }, + { SPH_C32(0x8000000B), SPH_C32(0x00000001) }, + { SPH_C32(0x8000008A), SPH_C32(0x00000001) }, + { SPH_C32(0x80000081), SPH_C32(0x00000001) }, + { SPH_C32(0x80000081), SPH_C32(0x00000000) }, + { SPH_C32(0x80000008), SPH_C32(0x00000000) }, + { SPH_C32(0x00000083), SPH_C32(0x00000000) }, + { SPH_C32(0x80008003), SPH_C32(0x00000000) }, + { SPH_C32(0x80008088), SPH_C32(0x00000001) }, + { SPH_C32(0x80000088), SPH_C32(0x00000000) }, + { SPH_C32(0x00008000), SPH_C32(0x00000001) }, + { SPH_C32(0x80008082), SPH_C32(0x00000000) } +#else + { SPH_C32(0x00000000), SPH_C32(0x00000001) }, + { SPH_C32(0x00000000), SPH_C32(0x00008082) }, + { SPH_C32(0x80000000), SPH_C32(0x0000808A) }, + { SPH_C32(0x80000000), SPH_C32(0x80008000) }, + { SPH_C32(0x00000000), SPH_C32(0x0000808B) }, + { SPH_C32(0x00000000), SPH_C32(0x80000001) }, + { SPH_C32(0x80000000), SPH_C32(0x80008081) }, + { SPH_C32(0x80000000), SPH_C32(0x00008009) }, + { SPH_C32(0x00000000), SPH_C32(0x0000008A) }, + { SPH_C32(0x00000000), SPH_C32(0x00000088) }, + { SPH_C32(0x00000000), SPH_C32(0x80008009) }, + { SPH_C32(0x00000000), SPH_C32(0x8000000A) }, + { SPH_C32(0x00000000), SPH_C32(0x8000808B) }, + { SPH_C32(0x80000000), SPH_C32(0x0000008B) }, + { SPH_C32(0x80000000), SPH_C32(0x00008089) }, + { SPH_C32(0x80000000), SPH_C32(0x00008003) }, + { SPH_C32(0x80000000), SPH_C32(0x00008002) }, + { SPH_C32(0x80000000), SPH_C32(0x00000080) }, + { SPH_C32(0x00000000), SPH_C32(0x0000800A) }, + { SPH_C32(0x80000000), SPH_C32(0x8000000A) }, + { SPH_C32(0x80000000), SPH_C32(0x80008081) }, + { SPH_C32(0x80000000), SPH_C32(0x00008080) }, + { SPH_C32(0x00000000), SPH_C32(0x80000001) }, + { SPH_C32(0x80000000), SPH_C32(0x80008008) } +#endif +}; + +#if SPH_KECCAK_INTERLEAVE + +#define INTERLEAVE(xl, xh) do { \ + sph_u32 l, h, t; \ + l = (xl); h = (xh); \ + t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ + t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ + t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ + t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ + t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ + t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ + t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ + t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ + t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ + l ^= t; h ^= t >> 16; \ + (xl) = l; (xh) = h; \ + } while (0) + +#define UNINTERLEAVE(xl, xh) do { \ + sph_u32 l, h, t; \ + l = (xl); h = (xh); \ + t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ + l ^= t; h ^= t >> 16; \ + t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ + t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ + t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ + t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ + t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ + t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ + t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ + t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ + (xl) = l; (xh) = h; \ + } while (0) + +#else + +#define INTERLEAVE(l, h) +#define UNINTERLEAVE(l, h) + +#endif + +#if SPH_KECCAK_NOCOPY + +#define a00l (kc->u.narrow[2 * 0 + 0]) +#define a00h (kc->u.narrow[2 * 0 + 1]) +#define a10l (kc->u.narrow[2 * 1 + 0]) +#define a10h (kc->u.narrow[2 * 1 + 1]) +#define a20l (kc->u.narrow[2 * 2 + 0]) +#define a20h (kc->u.narrow[2 * 2 + 1]) +#define a30l (kc->u.narrow[2 * 3 + 0]) +#define a30h (kc->u.narrow[2 * 3 + 1]) +#define a40l (kc->u.narrow[2 * 4 + 0]) +#define a40h (kc->u.narrow[2 * 4 + 1]) +#define a01l (kc->u.narrow[2 * 5 + 0]) +#define a01h (kc->u.narrow[2 * 5 + 1]) +#define a11l (kc->u.narrow[2 * 6 + 0]) +#define a11h (kc->u.narrow[2 * 6 + 1]) +#define a21l (kc->u.narrow[2 * 7 + 0]) +#define a21h (kc->u.narrow[2 * 7 + 1]) +#define a31l (kc->u.narrow[2 * 8 + 0]) +#define a31h (kc->u.narrow[2 * 8 + 1]) +#define a41l (kc->u.narrow[2 * 9 + 0]) +#define a41h (kc->u.narrow[2 * 9 + 1]) +#define a02l (kc->u.narrow[2 * 10 + 0]) +#define a02h (kc->u.narrow[2 * 10 + 1]) +#define a12l (kc->u.narrow[2 * 11 + 0]) +#define a12h (kc->u.narrow[2 * 11 + 1]) +#define a22l (kc->u.narrow[2 * 12 + 0]) +#define a22h (kc->u.narrow[2 * 12 + 1]) +#define a32l (kc->u.narrow[2 * 13 + 0]) +#define a32h (kc->u.narrow[2 * 13 + 1]) +#define a42l (kc->u.narrow[2 * 14 + 0]) +#define a42h (kc->u.narrow[2 * 14 + 1]) +#define a03l (kc->u.narrow[2 * 15 + 0]) +#define a03h (kc->u.narrow[2 * 15 + 1]) +#define a13l (kc->u.narrow[2 * 16 + 0]) +#define a13h (kc->u.narrow[2 * 16 + 1]) +#define a23l (kc->u.narrow[2 * 17 + 0]) +#define a23h (kc->u.narrow[2 * 17 + 1]) +#define a33l (kc->u.narrow[2 * 18 + 0]) +#define a33h (kc->u.narrow[2 * 18 + 1]) +#define a43l (kc->u.narrow[2 * 19 + 0]) +#define a43h (kc->u.narrow[2 * 19 + 1]) +#define a04l (kc->u.narrow[2 * 20 + 0]) +#define a04h (kc->u.narrow[2 * 20 + 1]) +#define a14l (kc->u.narrow[2 * 21 + 0]) +#define a14h (kc->u.narrow[2 * 21 + 1]) +#define a24l (kc->u.narrow[2 * 22 + 0]) +#define a24h (kc->u.narrow[2 * 22 + 1]) +#define a34l (kc->u.narrow[2 * 23 + 0]) +#define a34h (kc->u.narrow[2 * 23 + 1]) +#define a44l (kc->u.narrow[2 * 24 + 0]) +#define a44h (kc->u.narrow[2 * 24 + 1]) + +#define DECL_STATE +#define READ_STATE(state) +#define WRITE_STATE(state) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size); j += 8) { \ + sph_u32 tl, th; \ + tl = sph_dec32le_aligned(buf + j + 0); \ + th = sph_dec32le_aligned(buf + j + 4); \ + INTERLEAVE(tl, th); \ + kc->u.narrow[(j >> 2) + 0] ^= tl; \ + kc->u.narrow[(j >> 2) + 1] ^= th; \ + } \ + } while (0) + +#define INPUT_BUF144 INPUT_BUF(144) +#define INPUT_BUF136 INPUT_BUF(136) +#define INPUT_BUF104 INPUT_BUF(104) +#define INPUT_BUF72 INPUT_BUF(72) + +#else + +#define DECL_STATE \ + sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \ + sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \ + sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \ + sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \ + sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h; + +#define READ_STATE(state) do { \ + a00l = (state)->u.narrow[2 * 0 + 0]; \ + a00h = (state)->u.narrow[2 * 0 + 1]; \ + a10l = (state)->u.narrow[2 * 1 + 0]; \ + a10h = (state)->u.narrow[2 * 1 + 1]; \ + a20l = (state)->u.narrow[2 * 2 + 0]; \ + a20h = (state)->u.narrow[2 * 2 + 1]; \ + a30l = (state)->u.narrow[2 * 3 + 0]; \ + a30h = (state)->u.narrow[2 * 3 + 1]; \ + a40l = (state)->u.narrow[2 * 4 + 0]; \ + a40h = (state)->u.narrow[2 * 4 + 1]; \ + a01l = (state)->u.narrow[2 * 5 + 0]; \ + a01h = (state)->u.narrow[2 * 5 + 1]; \ + a11l = (state)->u.narrow[2 * 6 + 0]; \ + a11h = (state)->u.narrow[2 * 6 + 1]; \ + a21l = (state)->u.narrow[2 * 7 + 0]; \ + a21h = (state)->u.narrow[2 * 7 + 1]; \ + a31l = (state)->u.narrow[2 * 8 + 0]; \ + a31h = (state)->u.narrow[2 * 8 + 1]; \ + a41l = (state)->u.narrow[2 * 9 + 0]; \ + a41h = (state)->u.narrow[2 * 9 + 1]; \ + a02l = (state)->u.narrow[2 * 10 + 0]; \ + a02h = (state)->u.narrow[2 * 10 + 1]; \ + a12l = (state)->u.narrow[2 * 11 + 0]; \ + a12h = (state)->u.narrow[2 * 11 + 1]; \ + a22l = (state)->u.narrow[2 * 12 + 0]; \ + a22h = (state)->u.narrow[2 * 12 + 1]; \ + a32l = (state)->u.narrow[2 * 13 + 0]; \ + a32h = (state)->u.narrow[2 * 13 + 1]; \ + a42l = (state)->u.narrow[2 * 14 + 0]; \ + a42h = (state)->u.narrow[2 * 14 + 1]; \ + a03l = (state)->u.narrow[2 * 15 + 0]; \ + a03h = (state)->u.narrow[2 * 15 + 1]; \ + a13l = (state)->u.narrow[2 * 16 + 0]; \ + a13h = (state)->u.narrow[2 * 16 + 1]; \ + a23l = (state)->u.narrow[2 * 17 + 0]; \ + a23h = (state)->u.narrow[2 * 17 + 1]; \ + a33l = (state)->u.narrow[2 * 18 + 0]; \ + a33h = (state)->u.narrow[2 * 18 + 1]; \ + a43l = (state)->u.narrow[2 * 19 + 0]; \ + a43h = (state)->u.narrow[2 * 19 + 1]; \ + a04l = (state)->u.narrow[2 * 20 + 0]; \ + a04h = (state)->u.narrow[2 * 20 + 1]; \ + a14l = (state)->u.narrow[2 * 21 + 0]; \ + a14h = (state)->u.narrow[2 * 21 + 1]; \ + a24l = (state)->u.narrow[2 * 22 + 0]; \ + a24h = (state)->u.narrow[2 * 22 + 1]; \ + a34l = (state)->u.narrow[2 * 23 + 0]; \ + a34h = (state)->u.narrow[2 * 23 + 1]; \ + a44l = (state)->u.narrow[2 * 24 + 0]; \ + a44h = (state)->u.narrow[2 * 24 + 1]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->u.narrow[2 * 0 + 0] = a00l; \ + (state)->u.narrow[2 * 0 + 1] = a00h; \ + (state)->u.narrow[2 * 1 + 0] = a10l; \ + (state)->u.narrow[2 * 1 + 1] = a10h; \ + (state)->u.narrow[2 * 2 + 0] = a20l; \ + (state)->u.narrow[2 * 2 + 1] = a20h; \ + (state)->u.narrow[2 * 3 + 0] = a30l; \ + (state)->u.narrow[2 * 3 + 1] = a30h; \ + (state)->u.narrow[2 * 4 + 0] = a40l; \ + (state)->u.narrow[2 * 4 + 1] = a40h; \ + (state)->u.narrow[2 * 5 + 0] = a01l; \ + (state)->u.narrow[2 * 5 + 1] = a01h; \ + (state)->u.narrow[2 * 6 + 0] = a11l; \ + (state)->u.narrow[2 * 6 + 1] = a11h; \ + (state)->u.narrow[2 * 7 + 0] = a21l; \ + (state)->u.narrow[2 * 7 + 1] = a21h; \ + (state)->u.narrow[2 * 8 + 0] = a31l; \ + (state)->u.narrow[2 * 8 + 1] = a31h; \ + (state)->u.narrow[2 * 9 + 0] = a41l; \ + (state)->u.narrow[2 * 9 + 1] = a41h; \ + (state)->u.narrow[2 * 10 + 0] = a02l; \ + (state)->u.narrow[2 * 10 + 1] = a02h; \ + (state)->u.narrow[2 * 11 + 0] = a12l; \ + (state)->u.narrow[2 * 11 + 1] = a12h; \ + (state)->u.narrow[2 * 12 + 0] = a22l; \ + (state)->u.narrow[2 * 12 + 1] = a22h; \ + (state)->u.narrow[2 * 13 + 0] = a32l; \ + (state)->u.narrow[2 * 13 + 1] = a32h; \ + (state)->u.narrow[2 * 14 + 0] = a42l; \ + (state)->u.narrow[2 * 14 + 1] = a42h; \ + (state)->u.narrow[2 * 15 + 0] = a03l; \ + (state)->u.narrow[2 * 15 + 1] = a03h; \ + (state)->u.narrow[2 * 16 + 0] = a13l; \ + (state)->u.narrow[2 * 16 + 1] = a13h; \ + (state)->u.narrow[2 * 17 + 0] = a23l; \ + (state)->u.narrow[2 * 17 + 1] = a23h; \ + (state)->u.narrow[2 * 18 + 0] = a33l; \ + (state)->u.narrow[2 * 18 + 1] = a33h; \ + (state)->u.narrow[2 * 19 + 0] = a43l; \ + (state)->u.narrow[2 * 19 + 1] = a43h; \ + (state)->u.narrow[2 * 20 + 0] = a04l; \ + (state)->u.narrow[2 * 20 + 1] = a04h; \ + (state)->u.narrow[2 * 21 + 0] = a14l; \ + (state)->u.narrow[2 * 21 + 1] = a14h; \ + (state)->u.narrow[2 * 22 + 0] = a24l; \ + (state)->u.narrow[2 * 22 + 1] = a24h; \ + (state)->u.narrow[2 * 23 + 0] = a34l; \ + (state)->u.narrow[2 * 23 + 1] = a34h; \ + (state)->u.narrow[2 * 24 + 0] = a44l; \ + (state)->u.narrow[2 * 24 + 1] = a44h; \ + } while (0) + +#define READ64(d, off) do { \ + sph_u32 tl, th; \ + tl = sph_dec32le_aligned(buf + (off)); \ + th = sph_dec32le_aligned(buf + (off) + 4); \ + INTERLEAVE(tl, th); \ + d ## l ^= tl; \ + d ## h ^= th; \ + } while (0) + +#define INPUT_BUF144 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + READ64(a23, 136); \ + } while (0) + +#define INPUT_BUF136 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + } while (0) + +#define INPUT_BUF104 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + } while (0) + +#define INPUT_BUF72 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + } while (0) + +#define INPUT_BUF(lim) do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + if ((lim) == 72) \ + break; \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + if ((lim) == 104) \ + break; \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + if ((lim) == 136) \ + break; \ + READ64(a23, 136); \ + } while (0) + +#endif + +#define DECL64(x) sph_u64 x ## l, x ## h +#define MOV64(d, s) (d ## l = s ## l, d ## h = s ## h) +#define XOR64(d, a, b) (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h) +#define AND64(d, a, b) (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h) +#define OR64(d, a, b) (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h) +#define NOT64(d, s) (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h)) +#define ROL64(d, v, n) ROL64_ ## n(d, v) + +#if SPH_KECCAK_INTERLEAVE + +#define ROL64_odd1(d, v) do { \ + sph_u32 tmp; \ + tmp = v ## l; \ + d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \ + d ## h = tmp; \ + } while (0) + +#define ROL64_odd63(d, v) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \ + d ## l = v ## h; \ + d ## h = tmp; \ + } while (0) + +#define ROL64_odd(d, v, n) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \ + d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ + d ## h = tmp; \ + } while (0) + +#define ROL64_even(d, v, n) do { \ + d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \ + d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ + } while (0) + +#define ROL64_0(d, v) +#define ROL64_1(d, v) ROL64_odd1(d, v) +#define ROL64_2(d, v) ROL64_even(d, v, 1) +#define ROL64_3(d, v) ROL64_odd( d, v, 2) +#define ROL64_4(d, v) ROL64_even(d, v, 2) +#define ROL64_5(d, v) ROL64_odd( d, v, 3) +#define ROL64_6(d, v) ROL64_even(d, v, 3) +#define ROL64_7(d, v) ROL64_odd( d, v, 4) +#define ROL64_8(d, v) ROL64_even(d, v, 4) +#define ROL64_9(d, v) ROL64_odd( d, v, 5) +#define ROL64_10(d, v) ROL64_even(d, v, 5) +#define ROL64_11(d, v) ROL64_odd( d, v, 6) +#define ROL64_12(d, v) ROL64_even(d, v, 6) +#define ROL64_13(d, v) ROL64_odd( d, v, 7) +#define ROL64_14(d, v) ROL64_even(d, v, 7) +#define ROL64_15(d, v) ROL64_odd( d, v, 8) +#define ROL64_16(d, v) ROL64_even(d, v, 8) +#define ROL64_17(d, v) ROL64_odd( d, v, 9) +#define ROL64_18(d, v) ROL64_even(d, v, 9) +#define ROL64_19(d, v) ROL64_odd( d, v, 10) +#define ROL64_20(d, v) ROL64_even(d, v, 10) +#define ROL64_21(d, v) ROL64_odd( d, v, 11) +#define ROL64_22(d, v) ROL64_even(d, v, 11) +#define ROL64_23(d, v) ROL64_odd( d, v, 12) +#define ROL64_24(d, v) ROL64_even(d, v, 12) +#define ROL64_25(d, v) ROL64_odd( d, v, 13) +#define ROL64_26(d, v) ROL64_even(d, v, 13) +#define ROL64_27(d, v) ROL64_odd( d, v, 14) +#define ROL64_28(d, v) ROL64_even(d, v, 14) +#define ROL64_29(d, v) ROL64_odd( d, v, 15) +#define ROL64_30(d, v) ROL64_even(d, v, 15) +#define ROL64_31(d, v) ROL64_odd( d, v, 16) +#define ROL64_32(d, v) ROL64_even(d, v, 16) +#define ROL64_33(d, v) ROL64_odd( d, v, 17) +#define ROL64_34(d, v) ROL64_even(d, v, 17) +#define ROL64_35(d, v) ROL64_odd( d, v, 18) +#define ROL64_36(d, v) ROL64_even(d, v, 18) +#define ROL64_37(d, v) ROL64_odd( d, v, 19) +#define ROL64_38(d, v) ROL64_even(d, v, 19) +#define ROL64_39(d, v) ROL64_odd( d, v, 20) +#define ROL64_40(d, v) ROL64_even(d, v, 20) +#define ROL64_41(d, v) ROL64_odd( d, v, 21) +#define ROL64_42(d, v) ROL64_even(d, v, 21) +#define ROL64_43(d, v) ROL64_odd( d, v, 22) +#define ROL64_44(d, v) ROL64_even(d, v, 22) +#define ROL64_45(d, v) ROL64_odd( d, v, 23) +#define ROL64_46(d, v) ROL64_even(d, v, 23) +#define ROL64_47(d, v) ROL64_odd( d, v, 24) +#define ROL64_48(d, v) ROL64_even(d, v, 24) +#define ROL64_49(d, v) ROL64_odd( d, v, 25) +#define ROL64_50(d, v) ROL64_even(d, v, 25) +#define ROL64_51(d, v) ROL64_odd( d, v, 26) +#define ROL64_52(d, v) ROL64_even(d, v, 26) +#define ROL64_53(d, v) ROL64_odd( d, v, 27) +#define ROL64_54(d, v) ROL64_even(d, v, 27) +#define ROL64_55(d, v) ROL64_odd( d, v, 28) +#define ROL64_56(d, v) ROL64_even(d, v, 28) +#define ROL64_57(d, v) ROL64_odd( d, v, 29) +#define ROL64_58(d, v) ROL64_even(d, v, 29) +#define ROL64_59(d, v) ROL64_odd( d, v, 30) +#define ROL64_60(d, v) ROL64_even(d, v, 30) +#define ROL64_61(d, v) ROL64_odd( d, v, 31) +#define ROL64_62(d, v) ROL64_even(d, v, 31) +#define ROL64_63(d, v) ROL64_odd63(d, v) + +#else + +#define ROL64_small(d, v, n) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \ + d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \ + d ## l = tmp; \ + } while (0) + +#define ROL64_0(d, v) 0 +#define ROL64_1(d, v) ROL64_small(d, v, 1) +#define ROL64_2(d, v) ROL64_small(d, v, 2) +#define ROL64_3(d, v) ROL64_small(d, v, 3) +#define ROL64_4(d, v) ROL64_small(d, v, 4) +#define ROL64_5(d, v) ROL64_small(d, v, 5) +#define ROL64_6(d, v) ROL64_small(d, v, 6) +#define ROL64_7(d, v) ROL64_small(d, v, 7) +#define ROL64_8(d, v) ROL64_small(d, v, 8) +#define ROL64_9(d, v) ROL64_small(d, v, 9) +#define ROL64_10(d, v) ROL64_small(d, v, 10) +#define ROL64_11(d, v) ROL64_small(d, v, 11) +#define ROL64_12(d, v) ROL64_small(d, v, 12) +#define ROL64_13(d, v) ROL64_small(d, v, 13) +#define ROL64_14(d, v) ROL64_small(d, v, 14) +#define ROL64_15(d, v) ROL64_small(d, v, 15) +#define ROL64_16(d, v) ROL64_small(d, v, 16) +#define ROL64_17(d, v) ROL64_small(d, v, 17) +#define ROL64_18(d, v) ROL64_small(d, v, 18) +#define ROL64_19(d, v) ROL64_small(d, v, 19) +#define ROL64_20(d, v) ROL64_small(d, v, 20) +#define ROL64_21(d, v) ROL64_small(d, v, 21) +#define ROL64_22(d, v) ROL64_small(d, v, 22) +#define ROL64_23(d, v) ROL64_small(d, v, 23) +#define ROL64_24(d, v) ROL64_small(d, v, 24) +#define ROL64_25(d, v) ROL64_small(d, v, 25) +#define ROL64_26(d, v) ROL64_small(d, v, 26) +#define ROL64_27(d, v) ROL64_small(d, v, 27) +#define ROL64_28(d, v) ROL64_small(d, v, 28) +#define ROL64_29(d, v) ROL64_small(d, v, 29) +#define ROL64_30(d, v) ROL64_small(d, v, 30) +#define ROL64_31(d, v) ROL64_small(d, v, 31) + +#define ROL64_32(d, v) do { \ + sph_u32 tmp; \ + tmp = v ## l; \ + d ## l = v ## h; \ + d ## h = tmp; \ + } while (0) + +#define ROL64_big(d, v, n) do { \ + sph_u32 trl, trh; \ + ROL64_small(tr, v, n); \ + d ## h = trl; \ + d ## l = trh; \ + } while (0) + +#define ROL64_33(d, v) ROL64_big(d, v, 1) +#define ROL64_34(d, v) ROL64_big(d, v, 2) +#define ROL64_35(d, v) ROL64_big(d, v, 3) +#define ROL64_36(d, v) ROL64_big(d, v, 4) +#define ROL64_37(d, v) ROL64_big(d, v, 5) +#define ROL64_38(d, v) ROL64_big(d, v, 6) +#define ROL64_39(d, v) ROL64_big(d, v, 7) +#define ROL64_40(d, v) ROL64_big(d, v, 8) +#define ROL64_41(d, v) ROL64_big(d, v, 9) +#define ROL64_42(d, v) ROL64_big(d, v, 10) +#define ROL64_43(d, v) ROL64_big(d, v, 11) +#define ROL64_44(d, v) ROL64_big(d, v, 12) +#define ROL64_45(d, v) ROL64_big(d, v, 13) +#define ROL64_46(d, v) ROL64_big(d, v, 14) +#define ROL64_47(d, v) ROL64_big(d, v, 15) +#define ROL64_48(d, v) ROL64_big(d, v, 16) +#define ROL64_49(d, v) ROL64_big(d, v, 17) +#define ROL64_50(d, v) ROL64_big(d, v, 18) +#define ROL64_51(d, v) ROL64_big(d, v, 19) +#define ROL64_52(d, v) ROL64_big(d, v, 20) +#define ROL64_53(d, v) ROL64_big(d, v, 21) +#define ROL64_54(d, v) ROL64_big(d, v, 22) +#define ROL64_55(d, v) ROL64_big(d, v, 23) +#define ROL64_56(d, v) ROL64_big(d, v, 24) +#define ROL64_57(d, v) ROL64_big(d, v, 25) +#define ROL64_58(d, v) ROL64_big(d, v, 26) +#define ROL64_59(d, v) ROL64_big(d, v, 27) +#define ROL64_60(d, v) ROL64_big(d, v, 28) +#define ROL64_61(d, v) ROL64_big(d, v, 29) +#define ROL64_62(d, v) ROL64_big(d, v, 30) +#define ROL64_63(d, v) ROL64_big(d, v, 31) + +#endif + +#define XOR64_IOTA(d, s, k) \ + (d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high) + +#endif + +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ + DECL64(tt0); \ + DECL64(tt1); \ + DECL64(tt2); \ + DECL64(tt3); \ + XOR64(tt0, d0, d1); \ + XOR64(tt1, d2, d3); \ + XOR64(tt0, tt0, d4); \ + XOR64(tt0, tt0, tt1); \ + ROL64(tt0, tt0, 1); \ + XOR64(tt2, c0, c1); \ + XOR64(tt3, c2, c3); \ + XOR64(tt0, tt0, c4); \ + XOR64(tt2, tt2, tt3); \ + XOR64(t, tt0, tt2); \ + } while (0) + +#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(t0); \ + DECL64(t1); \ + DECL64(t2); \ + DECL64(t3); \ + DECL64(t4); \ + TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ + TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ + TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ + TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ + TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ + XOR64(b00, b00, t0); \ + XOR64(b01, b01, t0); \ + XOR64(b02, b02, t0); \ + XOR64(b03, b03, t0); \ + XOR64(b04, b04, t0); \ + XOR64(b10, b10, t1); \ + XOR64(b11, b11, t1); \ + XOR64(b12, b12, t1); \ + XOR64(b13, b13, t1); \ + XOR64(b14, b14, t1); \ + XOR64(b20, b20, t2); \ + XOR64(b21, b21, t2); \ + XOR64(b22, b22, t2); \ + XOR64(b23, b23, t2); \ + XOR64(b24, b24, t2); \ + XOR64(b30, b30, t3); \ + XOR64(b31, b31, t3); \ + XOR64(b32, b32, t3); \ + XOR64(b33, b33, t3); \ + XOR64(b34, b34, t3); \ + XOR64(b40, b40, t4); \ + XOR64(b41, b41, t4); \ + XOR64(b42, b42, t4); \ + XOR64(b43, b43, t4); \ + XOR64(b44, b44, t4); \ + } while (0) + +#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + /* ROL64(b00, b00, 0); */ \ + ROL64(b01, b01, 36); \ + ROL64(b02, b02, 3); \ + ROL64(b03, b03, 41); \ + ROL64(b04, b04, 18); \ + ROL64(b10, b10, 1); \ + ROL64(b11, b11, 44); \ + ROL64(b12, b12, 10); \ + ROL64(b13, b13, 45); \ + ROL64(b14, b14, 2); \ + ROL64(b20, b20, 62); \ + ROL64(b21, b21, 6); \ + ROL64(b22, b22, 43); \ + ROL64(b23, b23, 15); \ + ROL64(b24, b24, 61); \ + ROL64(b30, b30, 28); \ + ROL64(b31, b31, 55); \ + ROL64(b32, b32, 25); \ + ROL64(b33, b33, 21); \ + ROL64(b34, b34, 56); \ + ROL64(b40, b40, 27); \ + ROL64(b41, b41, 20); \ + ROL64(b42, b42, 39); \ + ROL64(b43, b43, 8); \ + ROL64(b44, b44, 14); \ + } while (0) + +/* + * The KHI macro integrates the "lane complement" optimization. On input, + * some words are complemented: + * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 + * On output, the following words are complemented: + * a04 a10 a20 a22 a23 a31 + * + * The (implicit) permutation and the theta expansion will bring back + * the input mask for the next round. + */ + +#define KHI_XO(d, a, b, c) do { \ + DECL64(kt); \ + OR64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#define KHI_XA(d, a, b, c) do { \ + DECL64(kt); \ + AND64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(c0); \ + DECL64(c1); \ + DECL64(c2); \ + DECL64(c3); \ + DECL64(c4); \ + DECL64(bnn); \ + NOT64(bnn, b20); \ + KHI_XO(c0, b00, b10, b20); \ + KHI_XO(c1, b10, bnn, b30); \ + KHI_XA(c2, b20, b30, b40); \ + KHI_XO(c3, b30, b40, b00); \ + KHI_XA(c4, b40, b00, b10); \ + MOV64(b00, c0); \ + MOV64(b10, c1); \ + MOV64(b20, c2); \ + MOV64(b30, c3); \ + MOV64(b40, c4); \ + NOT64(bnn, b41); \ + KHI_XO(c0, b01, b11, b21); \ + KHI_XA(c1, b11, b21, b31); \ + KHI_XO(c2, b21, b31, bnn); \ + KHI_XO(c3, b31, b41, b01); \ + KHI_XA(c4, b41, b01, b11); \ + MOV64(b01, c0); \ + MOV64(b11, c1); \ + MOV64(b21, c2); \ + MOV64(b31, c3); \ + MOV64(b41, c4); \ + NOT64(bnn, b32); \ + KHI_XO(c0, b02, b12, b22); \ + KHI_XA(c1, b12, b22, b32); \ + KHI_XA(c2, b22, bnn, b42); \ + KHI_XO(c3, bnn, b42, b02); \ + KHI_XA(c4, b42, b02, b12); \ + MOV64(b02, c0); \ + MOV64(b12, c1); \ + MOV64(b22, c2); \ + MOV64(b32, c3); \ + MOV64(b42, c4); \ + NOT64(bnn, b33); \ + KHI_XA(c0, b03, b13, b23); \ + KHI_XO(c1, b13, b23, b33); \ + KHI_XO(c2, b23, bnn, b43); \ + KHI_XA(c3, bnn, b43, b03); \ + KHI_XO(c4, b43, b03, b13); \ + MOV64(b03, c0); \ + MOV64(b13, c1); \ + MOV64(b23, c2); \ + MOV64(b33, c3); \ + MOV64(b43, c4); \ + NOT64(bnn, b14); \ + KHI_XA(c0, b04, bnn, b24); \ + KHI_XO(c1, bnn, b24, b34); \ + KHI_XA(c2, b24, b34, b44); \ + KHI_XO(c3, b34, b44, b04); \ + KHI_XA(c4, b44, b04, b14); \ + MOV64(b04, c0); \ + MOV64(b14, c1); \ + MOV64(b24, c2); \ + MOV64(b34, c3); \ + MOV64(b44, c4); \ + } while (0) + +#define IOTA(r) XOR64_IOTA(a00, a00, r) + +#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ + a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 +#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ + a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 +#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ + a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 +#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ + a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 +#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ + a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 +#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ + a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 +#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ + a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 +#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ + a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 +#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ + a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 +#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ + a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 +#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ + a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 +#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ + a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 +#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ + a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 +#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ + a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 +#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ + a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 +#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ + a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 +#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ + a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 +#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ + a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 +#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ + a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 +#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ + a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 +#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ + a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 +#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ + a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 +#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ + a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 +#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ + a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 + +#define P1_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a30); \ + MOV64(a30, a33); \ + MOV64(a33, a23); \ + MOV64(a23, a12); \ + MOV64(a12, a21); \ + MOV64(a21, a02); \ + MOV64(a02, a10); \ + MOV64(a10, a11); \ + MOV64(a11, a41); \ + MOV64(a41, a24); \ + MOV64(a24, a42); \ + MOV64(a42, a04); \ + MOV64(a04, a20); \ + MOV64(a20, a22); \ + MOV64(a22, a32); \ + MOV64(a32, a43); \ + MOV64(a43, a34); \ + MOV64(a34, a03); \ + MOV64(a03, a40); \ + MOV64(a40, a44); \ + MOV64(a44, a14); \ + MOV64(a14, a31); \ + MOV64(a31, a13); \ + MOV64(a13, t); \ + } while (0) + +#define P2_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a33); \ + MOV64(a33, a12); \ + MOV64(a12, a02); \ + MOV64(a02, a11); \ + MOV64(a11, a24); \ + MOV64(a24, a04); \ + MOV64(a04, a22); \ + MOV64(a22, a43); \ + MOV64(a43, a03); \ + MOV64(a03, a44); \ + MOV64(a44, a31); \ + MOV64(a31, t); \ + MOV64(t, a10); \ + MOV64(a10, a41); \ + MOV64(a41, a42); \ + MOV64(a42, a20); \ + MOV64(a20, a32); \ + MOV64(a32, a34); \ + MOV64(a34, a40); \ + MOV64(a40, a14); \ + MOV64(a14, a13); \ + MOV64(a13, a30); \ + MOV64(a30, a23); \ + MOV64(a23, a21); \ + MOV64(a21, t); \ + } while (0) + +#define P4_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a12); \ + MOV64(a12, a11); \ + MOV64(a11, a04); \ + MOV64(a04, a43); \ + MOV64(a43, a44); \ + MOV64(a44, t); \ + MOV64(t, a02); \ + MOV64(a02, a24); \ + MOV64(a24, a22); \ + MOV64(a22, a03); \ + MOV64(a03, a31); \ + MOV64(a31, a33); \ + MOV64(a33, t); \ + MOV64(t, a10); \ + MOV64(a10, a42); \ + MOV64(a42, a32); \ + MOV64(a32, a40); \ + MOV64(a40, a13); \ + MOV64(a13, a23); \ + MOV64(a23, t); \ + MOV64(t, a14); \ + MOV64(a14, a30); \ + MOV64(a30, a21); \ + MOV64(a21, a41); \ + MOV64(a41, a20); \ + MOV64(a20, a34); \ + MOV64(a34, t); \ + } while (0) + +#define P6_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a02); \ + MOV64(a02, a04); \ + MOV64(a04, a03); \ + MOV64(a03, t); \ + MOV64(t, a10); \ + MOV64(a10, a20); \ + MOV64(a20, a40); \ + MOV64(a40, a30); \ + MOV64(a30, t); \ + MOV64(t, a11); \ + MOV64(a11, a22); \ + MOV64(a22, a44); \ + MOV64(a44, a33); \ + MOV64(a33, t); \ + MOV64(t, a12); \ + MOV64(a12, a24); \ + MOV64(a24, a43); \ + MOV64(a43, a31); \ + MOV64(a31, t); \ + MOV64(t, a13); \ + MOV64(a13, a21); \ + MOV64(a21, a42); \ + MOV64(a42, a34); \ + MOV64(a34, t); \ + MOV64(t, a14); \ + MOV64(a14, a23); \ + MOV64(a23, a41); \ + MOV64(a41, a32); \ + MOV64(a32, t); \ + } while (0) + +#define P8_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a11); \ + MOV64(a11, a43); \ + MOV64(a43, t); \ + MOV64(t, a02); \ + MOV64(a02, a22); \ + MOV64(a22, a31); \ + MOV64(a31, t); \ + MOV64(t, a03); \ + MOV64(a03, a33); \ + MOV64(a33, a24); \ + MOV64(a24, t); \ + MOV64(t, a04); \ + MOV64(a04, a44); \ + MOV64(a44, a12); \ + MOV64(a12, t); \ + MOV64(t, a10); \ + MOV64(a10, a32); \ + MOV64(a32, a13); \ + MOV64(a13, t); \ + MOV64(t, a14); \ + MOV64(a14, a21); \ + MOV64(a21, a20); \ + MOV64(a20, t); \ + MOV64(t, a23); \ + MOV64(a23, a42); \ + MOV64(a42, a40); \ + MOV64(a40, t); \ + MOV64(t, a30); \ + MOV64(a30, a41); \ + MOV64(a41, a34); \ + MOV64(a34, t); \ + } while (0) + +#define P12_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a04); \ + MOV64(a04, t); \ + MOV64(t, a02); \ + MOV64(a02, a03); \ + MOV64(a03, t); \ + MOV64(t, a10); \ + MOV64(a10, a40); \ + MOV64(a40, t); \ + MOV64(t, a11); \ + MOV64(a11, a44); \ + MOV64(a44, t); \ + MOV64(t, a12); \ + MOV64(a12, a43); \ + MOV64(a43, t); \ + MOV64(t, a13); \ + MOV64(a13, a42); \ + MOV64(a42, t); \ + MOV64(t, a14); \ + MOV64(a14, a41); \ + MOV64(a41, t); \ + MOV64(t, a20); \ + MOV64(a20, a30); \ + MOV64(a30, t); \ + MOV64(t, a21); \ + MOV64(a21, a34); \ + MOV64(a34, t); \ + MOV64(t, a22); \ + MOV64(a22, a33); \ + MOV64(a33, t); \ + MOV64(t, a23); \ + MOV64(a23, a32); \ + MOV64(a32, t); \ + MOV64(t, a24); \ + MOV64(a24, a31); \ + MOV64(a31, t); \ + } while (0) + +#define LPAR ( +#define RPAR ) + +#define KF_ELT(r, s, k) do { \ + THETA LPAR P ## r RPAR; \ + RHO LPAR P ## r RPAR; \ + KHI LPAR P ## s RPAR; \ + IOTA(k); \ + } while (0) + +#define DO(x) x + +#define KECCAK_F_1600 DO(KECCAK_F_1600_) + +#if SPH_KECCAK_UNROLL == 1 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j ++) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + P1_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 2 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 2) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + P2_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 4 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 4) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + P4_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 6 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 6) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + P6_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 8 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 8) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + KF_ELT( 6, 7, RC[j + 6]); \ + KF_ELT( 7, 8, RC[j + 7]); \ + P8_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 12 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 12) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + KF_ELT( 6, 7, RC[j + 6]); \ + KF_ELT( 7, 8, RC[j + 7]); \ + KF_ELT( 8, 9, RC[j + 8]); \ + KF_ELT( 9, 10, RC[j + 9]); \ + KF_ELT(10, 11, RC[j + 10]); \ + KF_ELT(11, 12, RC[j + 11]); \ + P12_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 0 + +#define KECCAK_F_1600_ do { \ + KF_ELT( 0, 1, RC[ 0]); \ + KF_ELT( 1, 2, RC[ 1]); \ + KF_ELT( 2, 3, RC[ 2]); \ + KF_ELT( 3, 4, RC[ 3]); \ + KF_ELT( 4, 5, RC[ 4]); \ + KF_ELT( 5, 6, RC[ 5]); \ + KF_ELT( 6, 7, RC[ 6]); \ + KF_ELT( 7, 8, RC[ 7]); \ + KF_ELT( 8, 9, RC[ 8]); \ + KF_ELT( 9, 10, RC[ 9]); \ + KF_ELT(10, 11, RC[10]); \ + KF_ELT(11, 12, RC[11]); \ + KF_ELT(12, 13, RC[12]); \ + KF_ELT(13, 14, RC[13]); \ + KF_ELT(14, 15, RC[14]); \ + KF_ELT(15, 16, RC[15]); \ + KF_ELT(16, 17, RC[16]); \ + KF_ELT(17, 18, RC[17]); \ + KF_ELT(18, 19, RC[18]); \ + KF_ELT(19, 20, RC[19]); \ + KF_ELT(20, 21, RC[20]); \ + KF_ELT(21, 22, RC[21]); \ + KF_ELT(22, 23, RC[22]); \ + KF_ELT(23, 0, RC[23]); \ + } while (0) + +#else + +#error Unimplemented unroll count for Keccak. + +#endif + +static void +keccak_init(sph_keccak_context *kc, unsigned out_size) +{ + int i; + +#if SPH_KECCAK_64 + for (i = 0; i < 25; i ++) + kc->u.wide[i] = 0; + /* + * Initialization for the "lane complement". + */ + kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF); +#else + + for (i = 0; i < 50; i ++) + kc->u.narrow[i] = 0; + /* + * Initialization for the "lane complement". + * Note: since we set to all-one full 64-bit words, + * interleaving (if applicable) is a no-op. + */ + kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[16] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[17] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[24] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[25] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[34] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[35] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[40] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[41] = SPH_C32(0xFFFFFFFF); +#endif + kc->ptr = 0; + kc->lim = 200 - (out_size >> 2); +} + +static void +keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = kc->buf; + ptr = kc->ptr; + + if (len < (lim - ptr)) { + memcpy(buf + ptr, data, len); + kc->ptr = ptr + len; + return; + } + + READ_STATE(kc); + while (len > 0) { + size_t clen; + + clen = (lim - ptr); + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == lim) { + INPUT_BUF(lim); + KECCAK_F_1600; + ptr = 0; + } + } + WRITE_STATE(kc); + kc->ptr = ptr; +} + +#if SPH_KECCAK_64 + +#define DEFCLOSE(d, lim) \ + static void keccak_close ## d( \ + sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ + { \ + unsigned eb; \ + union { \ + unsigned char tmp[lim + 1]; \ + sph_u64 dummy; /* for alignment */ \ + } u; \ + size_t j; \ + \ + eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ + if (kc->ptr == (lim - 1)) { \ + if (n == 7) { \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, lim - 1); \ + u.tmp[lim] = 0x80; \ + j = 1 + lim; \ + } else { \ + u.tmp[0] = eb | 0x80; \ + j = 1; \ + } \ + } else { \ + j = lim - kc->ptr; \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, j - 2); \ + u.tmp[j - 1] = 0x80; \ + } \ + keccak_core(kc, u.tmp, j, lim); \ + /* Finalize the "lane complement" */ \ + kc->u.wide[ 1] = ~kc->u.wide[ 1]; \ + kc->u.wide[ 2] = ~kc->u.wide[ 2]; \ + kc->u.wide[ 8] = ~kc->u.wide[ 8]; \ + kc->u.wide[12] = ~kc->u.wide[12]; \ + kc->u.wide[17] = ~kc->u.wide[17]; \ + kc->u.wide[20] = ~kc->u.wide[20]; \ + for (j = 0; j < d; j += 8) \ + sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \ + memcpy(dst, u.tmp, d); \ + keccak_init(kc, (unsigned)d << 3); \ + } \ + +#else + +#define DEFCLOSE(d, lim) \ + static void keccak_close ## d( \ + sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ + { \ + unsigned eb; \ + union { \ + unsigned char tmp[lim + 1]; \ + sph_u64 dummy; /* for alignment */ \ + } u; \ + size_t j; \ + \ + eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ + if (kc->ptr == (lim - 1)) { \ + if (n == 7) { \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, lim - 1); \ + u.tmp[lim] = 0x80; \ + j = 1 + lim; \ + } else { \ + u.tmp[0] = eb | 0x80; \ + j = 1; \ + } \ + } else { \ + j = lim - kc->ptr; \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, j - 2); \ + u.tmp[j - 1] = 0x80; \ + } \ + keccak_core(kc, u.tmp, j, lim); \ + /* Finalize the "lane complement" */ \ + kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \ + kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \ + kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \ + kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \ + kc->u.narrow[16] = ~kc->u.narrow[16]; \ + kc->u.narrow[17] = ~kc->u.narrow[17]; \ + kc->u.narrow[24] = ~kc->u.narrow[24]; \ + kc->u.narrow[25] = ~kc->u.narrow[25]; \ + kc->u.narrow[34] = ~kc->u.narrow[34]; \ + kc->u.narrow[35] = ~kc->u.narrow[35]; \ + kc->u.narrow[40] = ~kc->u.narrow[40]; \ + kc->u.narrow[41] = ~kc->u.narrow[41]; \ + /* un-interleave */ \ + for (j = 0; j < 50; j += 2) \ + UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \ + for (j = 0; j < d; j += 4) \ + sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \ + memcpy(dst, u.tmp, d); \ + keccak_init(kc, (unsigned)d << 3); \ + } \ + +#endif + +DEFCLOSE(28, 144) +DEFCLOSE(32, 136) +DEFCLOSE(48, 104) +DEFCLOSE(64, 72) + +/* see sph_keccak.h */ +void +sph_keccak224_init(void *cc) +{ + keccak_init(cc, 224); +} + +/* see sph_keccak.h */ +void +sph_keccak224(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 144); +} + +/* see sph_keccak.h */ +void +sph_keccak224_close(void *cc, void *dst) +{ + sph_keccak224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close28(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak256_init(void *cc) +{ + keccak_init(cc, 256); +} + +/* see sph_keccak.h */ +void +sph_keccak256(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 136); +} + +/* see sph_keccak.h */ +void +sph_keccak256_close(void *cc, void *dst) +{ + sph_keccak256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close32(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak384_init(void *cc) +{ + keccak_init(cc, 384); +} + +/* see sph_keccak.h */ +void +sph_keccak384(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 104); +} + +/* see sph_keccak.h */ +void +sph_keccak384_close(void *cc, void *dst) +{ + sph_keccak384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close48(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak512_init(void *cc) +{ + keccak_init(cc, 512); +} + +/* see sph_keccak.h */ +void +sph_keccak512(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 72); +} + +/* see sph_keccak.h */ +void +sph_keccak512_close(void *cc, void *dst) +{ + sph_keccak512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close64(cc, ub, n, dst); +} + + +#ifdef __cplusplus +} +#endif diff --git a/sph/sph_blake.h b/sph/sph_blake.h index 0fc4295..d8d7943 100644 --- a/sph/sph_blake.h +++ b/sph/sph_blake.h @@ -1,327 +1,327 @@ -/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */ -/** - * BLAKE interface. BLAKE is a family of functions which differ by their - * output size; this implementation defines BLAKE for output sizes 224, - * 256, 384 and 512 bits. This implementation conforms to the "third - * round" specification. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_blake.h - * @author Thomas Pornin - */ - -#ifndef SPH_BLAKE_H__ -#define SPH_BLAKE_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -/** - * Output size (in bits) for BLAKE-224. - */ -#define SPH_SIZE_blake224 224 - -/** - * Output size (in bits) for BLAKE-256. - */ -#define SPH_SIZE_blake256 256 - -#if SPH_64 - -/** - * Output size (in bits) for BLAKE-384. - */ -#define SPH_SIZE_blake384 384 - -/** - * Output size (in bits) for BLAKE-512. - */ -#define SPH_SIZE_blake512 512 - -#endif - -/** - * This structure is a context for BLAKE-224 and BLAKE-256 computations: - * it contains the intermediate values and some data from the last - * entered block. Once a BLAKE computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running BLAKE - * computation can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - size_t ptr; - sph_u32 H[8]; - sph_u32 S[4]; - sph_u32 T0, T1; -#endif -} sph_blake_small_context; - -/** - * This structure is a context for BLAKE-224 computations. It is - * identical to the common sph_blake_small_context. - */ -typedef sph_blake_small_context sph_blake224_context; - -/** - * This structure is a context for BLAKE-256 computations. It is - * identical to the common sph_blake_small_context. - */ -typedef sph_blake_small_context sph_blake256_context; - -#if SPH_64 - -/** - * This structure is a context for BLAKE-384 and BLAKE-512 computations: - * it contains the intermediate values and some data from the last - * entered block. Once a BLAKE computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running BLAKE - * computation can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[128]; /* first field, for alignment */ - size_t ptr; - sph_u64 H[8]; - sph_u64 S[4]; - sph_u64 T0, T1; -#endif -} sph_blake_big_context; - -/** - * This structure is a context for BLAKE-384 computations. It is - * identical to the common sph_blake_small_context. - */ -typedef sph_blake_big_context sph_blake384_context; - -/** - * This structure is a context for BLAKE-512 computations. It is - * identical to the common sph_blake_small_context. - */ -typedef sph_blake_big_context sph_blake512_context; - -#endif - -/** - * Initialize a BLAKE-224 context. This process performs no memory allocation. - * - * @param cc the BLAKE-224 context (pointer to a - * sph_blake224_context) - */ -void sph_blake224_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the BLAKE-224 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_blake224(void *cc, const void *data, size_t len); - -/** - * Terminate the current BLAKE-224 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (28 bytes). The context is automatically - * reinitialized. - * - * @param cc the BLAKE-224 context - * @param dst the destination buffer - */ -void sph_blake224_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (28 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the BLAKE-224 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_blake224_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a BLAKE-256 context. This process performs no memory allocation. - * - * @param cc the BLAKE-256 context (pointer to a - * sph_blake256_context) - */ -void sph_blake256_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the BLAKE-256 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_blake256(void *cc, const void *data, size_t len); - -/** - * Terminate the current BLAKE-256 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the BLAKE-256 context - * @param dst the destination buffer - */ -void sph_blake256_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (32 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the BLAKE-256 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_blake256_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#if SPH_64 - -/** - * Initialize a BLAKE-384 context. This process performs no memory allocation. - * - * @param cc the BLAKE-384 context (pointer to a - * sph_blake384_context) - */ -void sph_blake384_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the BLAKE-384 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_blake384(void *cc, const void *data, size_t len); - -/** - * Terminate the current BLAKE-384 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (48 bytes). The context is automatically - * reinitialized. - * - * @param cc the BLAKE-384 context - * @param dst the destination buffer - */ -void sph_blake384_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (48 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the BLAKE-384 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_blake384_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a BLAKE-512 context. This process performs no memory allocation. - * - * @param cc the BLAKE-512 context (pointer to a - * sph_blake512_context) - */ -void sph_blake512_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the BLAKE-512 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_blake512(void *cc, const void *data, size_t len); - -/** - * Terminate the current BLAKE-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the BLAKE-512 context - * @param dst the destination buffer - */ -void sph_blake512_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the BLAKE-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_blake512_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#endif - -#ifdef __cplusplus -} -#endif - -#endif +/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */ +/** + * BLAKE interface. BLAKE is a family of functions which differ by their + * output size; this implementation defines BLAKE for output sizes 224, + * 256, 384 and 512 bits. This implementation conforms to the "third + * round" specification. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_blake.h + * @author Thomas Pornin + */ + +#ifndef SPH_BLAKE_H__ +#define SPH_BLAKE_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for BLAKE-224. + */ +#define SPH_SIZE_blake224 224 + +/** + * Output size (in bits) for BLAKE-256. + */ +#define SPH_SIZE_blake256 256 + +#if SPH_64 + +/** + * Output size (in bits) for BLAKE-384. + */ +#define SPH_SIZE_blake384 384 + +/** + * Output size (in bits) for BLAKE-512. + */ +#define SPH_SIZE_blake512 512 + +#endif + +/** + * This structure is a context for BLAKE-224 and BLAKE-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BLAKE computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BLAKE + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 H[8]; + sph_u32 S[4]; + sph_u32 T0, T1; +#endif +} sph_blake_small_context; + +/** + * This structure is a context for BLAKE-224 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_small_context sph_blake224_context; + +/** + * This structure is a context for BLAKE-256 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_small_context sph_blake256_context; + +#if SPH_64 + +/** + * This structure is a context for BLAKE-384 and BLAKE-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BLAKE computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BLAKE + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u64 H[8]; + sph_u64 S[4]; + sph_u64 T0, T1; +#endif +} sph_blake_big_context; + +/** + * This structure is a context for BLAKE-384 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_big_context sph_blake384_context; + +/** + * This structure is a context for BLAKE-512 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_big_context sph_blake512_context; + +#endif + +/** + * Initialize a BLAKE-224 context. This process performs no memory allocation. + * + * @param cc the BLAKE-224 context (pointer to a + * sph_blake224_context) + */ +void sph_blake224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake224(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-224 context + * @param dst the destination buffer + */ +void sph_blake224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BLAKE-256 context. This process performs no memory allocation. + * + * @param cc the BLAKE-256 context (pointer to a + * sph_blake256_context) + */ +void sph_blake256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake256(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-256 context + * @param dst the destination buffer + */ +void sph_blake256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#if SPH_64 + +/** + * Initialize a BLAKE-384 context. This process performs no memory allocation. + * + * @param cc the BLAKE-384 context (pointer to a + * sph_blake384_context) + */ +void sph_blake384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake384(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-384 context + * @param dst the destination buffer + */ +void sph_blake384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BLAKE-512 context. This process performs no memory allocation. + * + * @param cc the BLAKE-512 context (pointer to a + * sph_blake512_context) + */ +void sph_blake512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake512(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-512 context + * @param dst the destination buffer + */ +void sph_blake512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph/sph_groestl.h b/sph/sph_groestl.h index a997431..495f05e 100644 --- a/sph/sph_groestl.h +++ b/sph/sph_groestl.h @@ -1,329 +1,329 @@ -/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * Groestl interface. This code implements Groestl with the recommended - * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_groestl.h - * @author Thomas Pornin - */ - -#ifndef SPH_GROESTL_H__ -#define SPH_GROESTL_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -/** - * Output size (in bits) for Groestl-224. - */ -#define SPH_SIZE_groestl224 224 - -/** - * Output size (in bits) for Groestl-256. - */ -#define SPH_SIZE_groestl256 256 - -/** - * Output size (in bits) for Groestl-384. - */ -#define SPH_SIZE_groestl384 384 - -/** - * Output size (in bits) for Groestl-512. - */ -#define SPH_SIZE_groestl512 512 - -/** - * This structure is a context for Groestl-224 and Groestl-256 computations: - * it contains the intermediate values and some data from the last - * entered block. Once a Groestl computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running Groestl - * computation can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ - size_t ptr; - union { -#if SPH_64 - sph_u64 wide[8]; -#endif - sph_u32 narrow[16]; - } state; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif -#endif -} sph_groestl_small_context; - -/** - * This structure is a context for Groestl-224 computations. It is - * identical to the common sph_groestl_small_context. - */ -typedef sph_groestl_small_context sph_groestl224_context; - -/** - * This structure is a context for Groestl-256 computations. It is - * identical to the common sph_groestl_small_context. - */ -typedef sph_groestl_small_context sph_groestl256_context; - -/** - * This structure is a context for Groestl-384 and Groestl-512 computations: - * it contains the intermediate values and some data from the last - * entered block. Once a Groestl computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running Groestl - * computation can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[128]; /* first field, for alignment */ - size_t ptr; - union { -#if SPH_64 - sph_u64 wide[16]; -#endif - sph_u32 narrow[32]; - } state; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif -#endif -} sph_groestl_big_context; - -/** - * This structure is a context for Groestl-384 computations. It is - * identical to the common sph_groestl_small_context. - */ -typedef sph_groestl_big_context sph_groestl384_context; - -/** - * This structure is a context for Groestl-512 computations. It is - * identical to the common sph_groestl_small_context. - */ -typedef sph_groestl_big_context sph_groestl512_context; - -/** - * Initialize a Groestl-224 context. This process performs no memory allocation. - * - * @param cc the Groestl-224 context (pointer to a - * sph_groestl224_context) - */ -void sph_groestl224_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Groestl-224 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_groestl224(void *cc, const void *data, size_t len); - -/** - * Terminate the current Groestl-224 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (28 bytes). The context is automatically - * reinitialized. - * - * @param cc the Groestl-224 context - * @param dst the destination buffer - */ -void sph_groestl224_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (28 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Groestl-224 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_groestl224_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a Groestl-256 context. This process performs no memory allocation. - * - * @param cc the Groestl-256 context (pointer to a - * sph_groestl256_context) - */ -void sph_groestl256_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Groestl-256 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_groestl256(void *cc, const void *data, size_t len); - -/** - * Terminate the current Groestl-256 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the Groestl-256 context - * @param dst the destination buffer - */ -void sph_groestl256_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (32 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Groestl-256 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_groestl256_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a Groestl-384 context. This process performs no memory allocation. - * - * @param cc the Groestl-384 context (pointer to a - * sph_groestl384_context) - */ -void sph_groestl384_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Groestl-384 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_groestl384(void *cc, const void *data, size_t len); - -/** - * Terminate the current Groestl-384 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (48 bytes). The context is automatically - * reinitialized. - * - * @param cc the Groestl-384 context - * @param dst the destination buffer - */ -void sph_groestl384_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (48 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Groestl-384 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_groestl384_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a Groestl-512 context. This process performs no memory allocation. - * - * @param cc the Groestl-512 context (pointer to a - * sph_groestl512_context) - */ -void sph_groestl512_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Groestl-512 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_groestl512(void *cc, const void *data, size_t len); - -/** - * Terminate the current Groestl-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the Groestl-512 context - * @param dst the destination buffer - */ -void sph_groestl512_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Groestl-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_groestl512_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#ifdef __cplusplus -} -#endif - -#endif +/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Groestl interface. This code implements Groestl with the recommended + * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_groestl.h + * @author Thomas Pornin + */ + +#ifndef SPH_GROESTL_H__ +#define SPH_GROESTL_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Groestl-224. + */ +#define SPH_SIZE_groestl224 224 + +/** + * Output size (in bits) for Groestl-256. + */ +#define SPH_SIZE_groestl256 256 + +/** + * Output size (in bits) for Groestl-384. + */ +#define SPH_SIZE_groestl384 384 + +/** + * Output size (in bits) for Groestl-512. + */ +#define SPH_SIZE_groestl512 512 + +/** + * This structure is a context for Groestl-224 and Groestl-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a Groestl computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running Groestl + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[8]; +#endif + sph_u32 narrow[16]; + } state; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_groestl_small_context; + +/** + * This structure is a context for Groestl-224 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_small_context sph_groestl224_context; + +/** + * This structure is a context for Groestl-256 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_small_context sph_groestl256_context; + +/** + * This structure is a context for Groestl-384 and Groestl-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a Groestl computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running Groestl + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[16]; +#endif + sph_u32 narrow[32]; + } state; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_groestl_big_context; + +/** + * This structure is a context for Groestl-384 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_big_context sph_groestl384_context; + +/** + * This structure is a context for Groestl-512 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_big_context sph_groestl512_context; + +/** + * Initialize a Groestl-224 context. This process performs no memory allocation. + * + * @param cc the Groestl-224 context (pointer to a + * sph_groestl224_context) + */ +void sph_groestl224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-224 context + * @param dst the destination buffer + */ +void sph_groestl224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-256 context. This process performs no memory allocation. + * + * @param cc the Groestl-256 context (pointer to a + * sph_groestl256_context) + */ +void sph_groestl256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-256 context + * @param dst the destination buffer + */ +void sph_groestl256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-384 context. This process performs no memory allocation. + * + * @param cc the Groestl-384 context (pointer to a + * sph_groestl384_context) + */ +void sph_groestl384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-384 context + * @param dst the destination buffer + */ +void sph_groestl384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-512 context. This process performs no memory allocation. + * + * @param cc the Groestl-512 context (pointer to a + * sph_groestl512_context) + */ +void sph_groestl512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-512 context + * @param dst the destination buffer + */ +void sph_groestl512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph/sph_keccak.h b/sph/sph_keccak.h index 8760598..bdafdb8 100644 --- a/sph/sph_keccak.h +++ b/sph/sph_keccak.h @@ -1,293 +1,293 @@ -/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * Keccak interface. This is the interface for Keccak with the - * recommended parameters for SHA-3, with output lengths 224, 256, - * 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_keccak.h - * @author Thomas Pornin - */ - -#ifndef SPH_KECCAK_H__ -#define SPH_KECCAK_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -/** - * Output size (in bits) for Keccak-224. - */ -#define SPH_SIZE_keccak224 224 - -/** - * Output size (in bits) for Keccak-256. - */ -#define SPH_SIZE_keccak256 256 - -/** - * Output size (in bits) for Keccak-384. - */ -#define SPH_SIZE_keccak384 384 - -/** - * Output size (in bits) for Keccak-512. - */ -#define SPH_SIZE_keccak512 512 - -/** - * This structure is a context for Keccak computations: it contains the - * intermediate values and some data from the last entered block. Once a - * Keccak computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running Keccak computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char buf[144]; /* first field, for alignment */ - size_t ptr, lim; - union { -#if SPH_64 - sph_u64 wide[25]; -#endif - sph_u32 narrow[50]; - } u; -#endif -} sph_keccak_context; - -/** - * Type for a Keccak-224 context (identical to the common context). - */ -typedef sph_keccak_context sph_keccak224_context; - -/** - * Type for a Keccak-256 context (identical to the common context). - */ -typedef sph_keccak_context sph_keccak256_context; - -/** - * Type for a Keccak-384 context (identical to the common context). - */ -typedef sph_keccak_context sph_keccak384_context; - -/** - * Type for a Keccak-512 context (identical to the common context). - */ -typedef sph_keccak_context sph_keccak512_context; - -/** - * Initialize a Keccak-224 context. This process performs no memory allocation. - * - * @param cc the Keccak-224 context (pointer to a - * sph_keccak224_context) - */ -void sph_keccak224_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Keccak-224 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_keccak224(void *cc, const void *data, size_t len); - -/** - * Terminate the current Keccak-224 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (28 bytes). The context is automatically - * reinitialized. - * - * @param cc the Keccak-224 context - * @param dst the destination buffer - */ -void sph_keccak224_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (28 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Keccak-224 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_keccak224_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a Keccak-256 context. This process performs no memory allocation. - * - * @param cc the Keccak-256 context (pointer to a - * sph_keccak256_context) - */ -void sph_keccak256_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Keccak-256 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_keccak256(void *cc, const void *data, size_t len); - -/** - * Terminate the current Keccak-256 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (32 bytes). The context is automatically - * reinitialized. - * - * @param cc the Keccak-256 context - * @param dst the destination buffer - */ -void sph_keccak256_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (32 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Keccak-256 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_keccak256_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a Keccak-384 context. This process performs no memory allocation. - * - * @param cc the Keccak-384 context (pointer to a - * sph_keccak384_context) - */ -void sph_keccak384_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Keccak-384 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_keccak384(void *cc, const void *data, size_t len); - -/** - * Terminate the current Keccak-384 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (48 bytes). The context is automatically - * reinitialized. - * - * @param cc the Keccak-384 context - * @param dst the destination buffer - */ -void sph_keccak384_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (48 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Keccak-384 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_keccak384_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -/** - * Initialize a Keccak-512 context. This process performs no memory allocation. - * - * @param cc the Keccak-512 context (pointer to a - * sph_keccak512_context) - */ -void sph_keccak512_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the Keccak-512 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_keccak512(void *cc, const void *data, size_t len); - -/** - * Terminate the current Keccak-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the Keccak-512 context - * @param dst the destination buffer - */ -void sph_keccak512_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Keccak-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_keccak512_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#ifdef __cplusplus -} -#endif - -#endif +/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Keccak interface. This is the interface for Keccak with the + * recommended parameters for SHA-3, with output lengths 224, 256, + * 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_keccak.h + * @author Thomas Pornin + */ + +#ifndef SPH_KECCAK_H__ +#define SPH_KECCAK_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Keccak-224. + */ +#define SPH_SIZE_keccak224 224 + +/** + * Output size (in bits) for Keccak-256. + */ +#define SPH_SIZE_keccak256 256 + +/** + * Output size (in bits) for Keccak-384. + */ +#define SPH_SIZE_keccak384 384 + +/** + * Output size (in bits) for Keccak-512. + */ +#define SPH_SIZE_keccak512 512 + +/** + * This structure is a context for Keccak computations: it contains the + * intermediate values and some data from the last entered block. Once a + * Keccak computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running Keccak computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[144]; /* first field, for alignment */ + size_t ptr, lim; + union { +#if SPH_64 + sph_u64 wide[25]; +#endif + sph_u32 narrow[50]; + } u; +#endif +} sph_keccak_context; + +/** + * Type for a Keccak-224 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak224_context; + +/** + * Type for a Keccak-256 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak256_context; + +/** + * Type for a Keccak-384 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak384_context; + +/** + * Type for a Keccak-512 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak512_context; + +/** + * Initialize a Keccak-224 context. This process performs no memory allocation. + * + * @param cc the Keccak-224 context (pointer to a + * sph_keccak224_context) + */ +void sph_keccak224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-224 context + * @param dst the destination buffer + */ +void sph_keccak224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-256 context. This process performs no memory allocation. + * + * @param cc the Keccak-256 context (pointer to a + * sph_keccak256_context) + */ +void sph_keccak256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-256 context + * @param dst the destination buffer + */ +void sph_keccak256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-384 context. This process performs no memory allocation. + * + * @param cc the Keccak-384 context (pointer to a + * sph_keccak384_context) + */ +void sph_keccak384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-384 context + * @param dst the destination buffer + */ +void sph_keccak384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-512 context. This process performs no memory allocation. + * + * @param cc the Keccak-512 context (pointer to a + * sph_keccak512_context) + */ +void sph_keccak512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-512 context + * @param dst the destination buffer + */ +void sph_keccak512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph/sph_types.h b/sph/sph_types.h index 054c96f..7295b0b 100644 --- a/sph/sph_types.h +++ b/sph/sph_types.h @@ -1,1976 +1,1976 @@ -/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */ -/** - * Basic type definitions. - * - * This header file defines the generic integer types that will be used - * for the implementation of hash functions; it also contains helper - * functions which encode and decode multi-byte integer values, using - * either little-endian or big-endian conventions. - * - * This file contains a compile-time test on the size of a byte - * (the unsigned char C type). If bytes are not octets, - * i.e. if they do not have a size of exactly 8 bits, then compilation - * is aborted. Architectures where bytes are not octets are relatively - * rare, even in the embedded devices market. We forbid non-octet bytes - * because there is no clear convention on how octet streams are encoded - * on such systems. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_types.h - * @author Thomas Pornin - */ - -#ifndef SPH_TYPES_H__ -#define SPH_TYPES_H__ - -#include - -/* - * All our I/O functions are defined over octet streams. We do not know - * how to handle input data if bytes are not octets. - */ -#if CHAR_BIT != 8 -#error This code requires 8-bit bytes -#endif - -/* ============= BEGIN documentation block for Doxygen ============ */ - -#ifdef DOXYGEN_IGNORE - -/** @mainpage sphlib C code documentation - * - * @section overview Overview - * - * sphlib is a library which contains implementations of - * various cryptographic hash functions. These pages have been generated - * with doxygen and - * document the API for the C implementations. - * - * The API is described in appropriate header files, which are available - * in the "Files" section. Each hash function family has its own header, - * whose name begins with "sph_" and contains the family - * name. For instance, the API for the RIPEMD hash functions is available - * in the header file sph_ripemd.h. - * - * @section principles API structure and conventions - * - * @subsection io Input/output conventions - * - * In all generality, hash functions operate over strings of bits. - * Individual bits are rarely encountered in C programming or actual - * communication protocols; most protocols converge on the ubiquitous - * "octet" which is a group of eight bits. Data is thus expressed as a - * stream of octets. The C programming language contains the notion of a - * "byte", which is a data unit managed under the type "unsigned - * char". The C standard prescribes that a byte should hold at - * least eight bits, but possibly more. Most modern architectures, even - * in the embedded world, feature eight-bit bytes, i.e. map bytes to - * octets. - * - * Nevertheless, for some of the implemented hash functions, an extra - * API has been added, which allows the input of arbitrary sequences of - * bits: when the computation is about to be closed, 1 to 7 extra bits - * can be added. The functions for which this API is implemented include - * the SHA-2 functions and all SHA-3 candidates. - * - * sphlib defines hash function which may hash octet streams, - * i.e. streams of bits where the number of bits is a multiple of eight. - * The data input functions in the sphlib API expect data - * as anonymous pointers ("const void *") with a length - * (of type "size_t") which gives the input data chunk length - * in bytes. A byte is assumed to be an octet; the sph_types.h - * header contains a compile-time test which prevents compilation on - * architectures where this property is not met. - * - * The hash function output is also converted into bytes. All currently - * implemented hash functions have an output width which is a multiple of - * eight, and this is likely to remain true for new designs. - * - * Most hash functions internally convert input data into 32-bit of 64-bit - * words, using either little-endian or big-endian conversion. The hash - * output also often consists of such words, which are encoded into output - * bytes with a similar endianness convention. Some hash functions have - * been only loosely specified on that subject; when necessary, - * sphlib has been tested against published "reference" - * implementations in order to use the same conventions. - * - * @subsection shortname Function short name - * - * Each implemented hash function has a "short name" which is used - * internally to derive the identifiers for the functions and context - * structures which the function uses. For instance, MD5 has the short - * name "md5". Short names are listed in the next section, - * for the implemented hash functions. In subsequent sections, the - * short name will be assumed to be "XXX": replace with the - * actual hash function name to get the C identifier. - * - * Note: some functions within the same family share the same core - * elements, such as update function or context structure. Correspondingly, - * some of the defined types or functions may actually be macros which - * transparently evaluate to another type or function name. - * - * @subsection context Context structure - * - * Each implemented hash fonction has its own context structure, available - * under the type name "sph_XXX_context" for the hash function - * with short name "XXX". This structure holds all needed - * state for a running hash computation. - * - * The contents of these structures are meant to be opaque, and private - * to the implementation. However, these contents are specified in the - * header files so that application code which uses sphlib - * may access the size of those structures. - * - * The caller is responsible for allocating the context structure, - * whether by dynamic allocation (malloc() or equivalent), - * static allocation (a global permanent variable), as an automatic - * variable ("on the stack"), or by any other mean which ensures proper - * structure alignment. sphlib code performs no dynamic - * allocation by itself. - * - * The context must be initialized before use, using the - * sph_XXX_init() function. This function sets the context - * state to proper initial values for hashing. - * - * Since all state data is contained within the context structure, - * sphlib is thread-safe and reentrant: several hash - * computations may be performed in parallel, provided that they do not - * operate on the same context. Moreover, a running computation can be - * cloned by copying the context (with a simple memcpy()): - * the context and its clone are then independant and may be updated - * with new data and/or closed without interfering with each other. - * Similarly, a context structure can be moved in memory at will: - * context structures contain no pointer, in particular no pointer to - * themselves. - * - * @subsection dataio Data input - * - * Hashed data is input with the sph_XXX() fonction, which - * takes as parameters a pointer to the context, a pointer to the data - * to hash, and the number of data bytes to hash. The context is updated - * with the new data. - * - * Data can be input in one or several calls, with arbitrary input lengths. - * However, it is best, performance wise, to input data by relatively big - * chunks (say a few kilobytes), because this allows sphlib to - * optimize things and avoid internal copying. - * - * When all data has been input, the context can be closed with - * sph_XXX_close(). The hash output is computed and written - * into the provided buffer. The caller must take care to provide a - * buffer of appropriate length; e.g., when using SHA-1, the output is - * a 20-byte word, therefore the output buffer must be at least 20-byte - * long. - * - * For some hash functions, the sph_XXX_addbits_and_close() - * function can be used instead of sph_XXX_close(). This - * function can take a few extra bits to be added at - * the end of the input message. This allows hashing messages with a - * bit length which is not a multiple of 8. The extra bits are provided - * as an unsigned integer value, and a bit count. The bit count must be - * between 0 and 7, inclusive. The extra bits are provided as bits 7 to - * 0 (bits of numerical value 128, 64, 32... downto 0), in that order. - * For instance, to add three bits of value 1, 1 and 0, the unsigned - * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count - * will be 3. - * - * The SPH_SIZE_XXX macro is defined for each hash function; - * it evaluates to the function output size, expressed in bits. For instance, - * SPH_SIZE_sha1 evaluates to 160. - * - * When closed, the context is automatically reinitialized and can be - * immediately used for another computation. It is not necessary to call - * sph_XXX_init() after a close. Note that - * sph_XXX_init() can still be called to "reset" a context, - * i.e. forget previously input data, and get back to the initial state. - * - * @subsection alignment Data alignment - * - * "Alignment" is a property of data, which is said to be "properly - * aligned" when its emplacement in memory is such that the data can - * be optimally read by full words. This depends on the type of access; - * basically, some hash functions will read data by 32-bit or 64-bit - * words. sphlib does not mandate such alignment for input - * data, but using aligned data can substantially improve performance. - * - * As a rule, it is best to input data by chunks whose length (in bytes) - * is a multiple of eight, and which begins at "generally aligned" - * addresses, such as the base address returned by a call to - * malloc(). - * - * @section functions Implemented functions - * - * We give here the list of implemented functions. They are grouped by - * family; to each family corresponds a specific header file. Each - * individual function has its associated "short name". Please refer to - * the documentation for that header file to get details on the hash - * function denomination and provenance. - * - * Note: the functions marked with a '(64)' in the list below are - * available only if the C compiler provides an integer type of length - * 64 bits or more. Such a type is mandatory in the latest C standard - * (ISO 9899:1999, aka "C99") and is present in several older compilers - * as well, so chances are that such a type is available. - * - * - HAVAL family: file sph_haval.h - * - HAVAL-128/3 (128-bit, 3 passes): short name: haval128_3 - * - HAVAL-128/4 (128-bit, 4 passes): short name: haval128_4 - * - HAVAL-128/5 (128-bit, 5 passes): short name: haval128_5 - * - HAVAL-160/3 (160-bit, 3 passes): short name: haval160_3 - * - HAVAL-160/4 (160-bit, 4 passes): short name: haval160_4 - * - HAVAL-160/5 (160-bit, 5 passes): short name: haval160_5 - * - HAVAL-192/3 (192-bit, 3 passes): short name: haval192_3 - * - HAVAL-192/4 (192-bit, 4 passes): short name: haval192_4 - * - HAVAL-192/5 (192-bit, 5 passes): short name: haval192_5 - * - HAVAL-224/3 (224-bit, 3 passes): short name: haval224_3 - * - HAVAL-224/4 (224-bit, 4 passes): short name: haval224_4 - * - HAVAL-224/5 (224-bit, 5 passes): short name: haval224_5 - * - HAVAL-256/3 (256-bit, 3 passes): short name: haval256_3 - * - HAVAL-256/4 (256-bit, 4 passes): short name: haval256_4 - * - HAVAL-256/5 (256-bit, 5 passes): short name: haval256_5 - * - MD2: file sph_md2.h, short name: md2 - * - MD4: file sph_md4.h, short name: md4 - * - MD5: file sph_md5.h, short name: md5 - * - PANAMA: file sph_panama.h, short name: panama - * - RadioGatun family: file sph_radiogatun.h - * - RadioGatun[32]: short name: radiogatun32 - * - RadioGatun[64]: short name: radiogatun64 (64) - * - RIPEMD family: file sph_ripemd.h - * - RIPEMD: short name: ripemd - * - RIPEMD-128: short name: ripemd128 - * - RIPEMD-160: short name: ripemd160 - * - SHA-0: file sph_sha0.h, short name: sha0 - * - SHA-1: file sph_sha1.h, short name: sha1 - * - SHA-2 family, 32-bit hashes: file sph_sha2.h - * - SHA-224: short name: sha224 - * - SHA-256: short name: sha256 - * - SHA-384: short name: sha384 (64) - * - SHA-512: short name: sha512 (64) - * - Tiger family: file sph_tiger.h - * - Tiger: short name: tiger (64) - * - Tiger2: short name: tiger2 (64) - * - WHIRLPOOL family: file sph_whirlpool.h - * - WHIRLPOOL-0: short name: whirlpool0 (64) - * - WHIRLPOOL-1: short name: whirlpool1 (64) - * - WHIRLPOOL: short name: whirlpool (64) - * - * The fourteen second-round SHA-3 candidates are also implemented; - * when applicable, the implementations follow the "final" specifications - * as published for the third round of the SHA-3 competition (BLAKE, - * Groestl, JH, Keccak and Skein have been tweaked for third round). - * - * - BLAKE family: file sph_blake.h - * - BLAKE-224: short name: blake224 - * - BLAKE-256: short name: blake256 - * - BLAKE-384: short name: blake384 - * - BLAKE-512: short name: blake512 - * - BMW (Blue Midnight Wish) family: file sph_bmw.h - * - BMW-224: short name: bmw224 - * - BMW-256: short name: bmw256 - * - BMW-384: short name: bmw384 (64) - * - BMW-512: short name: bmw512 (64) - * - CubeHash family: file sph_cubehash.h (specified as - * CubeHash16/32 in the CubeHash specification) - * - CubeHash-224: short name: cubehash224 - * - CubeHash-256: short name: cubehash256 - * - CubeHash-384: short name: cubehash384 - * - CubeHash-512: short name: cubehash512 - * - ECHO family: file sph_echo.h - * - ECHO-224: short name: echo224 - * - ECHO-256: short name: echo256 - * - ECHO-384: short name: echo384 - * - ECHO-512: short name: echo512 - * - Fugue family: file sph_fugue.h - * - Fugue-224: short name: fugue224 - * - Fugue-256: short name: fugue256 - * - Fugue-384: short name: fugue384 - * - Fugue-512: short name: fugue512 - * - Groestl family: file sph_groestl.h - * - Groestl-224: short name: groestl224 - * - Groestl-256: short name: groestl256 - * - Groestl-384: short name: groestl384 - * - Groestl-512: short name: groestl512 - * - Hamsi family: file sph_hamsi.h - * - Hamsi-224: short name: hamsi224 - * - Hamsi-256: short name: hamsi256 - * - Hamsi-384: short name: hamsi384 - * - Hamsi-512: short name: hamsi512 - * - JH family: file sph_jh.h - * - JH-224: short name: jh224 - * - JH-256: short name: jh256 - * - JH-384: short name: jh384 - * - JH-512: short name: jh512 - * - Keccak family: file sph_keccak.h - * - Keccak-224: short name: keccak224 - * - Keccak-256: short name: keccak256 - * - Keccak-384: short name: keccak384 - * - Keccak-512: short name: keccak512 - * - Luffa family: file sph_luffa.h - * - Luffa-224: short name: luffa224 - * - Luffa-256: short name: luffa256 - * - Luffa-384: short name: luffa384 - * - Luffa-512: short name: luffa512 - * - Shabal family: file sph_shabal.h - * - Shabal-192: short name: shabal192 - * - Shabal-224: short name: shabal224 - * - Shabal-256: short name: shabal256 - * - Shabal-384: short name: shabal384 - * - Shabal-512: short name: shabal512 - * - SHAvite-3 family: file sph_shavite.h - * - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"): - * short name: shabal224 - * - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"): - * short name: shabal256 - * - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"): - * short name: shabal384 - * - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"): - * short name: shabal512 - * - SIMD family: file sph_simd.h - * - SIMD-224: short name: simd224 - * - SIMD-256: short name: simd256 - * - SIMD-384: short name: simd384 - * - SIMD-512: short name: simd512 - * - Skein family: file sph_skein.h - * - Skein-224 (nominally specified as Skein-512-224): short name: - * skein224 (64) - * - Skein-256 (nominally specified as Skein-512-256): short name: - * skein256 (64) - * - Skein-384 (nominally specified as Skein-512-384): short name: - * skein384 (64) - * - Skein-512 (nominally specified as Skein-512-512): short name: - * skein512 (64) - * - * For the second-round SHA-3 candidates, the functions are as specified - * for round 2, i.e. with the "tweaks" that some candidates added - * between round 1 and round 2. Also, some of the submitted packages for - * round 2 contained errors, in the specification, reference code, or - * both. sphlib implements the corrected versions. - */ - -/** @hideinitializer - * Unsigned integer type whose length is at least 32 bits; on most - * architectures, it will have a width of exactly 32 bits. Unsigned C - * types implement arithmetics modulo a power of 2; use the - * SPH_T32() macro to ensure that the value is truncated - * to exactly 32 bits. Unless otherwise specified, all macros and - * functions which accept sph_u32 values assume that these - * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures - * where sph_u32 is larger than that. - */ -typedef __arch_dependant__ sph_u32; - -/** @hideinitializer - * Signed integer type corresponding to sph_u32; it has - * width 32 bits or more. - */ -typedef __arch_dependant__ sph_s32; - -/** @hideinitializer - * Unsigned integer type whose length is at least 64 bits; on most - * architectures which feature such a type, it will have a width of - * exactly 64 bits. C99-compliant platform will have this type; it - * is also defined when the GNU compiler (gcc) is used, and on - * platforms where unsigned long is large enough. If this - * type is not available, then some hash functions which depends on - * a 64-bit type will not be available (most notably SHA-384, SHA-512, - * Tiger and WHIRLPOOL). - */ -typedef __arch_dependant__ sph_u64; - -/** @hideinitializer - * Signed integer type corresponding to sph_u64; it has - * width 64 bits or more. - */ -typedef __arch_dependant__ sph_s64; - -/** - * This macro expands the token x into a suitable - * constant expression of type sph_u32. Depending on - * how this type is defined, a suffix such as UL may - * be appended to the argument. - * - * @param x the token to expand into a suitable constant expression - */ -#define SPH_C32(x) - -/** - * Truncate a 32-bit value to exactly 32 bits. On most systems, this is - * a no-op, recognized as such by the compiler. - * - * @param x the value to truncate (of type sph_u32) - */ -#define SPH_T32(x) - -/** - * Rotate a 32-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 31. This macro assumes that its - * first argument fits in 32 bits (no extra bit allowed on machines where - * sph_u32 is wider); both arguments may be evaluated - * several times. - * - * @param x the value to rotate (of type sph_u32) - * @param n the rotation count (between 1 and 31, inclusive) - */ -#define SPH_ROTL32(x, n) - -/** - * Rotate a 32-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 31. This macro assumes that its - * first argument fits in 32 bits (no extra bit allowed on machines where - * sph_u32 is wider); both arguments may be evaluated - * several times. - * - * @param x the value to rotate (of type sph_u32) - * @param n the rotation count (between 1 and 31, inclusive) - */ -#define SPH_ROTR32(x, n) - -/** - * This macro is defined on systems for which a 64-bit type has been - * detected, and is used for sph_u64. - */ -#define SPH_64 - -/** - * This macro is defined on systems for the "native" integer size is - * 64 bits (64-bit values fit in one register). - */ -#define SPH_64_TRUE - -/** - * This macro expands the token x into a suitable - * constant expression of type sph_u64. Depending on - * how this type is defined, a suffix such as ULL may - * be appended to the argument. This macro is defined only if a - * 64-bit type was detected and used for sph_u64. - * - * @param x the token to expand into a suitable constant expression - */ -#define SPH_C64(x) - -/** - * Truncate a 64-bit value to exactly 64 bits. On most systems, this is - * a no-op, recognized as such by the compiler. This macro is defined only - * if a 64-bit type was detected and used for sph_u64. - * - * @param x the value to truncate (of type sph_u64) - */ -#define SPH_T64(x) - -/** - * Rotate a 64-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 63. This macro assumes that its - * first argument fits in 64 bits (no extra bit allowed on machines where - * sph_u64 is wider); both arguments may be evaluated - * several times. This macro is defined only if a 64-bit type was detected - * and used for sph_u64. - * - * @param x the value to rotate (of type sph_u64) - * @param n the rotation count (between 1 and 63, inclusive) - */ -#define SPH_ROTL64(x, n) - -/** - * Rotate a 64-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 63. This macro assumes that its - * first argument fits in 64 bits (no extra bit allowed on machines where - * sph_u64 is wider); both arguments may be evaluated - * several times. This macro is defined only if a 64-bit type was detected - * and used for sph_u64. - * - * @param x the value to rotate (of type sph_u64) - * @param n the rotation count (between 1 and 63, inclusive) - */ -#define SPH_ROTR64(x, n) - -/** - * This macro evaluates to inline or an equivalent construction, - * if available on the compilation platform, or to nothing otherwise. This - * is used to declare inline functions, for which the compiler should - * endeavour to include the code directly in the caller. Inline functions - * are typically defined in header files as replacement for macros. - */ -#define SPH_INLINE - -/** - * This macro is defined if the platform has been detected as using - * little-endian convention. This implies that the sph_u32 - * type (and the sph_u64 type also, if it is defined) has - * an exact width (i.e. exactly 32-bit, respectively 64-bit). - */ -#define SPH_LITTLE_ENDIAN - -/** - * This macro is defined if the platform has been detected as using - * big-endian convention. This implies that the sph_u32 - * type (and the sph_u64 type also, if it is defined) has - * an exact width (i.e. exactly 32-bit, respectively 64-bit). - */ -#define SPH_BIG_ENDIAN - -/** - * This macro is defined if 32-bit words (and 64-bit words, if defined) - * can be read from and written to memory efficiently in little-endian - * convention. This is the case for little-endian platforms, and also - * for the big-endian platforms which have special little-endian access - * opcodes (e.g. Ultrasparc). - */ -#define SPH_LITTLE_FAST - -/** - * This macro is defined if 32-bit words (and 64-bit words, if defined) - * can be read from and written to memory efficiently in big-endian - * convention. This is the case for little-endian platforms, and also - * for the little-endian platforms which have special big-endian access - * opcodes. - */ -#define SPH_BIG_FAST - -/** - * On some platforms, this macro is defined to an unsigned integer type - * into which pointer values may be cast. The resulting value can then - * be tested for being a multiple of 2, 4 or 8, indicating an aligned - * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses. - */ -#define SPH_UPTR - -/** - * When defined, this macro indicates that unaligned memory accesses - * are possible with only a minor penalty, and thus should be prefered - * over strategies which first copy data to an aligned buffer. - */ -#define SPH_UNALIGNED - -/** - * Byte-swap a 32-bit word (i.e. 0x12345678 becomes - * 0x78563412). This is an inline function which resorts - * to inline assembly on some platforms, for better performance. - * - * @param x the 32-bit value to byte-swap - * @return the byte-swapped value - */ -static inline sph_u32 sph_bswap32(sph_u32 x); - -/** - * Byte-swap a 64-bit word. This is an inline function which resorts - * to inline assembly on some platforms, for better performance. This - * function is defined only if a suitable 64-bit type was found for - * sph_u64 - * - * @param x the 64-bit value to byte-swap - * @return the byte-swapped value - */ -static inline sph_u64 sph_bswap64(sph_u64 x); - -/** - * Decode a 16-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline unsigned sph_dec16le(const void *src); - -/** - * Encode a 16-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc16le(void *dst, unsigned val); - -/** - * Decode a 16-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline unsigned sph_dec16be(const void *src); - -/** - * Encode a 16-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc16be(void *dst, unsigned val); - -/** - * Decode a 32-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32le(const void *src); - -/** - * Decode a 32-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec32le() function. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32le_aligned(const void *src); - -/** - * Encode a 32-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32le(void *dst, sph_u32 val); - -/** - * Encode a 32-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc32le() function. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32le_aligned(void *dst, sph_u32 val); - -/** - * Decode a 32-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32be(const void *src); - -/** - * Decode a 32-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec32be() function. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32be_aligned(const void *src); - -/** - * Encode a 32-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32be(void *dst, sph_u32 val); - -/** - * Encode a 32-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc32be() function. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32be_aligned(void *dst, sph_u32 val); - -/** - * Decode a 64-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64le(const void *src); - -/** - * Decode a 64-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec64le() function. This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64le_aligned(const void *src); - -/** - * Encode a 64-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64le(void *dst, sph_u64 val); - -/** - * Encode a 64-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc64le() function. This function is defined - * only if a suitable 64-bit type was detected and used for - * sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64le_aligned(void *dst, sph_u64 val); - -/** - * Decode a 64-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64be(const void *src); - -/** - * Decode a 64-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec64be() function. This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64be_aligned(const void *src); - -/** - * Encode a 64-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64be(void *dst, sph_u64 val); - -/** - * Encode a 64-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc64be() function. This function is defined - * only if a suitable 64-bit type was detected and used for - * sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64be_aligned(void *dst, sph_u64 val); - -#endif - -/* ============== END documentation block for Doxygen ============= */ - -#ifndef DOXYGEN_IGNORE - -/* - * We want to define the types "sph_u32" and "sph_u64" which hold - * unsigned values of at least, respectively, 32 and 64 bits. These - * tests should select appropriate types for most platforms. The - * macro "SPH_64" is defined if the 64-bit is supported. - */ - -#undef SPH_64 -#undef SPH_64_TRUE - -#if defined __STDC__ && __STDC_VERSION__ >= 199901L - -/* - * On C99 implementations, we can use to get an exact 64-bit - * type, if any, or otherwise use a wider type (which must exist, for - * C99 conformance). - */ - -#include - -#ifdef UINT32_MAX -typedef uint32_t sph_u32; -typedef int32_t sph_s32; -#else -typedef uint_fast32_t sph_u32; -typedef int_fast32_t sph_s32; -#endif -#if !SPH_NO_64 -#ifdef UINT64_MAX -typedef uint64_t sph_u64; -typedef int64_t sph_s64; -#else -typedef uint_fast64_t sph_u64; -typedef int_fast64_t sph_s64; -#endif -#endif - -#define SPH_C32(x) ((sph_u32)(x)) -#if !SPH_NO_64 -#define SPH_C64(x) ((sph_u64)(x)) -#define SPH_64 1 -#endif - -#else - -/* - * On non-C99 systems, we use "unsigned int" if it is wide enough, - * "unsigned long" otherwise. This supports all "reasonable" architectures. - * We have to be cautious: pre-C99 preprocessors handle constants - * differently in '#if' expressions. Hence the shifts to test UINT_MAX. - */ - -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF - -typedef unsigned int sph_u32; -typedef int sph_s32; - -#define SPH_C32(x) ((sph_u32)(x ## U)) - -#else - -typedef unsigned long sph_u32; -typedef long sph_s32; - -#define SPH_C32(x) ((sph_u32)(x ## UL)) - -#endif - -#if !SPH_NO_64 - -/* - * We want a 64-bit type. We use "unsigned long" if it is wide enough (as - * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9), - * "unsigned long long" otherwise, if available. We use ULLONG_MAX to - * test whether "unsigned long long" is available; we also know that - * gcc features this type, even if the libc header do not know it. - */ - -#if ((ULONG_MAX >> 31) >> 31) >= 3 - -typedef unsigned long sph_u64; -typedef long sph_s64; - -#define SPH_C64(x) ((sph_u64)(x ## UL)) - -#define SPH_64 1 - -#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__ - -typedef unsigned long long sph_u64; -typedef long long sph_s64; - -#define SPH_C64(x) ((sph_u64)(x ## ULL)) - -#define SPH_64 1 - -#else - -/* - * No 64-bit type... - */ - -#endif - -#endif - -#endif - -/* - * If the "unsigned long" type has length 64 bits or more, then this is - * a "true" 64-bit architectures. This is also true with Visual C on - * amd64, even though the "long" type is limited to 32 bits. - */ -#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64) -#define SPH_64_TRUE 1 -#endif - -/* - * Implementation note: some processors have specific opcodes to perform - * a rotation. Recent versions of gcc recognize the expression above and - * use the relevant opcodes, when appropriate. - */ - -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -#if SPH_64 - -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) -#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) - -#endif - -#ifndef DOXYGEN_IGNORE -/* - * Define SPH_INLINE to be an "inline" qualifier, if available. We define - * some small macro-like functions which benefit greatly from being inlined. - */ -#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__ -#define SPH_INLINE inline -#elif defined _MSC_VER -#define SPH_INLINE __inline -#else -#define SPH_INLINE -#endif -#endif - -/* - * We define some macros which qualify the architecture. These macros - * may be explicit set externally (e.g. as compiler parameters). The - * code below sets those macros if they are not already defined. - * - * Most macros are boolean, thus evaluate to either zero or non-zero. - * The SPH_UPTR macro is special, in that it evaluates to a C type, - * or is not defined. - * - * SPH_UPTR if defined: unsigned type to cast pointers into - * - * SPH_UNALIGNED non-zero if unaligned accesses are efficient - * SPH_LITTLE_ENDIAN non-zero if architecture is known to be little-endian - * SPH_BIG_ENDIAN non-zero if architecture is known to be big-endian - * SPH_LITTLE_FAST non-zero if little-endian decoding is fast - * SPH_BIG_FAST non-zero if big-endian decoding is fast - * - * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit - * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN - * _must_ be non-zero in those situations. The 32-bit and 64-bit types - * _must_ also have an exact width. - * - * SPH_SPARCV9_GCC_32 UltraSPARC-compatible with gcc, 32-bit mode - * SPH_SPARCV9_GCC_64 UltraSPARC-compatible with gcc, 64-bit mode - * SPH_SPARCV9_GCC UltraSPARC-compatible with gcc - * SPH_I386_GCC x86-compatible (32-bit) with gcc - * SPH_I386_MSVC x86-compatible (32-bit) with Microsoft Visual C - * SPH_AMD64_GCC x86-compatible (64-bit) with gcc - * SPH_AMD64_MSVC x86-compatible (64-bit) with Microsoft Visual C - * SPH_PPC32_GCC PowerPC, 32-bit, with gcc - * SPH_PPC64_GCC PowerPC, 64-bit, with gcc - * - * TODO: enhance automatic detection, for more architectures and compilers. - * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with - * some very fast functions (e.g. MD4) when using unaligned input data. - * The CPU-specific-with-GCC macros are useful only for inline assembly, - * normally restrained to this header file. - */ - -/* - * 32-bit x86, aka "i386 compatible". - */ -#if defined __i386__ || defined _M_IX86 - -#define SPH_DETECT_UNALIGNED 1 -#define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 -#ifdef __GNUC__ -#define SPH_DETECT_I386_GCC 1 -#endif -#ifdef _MSC_VER -#define SPH_DETECT_I386_MSVC 1 -#endif - -/* - * 64-bit x86, hereafter known as "amd64". - */ -#elif defined __x86_64 || defined _M_X64 - -#define SPH_DETECT_UNALIGNED 1 -#define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 -#ifdef __GNUC__ -#define SPH_DETECT_AMD64_GCC 1 -#endif -#ifdef _MSC_VER -#define SPH_DETECT_AMD64_MSVC 1 -#endif - -/* - * 64-bit Sparc architecture (implies v9). - */ -#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \ - || defined __sparcv9 - -#define SPH_DETECT_BIG_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 -#ifdef __GNUC__ -#define SPH_DETECT_SPARCV9_GCC_64 1 -#define SPH_DETECT_LITTLE_FAST 1 -#endif - -/* - * 32-bit Sparc. - */ -#elif (defined __sparc__ || defined __sparc) \ - && !(defined __sparcv9 || defined __arch64__) - -#define SPH_DETECT_BIG_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 -#if defined __GNUC__ && defined __sparc_v9__ -#define SPH_DETECT_SPARCV9_GCC_32 1 -#define SPH_DETECT_LITTLE_FAST 1 -#endif - -/* - * ARM, little-endian. - */ -#elif defined __arm__ && __ARMEL__ - -#define SPH_DETECT_LITTLE_ENDIAN 1 - -/* - * MIPS, little-endian. - */ -#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__ - -#define SPH_DETECT_LITTLE_ENDIAN 1 - -/* - * MIPS, big-endian. - */ -#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__ - -#define SPH_DETECT_BIG_ENDIAN 1 - -/* - * PowerPC. - */ -#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \ - || defined _ARCH_PPC - -/* - * Note: we do not declare cross-endian access to be "fast": even if - * using inline assembly, implementation should still assume that - * keeping the decoded word in a temporary is faster than decoding - * it again. - */ -#if defined __GNUC__ -#if SPH_64_TRUE -#define SPH_DETECT_PPC64_GCC 1 -#else -#define SPH_DETECT_PPC32_GCC 1 -#endif -#endif - -#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN -#define SPH_DETECT_BIG_ENDIAN 1 -#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN -#define SPH_DETECT_LITTLE_ENDIAN 1 -#endif - -/* - * Itanium, 64-bit. - */ -#elif defined __ia64 || defined __ia64__ \ - || defined __itanium__ || defined _M_IA64 - -#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN -#define SPH_DETECT_BIG_ENDIAN 1 -#else -#define SPH_DETECT_LITTLE_ENDIAN 1 -#endif -#if defined __LP64__ || defined _LP64 -#define SPH_DETECT_UPTR sph_u64 -#else -#define SPH_DETECT_UPTR sph_u32 -#endif - -#endif - -#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64 -#define SPH_DETECT_SPARCV9_GCC 1 -#endif - -#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED -#define SPH_UNALIGNED SPH_DETECT_UNALIGNED -#endif -#if defined SPH_DETECT_UPTR && !defined SPH_UPTR -#define SPH_UPTR SPH_DETECT_UPTR -#endif -#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN -#define SPH_LITTLE_ENDIAN SPH_DETECT_LITTLE_ENDIAN -#endif -#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN -#define SPH_BIG_ENDIAN SPH_DETECT_BIG_ENDIAN -#endif -#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST -#define SPH_LITTLE_FAST SPH_DETECT_LITTLE_FAST -#endif -#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST -#define SPH_BIG_FAST SPH_DETECT_BIG_FAST -#endif -#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32 -#define SPH_SPARCV9_GCC_32 SPH_DETECT_SPARCV9_GCC_32 -#endif -#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64 -#define SPH_SPARCV9_GCC_64 SPH_DETECT_SPARCV9_GCC_64 -#endif -#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC -#define SPH_SPARCV9_GCC SPH_DETECT_SPARCV9_GCC -#endif -#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC -#define SPH_I386_GCC SPH_DETECT_I386_GCC -#endif -#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC -#define SPH_I386_MSVC SPH_DETECT_I386_MSVC -#endif -#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC -#define SPH_AMD64_GCC SPH_DETECT_AMD64_GCC -#endif -#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC -#define SPH_AMD64_MSVC SPH_DETECT_AMD64_MSVC -#endif -#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC -#define SPH_PPC32_GCC SPH_DETECT_PPC32_GCC -#endif -#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC -#define SPH_PPC64_GCC SPH_DETECT_PPC64_GCC -#endif - -#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST -#define SPH_LITTLE_FAST 1 -#endif -#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST -#define SPH_BIG_FAST 1 -#endif - -#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN) -#error SPH_UPTR defined, but endianness is not known. -#endif - -#if SPH_I386_GCC && !SPH_NO_ASM - -/* - * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit - * values. - */ - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); - return x; -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - return ((sph_u64)sph_bswap32((sph_u32)x) << 32) - | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); -} - -#endif - -#elif SPH_AMD64_GCC && !SPH_NO_ASM - -/* - * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit - * and 64-bit values. - */ - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); - return x; -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - __asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x)); - return x; -} - -#endif - -/* - * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough - * to generate proper opcodes for endianness swapping with the pure C - * implementation below. - * - -#elif SPH_I386_MSVC && !SPH_NO_ASM - -static __inline sph_u32 __declspec(naked) __fastcall -sph_bswap32(sph_u32 x) -{ - __asm { - bswap ecx - mov eax,ecx - ret - } -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - return ((sph_u64)sph_bswap32((sph_u32)x) << 32) - | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); -} - -#endif - - * - * [end of disabled code] - */ - -#else - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - x = SPH_T32((x << 16) | (x >> 16)); - x = ((x & SPH_C32(0xFF00FF00)) >> 8) - | ((x & SPH_C32(0x00FF00FF)) << 8); - return x; -} - -#if SPH_64 - -/** - * Byte-swap a 64-bit value. - * - * @param x the input value - * @return the byte-swapped value - */ -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - x = SPH_T64((x << 32) | (x >> 32)); - x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16) - | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16); - x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8) - | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8); - return x; -} - -#endif - -#endif - -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - -/* - * On UltraSPARC systems, native ordering is big-endian, but it is - * possible to perform little-endian read accesses by specifying the - * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use - * the opcode "lda [%reg]0x88,%dst", where %reg is the register which - * contains the source address and %dst is the destination register, - * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register - * to get the address space name. The latter format is better since it - * combines an addition and the actual access in a single opcode; but - * it requires the setting (and subsequent resetting) of %asi, which is - * slow. Some operations (i.e. MD5 compression function) combine many - * successive little-endian read accesses, which may share the same - * %asi setting. The macros below contain the appropriate inline - * assembly. - */ - -#define SPH_SPARCV9_SET_ASI \ - sph_u32 sph_sparcv9_asi; \ - __asm__ __volatile__ ( \ - "rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi)); - -#define SPH_SPARCV9_RESET_ASI \ - __asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi)); - -#define SPH_SPARCV9_DEC32LE(base, idx) ({ \ - sph_u32 sph_sparcv9_tmp; \ - __asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \ - : "=r" (sph_sparcv9_tmp) : "r" (base)); \ - sph_sparcv9_tmp; \ - }) - -#endif - -static SPH_INLINE void -sph_enc16be(void *dst, unsigned val) -{ - ((unsigned char *)dst)[0] = (val >> 8); - ((unsigned char *)dst)[1] = val; -} - -static SPH_INLINE unsigned -sph_dec16be(const void *src) -{ - return ((unsigned)(((const unsigned char *)src)[0]) << 8) - | (unsigned)(((const unsigned char *)src)[1]); -} - -static SPH_INLINE void -sph_enc16le(void *dst, unsigned val) -{ - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = val >> 8; -} - -static SPH_INLINE unsigned -sph_dec16le(const void *src) -{ - return (unsigned)(((const unsigned char *)src)[0]) - | ((unsigned)(((const unsigned char *)src)[1]) << 8); -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static SPH_INLINE void -sph_enc32be(void *dst, sph_u32 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; -#else - if (((SPH_UPTR)dst & 3) == 0) { -#if SPH_LITTLE_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; - } else { - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; - } -#endif -#else - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (32-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc32be_aligned(void *dst, sph_u32 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u32 *)dst = sph_bswap32(val); -#elif SPH_BIG_ENDIAN - *(sph_u32 *)dst = val; -#else - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (big endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32be(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif -#else - if (((SPH_UPTR)src & 3) == 0) { -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif - } else { - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); - } -#endif -#else - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (big endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (32-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32be_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#elif SPH_BIG_ENDIAN - return *(const sph_u32 *)src; -#else - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (little endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static SPH_INLINE void -sph_enc32le(void *dst, sph_u32 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; -#else - if (((SPH_UPTR)dst & 3) == 0) { -#if SPH_BIG_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; - } else { - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - } -#endif -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (little endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (32-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc32le_aligned(void *dst, sph_u32 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u32 *)dst = val; -#elif SPH_BIG_ENDIAN - *(sph_u32 *)dst = sph_bswap32(val); -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (little endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32le(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif -#else - if (((SPH_UPTR)src & 3) == 0) { -#if SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - sph_u32 tmp; - - /* - * "__volatile__" is needed here because without it, - * gcc-3.4.3 miscompiles the code and performs the - * access before the test on the address, thus triggering - * a bus error... - */ - __asm__ __volatile__ ( - "lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * On PowerPC, this turns out not to be worth the effort: the inline - * assembly makes GCC optimizer uncomfortable, which tends to nullify - * the decoding gains. - * - * For most hash functions, using this inline assembly trick changes - * hashing speed by less than 5% and often _reduces_ it. The biggest - * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is - * less then 10%. The speed gain on CubeHash is probably due to the - * chronic shortage of registers that CubeHash endures; for the other - * functions, the generic code appears to be efficient enough already. - * -#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ( - "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap32(*(const sph_u32 *)src); -#endif -#else - return *(const sph_u32 *)src; -#endif - } else { - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); - } -#endif -#else - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (little endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (32-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32le_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return *(const sph_u32 *)src; -#elif SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap32(*(const sph_u32 *)src); -#endif -#else - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); -#endif -} - -#if SPH_64 - -/** - * Encode a 64-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 64-bit value to encode - */ -static SPH_INLINE void -sph_enc64be(void *dst, sph_u64 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; -#else - if (((SPH_UPTR)dst & 7) == 0) { -#if SPH_LITTLE_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; - } else { - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; - } -#endif -#else - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (big endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (64-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc64be_aligned(void *dst, sph_u64 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u64 *)dst = sph_bswap64(val); -#elif SPH_BIG_ENDIAN - *(sph_u64 *)dst = val; -#else - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (big endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64be(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif -#else - if (((SPH_UPTR)src & 7) == 0) { -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif - } else { - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); - } -#endif -#else - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (big endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (64-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64be_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#elif SPH_BIG_ENDIAN - return *(const sph_u64 *)src; -#else - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (little endian convention). - * - * @param dst the destination buffer - * @param val the 64-bit value to encode - */ -static SPH_INLINE void -sph_enc64le(void *dst, sph_u64 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; -#else - if (((SPH_UPTR)dst & 7) == 0) { -#if SPH_BIG_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; - } else { - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); - } -#endif -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (little endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (64-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc64le_aligned(void *dst, sph_u64 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u64 *)dst = val; -#elif SPH_BIG_ENDIAN - *(sph_u64 *)dst = sph_bswap64(val); -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (little endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64le(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif -#else - if (((SPH_UPTR)src & 7) == 0) { -#if SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ( - "ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif SPH_PPC32_GCC && !SPH_NO_ASM - return (sph_u64)sph_dec32le_aligned(src) - | ((sph_u64)sph_dec32le_aligned( - (const char *)src + 4) << 32); -#elif SPH_PPC64_GCC && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ( - "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap64(*(const sph_u64 *)src); -#endif -#else - return *(const sph_u64 *)src; -#endif - } else { - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); - } -#endif -#else - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (little endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (64-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64le_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return *(const sph_u64 *)src; -#elif SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif SPH_PPC32_GCC && !SPH_NO_ASM - return (sph_u64)sph_dec32le_aligned(src) - | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32); -#elif SPH_PPC64_GCC && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap64(*(const sph_u64 *)src); -#endif -#else - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); -#endif -} - -#endif - -#endif /* Doxygen excluded block */ - -#endif +/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */ +/** + * Basic type definitions. + * + * This header file defines the generic integer types that will be used + * for the implementation of hash functions; it also contains helper + * functions which encode and decode multi-byte integer values, using + * either little-endian or big-endian conventions. + * + * This file contains a compile-time test on the size of a byte + * (the unsigned char C type). If bytes are not octets, + * i.e. if they do not have a size of exactly 8 bits, then compilation + * is aborted. Architectures where bytes are not octets are relatively + * rare, even in the embedded devices market. We forbid non-octet bytes + * because there is no clear convention on how octet streams are encoded + * on such systems. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_types.h + * @author Thomas Pornin + */ + +#ifndef SPH_TYPES_H__ +#define SPH_TYPES_H__ + +#include + +/* + * All our I/O functions are defined over octet streams. We do not know + * how to handle input data if bytes are not octets. + */ +#if CHAR_BIT != 8 +#error This code requires 8-bit bytes +#endif + +/* ============= BEGIN documentation block for Doxygen ============ */ + +#ifdef DOXYGEN_IGNORE + +/** @mainpage sphlib C code documentation + * + * @section overview Overview + * + * sphlib is a library which contains implementations of + * various cryptographic hash functions. These pages have been generated + * with doxygen and + * document the API for the C implementations. + * + * The API is described in appropriate header files, which are available + * in the "Files" section. Each hash function family has its own header, + * whose name begins with "sph_" and contains the family + * name. For instance, the API for the RIPEMD hash functions is available + * in the header file sph_ripemd.h. + * + * @section principles API structure and conventions + * + * @subsection io Input/output conventions + * + * In all generality, hash functions operate over strings of bits. + * Individual bits are rarely encountered in C programming or actual + * communication protocols; most protocols converge on the ubiquitous + * "octet" which is a group of eight bits. Data is thus expressed as a + * stream of octets. The C programming language contains the notion of a + * "byte", which is a data unit managed under the type "unsigned + * char". The C standard prescribes that a byte should hold at + * least eight bits, but possibly more. Most modern architectures, even + * in the embedded world, feature eight-bit bytes, i.e. map bytes to + * octets. + * + * Nevertheless, for some of the implemented hash functions, an extra + * API has been added, which allows the input of arbitrary sequences of + * bits: when the computation is about to be closed, 1 to 7 extra bits + * can be added. The functions for which this API is implemented include + * the SHA-2 functions and all SHA-3 candidates. + * + * sphlib defines hash function which may hash octet streams, + * i.e. streams of bits where the number of bits is a multiple of eight. + * The data input functions in the sphlib API expect data + * as anonymous pointers ("const void *") with a length + * (of type "size_t") which gives the input data chunk length + * in bytes. A byte is assumed to be an octet; the sph_types.h + * header contains a compile-time test which prevents compilation on + * architectures where this property is not met. + * + * The hash function output is also converted into bytes. All currently + * implemented hash functions have an output width which is a multiple of + * eight, and this is likely to remain true for new designs. + * + * Most hash functions internally convert input data into 32-bit of 64-bit + * words, using either little-endian or big-endian conversion. The hash + * output also often consists of such words, which are encoded into output + * bytes with a similar endianness convention. Some hash functions have + * been only loosely specified on that subject; when necessary, + * sphlib has been tested against published "reference" + * implementations in order to use the same conventions. + * + * @subsection shortname Function short name + * + * Each implemented hash function has a "short name" which is used + * internally to derive the identifiers for the functions and context + * structures which the function uses. For instance, MD5 has the short + * name "md5". Short names are listed in the next section, + * for the implemented hash functions. In subsequent sections, the + * short name will be assumed to be "XXX": replace with the + * actual hash function name to get the C identifier. + * + * Note: some functions within the same family share the same core + * elements, such as update function or context structure. Correspondingly, + * some of the defined types or functions may actually be macros which + * transparently evaluate to another type or function name. + * + * @subsection context Context structure + * + * Each implemented hash fonction has its own context structure, available + * under the type name "sph_XXX_context" for the hash function + * with short name "XXX". This structure holds all needed + * state for a running hash computation. + * + * The contents of these structures are meant to be opaque, and private + * to the implementation. However, these contents are specified in the + * header files so that application code which uses sphlib + * may access the size of those structures. + * + * The caller is responsible for allocating the context structure, + * whether by dynamic allocation (malloc() or equivalent), + * static allocation (a global permanent variable), as an automatic + * variable ("on the stack"), or by any other mean which ensures proper + * structure alignment. sphlib code performs no dynamic + * allocation by itself. + * + * The context must be initialized before use, using the + * sph_XXX_init() function. This function sets the context + * state to proper initial values for hashing. + * + * Since all state data is contained within the context structure, + * sphlib is thread-safe and reentrant: several hash + * computations may be performed in parallel, provided that they do not + * operate on the same context. Moreover, a running computation can be + * cloned by copying the context (with a simple memcpy()): + * the context and its clone are then independant and may be updated + * with new data and/or closed without interfering with each other. + * Similarly, a context structure can be moved in memory at will: + * context structures contain no pointer, in particular no pointer to + * themselves. + * + * @subsection dataio Data input + * + * Hashed data is input with the sph_XXX() fonction, which + * takes as parameters a pointer to the context, a pointer to the data + * to hash, and the number of data bytes to hash. The context is updated + * with the new data. + * + * Data can be input in one or several calls, with arbitrary input lengths. + * However, it is best, performance wise, to input data by relatively big + * chunks (say a few kilobytes), because this allows sphlib to + * optimize things and avoid internal copying. + * + * When all data has been input, the context can be closed with + * sph_XXX_close(). The hash output is computed and written + * into the provided buffer. The caller must take care to provide a + * buffer of appropriate length; e.g., when using SHA-1, the output is + * a 20-byte word, therefore the output buffer must be at least 20-byte + * long. + * + * For some hash functions, the sph_XXX_addbits_and_close() + * function can be used instead of sph_XXX_close(). This + * function can take a few extra bits to be added at + * the end of the input message. This allows hashing messages with a + * bit length which is not a multiple of 8. The extra bits are provided + * as an unsigned integer value, and a bit count. The bit count must be + * between 0 and 7, inclusive. The extra bits are provided as bits 7 to + * 0 (bits of numerical value 128, 64, 32... downto 0), in that order. + * For instance, to add three bits of value 1, 1 and 0, the unsigned + * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count + * will be 3. + * + * The SPH_SIZE_XXX macro is defined for each hash function; + * it evaluates to the function output size, expressed in bits. For instance, + * SPH_SIZE_sha1 evaluates to 160. + * + * When closed, the context is automatically reinitialized and can be + * immediately used for another computation. It is not necessary to call + * sph_XXX_init() after a close. Note that + * sph_XXX_init() can still be called to "reset" a context, + * i.e. forget previously input data, and get back to the initial state. + * + * @subsection alignment Data alignment + * + * "Alignment" is a property of data, which is said to be "properly + * aligned" when its emplacement in memory is such that the data can + * be optimally read by full words. This depends on the type of access; + * basically, some hash functions will read data by 32-bit or 64-bit + * words. sphlib does not mandate such alignment for input + * data, but using aligned data can substantially improve performance. + * + * As a rule, it is best to input data by chunks whose length (in bytes) + * is a multiple of eight, and which begins at "generally aligned" + * addresses, such as the base address returned by a call to + * malloc(). + * + * @section functions Implemented functions + * + * We give here the list of implemented functions. They are grouped by + * family; to each family corresponds a specific header file. Each + * individual function has its associated "short name". Please refer to + * the documentation for that header file to get details on the hash + * function denomination and provenance. + * + * Note: the functions marked with a '(64)' in the list below are + * available only if the C compiler provides an integer type of length + * 64 bits or more. Such a type is mandatory in the latest C standard + * (ISO 9899:1999, aka "C99") and is present in several older compilers + * as well, so chances are that such a type is available. + * + * - HAVAL family: file sph_haval.h + * - HAVAL-128/3 (128-bit, 3 passes): short name: haval128_3 + * - HAVAL-128/4 (128-bit, 4 passes): short name: haval128_4 + * - HAVAL-128/5 (128-bit, 5 passes): short name: haval128_5 + * - HAVAL-160/3 (160-bit, 3 passes): short name: haval160_3 + * - HAVAL-160/4 (160-bit, 4 passes): short name: haval160_4 + * - HAVAL-160/5 (160-bit, 5 passes): short name: haval160_5 + * - HAVAL-192/3 (192-bit, 3 passes): short name: haval192_3 + * - HAVAL-192/4 (192-bit, 4 passes): short name: haval192_4 + * - HAVAL-192/5 (192-bit, 5 passes): short name: haval192_5 + * - HAVAL-224/3 (224-bit, 3 passes): short name: haval224_3 + * - HAVAL-224/4 (224-bit, 4 passes): short name: haval224_4 + * - HAVAL-224/5 (224-bit, 5 passes): short name: haval224_5 + * - HAVAL-256/3 (256-bit, 3 passes): short name: haval256_3 + * - HAVAL-256/4 (256-bit, 4 passes): short name: haval256_4 + * - HAVAL-256/5 (256-bit, 5 passes): short name: haval256_5 + * - MD2: file sph_md2.h, short name: md2 + * - MD4: file sph_md4.h, short name: md4 + * - MD5: file sph_md5.h, short name: md5 + * - PANAMA: file sph_panama.h, short name: panama + * - RadioGatun family: file sph_radiogatun.h + * - RadioGatun[32]: short name: radiogatun32 + * - RadioGatun[64]: short name: radiogatun64 (64) + * - RIPEMD family: file sph_ripemd.h + * - RIPEMD: short name: ripemd + * - RIPEMD-128: short name: ripemd128 + * - RIPEMD-160: short name: ripemd160 + * - SHA-0: file sph_sha0.h, short name: sha0 + * - SHA-1: file sph_sha1.h, short name: sha1 + * - SHA-2 family, 32-bit hashes: file sph_sha2.h + * - SHA-224: short name: sha224 + * - SHA-256: short name: sha256 + * - SHA-384: short name: sha384 (64) + * - SHA-512: short name: sha512 (64) + * - Tiger family: file sph_tiger.h + * - Tiger: short name: tiger (64) + * - Tiger2: short name: tiger2 (64) + * - WHIRLPOOL family: file sph_whirlpool.h + * - WHIRLPOOL-0: short name: whirlpool0 (64) + * - WHIRLPOOL-1: short name: whirlpool1 (64) + * - WHIRLPOOL: short name: whirlpool (64) + * + * The fourteen second-round SHA-3 candidates are also implemented; + * when applicable, the implementations follow the "final" specifications + * as published for the third round of the SHA-3 competition (BLAKE, + * Groestl, JH, Keccak and Skein have been tweaked for third round). + * + * - BLAKE family: file sph_blake.h + * - BLAKE-224: short name: blake224 + * - BLAKE-256: short name: blake256 + * - BLAKE-384: short name: blake384 + * - BLAKE-512: short name: blake512 + * - BMW (Blue Midnight Wish) family: file sph_bmw.h + * - BMW-224: short name: bmw224 + * - BMW-256: short name: bmw256 + * - BMW-384: short name: bmw384 (64) + * - BMW-512: short name: bmw512 (64) + * - CubeHash family: file sph_cubehash.h (specified as + * CubeHash16/32 in the CubeHash specification) + * - CubeHash-224: short name: cubehash224 + * - CubeHash-256: short name: cubehash256 + * - CubeHash-384: short name: cubehash384 + * - CubeHash-512: short name: cubehash512 + * - ECHO family: file sph_echo.h + * - ECHO-224: short name: echo224 + * - ECHO-256: short name: echo256 + * - ECHO-384: short name: echo384 + * - ECHO-512: short name: echo512 + * - Fugue family: file sph_fugue.h + * - Fugue-224: short name: fugue224 + * - Fugue-256: short name: fugue256 + * - Fugue-384: short name: fugue384 + * - Fugue-512: short name: fugue512 + * - Groestl family: file sph_groestl.h + * - Groestl-224: short name: groestl224 + * - Groestl-256: short name: groestl256 + * - Groestl-384: short name: groestl384 + * - Groestl-512: short name: groestl512 + * - Hamsi family: file sph_hamsi.h + * - Hamsi-224: short name: hamsi224 + * - Hamsi-256: short name: hamsi256 + * - Hamsi-384: short name: hamsi384 + * - Hamsi-512: short name: hamsi512 + * - JH family: file sph_jh.h + * - JH-224: short name: jh224 + * - JH-256: short name: jh256 + * - JH-384: short name: jh384 + * - JH-512: short name: jh512 + * - Keccak family: file sph_keccak.h + * - Keccak-224: short name: keccak224 + * - Keccak-256: short name: keccak256 + * - Keccak-384: short name: keccak384 + * - Keccak-512: short name: keccak512 + * - Luffa family: file sph_luffa.h + * - Luffa-224: short name: luffa224 + * - Luffa-256: short name: luffa256 + * - Luffa-384: short name: luffa384 + * - Luffa-512: short name: luffa512 + * - Shabal family: file sph_shabal.h + * - Shabal-192: short name: shabal192 + * - Shabal-224: short name: shabal224 + * - Shabal-256: short name: shabal256 + * - Shabal-384: short name: shabal384 + * - Shabal-512: short name: shabal512 + * - SHAvite-3 family: file sph_shavite.h + * - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"): + * short name: shabal224 + * - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"): + * short name: shabal256 + * - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"): + * short name: shabal384 + * - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"): + * short name: shabal512 + * - SIMD family: file sph_simd.h + * - SIMD-224: short name: simd224 + * - SIMD-256: short name: simd256 + * - SIMD-384: short name: simd384 + * - SIMD-512: short name: simd512 + * - Skein family: file sph_skein.h + * - Skein-224 (nominally specified as Skein-512-224): short name: + * skein224 (64) + * - Skein-256 (nominally specified as Skein-512-256): short name: + * skein256 (64) + * - Skein-384 (nominally specified as Skein-512-384): short name: + * skein384 (64) + * - Skein-512 (nominally specified as Skein-512-512): short name: + * skein512 (64) + * + * For the second-round SHA-3 candidates, the functions are as specified + * for round 2, i.e. with the "tweaks" that some candidates added + * between round 1 and round 2. Also, some of the submitted packages for + * round 2 contained errors, in the specification, reference code, or + * both. sphlib implements the corrected versions. + */ + +/** @hideinitializer + * Unsigned integer type whose length is at least 32 bits; on most + * architectures, it will have a width of exactly 32 bits. Unsigned C + * types implement arithmetics modulo a power of 2; use the + * SPH_T32() macro to ensure that the value is truncated + * to exactly 32 bits. Unless otherwise specified, all macros and + * functions which accept sph_u32 values assume that these + * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures + * where sph_u32 is larger than that. + */ +typedef __arch_dependant__ sph_u32; + +/** @hideinitializer + * Signed integer type corresponding to sph_u32; it has + * width 32 bits or more. + */ +typedef __arch_dependant__ sph_s32; + +/** @hideinitializer + * Unsigned integer type whose length is at least 64 bits; on most + * architectures which feature such a type, it will have a width of + * exactly 64 bits. C99-compliant platform will have this type; it + * is also defined when the GNU compiler (gcc) is used, and on + * platforms where unsigned long is large enough. If this + * type is not available, then some hash functions which depends on + * a 64-bit type will not be available (most notably SHA-384, SHA-512, + * Tiger and WHIRLPOOL). + */ +typedef __arch_dependant__ sph_u64; + +/** @hideinitializer + * Signed integer type corresponding to sph_u64; it has + * width 64 bits or more. + */ +typedef __arch_dependant__ sph_s64; + +/** + * This macro expands the token x into a suitable + * constant expression of type sph_u32. Depending on + * how this type is defined, a suffix such as UL may + * be appended to the argument. + * + * @param x the token to expand into a suitable constant expression + */ +#define SPH_C32(x) + +/** + * Truncate a 32-bit value to exactly 32 bits. On most systems, this is + * a no-op, recognized as such by the compiler. + * + * @param x the value to truncate (of type sph_u32) + */ +#define SPH_T32(x) + +/** + * Rotate a 32-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 31. This macro assumes that its + * first argument fits in 32 bits (no extra bit allowed on machines where + * sph_u32 is wider); both arguments may be evaluated + * several times. + * + * @param x the value to rotate (of type sph_u32) + * @param n the rotation count (between 1 and 31, inclusive) + */ +#define SPH_ROTL32(x, n) + +/** + * Rotate a 32-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 31. This macro assumes that its + * first argument fits in 32 bits (no extra bit allowed on machines where + * sph_u32 is wider); both arguments may be evaluated + * several times. + * + * @param x the value to rotate (of type sph_u32) + * @param n the rotation count (between 1 and 31, inclusive) + */ +#define SPH_ROTR32(x, n) + +/** + * This macro is defined on systems for which a 64-bit type has been + * detected, and is used for sph_u64. + */ +#define SPH_64 + +/** + * This macro is defined on systems for the "native" integer size is + * 64 bits (64-bit values fit in one register). + */ +#define SPH_64_TRUE + +/** + * This macro expands the token x into a suitable + * constant expression of type sph_u64. Depending on + * how this type is defined, a suffix such as ULL may + * be appended to the argument. This macro is defined only if a + * 64-bit type was detected and used for sph_u64. + * + * @param x the token to expand into a suitable constant expression + */ +#define SPH_C64(x) + +/** + * Truncate a 64-bit value to exactly 64 bits. On most systems, this is + * a no-op, recognized as such by the compiler. This macro is defined only + * if a 64-bit type was detected and used for sph_u64. + * + * @param x the value to truncate (of type sph_u64) + */ +#define SPH_T64(x) + +/** + * Rotate a 64-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 63. This macro assumes that its + * first argument fits in 64 bits (no extra bit allowed on machines where + * sph_u64 is wider); both arguments may be evaluated + * several times. This macro is defined only if a 64-bit type was detected + * and used for sph_u64. + * + * @param x the value to rotate (of type sph_u64) + * @param n the rotation count (between 1 and 63, inclusive) + */ +#define SPH_ROTL64(x, n) + +/** + * Rotate a 64-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 63. This macro assumes that its + * first argument fits in 64 bits (no extra bit allowed on machines where + * sph_u64 is wider); both arguments may be evaluated + * several times. This macro is defined only if a 64-bit type was detected + * and used for sph_u64. + * + * @param x the value to rotate (of type sph_u64) + * @param n the rotation count (between 1 and 63, inclusive) + */ +#define SPH_ROTR64(x, n) + +/** + * This macro evaluates to inline or an equivalent construction, + * if available on the compilation platform, or to nothing otherwise. This + * is used to declare inline functions, for which the compiler should + * endeavour to include the code directly in the caller. Inline functions + * are typically defined in header files as replacement for macros. + */ +#define SPH_INLINE + +/** + * This macro is defined if the platform has been detected as using + * little-endian convention. This implies that the sph_u32 + * type (and the sph_u64 type also, if it is defined) has + * an exact width (i.e. exactly 32-bit, respectively 64-bit). + */ +#define SPH_LITTLE_ENDIAN + +/** + * This macro is defined if the platform has been detected as using + * big-endian convention. This implies that the sph_u32 + * type (and the sph_u64 type also, if it is defined) has + * an exact width (i.e. exactly 32-bit, respectively 64-bit). + */ +#define SPH_BIG_ENDIAN + +/** + * This macro is defined if 32-bit words (and 64-bit words, if defined) + * can be read from and written to memory efficiently in little-endian + * convention. This is the case for little-endian platforms, and also + * for the big-endian platforms which have special little-endian access + * opcodes (e.g. Ultrasparc). + */ +#define SPH_LITTLE_FAST + +/** + * This macro is defined if 32-bit words (and 64-bit words, if defined) + * can be read from and written to memory efficiently in big-endian + * convention. This is the case for little-endian platforms, and also + * for the little-endian platforms which have special big-endian access + * opcodes. + */ +#define SPH_BIG_FAST + +/** + * On some platforms, this macro is defined to an unsigned integer type + * into which pointer values may be cast. The resulting value can then + * be tested for being a multiple of 2, 4 or 8, indicating an aligned + * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses. + */ +#define SPH_UPTR + +/** + * When defined, this macro indicates that unaligned memory accesses + * are possible with only a minor penalty, and thus should be prefered + * over strategies which first copy data to an aligned buffer. + */ +#define SPH_UNALIGNED + +/** + * Byte-swap a 32-bit word (i.e. 0x12345678 becomes + * 0x78563412). This is an inline function which resorts + * to inline assembly on some platforms, for better performance. + * + * @param x the 32-bit value to byte-swap + * @return the byte-swapped value + */ +static inline sph_u32 sph_bswap32(sph_u32 x); + +/** + * Byte-swap a 64-bit word. This is an inline function which resorts + * to inline assembly on some platforms, for better performance. This + * function is defined only if a suitable 64-bit type was found for + * sph_u64 + * + * @param x the 64-bit value to byte-swap + * @return the byte-swapped value + */ +static inline sph_u64 sph_bswap64(sph_u64 x); + +/** + * Decode a 16-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline unsigned sph_dec16le(const void *src); + +/** + * Encode a 16-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc16le(void *dst, unsigned val); + +/** + * Decode a 16-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline unsigned sph_dec16be(const void *src); + +/** + * Encode a 16-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc16be(void *dst, unsigned val); + +/** + * Decode a 32-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32le(const void *src); + +/** + * Decode a 32-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec32le() function. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32le_aligned(const void *src); + +/** + * Encode a 32-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32le(void *dst, sph_u32 val); + +/** + * Encode a 32-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc32le() function. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32le_aligned(void *dst, sph_u32 val); + +/** + * Decode a 32-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32be(const void *src); + +/** + * Decode a 32-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec32be() function. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32be_aligned(const void *src); + +/** + * Encode a 32-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32be(void *dst, sph_u32 val); + +/** + * Encode a 32-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc32be() function. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32be_aligned(void *dst, sph_u32 val); + +/** + * Decode a 64-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64le(const void *src); + +/** + * Decode a 64-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec64le() function. This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64le_aligned(const void *src); + +/** + * Encode a 64-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64le(void *dst, sph_u64 val); + +/** + * Encode a 64-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc64le() function. This function is defined + * only if a suitable 64-bit type was detected and used for + * sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64le_aligned(void *dst, sph_u64 val); + +/** + * Decode a 64-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64be(const void *src); + +/** + * Decode a 64-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec64be() function. This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64be_aligned(const void *src); + +/** + * Encode a 64-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64be(void *dst, sph_u64 val); + +/** + * Encode a 64-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc64be() function. This function is defined + * only if a suitable 64-bit type was detected and used for + * sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64be_aligned(void *dst, sph_u64 val); + +#endif + +/* ============== END documentation block for Doxygen ============= */ + +#ifndef DOXYGEN_IGNORE + +/* + * We want to define the types "sph_u32" and "sph_u64" which hold + * unsigned values of at least, respectively, 32 and 64 bits. These + * tests should select appropriate types for most platforms. The + * macro "SPH_64" is defined if the 64-bit is supported. + */ + +#undef SPH_64 +#undef SPH_64_TRUE + +#if defined __STDC__ && __STDC_VERSION__ >= 199901L + +/* + * On C99 implementations, we can use to get an exact 64-bit + * type, if any, or otherwise use a wider type (which must exist, for + * C99 conformance). + */ + +#include + +#ifdef UINT32_MAX +typedef uint32_t sph_u32; +typedef int32_t sph_s32; +#else +typedef uint_fast32_t sph_u32; +typedef int_fast32_t sph_s32; +#endif +#if !SPH_NO_64 +#ifdef UINT64_MAX +typedef uint64_t sph_u64; +typedef int64_t sph_s64; +#else +typedef uint_fast64_t sph_u64; +typedef int_fast64_t sph_s64; +#endif +#endif + +#define SPH_C32(x) ((sph_u32)(x)) +#if !SPH_NO_64 +#define SPH_C64(x) ((sph_u64)(x)) +#define SPH_64 1 +#endif + +#else + +/* + * On non-C99 systems, we use "unsigned int" if it is wide enough, + * "unsigned long" otherwise. This supports all "reasonable" architectures. + * We have to be cautious: pre-C99 preprocessors handle constants + * differently in '#if' expressions. Hence the shifts to test UINT_MAX. + */ + +#if ((UINT_MAX >> 11) >> 11) >= 0x3FF + +typedef unsigned int sph_u32; +typedef int sph_s32; + +#define SPH_C32(x) ((sph_u32)(x ## U)) + +#else + +typedef unsigned long sph_u32; +typedef long sph_s32; + +#define SPH_C32(x) ((sph_u32)(x ## UL)) + +#endif + +#if !SPH_NO_64 + +/* + * We want a 64-bit type. We use "unsigned long" if it is wide enough (as + * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9), + * "unsigned long long" otherwise, if available. We use ULLONG_MAX to + * test whether "unsigned long long" is available; we also know that + * gcc features this type, even if the libc header do not know it. + */ + +#if ((ULONG_MAX >> 31) >> 31) >= 3 + +typedef unsigned long sph_u64; +typedef long sph_s64; + +#define SPH_C64(x) ((sph_u64)(x ## UL)) + +#define SPH_64 1 + +#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__ + +typedef unsigned long long sph_u64; +typedef long long sph_s64; + +#define SPH_C64(x) ((sph_u64)(x ## ULL)) + +#define SPH_64 1 + +#else + +/* + * No 64-bit type... + */ + +#endif + +#endif + +#endif + +/* + * If the "unsigned long" type has length 64 bits or more, then this is + * a "true" 64-bit architectures. This is also true with Visual C on + * amd64, even though the "long" type is limited to 32 bits. + */ +#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64) +#define SPH_64_TRUE 1 +#endif + +/* + * Implementation note: some processors have specific opcodes to perform + * a rotation. Recent versions of gcc recognize the expression above and + * use the relevant opcodes, when appropriate. + */ + +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +#if SPH_64 + +#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) +#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) + +#endif + +#ifndef DOXYGEN_IGNORE +/* + * Define SPH_INLINE to be an "inline" qualifier, if available. We define + * some small macro-like functions which benefit greatly from being inlined. + */ +#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__ +#define SPH_INLINE inline +#elif defined _MSC_VER +#define SPH_INLINE __inline +#else +#define SPH_INLINE +#endif +#endif + +/* + * We define some macros which qualify the architecture. These macros + * may be explicit set externally (e.g. as compiler parameters). The + * code below sets those macros if they are not already defined. + * + * Most macros are boolean, thus evaluate to either zero or non-zero. + * The SPH_UPTR macro is special, in that it evaluates to a C type, + * or is not defined. + * + * SPH_UPTR if defined: unsigned type to cast pointers into + * + * SPH_UNALIGNED non-zero if unaligned accesses are efficient + * SPH_LITTLE_ENDIAN non-zero if architecture is known to be little-endian + * SPH_BIG_ENDIAN non-zero if architecture is known to be big-endian + * SPH_LITTLE_FAST non-zero if little-endian decoding is fast + * SPH_BIG_FAST non-zero if big-endian decoding is fast + * + * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit + * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN + * _must_ be non-zero in those situations. The 32-bit and 64-bit types + * _must_ also have an exact width. + * + * SPH_SPARCV9_GCC_32 UltraSPARC-compatible with gcc, 32-bit mode + * SPH_SPARCV9_GCC_64 UltraSPARC-compatible with gcc, 64-bit mode + * SPH_SPARCV9_GCC UltraSPARC-compatible with gcc + * SPH_I386_GCC x86-compatible (32-bit) with gcc + * SPH_I386_MSVC x86-compatible (32-bit) with Microsoft Visual C + * SPH_AMD64_GCC x86-compatible (64-bit) with gcc + * SPH_AMD64_MSVC x86-compatible (64-bit) with Microsoft Visual C + * SPH_PPC32_GCC PowerPC, 32-bit, with gcc + * SPH_PPC64_GCC PowerPC, 64-bit, with gcc + * + * TODO: enhance automatic detection, for more architectures and compilers. + * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with + * some very fast functions (e.g. MD4) when using unaligned input data. + * The CPU-specific-with-GCC macros are useful only for inline assembly, + * normally restrained to this header file. + */ + +/* + * 32-bit x86, aka "i386 compatible". + */ +#if defined __i386__ || defined _M_IX86 + +#define SPH_DETECT_UNALIGNED 1 +#define SPH_DETECT_LITTLE_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u32 +#ifdef __GNUC__ +#define SPH_DETECT_I386_GCC 1 +#endif +#ifdef _MSC_VER +#define SPH_DETECT_I386_MSVC 1 +#endif + +/* + * 64-bit x86, hereafter known as "amd64". + */ +#elif defined __x86_64 || defined _M_X64 + +#define SPH_DETECT_UNALIGNED 1 +#define SPH_DETECT_LITTLE_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u64 +#ifdef __GNUC__ +#define SPH_DETECT_AMD64_GCC 1 +#endif +#ifdef _MSC_VER +#define SPH_DETECT_AMD64_MSVC 1 +#endif + +/* + * 64-bit Sparc architecture (implies v9). + */ +#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \ + || defined __sparcv9 + +#define SPH_DETECT_BIG_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u64 +#ifdef __GNUC__ +#define SPH_DETECT_SPARCV9_GCC_64 1 +#define SPH_DETECT_LITTLE_FAST 1 +#endif + +/* + * 32-bit Sparc. + */ +#elif (defined __sparc__ || defined __sparc) \ + && !(defined __sparcv9 || defined __arch64__) + +#define SPH_DETECT_BIG_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u32 +#if defined __GNUC__ && defined __sparc_v9__ +#define SPH_DETECT_SPARCV9_GCC_32 1 +#define SPH_DETECT_LITTLE_FAST 1 +#endif + +/* + * ARM, little-endian. + */ +#elif defined __arm__ && __ARMEL__ + +#define SPH_DETECT_LITTLE_ENDIAN 1 + +/* + * MIPS, little-endian. + */ +#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__ + +#define SPH_DETECT_LITTLE_ENDIAN 1 + +/* + * MIPS, big-endian. + */ +#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__ + +#define SPH_DETECT_BIG_ENDIAN 1 + +/* + * PowerPC. + */ +#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \ + || defined _ARCH_PPC + +/* + * Note: we do not declare cross-endian access to be "fast": even if + * using inline assembly, implementation should still assume that + * keeping the decoded word in a temporary is faster than decoding + * it again. + */ +#if defined __GNUC__ +#if SPH_64_TRUE +#define SPH_DETECT_PPC64_GCC 1 +#else +#define SPH_DETECT_PPC32_GCC 1 +#endif +#endif + +#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN +#define SPH_DETECT_BIG_ENDIAN 1 +#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN +#define SPH_DETECT_LITTLE_ENDIAN 1 +#endif + +/* + * Itanium, 64-bit. + */ +#elif defined __ia64 || defined __ia64__ \ + || defined __itanium__ || defined _M_IA64 + +#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN +#define SPH_DETECT_BIG_ENDIAN 1 +#else +#define SPH_DETECT_LITTLE_ENDIAN 1 +#endif +#if defined __LP64__ || defined _LP64 +#define SPH_DETECT_UPTR sph_u64 +#else +#define SPH_DETECT_UPTR sph_u32 +#endif + +#endif + +#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64 +#define SPH_DETECT_SPARCV9_GCC 1 +#endif + +#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED +#define SPH_UNALIGNED SPH_DETECT_UNALIGNED +#endif +#if defined SPH_DETECT_UPTR && !defined SPH_UPTR +#define SPH_UPTR SPH_DETECT_UPTR +#endif +#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN +#define SPH_LITTLE_ENDIAN SPH_DETECT_LITTLE_ENDIAN +#endif +#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN +#define SPH_BIG_ENDIAN SPH_DETECT_BIG_ENDIAN +#endif +#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST +#define SPH_LITTLE_FAST SPH_DETECT_LITTLE_FAST +#endif +#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST +#define SPH_BIG_FAST SPH_DETECT_BIG_FAST +#endif +#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32 +#define SPH_SPARCV9_GCC_32 SPH_DETECT_SPARCV9_GCC_32 +#endif +#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64 +#define SPH_SPARCV9_GCC_64 SPH_DETECT_SPARCV9_GCC_64 +#endif +#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC +#define SPH_SPARCV9_GCC SPH_DETECT_SPARCV9_GCC +#endif +#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC +#define SPH_I386_GCC SPH_DETECT_I386_GCC +#endif +#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC +#define SPH_I386_MSVC SPH_DETECT_I386_MSVC +#endif +#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC +#define SPH_AMD64_GCC SPH_DETECT_AMD64_GCC +#endif +#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC +#define SPH_AMD64_MSVC SPH_DETECT_AMD64_MSVC +#endif +#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC +#define SPH_PPC32_GCC SPH_DETECT_PPC32_GCC +#endif +#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC +#define SPH_PPC64_GCC SPH_DETECT_PPC64_GCC +#endif + +#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST +#define SPH_LITTLE_FAST 1 +#endif +#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST +#define SPH_BIG_FAST 1 +#endif + +#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN) +#error SPH_UPTR defined, but endianness is not known. +#endif + +#if SPH_I386_GCC && !SPH_NO_ASM + +/* + * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit + * values. + */ + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); + return x; +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + return ((sph_u64)sph_bswap32((sph_u32)x) << 32) + | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); +} + +#endif + +#elif SPH_AMD64_GCC && !SPH_NO_ASM + +/* + * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit + * and 64-bit values. + */ + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); + return x; +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + __asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x)); + return x; +} + +#endif + +/* + * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough + * to generate proper opcodes for endianness swapping with the pure C + * implementation below. + * + +#elif SPH_I386_MSVC && !SPH_NO_ASM + +static __inline sph_u32 __declspec(naked) __fastcall +sph_bswap32(sph_u32 x) +{ + __asm { + bswap ecx + mov eax,ecx + ret + } +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + return ((sph_u64)sph_bswap32((sph_u32)x) << 32) + | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); +} + +#endif + + * + * [end of disabled code] + */ + +#else + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + x = SPH_T32((x << 16) | (x >> 16)); + x = ((x & SPH_C32(0xFF00FF00)) >> 8) + | ((x & SPH_C32(0x00FF00FF)) << 8); + return x; +} + +#if SPH_64 + +/** + * Byte-swap a 64-bit value. + * + * @param x the input value + * @return the byte-swapped value + */ +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + x = SPH_T64((x << 32) | (x >> 32)); + x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16) + | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16); + x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8) + | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8); + return x; +} + +#endif + +#endif + +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + +/* + * On UltraSPARC systems, native ordering is big-endian, but it is + * possible to perform little-endian read accesses by specifying the + * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use + * the opcode "lda [%reg]0x88,%dst", where %reg is the register which + * contains the source address and %dst is the destination register, + * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register + * to get the address space name. The latter format is better since it + * combines an addition and the actual access in a single opcode; but + * it requires the setting (and subsequent resetting) of %asi, which is + * slow. Some operations (i.e. MD5 compression function) combine many + * successive little-endian read accesses, which may share the same + * %asi setting. The macros below contain the appropriate inline + * assembly. + */ + +#define SPH_SPARCV9_SET_ASI \ + sph_u32 sph_sparcv9_asi; \ + __asm__ __volatile__ ( \ + "rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi)); + +#define SPH_SPARCV9_RESET_ASI \ + __asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi)); + +#define SPH_SPARCV9_DEC32LE(base, idx) ({ \ + sph_u32 sph_sparcv9_tmp; \ + __asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \ + : "=r" (sph_sparcv9_tmp) : "r" (base)); \ + sph_sparcv9_tmp; \ + }) + +#endif + +static SPH_INLINE void +sph_enc16be(void *dst, unsigned val) +{ + ((unsigned char *)dst)[0] = (val >> 8); + ((unsigned char *)dst)[1] = val; +} + +static SPH_INLINE unsigned +sph_dec16be(const void *src) +{ + return ((unsigned)(((const unsigned char *)src)[0]) << 8) + | (unsigned)(((const unsigned char *)src)[1]); +} + +static SPH_INLINE void +sph_enc16le(void *dst, unsigned val) +{ + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = val >> 8; +} + +static SPH_INLINE unsigned +sph_dec16le(const void *src) +{ + return (unsigned)(((const unsigned char *)src)[0]) + | ((unsigned)(((const unsigned char *)src)[1]) << 8); +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static SPH_INLINE void +sph_enc32be(void *dst, sph_u32 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; +#else + if (((SPH_UPTR)dst & 3) == 0) { +#if SPH_LITTLE_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; + } else { + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; + } +#endif +#else + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (32-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc32be_aligned(void *dst, sph_u32 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u32 *)dst = sph_bswap32(val); +#elif SPH_BIG_ENDIAN + *(sph_u32 *)dst = val; +#else + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (big endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32be(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif +#else + if (((SPH_UPTR)src & 3) == 0) { +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif + } else { + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); + } +#endif +#else + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (big endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (32-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32be_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#elif SPH_BIG_ENDIAN + return *(const sph_u32 *)src; +#else + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (little endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static SPH_INLINE void +sph_enc32le(void *dst, sph_u32 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; +#else + if (((SPH_UPTR)dst & 3) == 0) { +#if SPH_BIG_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; + } else { + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + } +#endif +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (little endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (32-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc32le_aligned(void *dst, sph_u32 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u32 *)dst = val; +#elif SPH_BIG_ENDIAN + *(sph_u32 *)dst = sph_bswap32(val); +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (little endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32le(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif +#else + if (((SPH_UPTR)src & 3) == 0) { +#if SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + sph_u32 tmp; + + /* + * "__volatile__" is needed here because without it, + * gcc-3.4.3 miscompiles the code and performs the + * access before the test on the address, thus triggering + * a bus error... + */ + __asm__ __volatile__ ( + "lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * On PowerPC, this turns out not to be worth the effort: the inline + * assembly makes GCC optimizer uncomfortable, which tends to nullify + * the decoding gains. + * + * For most hash functions, using this inline assembly trick changes + * hashing speed by less than 5% and often _reduces_ it. The biggest + * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is + * less then 10%. The speed gain on CubeHash is probably due to the + * chronic shortage of registers that CubeHash endures; for the other + * functions, the generic code appears to be efficient enough already. + * +#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ( + "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap32(*(const sph_u32 *)src); +#endif +#else + return *(const sph_u32 *)src; +#endif + } else { + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); + } +#endif +#else + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (little endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (32-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32le_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return *(const sph_u32 *)src; +#elif SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap32(*(const sph_u32 *)src); +#endif +#else + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); +#endif +} + +#if SPH_64 + +/** + * Encode a 64-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 64-bit value to encode + */ +static SPH_INLINE void +sph_enc64be(void *dst, sph_u64 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; +#else + if (((SPH_UPTR)dst & 7) == 0) { +#if SPH_LITTLE_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; + } else { + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; + } +#endif +#else + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (big endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (64-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc64be_aligned(void *dst, sph_u64 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u64 *)dst = sph_bswap64(val); +#elif SPH_BIG_ENDIAN + *(sph_u64 *)dst = val; +#else + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (big endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64be(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif +#else + if (((SPH_UPTR)src & 7) == 0) { +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif + } else { + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); + } +#endif +#else + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (big endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (64-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64be_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#elif SPH_BIG_ENDIAN + return *(const sph_u64 *)src; +#else + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (little endian convention). + * + * @param dst the destination buffer + * @param val the 64-bit value to encode + */ +static SPH_INLINE void +sph_enc64le(void *dst, sph_u64 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; +#else + if (((SPH_UPTR)dst & 7) == 0) { +#if SPH_BIG_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; + } else { + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); + } +#endif +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (little endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (64-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc64le_aligned(void *dst, sph_u64 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u64 *)dst = val; +#elif SPH_BIG_ENDIAN + *(sph_u64 *)dst = sph_bswap64(val); +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (little endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64le(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif +#else + if (((SPH_UPTR)src & 7) == 0) { +#if SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ( + "ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif SPH_PPC32_GCC && !SPH_NO_ASM + return (sph_u64)sph_dec32le_aligned(src) + | ((sph_u64)sph_dec32le_aligned( + (const char *)src + 4) << 32); +#elif SPH_PPC64_GCC && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ( + "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap64(*(const sph_u64 *)src); +#endif +#else + return *(const sph_u64 *)src; +#endif + } else { + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); + } +#endif +#else + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (little endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (64-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64le_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return *(const sph_u64 *)src; +#elif SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif SPH_PPC32_GCC && !SPH_NO_ASM + return (sph_u64)sph_dec32le_aligned(src) + | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32); +#elif SPH_PPC64_GCC && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap64(*(const sph_u64 *)src); +#endif +#else + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); +#endif +} + +#endif + +#endif /* Doxygen excluded block */ + +#endif diff --git a/util.c b/util.c index 3459c23..451aaed 100644 --- a/util.c +++ b/util.c @@ -1,1320 +1,1320 @@ -/* - * Copyright 2010 Jeff Garzik - * Copyright 2012-2014 pooler - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#define _GNU_SOURCE -#include "cpuminer-config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(WIN32) -#include -#include -#else -#include -#include -#include -#include -#endif -#include "compat.h" -#include "miner.h" -#include "elist.h" - -struct data_buffer { - void *buf; - size_t len; -}; - -struct upload_buffer { - const void *buf; - size_t len; - size_t pos; -}; - -struct header_info { - char *lp_path; - char *reason; - char *stratum_url; -}; - -struct tq_ent { - void *data; - struct list_head q_node; -}; - -struct thread_q { - struct list_head q; - - bool frozen; - - pthread_mutex_t mutex; - pthread_cond_t cond; -}; - -void applog(int prio, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - -#ifdef HAVE_SYSLOG_H - if (use_syslog) { - va_list ap2; - char *buf; - int len; - - va_copy(ap2, ap); - len = vsnprintf(NULL, 0, fmt, ap2) + 1; - va_end(ap2); - buf = alloca(len); - if (vsnprintf(buf, len, fmt, ap) >= 0) - syslog(prio, "%s", buf); - } -#else - if (0) {} -#endif - else { - char *f; - int len; - time_t now; - struct tm tm, *tm_p; - - time(&now); - - pthread_mutex_lock(&applog_lock); - tm_p = localtime(&now); - memcpy(&tm, tm_p, sizeof(tm)); - pthread_mutex_unlock(&applog_lock); - - len = (int)(40 + strlen(fmt) + 2); - f = (char*)alloca(len); - sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n", - tm.tm_year + 1900, - tm.tm_mon + 1, - tm.tm_mday, - tm.tm_hour, - tm.tm_min, - tm.tm_sec, - fmt); - pthread_mutex_lock(&applog_lock); - vfprintf(stderr, f, ap); /* atomic write to stderr */ - fflush(stderr); - pthread_mutex_unlock(&applog_lock); - } - va_end(ap); -} - -static void databuf_free(struct data_buffer *db) -{ - if (!db) - return; - - free(db->buf); - - memset(db, 0, sizeof(*db)); -} - -static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb, - void *user_data) -{ - struct data_buffer *db = (struct data_buffer *)user_data; - size_t len = size * nmemb; - size_t oldlen, newlen; - void *newmem; - static const unsigned char zero = 0; - - oldlen = db->len; - newlen = oldlen + len; - - newmem = realloc(db->buf, newlen + 1); - if (!newmem) - return 0; - - db->buf = newmem; - db->len = newlen; - memcpy((char*)db->buf + oldlen, ptr, len); - memcpy((char*)db->buf + newlen, &zero, 1); /* null terminate */ - - return len; -} - -static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb, - void *user_data) -{ - struct upload_buffer *ub = (struct upload_buffer *)user_data; - unsigned int len = (unsigned int)(size * nmemb); - - if (len > ub->len - ub->pos) - len = (unsigned int)(ub->len - ub->pos); - - if (len) { - memcpy(ptr, (char*)ub->buf + ub->pos, len); - ub->pos += len; - } - - return len; -} - -#if LIBCURL_VERSION_NUM >= 0x071200 -static int seek_data_cb(void *user_data, curl_off_t offset, int origin) -{ - struct upload_buffer *ub = (struct upload_buffer *)user_data; - - switch (origin) { - case SEEK_SET: - ub->pos = (size_t)offset; - break; - case SEEK_CUR: - ub->pos += (size_t)offset; - break; - case SEEK_END: - ub->pos = ub->len + (size_t)offset; - break; - default: - return 1; /* CURL_SEEKFUNC_FAIL */ - } - - return 0; /* CURL_SEEKFUNC_OK */ -} -#endif - -static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data) -{ - struct header_info *hi = (struct header_info *)user_data; - size_t remlen, slen, ptrlen = size * nmemb; - char *rem, *val = NULL, *key = NULL; - void *tmp; - - val = (char*)calloc(1, ptrlen); - key = (char*)calloc(1, ptrlen); - if (!key || !val) - goto out; - - tmp = memchr(ptr, ':', ptrlen); - if (!tmp || (tmp == ptr)) /* skip empty keys / blanks */ - goto out; - slen = (size_t)((char*)tmp - (char*)ptr); - if ((slen + 1) == ptrlen) /* skip key w/ no value */ - goto out; - memcpy(key, ptr, slen); /* store & nul term key */ - key[slen] = 0; - - rem = (char*)ptr + slen + 1; /* trim value's leading whitespace */ - remlen = ptrlen - slen - 1; - while ((remlen > 0) && (isspace(*rem))) { - remlen--; - rem++; - } - - memcpy(val, rem, remlen); /* store value, trim trailing ws */ - val[remlen] = 0; - while ((*val) && (isspace(val[strlen(val) - 1]))) { - val[strlen(val) - 1] = 0; - } - if (!*val) /* skip blank value */ - goto out; - - if (!strcasecmp("X-Long-Polling", key)) { - hi->lp_path = val; /* steal memory reference */ - val = NULL; - } - - if (!strcasecmp("X-Reject-Reason", key)) { - hi->reason = val; /* steal memory reference */ - val = NULL; - } - - if (!strcasecmp("X-Stratum", key)) { - hi->stratum_url = val; /* steal memory reference */ - val = NULL; - } - -out: - free(key); - free(val); - return ptrlen; -} - -#if LIBCURL_VERSION_NUM >= 0x070f06 -static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, - curlsocktype purpose) -{ - int keepalive = 1; - int tcp_keepcnt = 3; - int tcp_keepidle = 50; - int tcp_keepintvl = 50; -#ifdef WIN32 - DWORD outputBytes; -#endif - -#ifndef WIN32 - if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, - sizeof(keepalive)))) - return 1; -#ifdef __linux - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, - &tcp_keepcnt, sizeof(tcp_keepcnt)))) - return 1; - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, - &tcp_keepidle, sizeof(tcp_keepidle)))) - return 1; - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, - &tcp_keepintvl, sizeof(tcp_keepintvl)))) - return 1; -#endif /* __linux */ -#ifdef __APPLE_CC__ - if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, - &tcp_keepintvl, sizeof(tcp_keepintvl)))) - return 1; -#endif /* __APPLE_CC__ */ -#else /* WIN32 */ - struct tcp_keepalive vals; - vals.onoff = 1; - vals.keepalivetime = tcp_keepidle * 1000; - vals.keepaliveinterval = tcp_keepintvl * 1000; - if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), - NULL, 0, &outputBytes, NULL, NULL))) - return 1; -#endif /* WIN32 */ - - return 0; -} -#endif - -json_t *json_rpc_call(CURL *curl, const char *url, - const char *userpass, const char *rpc_req, - bool longpoll_scan, bool longpoll, int *curl_err) -{ - json_t *val, *err_val, *res_val; - int rc; - struct data_buffer all_data = {0}; - struct upload_buffer upload_data; - json_error_t err; - struct curl_slist *headers = NULL; - char len_hdr[64]; - char curl_err_str[CURL_ERROR_SIZE]; - long timeout = longpoll ? opt_timeout : 30; - struct header_info hi = {0}; - bool lp_scanning = longpoll_scan && !have_longpoll; - - /* it is assumed that 'curl' is freshly [re]initialized at this pt */ - - if (opt_protocol) - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); - curl_easy_setopt(curl, CURLOPT_URL, url); - if (opt_cert) - curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert); - curl_easy_setopt(curl, CURLOPT_ENCODING, ""); - curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); - curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); - curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb); - curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data); -#if LIBCURL_VERSION_NUM >= 0x071200 - curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb); - curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data); -#endif - curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi); - if (opt_proxy) { - curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); - } - if (userpass) { - curl_easy_setopt(curl, CURLOPT_USERPWD, userpass); - curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); - } -#if LIBCURL_VERSION_NUM >= 0x070f06 - if (longpoll) - curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); -#endif - curl_easy_setopt(curl, CURLOPT_POST, 1); - - if (opt_protocol) - applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req); - - upload_data.buf = rpc_req; - upload_data.len = strlen(rpc_req); - upload_data.pos = 0; - sprintf(len_hdr, "Content-Length: %lu", - (unsigned long) upload_data.len); - - headers = curl_slist_append(headers, "Content-Type: application/json"); - headers = curl_slist_append(headers, len_hdr); - headers = curl_slist_append(headers, "User-Agent: " USER_AGENT); - headers = curl_slist_append(headers, "X-Mining-Extensions: midstate"); - headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/ - headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/ - - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - - rc = curl_easy_perform(curl); - if (curl_err != NULL) - *curl_err = rc; - if (rc) { - if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT)) - applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); - goto err_out; - } - - /* If X-Stratum was found, activate Stratum */ - if (want_stratum && hi.stratum_url && - !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) && - !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) { - have_stratum = true; - tq_push(thr_info[stratum_thr_id].q, hi.stratum_url); - hi.stratum_url = NULL; - } - - /* If X-Long-Polling was found, activate long polling */ - if (lp_scanning && hi.lp_path && !have_stratum) { - have_longpoll = true; - tq_push(thr_info[longpoll_thr_id].q, hi.lp_path); - hi.lp_path = NULL; - } - - if (!all_data.buf) { - applog(LOG_ERR, "Empty data received in json_rpc_call."); - goto err_out; - } - - val = JSON_LOADS((const char*)all_data.buf, &err); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto err_out; - } - - if (opt_protocol) { - char *s = json_dumps(val, JSON_INDENT(3)); - applog(LOG_DEBUG, "JSON protocol response:\n%s", s); - free(s); - } - - /* JSON-RPC valid response returns a non-null 'result', - * and a null 'error'. */ - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || json_is_null(res_val) || - (err_val && !json_is_null(err_val))) { - char *s; - - if (err_val) - s = json_dumps(err_val, JSON_INDENT(3)); - else - s = strdup("(unknown reason)"); - - applog(LOG_ERR, "JSON-RPC call failed: %s", s); - - free(s); - - goto err_out; - } - - if (hi.reason) - json_object_set_new(val, "reject-reason", json_string(hi.reason)); - - databuf_free(&all_data); - curl_slist_free_all(headers); - curl_easy_reset(curl); - return val; - -err_out: - free(hi.lp_path); - free(hi.reason); - free(hi.stratum_url); - databuf_free(&all_data); - curl_slist_free_all(headers); - curl_easy_reset(curl); - return NULL; -} - -char *bin2hex(const unsigned char *p, size_t len) -{ - unsigned int i; - char *s = (char*)malloc((len * 2) + 1); - if (!s) - return NULL; - - for (i = 0; i < len; i++) - sprintf(s + (i * 2), "%02x", (unsigned int) p[i]); - - return s; -} - -bool hex2bin(unsigned char *p, const char *hexstr, size_t len) -{ - char hex_byte[3]; - char *ep; - - hex_byte[2] = '\0'; - - while (*hexstr && len) { - if (!hexstr[1]) { - applog(LOG_ERR, "hex2bin str truncated"); - return false; - } - hex_byte[0] = hexstr[0]; - hex_byte[1] = hexstr[1]; - *p = (unsigned char) strtol(hex_byte, &ep, 16); - if (*ep) { - applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte); - return false; - } - p++; - hexstr += 2; - len--; - } - - return (len == 0 && *hexstr == 0) ? true : false; -} - -/* Subtract the `struct timeval' values X and Y, - storing the result in RESULT. - Return 1 if the difference is negative, otherwise 0. */ -int timeval_subtract(struct timeval *result, struct timeval *x, - struct timeval *y) -{ - /* Perform the carry for the later subtraction by updating Y. */ - if (x->tv_usec < y->tv_usec) { - int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; - y->tv_usec -= 1000000 * nsec; - y->tv_sec += nsec; - } - if (x->tv_usec - y->tv_usec > 1000000) { - int nsec = (x->tv_usec - y->tv_usec) / 1000000; - y->tv_usec += 1000000 * nsec; - y->tv_sec -= nsec; - } - - /* Compute the time remaining to wait. - * `tv_usec' is certainly positive. */ - result->tv_sec = x->tv_sec - y->tv_sec; - result->tv_usec = x->tv_usec - y->tv_usec; - - /* Return 1 if result is negative. */ - return x->tv_sec < y->tv_sec; -} - -bool fulltest(const uint32_t *hash, const uint32_t *target) -{ - int i; - bool rc = true; - - for (i = 7; i >= 0; i--) { - if (hash[i] > target[i]) { - rc = false; - break; - } - if (hash[i] < target[i]) { - rc = true; - break; - } - } - - if (opt_debug) { - uint32_t hash_be[8], target_be[8]; - char *hash_str, *target_str; - - for (i = 0; i < 8; i++) { - be32enc(hash_be + i, hash[7 - i]); - be32enc(target_be + i, target[7 - i]); - } - hash_str = bin2hex((unsigned char *)hash_be, 32); - target_str = bin2hex((unsigned char *)target_be, 32); - - applog(LOG_DEBUG, "DEBUG: %s\nHash: %s\nTarget: %s", - rc ? "hash <= target" - : "hash > target (false positive)", - hash_str, - target_str); - - free(hash_str); - free(target_str); - } - - return rc; -} - -void diff_to_target(uint32_t *target, double diff) -{ - uint64_t m; - int k; - - for (k = 6; k > 0 && diff > 1.0; k--) - diff /= 4294967296.0; - m = (uint64_t)(4294901760.0 / diff); - if (m == 0 && k == 6) - memset(target, 0xff, 32); - else { - memset(target, 0, 32); - target[k] = (uint32_t)m; - target[k + 1] = (uint32_t)(m >> 32); - } -} - -#ifdef WIN32 -#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK) -#else -#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK) -#endif - -static bool send_line(curl_socket_t sock, char *s) -{ - ssize_t len, sent = 0; - - len = (ssize_t)strlen(s); - s[len++] = '\n'; - - while (len > 0) { - struct timeval timeout = {0, 0}; - ssize_t n; - fd_set wd; - - FD_ZERO(&wd); - FD_SET(sock, &wd); - if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1) - return false; - n = send(sock, s + sent, len, 0); - if (n < 0) { - if (!socket_blocks()) - return false; - n = 0; - } - sent += n; - len -= n; - } - - return true; -} - -bool stratum_send_line(struct stratum_ctx *sctx, char *s) -{ - bool ret = false; - - if (opt_protocol) - applog(LOG_DEBUG, "> %s", s); - - pthread_mutex_lock(&sctx->sock_lock); - ret = send_line(sctx->sock, s); - pthread_mutex_unlock(&sctx->sock_lock); - - return ret; -} - -static bool socket_full(curl_socket_t sock, int timeout) -{ - struct timeval tv; - fd_set rd; - - FD_ZERO(&rd); - FD_SET(sock, &rd); - tv.tv_sec = timeout; - tv.tv_usec = 0; - if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0) - return true; - return false; -} - -bool stratum_socket_full(struct stratum_ctx *sctx, int timeout) -{ - return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout); -} - -#define RBUFSIZE 2048 -#define RECVSIZE (RBUFSIZE - 4) - -static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s) -{ - size_t old, snew; - - old = strlen(sctx->sockbuf); - snew = old + strlen(s) + 1; - if (snew >= sctx->sockbuf_size) { - sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE)); - sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size); - } - strcpy(sctx->sockbuf + old, s); -} - -char *stratum_recv_line(struct stratum_ctx *sctx) -{ - ssize_t len, buflen; - char *tok, *sret = NULL; - - if (!strstr(sctx->sockbuf, "\n")) { - bool ret = true; - time_t rstart; - - time(&rstart); - if (!socket_full(sctx->sock, 60)) { - applog(LOG_ERR, "stratum_recv_line timed out"); - goto out; - } - do { - char s[RBUFSIZE]; - ssize_t n; - - memset(s, 0, RBUFSIZE); - n = recv(sctx->sock, s, RECVSIZE, 0); - if (!n) { - ret = false; - break; - } - if (n < 0) { - if (!socket_blocks() || !socket_full(sctx->sock, 1)) { - ret = false; - break; - } - } else - stratum_buffer_append(sctx, s); - } while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n")); - - if (!ret) { - applog(LOG_ERR, "stratum_recv_line failed"); - goto out; - } - } - - buflen = (ssize_t)strlen(sctx->sockbuf); - tok = strtok(sctx->sockbuf, "\n"); - if (!tok) { - applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string"); - goto out; - } - sret = strdup(tok); - len = (ssize_t)strlen(sret); - - if (buflen > len + 1) - memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1); - else - sctx->sockbuf[0] = '\0'; - -out: - if (sret && opt_protocol) - applog(LOG_DEBUG, "< %s", sret); - return sret; -} - -#if LIBCURL_VERSION_NUM >= 0x071101 -static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, - struct curl_sockaddr *addr) -{ - curl_socket_t *sock = (curl_socket_t *)clientp; - *sock = socket(addr->family, addr->socktype, addr->protocol); - return *sock; -} -#endif - -bool stratum_connect(struct stratum_ctx *sctx, const char *url) -{ - CURL *curl; - int rc; - - pthread_mutex_lock(&sctx->sock_lock); - if (sctx->curl) - curl_easy_cleanup(sctx->curl); - sctx->curl = curl_easy_init(); - if (!sctx->curl) { - applog(LOG_ERR, "CURL initialization failed"); - pthread_mutex_unlock(&sctx->sock_lock); - return false; - } - curl = sctx->curl; - if (!sctx->sockbuf) { - sctx->sockbuf = (char*)calloc(RBUFSIZE, 1); - sctx->sockbuf_size = RBUFSIZE; - } - sctx->sockbuf[0] = '\0'; - pthread_mutex_unlock(&sctx->sock_lock); - - if (url != sctx->url) { - free(sctx->url); - sctx->url = strdup(url); - } - free(sctx->curl_url); - sctx->curl_url = (char*)malloc(strlen(url)); - sprintf(sctx->curl_url, "http%s", strstr(url, "://")); - - if (opt_protocol) - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); - curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url); - curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); - curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30); - curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str); - curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); - if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) { - curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); - } else if (getenv("http_proxy")) { - if (getenv("all_proxy")) - curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy")); - else if (getenv("ALL_PROXY")) - curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY")); - else - curl_easy_setopt(curl, CURLOPT_PROXY, ""); - } -#if LIBCURL_VERSION_NUM >= 0x070f06 - curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); -#endif -#if LIBCURL_VERSION_NUM >= 0x071101 - curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb); - curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock); -#endif - curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1); - - rc = curl_easy_perform(curl); - if (rc) { - applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str); - curl_easy_cleanup(curl); - sctx->curl = NULL; - return false; - } - -#if LIBCURL_VERSION_NUM < 0x071101 - /* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */ - curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock); -#endif - - return true; -} - -void stratum_disconnect(struct stratum_ctx *sctx) -{ - pthread_mutex_lock(&sctx->sock_lock); - if (sctx->curl) { - curl_easy_cleanup(sctx->curl); - sctx->curl = NULL; - sctx->sockbuf[0] = '\0'; - } - pthread_mutex_unlock(&sctx->sock_lock); -} - -static const char *get_stratum_session_id(json_t *val) -{ - json_t *arr_val; - int i, n; - - arr_val = json_array_get(val, 0); - if (!arr_val || !json_is_array(arr_val)) - return NULL; - n = json_array_size(arr_val); - for (i = 0; i < n; i++) { - const char *notify; - json_t *arr = json_array_get(arr_val, i); - - if (!arr || !json_is_array(arr)) - break; - notify = json_string_value(json_array_get(arr, 0)); - if (!notify) - continue; - if (!strcasecmp(notify, "mining.notify")) - return json_string_value(json_array_get(arr, 1)); - } - return NULL; -} - -bool stratum_subscribe(struct stratum_ctx *sctx) -{ - char *s, *sret = NULL; - const char *sid, *xnonce1; - int xn2_size; - json_t *val = NULL, *res_val, *err_val; - json_error_t err; - bool ret = false, retry = false; - -start: - s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0)); - if (retry) - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}"); - else if (sctx->session_id) - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id); - else - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}"); - - if (!stratum_send_line(sctx, s)) - goto out; - - if (!socket_full(sctx->sock, 30)) { - applog(LOG_ERR, "stratum_subscribe timed out"); - goto out; - } - - sret = stratum_recv_line(sctx); - if (!sret) - goto out; - - val = JSON_LOADS(sret, &err); - free(sret); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || json_is_null(res_val) || - (err_val && !json_is_null(err_val))) { - if (opt_debug || retry) { - free(s); - if (err_val) - s = json_dumps(err_val, JSON_INDENT(3)); - else - s = strdup("(unknown reason)"); - applog(LOG_ERR, "JSON-RPC call failed: %s", s); - } - goto out; - } - - sid = get_stratum_session_id(res_val); - if (opt_debug && !sid) - applog(LOG_DEBUG, "Failed to get Stratum session id"); - xnonce1 = json_string_value(json_array_get(res_val, 1)); - if (!xnonce1) { - applog(LOG_ERR, "Failed to get extranonce1"); - goto out; - } - xn2_size = json_integer_value(json_array_get(res_val, 2)); - if (!xn2_size) { - applog(LOG_ERR, "Failed to get extranonce2_size"); - goto out; - } - - pthread_mutex_lock(&sctx->work_lock); - free(sctx->session_id); - free(sctx->xnonce1); - sctx->session_id = sid ? strdup(sid) : NULL; - sctx->xnonce1_size = strlen(xnonce1) / 2; - sctx->xnonce1 = (unsigned char*)malloc(sctx->xnonce1_size); - hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); - sctx->xnonce2_size = xn2_size; - sctx->next_diff = 1.0; - pthread_mutex_unlock(&sctx->work_lock); - - if (opt_debug && sid) - applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id); - - ret = true; - -out: - free(s); - if (val) - json_decref(val); - - if (!ret) { - if (sret && !retry) { - retry = true; - goto start; - } - } - - return ret; -} - -bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass) -{ - json_t *val = NULL, *res_val, *err_val; - char *s, *sret; - json_error_t err; - bool ret = false; - - s = (char*)malloc(80 + strlen(user) + strlen(pass)); - sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}", - user, pass); - - if (!stratum_send_line(sctx, s)) - goto out; - - while (1) { - sret = stratum_recv_line(sctx); - if (!sret) - goto out; - if (!stratum_handle_method(sctx, sret)) - break; - free(sret); - } - - val = JSON_LOADS(sret, &err); - free(sret); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || json_is_false(res_val) || - (err_val && !json_is_null(err_val))) { - applog(LOG_ERR, "Stratum authentication failed"); - goto out; - } - - ret = true; - -out: - free(s); - if (val) - json_decref(val); - - return ret; -} - -static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) -{ - const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *ntime, *nreward; - size_t coinb1_size, coinb2_size; - bool clean, ret = false; - int merkle_count, i; - json_t *merkle_arr; - unsigned char **merkle; - - job_id = json_string_value(json_array_get(params, 0)); - prevhash = json_string_value(json_array_get(params, 1)); - coinb1 = json_string_value(json_array_get(params, 2)); - coinb2 = json_string_value(json_array_get(params, 3)); - merkle_arr = json_array_get(params, 4); - if (!merkle_arr || !json_is_array(merkle_arr)) - goto out; - merkle_count = json_array_size(merkle_arr); - version = json_string_value(json_array_get(params, 5)); - nbits = json_string_value(json_array_get(params, 6)); - ntime = json_string_value(json_array_get(params, 7)); - clean = json_is_true(json_array_get(params, 8)); - nreward = json_string_value(json_array_get(params, 9)); - - if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !ntime || - strlen(prevhash) != 64 || strlen(version) != 8 || - strlen(nbits) != 8 || strlen(ntime) != 8) { - applog(LOG_ERR, "Stratum notify: invalid parameters"); - goto out; - } - merkle = (unsigned char**)malloc(merkle_count * sizeof(char *)); - for (i = 0; i < merkle_count; i++) { - const char *s = json_string_value(json_array_get(merkle_arr, i)); - if (!s || strlen(s) != 64) { - while (i--) - free(merkle[i]); - free(merkle); - applog(LOG_ERR, "Stratum notify: invalid Merkle branch"); - goto out; - } - merkle[i] = (unsigned char*)malloc(32); - hex2bin(merkle[i], s, 32); - } - - pthread_mutex_lock(&sctx->work_lock); - - coinb1_size = strlen(coinb1) / 2; - coinb2_size = strlen(coinb2) / 2; - sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size + - sctx->xnonce2_size + coinb2_size; - sctx->job.coinbase = (unsigned char*)realloc(sctx->job.coinbase, sctx->job.coinbase_size); - sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size; - hex2bin(sctx->job.coinbase, coinb1, coinb1_size); - memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); - if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) - memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); - hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); - - free(sctx->job.job_id); - sctx->job.job_id = strdup(job_id); - hex2bin(sctx->job.prevhash, prevhash, 32); - - for (i = 0; i < sctx->job.merkle_count; i++) - free(sctx->job.merkle[i]); - free(sctx->job.merkle); - sctx->job.merkle = merkle; - sctx->job.merkle_count = merkle_count; - - hex2bin(sctx->job.version, version, 4); - hex2bin(sctx->job.nbits, nbits, 4); - hex2bin(sctx->job.ntime, ntime, 4); - if(nreward != NULL) - { - if(strlen(nreward) == 4) - hex2bin(sctx->job.nreward, nreward, 2); - } - sctx->job.clean = clean; - - sctx->job.diff = sctx->next_diff; - - pthread_mutex_unlock(&sctx->work_lock); - - ret = true; - -out: - return ret; -} - -static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) -{ - double diff; - - diff = json_number_value(json_array_get(params, 0)); - if (diff == 0) - return false; - - pthread_mutex_lock(&sctx->work_lock); - sctx->next_diff = diff; - pthread_mutex_unlock(&sctx->work_lock); - - if (opt_debug) - applog(LOG_DEBUG, "Stratum difficulty set to %g", diff); - - return true; -} - -static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params) -{ - json_t *port_val; - const char *host; - int port; - - host = json_string_value(json_array_get(params, 0)); - port_val = json_array_get(params, 1); - if (json_is_string(port_val)) - port = atoi(json_string_value(port_val)); - else - port = json_integer_value(port_val); - if (!host || !port) - return false; - - free(sctx->url); - sctx->url = (char*)malloc(32 + strlen(host)); - sprintf(sctx->url, "stratum+tcp://%s:%d", host, port); - - applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url); - - stratum_disconnect(sctx); - - return true; -} - -static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id) -{ - char *s; - json_t *val; - bool ret; - - if (!id || json_is_null(id)) - return false; - - val = json_object(); - json_object_set(val, "id", id); - json_object_set_new(val, "error", json_null()); - json_object_set_new(val, "result", json_string(USER_AGENT)); - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); - - return ret; -} - -static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params) -{ - char *s; - json_t *val; - bool ret; - - val = json_array_get(params, 0); - if (val) - applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val)); - - if (!id || json_is_null(id)) - return true; - - val = json_object(); - json_object_set(val, "id", id); - json_object_set_new(val, "error", json_null()); - json_object_set_new(val, "result", json_true()); - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); - - return ret; -} - -bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) -{ - json_t *val, *id, *params; - json_error_t err; - const char *method; - bool ret = false; - - val = JSON_LOADS(s, &err); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - method = json_string_value(json_object_get(val, "method")); - if (!method) - goto out; - id = json_object_get(val, "id"); - params = json_object_get(val, "params"); - - if (!strcasecmp(method, "mining.notify")) { - ret = stratum_notify(sctx, params); - goto out; - } - if (!strcasecmp(method, "mining.set_difficulty")) { - ret = stratum_set_difficulty(sctx, params); - goto out; - } - if (!strcasecmp(method, "client.reconnect")) { - ret = stratum_reconnect(sctx, params); - goto out; - } - if (!strcasecmp(method, "client.get_version")) { - ret = stratum_get_version(sctx, id); - goto out; - } - if (!strcasecmp(method, "client.show_message")) { - ret = stratum_show_message(sctx, id, params); - goto out; - } - -out: - if (val) - json_decref(val); - - return ret; -} - -struct thread_q *tq_new(void) -{ - struct thread_q *tq; - - tq = (struct thread_q *)calloc(1, sizeof(*tq)); - if (!tq) - return NULL; - - INIT_LIST_HEAD(&tq->q); - pthread_mutex_init(&tq->mutex, NULL); - pthread_cond_init(&tq->cond, NULL); - - return tq; -} - -void tq_free(struct thread_q *tq) -{ - struct tq_ent *ent, *iter; - - if (!tq) - return; - - list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) { - list_del(&ent->q_node); - free(ent); - } - - pthread_cond_destroy(&tq->cond); - pthread_mutex_destroy(&tq->mutex); - - memset(tq, 0, sizeof(*tq)); /* poison */ - free(tq); -} - -static void tq_freezethaw(struct thread_q *tq, bool frozen) -{ - pthread_mutex_lock(&tq->mutex); - - tq->frozen = frozen; - - pthread_cond_signal(&tq->cond); - pthread_mutex_unlock(&tq->mutex); -} - -void tq_freeze(struct thread_q *tq) -{ - tq_freezethaw(tq, true); -} - -void tq_thaw(struct thread_q *tq) -{ - tq_freezethaw(tq, false); -} - -bool tq_push(struct thread_q *tq, void *data) -{ - struct tq_ent *ent; - bool rc = true; - - ent = (struct tq_ent *)calloc(1, sizeof(*ent)); - if (!ent) - return false; - - ent->data = data; - INIT_LIST_HEAD(&ent->q_node); - - pthread_mutex_lock(&tq->mutex); - - if (!tq->frozen) { - list_add_tail(&ent->q_node, &tq->q); - } else { - free(ent); - rc = false; - } - - pthread_cond_signal(&tq->cond); - pthread_mutex_unlock(&tq->mutex); - - return rc; -} - -void *tq_pop(struct thread_q *tq, const struct timespec *abstime) -{ - struct tq_ent *ent; - void *rval = NULL; - int rc; - - pthread_mutex_lock(&tq->mutex); - - if (!list_empty(&tq->q)) - goto pop; - - if (abstime) - rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime); - else - rc = pthread_cond_wait(&tq->cond, &tq->mutex); - if (rc) - goto out; - if (list_empty(&tq->q)) - goto out; - -pop: - ent = list_entry(tq->q.next, struct tq_ent, q_node); - rval = ent->data; - - list_del(&ent->q_node); - free(ent); - -out: - pthread_mutex_unlock(&tq->mutex); - return rval; -} +/* + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#define _GNU_SOURCE +#include "cpuminer-config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(WIN32) +#include +#include +#else +#include +#include +#include +#include +#endif +#include "compat.h" +#include "miner.h" +#include "elist.h" + +struct data_buffer { + void *buf; + size_t len; +}; + +struct upload_buffer { + const void *buf; + size_t len; + size_t pos; +}; + +struct header_info { + char *lp_path; + char *reason; + char *stratum_url; +}; + +struct tq_ent { + void *data; + struct list_head q_node; +}; + +struct thread_q { + struct list_head q; + + bool frozen; + + pthread_mutex_t mutex; + pthread_cond_t cond; +}; + +void applog(int prio, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + +#ifdef HAVE_SYSLOG_H + if (use_syslog) { + va_list ap2; + char *buf; + int len; + + va_copy(ap2, ap); + len = vsnprintf(NULL, 0, fmt, ap2) + 1; + va_end(ap2); + buf = alloca(len); + if (vsnprintf(buf, len, fmt, ap) >= 0) + syslog(prio, "%s", buf); + } +#else + if (0) {} +#endif + else { + char *f; + int len; + time_t now; + struct tm tm, *tm_p; + + time(&now); + + pthread_mutex_lock(&applog_lock); + tm_p = localtime(&now); + memcpy(&tm, tm_p, sizeof(tm)); + pthread_mutex_unlock(&applog_lock); + + len = (int)(40 + strlen(fmt) + 2); + f = (char*)alloca(len); + sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n", + tm.tm_year + 1900, + tm.tm_mon + 1, + tm.tm_mday, + tm.tm_hour, + tm.tm_min, + tm.tm_sec, + fmt); + pthread_mutex_lock(&applog_lock); + vfprintf(stderr, f, ap); /* atomic write to stderr */ + fflush(stderr); + pthread_mutex_unlock(&applog_lock); + } + va_end(ap); +} + +static void databuf_free(struct data_buffer *db) +{ + if (!db) + return; + + free(db->buf); + + memset(db, 0, sizeof(*db)); +} + +static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb, + void *user_data) +{ + struct data_buffer *db = (struct data_buffer *)user_data; + size_t len = size * nmemb; + size_t oldlen, newlen; + void *newmem; + static const unsigned char zero = 0; + + oldlen = db->len; + newlen = oldlen + len; + + newmem = realloc(db->buf, newlen + 1); + if (!newmem) + return 0; + + db->buf = newmem; + db->len = newlen; + memcpy((char*)db->buf + oldlen, ptr, len); + memcpy((char*)db->buf + newlen, &zero, 1); /* null terminate */ + + return len; +} + +static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb, + void *user_data) +{ + struct upload_buffer *ub = (struct upload_buffer *)user_data; + unsigned int len = (unsigned int)(size * nmemb); + + if (len > ub->len - ub->pos) + len = (unsigned int)(ub->len - ub->pos); + + if (len) { + memcpy(ptr, (char*)ub->buf + ub->pos, len); + ub->pos += len; + } + + return len; +} + +#if LIBCURL_VERSION_NUM >= 0x071200 +static int seek_data_cb(void *user_data, curl_off_t offset, int origin) +{ + struct upload_buffer *ub = (struct upload_buffer *)user_data; + + switch (origin) { + case SEEK_SET: + ub->pos = (size_t)offset; + break; + case SEEK_CUR: + ub->pos += (size_t)offset; + break; + case SEEK_END: + ub->pos = ub->len + (size_t)offset; + break; + default: + return 1; /* CURL_SEEKFUNC_FAIL */ + } + + return 0; /* CURL_SEEKFUNC_OK */ +} +#endif + +static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data) +{ + struct header_info *hi = (struct header_info *)user_data; + size_t remlen, slen, ptrlen = size * nmemb; + char *rem, *val = NULL, *key = NULL; + void *tmp; + + val = (char*)calloc(1, ptrlen); + key = (char*)calloc(1, ptrlen); + if (!key || !val) + goto out; + + tmp = memchr(ptr, ':', ptrlen); + if (!tmp || (tmp == ptr)) /* skip empty keys / blanks */ + goto out; + slen = (size_t)((char*)tmp - (char*)ptr); + if ((slen + 1) == ptrlen) /* skip key w/ no value */ + goto out; + memcpy(key, ptr, slen); /* store & nul term key */ + key[slen] = 0; + + rem = (char*)ptr + slen + 1; /* trim value's leading whitespace */ + remlen = ptrlen - slen - 1; + while ((remlen > 0) && (isspace(*rem))) { + remlen--; + rem++; + } + + memcpy(val, rem, remlen); /* store value, trim trailing ws */ + val[remlen] = 0; + while ((*val) && (isspace(val[strlen(val) - 1]))) { + val[strlen(val) - 1] = 0; + } + if (!*val) /* skip blank value */ + goto out; + + if (!strcasecmp("X-Long-Polling", key)) { + hi->lp_path = val; /* steal memory reference */ + val = NULL; + } + + if (!strcasecmp("X-Reject-Reason", key)) { + hi->reason = val; /* steal memory reference */ + val = NULL; + } + + if (!strcasecmp("X-Stratum", key)) { + hi->stratum_url = val; /* steal memory reference */ + val = NULL; + } + +out: + free(key); + free(val); + return ptrlen; +} + +#if LIBCURL_VERSION_NUM >= 0x070f06 +static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, + curlsocktype purpose) +{ + int keepalive = 1; + int tcp_keepcnt = 3; + int tcp_keepidle = 50; + int tcp_keepintvl = 50; +#ifdef WIN32 + DWORD outputBytes; +#endif + +#ifndef WIN32 + if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, + sizeof(keepalive)))) + return 1; +#ifdef __linux + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, + &tcp_keepcnt, sizeof(tcp_keepcnt)))) + return 1; + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, + &tcp_keepidle, sizeof(tcp_keepidle)))) + return 1; + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, + &tcp_keepintvl, sizeof(tcp_keepintvl)))) + return 1; +#endif /* __linux */ +#ifdef __APPLE_CC__ + if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, + &tcp_keepintvl, sizeof(tcp_keepintvl)))) + return 1; +#endif /* __APPLE_CC__ */ +#else /* WIN32 */ + struct tcp_keepalive vals; + vals.onoff = 1; + vals.keepalivetime = tcp_keepidle * 1000; + vals.keepaliveinterval = tcp_keepintvl * 1000; + if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), + NULL, 0, &outputBytes, NULL, NULL))) + return 1; +#endif /* WIN32 */ + + return 0; +} +#endif + +json_t *json_rpc_call(CURL *curl, const char *url, + const char *userpass, const char *rpc_req, + bool longpoll_scan, bool longpoll, int *curl_err) +{ + json_t *val, *err_val, *res_val; + int rc; + struct data_buffer all_data = {0}; + struct upload_buffer upload_data; + json_error_t err; + struct curl_slist *headers = NULL; + char len_hdr[64]; + char curl_err_str[CURL_ERROR_SIZE]; + long timeout = longpoll ? opt_timeout : 30; + struct header_info hi = {0}; + bool lp_scanning = longpoll_scan && !have_longpoll; + + /* it is assumed that 'curl' is freshly [re]initialized at this pt */ + + if (opt_protocol) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(curl, CURLOPT_URL, url); + if (opt_cert) + curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert); + curl_easy_setopt(curl, CURLOPT_ENCODING, ""); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); + curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb); + curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data); +#if LIBCURL_VERSION_NUM >= 0x071200 + curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb); + curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data); +#endif + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi); + if (opt_proxy) { + curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); + } + if (userpass) { + curl_easy_setopt(curl, CURLOPT_USERPWD, userpass); + curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); + } +#if LIBCURL_VERSION_NUM >= 0x070f06 + if (longpoll) + curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); +#endif + curl_easy_setopt(curl, CURLOPT_POST, 1); + + if (opt_protocol) + applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req); + + upload_data.buf = rpc_req; + upload_data.len = strlen(rpc_req); + upload_data.pos = 0; + sprintf(len_hdr, "Content-Length: %lu", + (unsigned long) upload_data.len); + + headers = curl_slist_append(headers, "Content-Type: application/json"); + headers = curl_slist_append(headers, len_hdr); + headers = curl_slist_append(headers, "User-Agent: " USER_AGENT); + headers = curl_slist_append(headers, "X-Mining-Extensions: midstate"); + headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/ + headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/ + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + + rc = curl_easy_perform(curl); + if (curl_err != NULL) + *curl_err = rc; + if (rc) { + if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT)) + applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); + goto err_out; + } + + /* If X-Stratum was found, activate Stratum */ + if (want_stratum && hi.stratum_url && + !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) && + !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) { + have_stratum = true; + tq_push(thr_info[stratum_thr_id].q, hi.stratum_url); + hi.stratum_url = NULL; + } + + /* If X-Long-Polling was found, activate long polling */ + if (lp_scanning && hi.lp_path && !have_stratum) { + have_longpoll = true; + tq_push(thr_info[longpoll_thr_id].q, hi.lp_path); + hi.lp_path = NULL; + } + + if (!all_data.buf) { + applog(LOG_ERR, "Empty data received in json_rpc_call."); + goto err_out; + } + + val = JSON_LOADS((const char*)all_data.buf, &err); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto err_out; + } + + if (opt_protocol) { + char *s = json_dumps(val, JSON_INDENT(3)); + applog(LOG_DEBUG, "JSON protocol response:\n%s", s); + free(s); + } + + /* JSON-RPC valid response returns a non-null 'result', + * and a null 'error'. */ + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || json_is_null(res_val) || + (err_val && !json_is_null(err_val))) { + char *s; + + if (err_val) + s = json_dumps(err_val, JSON_INDENT(3)); + else + s = strdup("(unknown reason)"); + + applog(LOG_ERR, "JSON-RPC call failed: %s", s); + + free(s); + + goto err_out; + } + + if (hi.reason) + json_object_set_new(val, "reject-reason", json_string(hi.reason)); + + databuf_free(&all_data); + curl_slist_free_all(headers); + curl_easy_reset(curl); + return val; + +err_out: + free(hi.lp_path); + free(hi.reason); + free(hi.stratum_url); + databuf_free(&all_data); + curl_slist_free_all(headers); + curl_easy_reset(curl); + return NULL; +} + +char *bin2hex(const unsigned char *p, size_t len) +{ + unsigned int i; + char *s = (char*)malloc((len * 2) + 1); + if (!s) + return NULL; + + for (i = 0; i < len; i++) + sprintf(s + (i * 2), "%02x", (unsigned int) p[i]); + + return s; +} + +bool hex2bin(unsigned char *p, const char *hexstr, size_t len) +{ + char hex_byte[3]; + char *ep; + + hex_byte[2] = '\0'; + + while (*hexstr && len) { + if (!hexstr[1]) { + applog(LOG_ERR, "hex2bin str truncated"); + return false; + } + hex_byte[0] = hexstr[0]; + hex_byte[1] = hexstr[1]; + *p = (unsigned char) strtol(hex_byte, &ep, 16); + if (*ep) { + applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte); + return false; + } + p++; + hexstr += 2; + len--; + } + + return (len == 0 && *hexstr == 0) ? true : false; +} + +/* Subtract the `struct timeval' values X and Y, + storing the result in RESULT. + Return 1 if the difference is negative, otherwise 0. */ +int timeval_subtract(struct timeval *result, struct timeval *x, + struct timeval *y) +{ + /* Perform the carry for the later subtraction by updating Y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + * `tv_usec' is certainly positive. */ + result->tv_sec = x->tv_sec - y->tv_sec; + result->tv_usec = x->tv_usec - y->tv_usec; + + /* Return 1 if result is negative. */ + return x->tv_sec < y->tv_sec; +} + +bool fulltest(const uint32_t *hash, const uint32_t *target) +{ + int i; + bool rc = true; + + for (i = 7; i >= 0; i--) { + if (hash[i] > target[i]) { + rc = false; + break; + } + if (hash[i] < target[i]) { + rc = true; + break; + } + } + + if (opt_debug) { + uint32_t hash_be[8], target_be[8]; + char *hash_str, *target_str; + + for (i = 0; i < 8; i++) { + be32enc(hash_be + i, hash[7 - i]); + be32enc(target_be + i, target[7 - i]); + } + hash_str = bin2hex((unsigned char *)hash_be, 32); + target_str = bin2hex((unsigned char *)target_be, 32); + + applog(LOG_DEBUG, "DEBUG: %s\nHash: %s\nTarget: %s", + rc ? "hash <= target" + : "hash > target (false positive)", + hash_str, + target_str); + + free(hash_str); + free(target_str); + } + + return rc; +} + +void diff_to_target(uint32_t *target, double diff) +{ + uint64_t m; + int k; + + for (k = 6; k > 0 && diff > 1.0; k--) + diff /= 4294967296.0; + m = (uint64_t)(4294901760.0 / diff); + if (m == 0 && k == 6) + memset(target, 0xff, 32); + else { + memset(target, 0, 32); + target[k] = (uint32_t)m; + target[k + 1] = (uint32_t)(m >> 32); + } +} + +#ifdef WIN32 +#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK) +#else +#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK) +#endif + +static bool send_line(curl_socket_t sock, char *s) +{ + ssize_t len, sent = 0; + + len = (ssize_t)strlen(s); + s[len++] = '\n'; + + while (len > 0) { + struct timeval timeout = {0, 0}; + ssize_t n; + fd_set wd; + + FD_ZERO(&wd); + FD_SET(sock, &wd); + if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1) + return false; + n = send(sock, s + sent, len, 0); + if (n < 0) { + if (!socket_blocks()) + return false; + n = 0; + } + sent += n; + len -= n; + } + + return true; +} + +bool stratum_send_line(struct stratum_ctx *sctx, char *s) +{ + bool ret = false; + + if (opt_protocol) + applog(LOG_DEBUG, "> %s", s); + + pthread_mutex_lock(&sctx->sock_lock); + ret = send_line(sctx->sock, s); + pthread_mutex_unlock(&sctx->sock_lock); + + return ret; +} + +static bool socket_full(curl_socket_t sock, int timeout) +{ + struct timeval tv; + fd_set rd; + + FD_ZERO(&rd); + FD_SET(sock, &rd); + tv.tv_sec = timeout; + tv.tv_usec = 0; + if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0) + return true; + return false; +} + +bool stratum_socket_full(struct stratum_ctx *sctx, int timeout) +{ + return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout); +} + +#define RBUFSIZE 2048 +#define RECVSIZE (RBUFSIZE - 4) + +static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s) +{ + size_t old, snew; + + old = strlen(sctx->sockbuf); + snew = old + strlen(s) + 1; + if (snew >= sctx->sockbuf_size) { + sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE)); + sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size); + } + strcpy(sctx->sockbuf + old, s); +} + +char *stratum_recv_line(struct stratum_ctx *sctx) +{ + ssize_t len, buflen; + char *tok, *sret = NULL; + + if (!strstr(sctx->sockbuf, "\n")) { + bool ret = true; + time_t rstart; + + time(&rstart); + if (!socket_full(sctx->sock, 60)) { + applog(LOG_ERR, "stratum_recv_line timed out"); + goto out; + } + do { + char s[RBUFSIZE]; + ssize_t n; + + memset(s, 0, RBUFSIZE); + n = recv(sctx->sock, s, RECVSIZE, 0); + if (!n) { + ret = false; + break; + } + if (n < 0) { + if (!socket_blocks() || !socket_full(sctx->sock, 1)) { + ret = false; + break; + } + } else + stratum_buffer_append(sctx, s); + } while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n")); + + if (!ret) { + applog(LOG_ERR, "stratum_recv_line failed"); + goto out; + } + } + + buflen = (ssize_t)strlen(sctx->sockbuf); + tok = strtok(sctx->sockbuf, "\n"); + if (!tok) { + applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string"); + goto out; + } + sret = strdup(tok); + len = (ssize_t)strlen(sret); + + if (buflen > len + 1) + memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1); + else + sctx->sockbuf[0] = '\0'; + +out: + if (sret && opt_protocol) + applog(LOG_DEBUG, "< %s", sret); + return sret; +} + +#if LIBCURL_VERSION_NUM >= 0x071101 +static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, + struct curl_sockaddr *addr) +{ + curl_socket_t *sock = (curl_socket_t *)clientp; + *sock = socket(addr->family, addr->socktype, addr->protocol); + return *sock; +} +#endif + +bool stratum_connect(struct stratum_ctx *sctx, const char *url) +{ + CURL *curl; + int rc; + + pthread_mutex_lock(&sctx->sock_lock); + if (sctx->curl) + curl_easy_cleanup(sctx->curl); + sctx->curl = curl_easy_init(); + if (!sctx->curl) { + applog(LOG_ERR, "CURL initialization failed"); + pthread_mutex_unlock(&sctx->sock_lock); + return false; + } + curl = sctx->curl; + if (!sctx->sockbuf) { + sctx->sockbuf = (char*)calloc(RBUFSIZE, 1); + sctx->sockbuf_size = RBUFSIZE; + } + sctx->sockbuf[0] = '\0'; + pthread_mutex_unlock(&sctx->sock_lock); + + if (url != sctx->url) { + free(sctx->url); + sctx->url = strdup(url); + } + free(sctx->curl_url); + sctx->curl_url = (char*)malloc(strlen(url)); + sprintf(sctx->curl_url, "http%s", strstr(url, "://")); + + if (opt_protocol) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url); + curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) { + curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); + } else if (getenv("http_proxy")) { + if (getenv("all_proxy")) + curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy")); + else if (getenv("ALL_PROXY")) + curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY")); + else + curl_easy_setopt(curl, CURLOPT_PROXY, ""); + } +#if LIBCURL_VERSION_NUM >= 0x070f06 + curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); +#endif +#if LIBCURL_VERSION_NUM >= 0x071101 + curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb); + curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock); +#endif + curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1); + + rc = curl_easy_perform(curl); + if (rc) { + applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str); + curl_easy_cleanup(curl); + sctx->curl = NULL; + return false; + } + +#if LIBCURL_VERSION_NUM < 0x071101 + /* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */ + curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock); +#endif + + return true; +} + +void stratum_disconnect(struct stratum_ctx *sctx) +{ + pthread_mutex_lock(&sctx->sock_lock); + if (sctx->curl) { + curl_easy_cleanup(sctx->curl); + sctx->curl = NULL; + sctx->sockbuf[0] = '\0'; + } + pthread_mutex_unlock(&sctx->sock_lock); +} + +static const char *get_stratum_session_id(json_t *val) +{ + json_t *arr_val; + int i, n; + + arr_val = json_array_get(val, 0); + if (!arr_val || !json_is_array(arr_val)) + return NULL; + n = json_array_size(arr_val); + for (i = 0; i < n; i++) { + const char *notify; + json_t *arr = json_array_get(arr_val, i); + + if (!arr || !json_is_array(arr)) + break; + notify = json_string_value(json_array_get(arr, 0)); + if (!notify) + continue; + if (!strcasecmp(notify, "mining.notify")) + return json_string_value(json_array_get(arr, 1)); + } + return NULL; +} + +bool stratum_subscribe(struct stratum_ctx *sctx) +{ + char *s, *sret = NULL; + const char *sid, *xnonce1; + int xn2_size; + json_t *val = NULL, *res_val, *err_val; + json_error_t err; + bool ret = false, retry = false; + +start: + s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0)); + if (retry) + sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}"); + else if (sctx->session_id) + sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id); + else + sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}"); + + if (!stratum_send_line(sctx, s)) + goto out; + + if (!socket_full(sctx->sock, 30)) { + applog(LOG_ERR, "stratum_subscribe timed out"); + goto out; + } + + sret = stratum_recv_line(sctx); + if (!sret) + goto out; + + val = JSON_LOADS(sret, &err); + free(sret); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || json_is_null(res_val) || + (err_val && !json_is_null(err_val))) { + if (opt_debug || retry) { + free(s); + if (err_val) + s = json_dumps(err_val, JSON_INDENT(3)); + else + s = strdup("(unknown reason)"); + applog(LOG_ERR, "JSON-RPC call failed: %s", s); + } + goto out; + } + + sid = get_stratum_session_id(res_val); + if (opt_debug && !sid) + applog(LOG_DEBUG, "Failed to get Stratum session id"); + xnonce1 = json_string_value(json_array_get(res_val, 1)); + if (!xnonce1) { + applog(LOG_ERR, "Failed to get extranonce1"); + goto out; + } + xn2_size = json_integer_value(json_array_get(res_val, 2)); + if (!xn2_size) { + applog(LOG_ERR, "Failed to get extranonce2_size"); + goto out; + } + + pthread_mutex_lock(&sctx->work_lock); + free(sctx->session_id); + free(sctx->xnonce1); + sctx->session_id = sid ? strdup(sid) : NULL; + sctx->xnonce1_size = strlen(xnonce1) / 2; + sctx->xnonce1 = (unsigned char*)malloc(sctx->xnonce1_size); + hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); + sctx->xnonce2_size = xn2_size; + sctx->next_diff = 1.0; + pthread_mutex_unlock(&sctx->work_lock); + + if (opt_debug && sid) + applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id); + + ret = true; + +out: + free(s); + if (val) + json_decref(val); + + if (!ret) { + if (sret && !retry) { + retry = true; + goto start; + } + } + + return ret; +} + +bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass) +{ + json_t *val = NULL, *res_val, *err_val; + char *s, *sret; + json_error_t err; + bool ret = false; + + s = (char*)malloc(80 + strlen(user) + strlen(pass)); + sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}", + user, pass); + + if (!stratum_send_line(sctx, s)) + goto out; + + while (1) { + sret = stratum_recv_line(sctx); + if (!sret) + goto out; + if (!stratum_handle_method(sctx, sret)) + break; + free(sret); + } + + val = JSON_LOADS(sret, &err); + free(sret); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || json_is_false(res_val) || + (err_val && !json_is_null(err_val))) { + applog(LOG_ERR, "Stratum authentication failed"); + goto out; + } + + ret = true; + +out: + free(s); + if (val) + json_decref(val); + + return ret; +} + +static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) +{ + const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *ntime, *nreward; + size_t coinb1_size, coinb2_size; + bool clean, ret = false; + int merkle_count, i; + json_t *merkle_arr; + unsigned char **merkle; + + job_id = json_string_value(json_array_get(params, 0)); + prevhash = json_string_value(json_array_get(params, 1)); + coinb1 = json_string_value(json_array_get(params, 2)); + coinb2 = json_string_value(json_array_get(params, 3)); + merkle_arr = json_array_get(params, 4); + if (!merkle_arr || !json_is_array(merkle_arr)) + goto out; + merkle_count = json_array_size(merkle_arr); + version = json_string_value(json_array_get(params, 5)); + nbits = json_string_value(json_array_get(params, 6)); + ntime = json_string_value(json_array_get(params, 7)); + clean = json_is_true(json_array_get(params, 8)); + nreward = json_string_value(json_array_get(params, 9)); + + if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !ntime || + strlen(prevhash) != 64 || strlen(version) != 8 || + strlen(nbits) != 8 || strlen(ntime) != 8) { + applog(LOG_ERR, "Stratum notify: invalid parameters"); + goto out; + } + merkle = (unsigned char**)malloc(merkle_count * sizeof(char *)); + for (i = 0; i < merkle_count; i++) { + const char *s = json_string_value(json_array_get(merkle_arr, i)); + if (!s || strlen(s) != 64) { + while (i--) + free(merkle[i]); + free(merkle); + applog(LOG_ERR, "Stratum notify: invalid Merkle branch"); + goto out; + } + merkle[i] = (unsigned char*)malloc(32); + hex2bin(merkle[i], s, 32); + } + + pthread_mutex_lock(&sctx->work_lock); + + coinb1_size = strlen(coinb1) / 2; + coinb2_size = strlen(coinb2) / 2; + sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size + + sctx->xnonce2_size + coinb2_size; + sctx->job.coinbase = (unsigned char*)realloc(sctx->job.coinbase, sctx->job.coinbase_size); + sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size; + hex2bin(sctx->job.coinbase, coinb1, coinb1_size); + memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); + if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) + memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); + hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); + + free(sctx->job.job_id); + sctx->job.job_id = strdup(job_id); + hex2bin(sctx->job.prevhash, prevhash, 32); + + for (i = 0; i < sctx->job.merkle_count; i++) + free(sctx->job.merkle[i]); + free(sctx->job.merkle); + sctx->job.merkle = merkle; + sctx->job.merkle_count = merkle_count; + + hex2bin(sctx->job.version, version, 4); + hex2bin(sctx->job.nbits, nbits, 4); + hex2bin(sctx->job.ntime, ntime, 4); + if(nreward != NULL) + { + if(strlen(nreward) == 4) + hex2bin(sctx->job.nreward, nreward, 2); + } + sctx->job.clean = clean; + + sctx->job.diff = sctx->next_diff; + + pthread_mutex_unlock(&sctx->work_lock); + + ret = true; + +out: + return ret; +} + +static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) +{ + double diff; + + diff = json_number_value(json_array_get(params, 0)); + if (diff == 0) + return false; + + pthread_mutex_lock(&sctx->work_lock); + sctx->next_diff = diff; + pthread_mutex_unlock(&sctx->work_lock); + + if (opt_debug) + applog(LOG_DEBUG, "Stratum difficulty set to %g", diff); + + return true; +} + +static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params) +{ + json_t *port_val; + const char *host; + int port; + + host = json_string_value(json_array_get(params, 0)); + port_val = json_array_get(params, 1); + if (json_is_string(port_val)) + port = atoi(json_string_value(port_val)); + else + port = json_integer_value(port_val); + if (!host || !port) + return false; + + free(sctx->url); + sctx->url = (char*)malloc(32 + strlen(host)); + sprintf(sctx->url, "stratum+tcp://%s:%d", host, port); + + applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url); + + stratum_disconnect(sctx); + + return true; +} + +static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id) +{ + char *s; + json_t *val; + bool ret; + + if (!id || json_is_null(id)) + return false; + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "error", json_null()); + json_object_set_new(val, "result", json_string(USER_AGENT)); + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params) +{ + char *s; + json_t *val; + bool ret; + + val = json_array_get(params, 0); + if (val) + applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val)); + + if (!id || json_is_null(id)) + return true; + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "error", json_null()); + json_object_set_new(val, "result", json_true()); + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) +{ + json_t *val, *id, *params; + json_error_t err; + const char *method; + bool ret = false; + + val = JSON_LOADS(s, &err); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + method = json_string_value(json_object_get(val, "method")); + if (!method) + goto out; + id = json_object_get(val, "id"); + params = json_object_get(val, "params"); + + if (!strcasecmp(method, "mining.notify")) { + ret = stratum_notify(sctx, params); + goto out; + } + if (!strcasecmp(method, "mining.set_difficulty")) { + ret = stratum_set_difficulty(sctx, params); + goto out; + } + if (!strcasecmp(method, "client.reconnect")) { + ret = stratum_reconnect(sctx, params); + goto out; + } + if (!strcasecmp(method, "client.get_version")) { + ret = stratum_get_version(sctx, id); + goto out; + } + if (!strcasecmp(method, "client.show_message")) { + ret = stratum_show_message(sctx, id, params); + goto out; + } + +out: + if (val) + json_decref(val); + + return ret; +} + +struct thread_q *tq_new(void) +{ + struct thread_q *tq; + + tq = (struct thread_q *)calloc(1, sizeof(*tq)); + if (!tq) + return NULL; + + INIT_LIST_HEAD(&tq->q); + pthread_mutex_init(&tq->mutex, NULL); + pthread_cond_init(&tq->cond, NULL); + + return tq; +} + +void tq_free(struct thread_q *tq) +{ + struct tq_ent *ent, *iter; + + if (!tq) + return; + + list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) { + list_del(&ent->q_node); + free(ent); + } + + pthread_cond_destroy(&tq->cond); + pthread_mutex_destroy(&tq->mutex); + + memset(tq, 0, sizeof(*tq)); /* poison */ + free(tq); +} + +static void tq_freezethaw(struct thread_q *tq, bool frozen) +{ + pthread_mutex_lock(&tq->mutex); + + tq->frozen = frozen; + + pthread_cond_signal(&tq->cond); + pthread_mutex_unlock(&tq->mutex); +} + +void tq_freeze(struct thread_q *tq) +{ + tq_freezethaw(tq, true); +} + +void tq_thaw(struct thread_q *tq) +{ + tq_freezethaw(tq, false); +} + +bool tq_push(struct thread_q *tq, void *data) +{ + struct tq_ent *ent; + bool rc = true; + + ent = (struct tq_ent *)calloc(1, sizeof(*ent)); + if (!ent) + return false; + + ent->data = data; + INIT_LIST_HEAD(&ent->q_node); + + pthread_mutex_lock(&tq->mutex); + + if (!tq->frozen) { + list_add_tail(&ent->q_node, &tq->q); + } else { + free(ent); + rc = false; + } + + pthread_cond_signal(&tq->cond); + pthread_mutex_unlock(&tq->mutex); + + return rc; +} + +void *tq_pop(struct thread_q *tq, const struct timespec *abstime) +{ + struct tq_ent *ent; + void *rval = NULL; + int rc; + + pthread_mutex_lock(&tq->mutex); + + if (!list_empty(&tq->q)) + goto pop; + + if (abstime) + rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime); + else + rc = pthread_cond_wait(&tq->cond, &tq->mutex); + if (rc) + goto out; + if (list_empty(&tq->q)) + goto out; + +pop: + ent = list_entry(tq->q.next, struct tq_ent, q_node); + rval = ent->data; + + list_del(&ent->q_node); + free(ent); + +out: + pthread_mutex_unlock(&tq->mutex); + return rval; +}