-
Notifications
You must be signed in to change notification settings - Fork 1
/
sha2.lua
2962 lines (2736 loc) · 148 KB
/
sha2.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
--------------------------------------------------------------------------------------------------------------------------
-- sha2.lua
--------------------------------------------------------------------------------------------------------------------------
-- VERSION: 9 (2020-05-10)
-- AUTHOR: Egor Skriptunoff
-- LICENSE: MIT (the same license as Lua itself)
--
--
-- DESCRIPTION:
-- This module contains functions to calculate SHA digest:
-- MD5, SHA-1,
-- SHA-224, SHA-256, SHA-512/224, SHA-512/256, SHA-384, SHA-512,
-- SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256,
-- HMAC
-- Written in pure Lua.
-- Compatible with:
-- Lua 5.1, Lua 5.2, Lua 5.3, Lua 5.4, Fengari, LuaJIT 2.0/2.1 (any CPU endianness).
-- Main feature of this module: it was heavily optimized for speed.
-- For every Lua version the module contains particular implementation branch to get benefits from version-specific features.
-- - branch for Lua 5.1 (emulating bitwise operators using look-up table)
-- - branch for Lua 5.2 (using bit32/bit library), suitable for both Lua 5.2 with native "bit32" and Lua 5.1 with external library "bit"
-- - branch for Lua 5.3/5.4 (using native 64-bit bitwise operators)
-- - branch for Lua 5.3/5.4 (using native 32-bit bitwise operators) for Lua built with LUA_INT_TYPE=LUA_INT_INT
-- - branch for LuaJIT without FFI library (useful in a sandboxed environment)
-- - branch for LuaJIT x86 without FFI library (LuaJIT x86 has oddity because of lack of CPU registers)
-- - branch for LuaJIT 2.0 with FFI library (bit.* functions work only with Lua numbers)
-- - branch for LuaJIT 2.1 with FFI library (bit.* functions can work with "int64_t" arguments)
--
--
-- USAGE:
-- Input data should be provided as a binary string: either as a whole string or as a sequence of substrings (chunk-by-chunk loading, total length < 9*10^15 bytes).
-- Result (SHA digest) is returned in hexadecimal representation as a string of lowercase hex digits.
-- Simplest usage example:
-- local sha = require("sha2")
-- local your_hash = sha.sha256("your string")
-- See file "sha2_test.lua" for more examples.
--
--
-- CHANGELOG:
-- version date description
-- ------- ---------- -----------
-- 9 2020-05-10 Now works in OpenWrt's Lua (dialect of Lua 5.1 with "double" + "invisible int32")
-- 8 2019-09-03 SHA3 functions added
-- 7 2019-03-17 Added functions to convert to/from base64
-- 6 2018-11-12 HMAC added
-- 5 2018-11-10 SHA-1 added
-- 4 2018-11-03 MD5 added
-- 3 2018-11-02 Bug fixed: incorrect hashing of long (2 GByte) data streams on Lua 5.3/5.4 built with "int32" integers
-- 2 2018-10-07 Decreased module loading time in Lua 5.1 implementation branch (thanks to Peter Melnichenko for giving a hint)
-- 1 2018-10-06 First release (only SHA-2 functions)
-----------------------------------------------------------------------------
local print_debug_messages = false -- set to true to view some messages about your system's abilities and implementation branch chosen for your system
local unpack, table_concat, byte, char, string_rep, sub, gsub, gmatch, string_format, floor, ceil, math_min, math_max, tonumber, type =
table.unpack or unpack, table.concat, string.byte, string.char, string.rep, string.sub, string.gsub, string.gmatch, string.format, math.floor, math.ceil, math.min, math.max, tonumber, type
--------------------------------------------------------------------------------
-- EXAMINING YOUR SYSTEM
--------------------------------------------------------------------------------
local function get_precision(one)
-- "one" must be either float 1.0 or integer 1
-- returns bits_precision, is_integer
-- This function works correctly with all floating point datatypes (including non-IEEE-754)
local k, n, m, prev_n = 0, one, one
while true do
k, prev_n, n, m = k + 1, n, n + n + 1, m + m + k % 2
if k > 256 or n - (n - 1) ~= 1 or m - (m - 1) ~= 1 or n == m then
return k, false -- floating point datatype
elseif n == prev_n then
return k, true -- integer datatype
end
end
end
-- Make sure Lua has "double" numbers
local x = 2/3
local Lua_has_double = x * 5 > 3 and x * 4 < 3 and get_precision(1.0) >= 53
assert(Lua_has_double, "at least 53-bit floating point numbers are required")
-- Q:
-- SHA2 was designed for FPU-less machines.
-- So, why floating point numbers are needed for this module?
-- A:
-- 53-bit "double" numbers are useful to calculate "magic numbers" used in SHA.
-- I prefer to write 50 LOC "magic numbers calculator" instead of storing more than 200 constants explicitly in this source file.
local int_prec, Lua_has_integers = get_precision(1)
local Lua_has_int64 = Lua_has_integers and int_prec == 64
local Lua_has_int32 = Lua_has_integers and int_prec == 32
assert(Lua_has_int64 or Lua_has_int32 or not Lua_has_integers, "Lua integers must be either 32-bit or 64-bit")
-- Q:
-- Does it mean that almost all non-standard configurations are not supported?
-- A:
-- Yes. Sorry, too many problems to support all possible Lua numbers configurations.
-- Lua 5.1/5.2 with "int32" will not work.
-- Lua 5.1/5.2 with "int64" will not work.
-- Lua 5.1/5.2 with "int128" will not work.
-- Lua 5.1/5.2 with "float" will not work.
-- Lua 5.1/5.2 with "double" is OK. (default config for Lua 5.1, Lua 5.2, LuaJIT)
-- Lua 5.3/5.4 with "int32" + "float" will not work.
-- Lua 5.3/5.4 with "int64" + "float" will not work.
-- Lua 5.3/5.4 with "int128" + "float" will not work.
-- Lua 5.3/5.4 with "int32" + "double" is OK. (config used by Fengari)
-- Lua 5.3/5.4 with "int64" + "double" is OK. (default config for Lua 5.3, Lua 5.4)
-- Lua 5.3/5.4 with "int128" + "double" will not work.
-- Using floating point numbers better than "double" instead of "double" is OK (non-IEEE-754 floating point implementation are allowed).
-- Using "int128" instead of "int64" is not OK: "int128" would require different branch of implementation for optimized SHA512.
-- Check for LuaJIT and 32-bit bitwise libraries
local is_LuaJIT = ({false, [1] = true})[1] and (type(jit) ~= "table" or jit.version_num >= 20000) -- LuaJIT 1.x.x is treated as vanilla Lua 5.1
local is_LuaJIT_21 -- LuaJIT 2.1+
local LuaJIT_arch
local ffi -- LuaJIT FFI library (as a table)
local b -- 32-bit bitwise library (as a table)
local library_name
if is_LuaJIT then
-- Assuming "bit" library is always available on LuaJIT
b = require"bit"
library_name = "bit"
-- "ffi" is intentionally disabled on some systems for safety reason
local LuaJIT_has_FFI, result = pcall(require, "ffi")
if LuaJIT_has_FFI then
ffi = result
end
is_LuaJIT_21 = not not loadstring"b=0b0"
LuaJIT_arch = type(jit) == "table" and jit.arch or ffi and ffi.arch or nil
else
-- For vanilla Lua, "bit"/"bit32" libraries are searched in global namespace only. No attempt is made to load a library if it's not loaded yet.
for _, libname in ipairs(_VERSION == "Lua 5.2" and {"bit32", "bit"} or {"bit", "bit32"}) do
if type(_G[libname]) == "table" and _G[libname].bxor then
b = _G[libname]
library_name = libname
break
end
end
end
--------------------------------------------------------------------------------
-- You can disable here some of your system's abilities (for testing purposes)
--------------------------------------------------------------------------------
-- is_LuaJIT = nil
-- is_LuaJIT_21 = nil
-- ffi = nil
-- Lua_has_int32 = nil
-- Lua_has_int64 = nil
-- b, library_name = nil
--------------------------------------------------------------------------------
if print_debug_messages then
-- Printing list of abilities of your system
print("Abilities:")
print(" Lua version: "..(is_LuaJIT and "LuaJIT "..(is_LuaJIT_21 and "2.1 " or "2.0 ")..(LuaJIT_arch or "")..(ffi and " with FFI" or " without FFI") or _VERSION))
print(" Integer bitwise operators: "..(Lua_has_int64 and "int64" or Lua_has_int32 and "int32" or "no"))
print(" 32-bit bitwise library: "..(library_name or "not found"))
end
-- Selecting the most suitable implementation for given set of abilities
local method, branch
if is_LuaJIT and ffi then
method = "Using 'ffi' library of LuaJIT"
branch = "FFI"
elseif is_LuaJIT then
method = "Using special code for FFI-less LuaJIT"
branch = "LJ"
elseif Lua_has_int64 then
method = "Using native int64 bitwise operators"
branch = "INT64"
elseif Lua_has_int32 then
method = "Using native int32 bitwise operators"
branch = "INT32"
elseif library_name then -- when bitwise library is available (Lua 5.2 with native library "bit32" or Lua 5.1 with external library "bit")
method = "Using '"..library_name.."' library"
branch = "LIB32"
else
method = "Emulating bitwise operators using look-up table"
branch = "EMUL"
end
if print_debug_messages then
-- Printing the implementation selected to be used on your system
print("Implementation selected:")
print(" "..method)
end
--------------------------------------------------------------------------------
-- BASIC 32-BIT BITWISE FUNCTIONS
--------------------------------------------------------------------------------
local AND, OR, XOR, SHL, SHR, ROL, ROR, NOT, NORM, HEX, XOR_BYTE
-- Only low 32 bits of function arguments matter, high bits are ignored
-- The result of all functions (except HEX) is an integer inside "correct range":
-- for "bit" library: (-2^31)..(2^31-1)
-- for "bit32" library: 0..(2^32-1)
if branch == "FFI" or branch == "LJ" or branch == "LIB32" then
-- Your system has 32-bit bitwise library (either "bit" or "bit32")
AND = b.band -- 2 arguments
OR = b.bor -- 2 arguments
XOR = b.bxor -- 2..5 arguments
SHL = b.lshift -- second argument is integer 0..31
SHR = b.rshift -- second argument is integer 0..31
ROL = b.rol or b.lrotate -- second argument is integer 0..31
ROR = b.ror or b.rrotate -- second argument is integer 0..31
NOT = b.bnot -- only for LuaJIT
NORM = b.tobit -- only for LuaJIT
HEX = b.tohex -- returns string of 8 lowercase hexadecimal digits
assert(AND and OR and XOR and SHL and SHR and ROL and ROR and NOT, "Library '"..library_name.."' is incomplete")
XOR_BYTE = XOR -- XOR of two bytes (0..255)
elseif branch == "EMUL" then
-- Emulating 32-bit bitwise operations using 53-bit floating point arithmetic
function SHL(x, n)
return (x * 2^n) % 2^32
end
function SHR(x, n)
-- return (x % 2^32 - x % 2^n) / 2^n
x = x % 2^32 / 2^n
return x - x % 1
end
function ROL(x, n)
x = x % 2^32 * 2^n
local r = x % 2^32
return r + (x - r) / 2^32
end
function ROR(x, n)
x = x % 2^32 / 2^n
local r = x % 1
return r * 2^32 + (x - r)
end
local AND_of_two_bytes = {[0] = 0} -- look-up table (256*256 entries)
local idx = 0
for y = 0, 127 * 256, 256 do
for x = y, y + 127 do
x = AND_of_two_bytes[x] * 2
AND_of_two_bytes[idx] = x
AND_of_two_bytes[idx + 1] = x
AND_of_two_bytes[idx + 256] = x
AND_of_two_bytes[idx + 257] = x + 1
idx = idx + 2
end
idx = idx + 256
end
local function and_or_xor(x, y, operation)
-- operation: nil = AND, 1 = OR, 2 = XOR
local x0 = x % 2^32
local y0 = y % 2^32
local rx = x0 % 256
local ry = y0 % 256
local res = AND_of_two_bytes[rx + ry * 256]
x = x0 - rx
y = (y0 - ry) / 256
rx = x % 65536
ry = y % 256
res = res + AND_of_two_bytes[rx + ry] * 256
x = (x - rx) / 256
y = (y - ry) / 256
rx = x % 65536 + y % 256
res = res + AND_of_two_bytes[rx] * 65536
res = res + AND_of_two_bytes[(x + y - rx) / 256] * 16777216
if operation then
res = x0 + y0 - operation * res
end
return res
end
function AND(x, y)
return and_or_xor(x, y)
end
function OR(x, y)
return and_or_xor(x, y, 1)
end
function XOR(x, y, z, t, u) -- 2..5 arguments
if z then
if t then
if u then
t = and_or_xor(t, u, 2)
end
z = and_or_xor(z, t, 2)
end
y = and_or_xor(y, z, 2)
end
return and_or_xor(x, y, 2)
end
function XOR_BYTE(x, y)
return x + y - 2 * AND_of_two_bytes[x + y * 256]
end
end
HEX = HEX
or
pcall(string_format, "%x", 2^31) and
function (x) -- returns string of 8 lowercase hexadecimal digits
return string_format("%08x", x % 4294967296)
end
or
function (x) -- for OpenWrt's dialect of Lua
return string_format("%08x", (x + 2^31) % 2^32 - 2^31)
end
local function XOR32A5(x)
return XOR(x, 0xA5A5A5A5) % 4294967296
end
local function create_array_of_lanes()
return {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
end
--------------------------------------------------------------------------------
-- CREATING OPTIMIZED INNER LOOP
--------------------------------------------------------------------------------
-- Inner loop functions
local sha256_feed_64, sha512_feed_128, md5_feed_64, sha1_feed_64, keccak_feed
-- Arrays of SHA2 "magic numbers" (in "INT64" and "FFI" branches "*_lo" arrays contain 64-bit values)
local sha2_K_lo, sha2_K_hi, sha2_H_lo, sha2_H_hi, sha3_RC_lo, sha3_RC_hi = {}, {}, {}, {}, {}, {}
local sha2_H_ext256 = {[224] = {}, [256] = sha2_H_hi}
local sha2_H_ext512_lo, sha2_H_ext512_hi = {[384] = {}, [512] = sha2_H_lo}, {[384] = {}, [512] = sha2_H_hi}
local md5_K, md5_sha1_H = {}, {0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0}
local md5_next_shift = {0, 0, 0, 0, 0, 0, 0, 0, 28, 25, 26, 27, 0, 0, 10, 9, 11, 12, 0, 15, 16, 17, 18, 0, 20, 22, 23, 21}
local HEX64, XOR64A5, lanes_index_base -- defined only for branches that internally use 64-bit integers: "INT64" and "FFI"
local common_W = {} -- temporary table shared between all calculations (to avoid creating new temporary table every time)
local K_lo_modulo, hi_factor, hi_factor_keccak = 4294967296, 0, 0
local function build_keccak_format(elem)
local keccak_format = {}
for _, size in ipairs{1, 9, 13, 17, 18, 21} do
keccak_format[size] = "<"..string_rep(elem, size)
end
return keccak_format
end
if branch == "FFI" then
-- SHA256 implementation for "LuaJIT with FFI" branch
local common_W_FFI_int32 = ffi.new"int32_t[80]" -- 64 is enough for SHA256, but 80 is needed for SHA-1
function sha256_feed_64(H, str, offs, size)
-- offs >= 0, size >= 0, size is multiple of 64
local W, K = common_W_FFI_int32, sha2_K_hi
for pos = offs, offs + size - 1, 64 do
for j = 0, 15 do
pos = pos + 4
local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
end
for j = 16, 63 do
local a, b = W[j-15], W[j-2]
W[j] = NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) + W[j-7] + W[j-16] )
end
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
for j = 0, 63, 8 do -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap)
local z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j] + K[j+1] + h) )
h, g, f, e = g, f, e, NORM( d + z )
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+1] + K[j+2] + h) )
h, g, f, e = g, f, e, NORM( d + z )
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+2] + K[j+3] + h) )
h, g, f, e = g, f, e, NORM( d + z )
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+3] + K[j+4] + h) )
h, g, f, e = g, f, e, NORM( d + z )
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+4] + K[j+5] + h) )
h, g, f, e = g, f, e, NORM( d + z )
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+5] + K[j+6] + h) )
h, g, f, e = g, f, e, NORM( d + z )
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+6] + K[j+7] + h) )
h, g, f, e = g, f, e, NORM( d + z )
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(g, AND(e, XOR(f, g))) + XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + (W[j+7] + K[j+8] + h) )
h, g, f, e = g, f, e, NORM( d + z )
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
end
H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8])
end
end
local common_W_FFI_int64 = ffi.new"int64_t[80]"
local int64 = ffi.typeof"int64_t"
local int32 = ffi.typeof"int32_t"
local uint32 = ffi.typeof"uint32_t"
hi_factor = int64(2^32)
if is_LuaJIT_21 then -- LuaJIT 2.1 supports bitwise 64-bit operations
local AND64, OR64, XOR64, NOT64, SHL64, SHR64, ROL64, ROR64 -- introducing synonyms for better code readability
= AND, OR, XOR, NOT, SHL, SHR, ROL, ROR
HEX64 = HEX
-- SHA3 implementation for "LuaJIT 2.1 + FFI" branch
local lanes_arr64 = ffi.typeof"int64_t[30]" -- 25 + 5 for temporary usage
-- lanes array is indexed from 0
lanes_index_base = 0
hi_factor_keccak = int64(2^32)
function create_array_of_lanes()
return lanes_arr64()
end
function keccak_feed(lanes, _, str, offs, size, block_size_in_bytes)
-- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
local RC = sha3_RC_lo
local qwords_qty = SHR(block_size_in_bytes, 3)
for pos = offs, offs + size - 1, block_size_in_bytes do
for j = 0, qwords_qty - 1 do
pos = pos + 8
local h, g, f, e, d, c, b, a = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
lanes[j] = XOR64(lanes[j], OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))))
end
for round_idx = 1, 24 do
for j = 0, 4 do
lanes[25 + j] = XOR64(lanes[j], lanes[j+5], lanes[j+10], lanes[j+15], lanes[j+20])
end
local D = XOR64(lanes[25], ROL64(lanes[27], 1))
lanes[1], lanes[6], lanes[11], lanes[16] = ROL64(XOR64(D, lanes[6]), 44), ROL64(XOR64(D, lanes[16]), 45), ROL64(XOR64(D, lanes[1]), 1), ROL64(XOR64(D, lanes[11]), 10)
lanes[21] = ROL64(XOR64(D, lanes[21]), 2)
D = XOR64(lanes[26], ROL64(lanes[28], 1))
lanes[2], lanes[7], lanes[12], lanes[22] = ROL64(XOR64(D, lanes[12]), 43), ROL64(XOR64(D, lanes[22]), 61), ROL64(XOR64(D, lanes[7]), 6), ROL64(XOR64(D, lanes[2]), 62)
lanes[17] = ROL64(XOR64(D, lanes[17]), 15)
D = XOR64(lanes[27], ROL64(lanes[29], 1))
lanes[3], lanes[8], lanes[18], lanes[23] = ROL64(XOR64(D, lanes[18]), 21), ROL64(XOR64(D, lanes[3]), 28), ROL64(XOR64(D, lanes[23]), 56), ROL64(XOR64(D, lanes[8]), 55)
lanes[13] = ROL64(XOR64(D, lanes[13]), 25)
D = XOR64(lanes[28], ROL64(lanes[25], 1))
lanes[4], lanes[14], lanes[19], lanes[24] = ROL64(XOR64(D, lanes[24]), 14), ROL64(XOR64(D, lanes[19]), 8), ROL64(XOR64(D, lanes[4]), 27), ROL64(XOR64(D, lanes[14]), 39)
lanes[9] = ROL64(XOR64(D, lanes[9]), 20)
D = XOR64(lanes[29], ROL64(lanes[26], 1))
lanes[5], lanes[10], lanes[15], lanes[20] = ROL64(XOR64(D, lanes[10]), 3), ROL64(XOR64(D, lanes[20]), 18), ROL64(XOR64(D, lanes[5]), 36), ROL64(XOR64(D, lanes[15]), 41)
lanes[0] = XOR64(D, lanes[0])
lanes[0], lanes[1], lanes[2], lanes[3], lanes[4] = XOR64(lanes[0], AND64(NOT64(lanes[1]), lanes[2]), RC[round_idx]), XOR64(lanes[1], AND64(NOT64(lanes[2]), lanes[3])), XOR64(lanes[2], AND64(NOT64(lanes[3]), lanes[4])), XOR64(lanes[3], AND64(NOT64(lanes[4]), lanes[0])), XOR64(lanes[4], AND64(NOT64(lanes[0]), lanes[1]))
lanes[5], lanes[6], lanes[7], lanes[8], lanes[9] = XOR64(lanes[8], AND64(NOT64(lanes[9]), lanes[5])), XOR64(lanes[9], AND64(NOT64(lanes[5]), lanes[6])), XOR64(lanes[5], AND64(NOT64(lanes[6]), lanes[7])), XOR64(lanes[6], AND64(NOT64(lanes[7]), lanes[8])), XOR64(lanes[7], AND64(NOT64(lanes[8]), lanes[9]))
lanes[10], lanes[11], lanes[12], lanes[13], lanes[14] = XOR64(lanes[11], AND64(NOT64(lanes[12]), lanes[13])), XOR64(lanes[12], AND64(NOT64(lanes[13]), lanes[14])), XOR64(lanes[13], AND64(NOT64(lanes[14]), lanes[10])), XOR64(lanes[14], AND64(NOT64(lanes[10]), lanes[11])), XOR64(lanes[10], AND64(NOT64(lanes[11]), lanes[12]))
lanes[15], lanes[16], lanes[17], lanes[18], lanes[19] = XOR64(lanes[19], AND64(NOT64(lanes[15]), lanes[16])), XOR64(lanes[15], AND64(NOT64(lanes[16]), lanes[17])), XOR64(lanes[16], AND64(NOT64(lanes[17]), lanes[18])), XOR64(lanes[17], AND64(NOT64(lanes[18]), lanes[19])), XOR64(lanes[18], AND64(NOT64(lanes[19]), lanes[15]))
lanes[20], lanes[21], lanes[22], lanes[23], lanes[24] = XOR64(lanes[22], AND64(NOT64(lanes[23]), lanes[24])), XOR64(lanes[23], AND64(NOT64(lanes[24]), lanes[20])), XOR64(lanes[24], AND64(NOT64(lanes[20]), lanes[21])), XOR64(lanes[20], AND64(NOT64(lanes[21]), lanes[22])), XOR64(lanes[21], AND64(NOT64(lanes[22]), lanes[23]))
end
end
end
-- SHA512 implementation for "LuaJIT 2.1 + FFI" branch
local A5_long = 0xA5A5A5A5 * int64(2^32 + 1) -- It's impossible to use constant 0xA5A5A5A5A5A5A5A5LL because it will raise syntax error on other Lua versions
function XOR64A5(long)
return XOR64(long, A5_long)
end
function sha512_feed_128(H, _, str, offs, size)
-- offs >= 0, size >= 0, size is multiple of 128
local W, K = common_W_FFI_int64, sha2_K_lo
for pos = offs, offs + size - 1, 128 do
for j = 0, 15 do
pos = pos + 8
local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
W[j] = OR64(OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32), uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h))))
end
for j = 16, 79 do
local a, b = W[j-15], W[j-2]
W[j] = XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7)) + XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6)) + W[j-7] + W[j-16]
end
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
for j = 0, 79, 8 do
local z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+1] + W[j]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+2] + W[j+1]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+3] + W[j+2]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+4] + W[j+3]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+5] + W[j+4]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+6] + W[j+5]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+7] + W[j+6]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
z = XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23)) + XOR64(g, AND64(e, XOR64(f, g))) + h + K[j+8] + W[j+7]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XOR64(AND64(XOR64(a, b), c), AND64(a, b)) + XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30)) + z
end
H[1] = a + H[1]
H[2] = b + H[2]
H[3] = c + H[3]
H[4] = d + H[4]
H[5] = e + H[5]
H[6] = f + H[6]
H[7] = g + H[7]
H[8] = h + H[8]
end
end
else -- LuaJIT 2.0 doesn't support 64-bit bitwise operations
-- SHA512 implementation for "LuaJIT 2.0 + FFI" branch
local union64 = ffi.typeof"union{int64_t i64; struct{int32_t lo, hi;} i32;}"
do -- make sure the struct is endianness-compatible
local u = union64(1)
if u.i32.lo < u.i32.hi then
union64 = ffi.typeof"union{int64_t i64; struct{int32_t hi, lo;} i32;}"
end
end
local unions64 = ffi.typeof("$[?]", union64)
local U = unions64(3) -- this array of unions is used for fast splitting int64 into int32_high and int32_low
-- "xorrific" 64-bit functions :-)
-- int64 input is splitted into two int32 parts, some bitwise 32-bit operations are performed, finally the result is converted to int64
-- these functions are needed because bit.* functions in LuaJIT 2.0 don't work with int64_t
local function XORROR64_1(a)
-- return XOR64(ROR64(a, 1), ROR64(a, 8), SHR64(a, 7))
U[0].i64 = a
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
return t_hi * int64(2^32) + uint32(int32(t_lo))
end
local function XORROR64_2(b)
-- return XOR64(ROR64(b, 19), ROL64(b, 3), SHR64(b, 6))
U[0].i64 = b
local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi
local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
return u_hi * int64(2^32) + uint32(int32(u_lo))
end
local function XORROR64_3(e)
-- return XOR64(ROR64(e, 14), ROR64(e, 18), ROL64(e, 23))
U[0].i64 = e
local e_lo, e_hi = U[0].i32.lo, U[0].i32.hi
local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
return u_hi * int64(2^32) + uint32(int32(u_lo))
end
local function XORROR64_6(a)
-- return XOR64(ROR64(a, 28), ROL64(a, 25), ROL64(a, 30))
U[0].i64 = a
local b_lo, b_hi = U[0].i32.lo, U[0].i32.hi
local u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
local u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
return u_hi * int64(2^32) + uint32(int32(u_lo))
end
local function XORROR64_4(e, f, g)
-- return XOR64(g, AND64(e, XOR64(f, g)))
U[0].i64 = f
U[1].i64 = g
U[2].i64 = e
local f_lo, f_hi = U[0].i32.lo, U[0].i32.hi
local g_lo, g_hi = U[1].i32.lo, U[1].i32.hi
local e_lo, e_hi = U[2].i32.lo, U[2].i32.hi
local result_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
local result_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
return result_hi * int64(2^32) + uint32(int32(result_lo))
end
local function XORROR64_5(a, b, c)
-- return XOR64(AND64(XOR64(a, b), c), AND64(a, b))
U[0].i64 = a
U[1].i64 = b
U[2].i64 = c
local a_lo, a_hi = U[0].i32.lo, U[0].i32.hi
local b_lo, b_hi = U[1].i32.lo, U[1].i32.hi
local c_lo, c_hi = U[2].i32.lo, U[2].i32.hi
local result_lo = XOR(AND(XOR(a_lo, b_lo), c_lo), AND(a_lo, b_lo))
local result_hi = XOR(AND(XOR(a_hi, b_hi), c_hi), AND(a_hi, b_hi))
return result_hi * int64(2^32) + uint32(int32(result_lo))
end
function XOR64A5(long)
-- return XOR64(long, 0xA5A5A5A5A5A5A5A5)
U[0].i64 = long
local lo32, hi32 = U[0].i32.lo, U[0].i32.hi
lo32 = XOR(lo32, 0xA5A5A5A5)
hi32 = XOR(hi32, 0xA5A5A5A5)
return hi32 * int64(2^32) + uint32(int32(lo32))
end
function HEX64(long)
U[0].i64 = long
return HEX(U[0].i32.hi)..HEX(U[0].i32.lo)
end
function sha512_feed_128(H, _, str, offs, size)
-- offs >= 0, size >= 0, size is multiple of 128
local W, K = common_W_FFI_int64, sha2_K_lo
for pos = offs, offs + size - 1, 128 do
for j = 0, 15 do
pos = pos + 8
local a, b, c, d, e, f, g, h = byte(str, pos - 7, pos) -- slow, but doesn't depend on endianness
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d) * int64(2^32) + uint32(int32(OR(SHL(e, 24), SHL(f, 16), SHL(g, 8), h)))
end
for j = 16, 79 do
W[j] = XORROR64_1(W[j-15]) + XORROR64_2(W[j-2]) + W[j-7] + W[j-16]
end
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
for j = 0, 79, 8 do
local z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+1] + W[j]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+2] + W[j+1]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+3] + W[j+2]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+4] + W[j+3]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+5] + W[j+4]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+6] + W[j+5]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+7] + W[j+6]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
z = XORROR64_3(e) + XORROR64_4(e, f, g) + h + K[j+8] + W[j+7]
h, g, f, e = g, f, e, z + d
d, c, b, a = c, b, a, XORROR64_5(a, b, c) + XORROR64_6(a) + z
end
H[1] = a + H[1]
H[2] = b + H[2]
H[3] = c + H[3]
H[4] = d + H[4]
H[5] = e + H[5]
H[6] = f + H[6]
H[7] = g + H[7]
H[8] = h + H[8]
end
end
end
-- MD5 implementation for "LuaJIT with FFI" branch
function md5_feed_64(H, str, offs, size)
-- offs >= 0, size >= 0, size is multiple of 64
local W, K = common_W_FFI_int32, md5_K
for pos = offs, offs + size - 1, 64 do
for j = 0, 15 do
pos = pos + 4
local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
W[j] = OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a)
end
local a, b, c, d = H[1], H[2], H[3], H[4]
for j = 0, 15, 4 do
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+1] + W[j ] + a), 7) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+2] + W[j+1] + a), 12) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+3] + W[j+2] + a), 17) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(d, AND(b, XOR(c, d))) + (K[j+4] + W[j+3] + a), 22) + b)
end
for j = 16, 31, 4 do
local g = 5*j
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+1] + W[AND(g + 1, 15)] + a), 5) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+2] + W[AND(g + 6, 15)] + a), 9) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+3] + W[AND(g - 5, 15)] + a), 14) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(c, AND(d, XOR(b, c))) + (K[j+4] + W[AND(g , 15)] + a), 20) + b)
end
for j = 32, 47, 4 do
local g = 3*j
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+1] + W[AND(g + 5, 15)] + a), 4) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+2] + W[AND(g + 8, 15)] + a), 11) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+3] + W[AND(g - 5, 15)] + a), 16) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(b, c, d) + (K[j+4] + W[AND(g - 2, 15)] + a), 23) + b)
end
for j = 48, 63, 4 do
local g = 7*j
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+1] + W[AND(g , 15)] + a), 6) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+2] + W[AND(g + 7, 15)] + a), 10) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+3] + W[AND(g - 2, 15)] + a), 15) + b)
a, d, c, b = d, c, b, NORM(ROL(XOR(c, OR(b, NOT(d))) + (K[j+4] + W[AND(g + 5, 15)] + a), 21) + b)
end
H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
end
end
-- SHA-1 implementation for "LuaJIT with FFI" branch
function sha1_feed_64(H, str, offs, size)
-- offs >= 0, size >= 0, size is multiple of 64
local W = common_W_FFI_int32
for pos = offs, offs + size - 1, 64 do
for j = 0, 15 do
pos = pos + 4
local a, b, c, d = byte(str, pos - 3, pos) -- slow, but doesn't depend on endianness
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
end
for j = 16, 79 do
W[j] = ROL(XOR(W[j-3], W[j-8], W[j-14], W[j-16]), 1)
end
local a, b, c, d, e = H[1], H[2], H[3], H[4], H[5]
for j = 0, 19, 5 do
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j] + 0x5A827999 + e)) -- constant = floor(2^30 * sqrt(2))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+1] + 0x5A827999 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+2] + 0x5A827999 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+3] + 0x5A827999 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(d, AND(b, XOR(d, c))) + (W[j+4] + 0x5A827999 + e))
end
for j = 20, 39, 5 do
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0x6ED9EBA1 + e)) -- 2^30 * sqrt(3)
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0x6ED9EBA1 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0x6ED9EBA1 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0x6ED9EBA1 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0x6ED9EBA1 + e))
end
for j = 40, 59, 5 do
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j] + 0x8F1BBCDC + e)) -- 2^30 * sqrt(5)
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+1] + 0x8F1BBCDC + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+2] + 0x8F1BBCDC + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+3] + 0x8F1BBCDC + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(AND(d, XOR(b, c)), AND(b, c)) + (W[j+4] + 0x8F1BBCDC + e))
end
for j = 60, 79, 5 do
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j] + 0xCA62C1D6 + e)) -- 2^30 * sqrt(10)
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+1] + 0xCA62C1D6 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+2] + 0xCA62C1D6 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+3] + 0xCA62C1D6 + e))
e, d, c, b, a = d, c, ROR(b, 2), a, NORM(ROL(a, 5) + XOR(b, c, d) + (W[j+4] + 0xCA62C1D6 + e))
end
H[1], H[2], H[3], H[4], H[5] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4]), NORM(e + H[5])
end
end
end
-- SHA3 implementation for "LuaJIT 2.0 + FFI" and "LuaJIT without FFI" branches
if branch == "FFI" and not is_LuaJIT_21 or branch == "LJ" then
if branch == "FFI" then
local lanes_arr32 = ffi.typeof"int32_t[31]" -- 25 + 5 + 1 (due to 1-based indexing)
function create_array_of_lanes()
return lanes_arr32()
end
end
function keccak_feed(lanes_lo, lanes_hi, str, offs, size, block_size_in_bytes)
-- offs >= 0, size >= 0, size is multiple of block_size_in_bytes, block_size_in_bytes is positive multiple of 8
local RC_lo, RC_hi = sha3_RC_lo, sha3_RC_hi
local qwords_qty = SHR(block_size_in_bytes, 3)
for pos = offs, offs + size - 1, block_size_in_bytes do
for j = 1, qwords_qty do
local a, b, c, d = byte(str, pos + 1, pos + 4)
lanes_lo[j] = XOR(lanes_lo[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))
pos = pos + 8
a, b, c, d = byte(str, pos - 3, pos)
lanes_hi[j] = XOR(lanes_hi[j], OR(SHL(d, 24), SHL(c, 16), SHL(b, 8), a))
end
for round_idx = 1, 24 do
for j = 1, 5 do
lanes_lo[25 + j] = XOR(lanes_lo[j], lanes_lo[j + 5], lanes_lo[j + 10], lanes_lo[j + 15], lanes_lo[j + 20])
end
for j = 1, 5 do
lanes_hi[25 + j] = XOR(lanes_hi[j], lanes_hi[j + 5], lanes_hi[j + 10], lanes_hi[j + 15], lanes_hi[j + 20])
end
local D_lo = XOR(lanes_lo[26], SHL(lanes_lo[28], 1), SHR(lanes_hi[28], 31))
local D_hi = XOR(lanes_hi[26], SHL(lanes_hi[28], 1), SHR(lanes_lo[28], 31))
lanes_lo[2], lanes_hi[2], lanes_lo[7], lanes_hi[7], lanes_lo[12], lanes_hi[12], lanes_lo[17], lanes_hi[17] = XOR(SHR(XOR(D_lo, lanes_lo[7]), 20), SHL(XOR(D_hi, lanes_hi[7]), 12)), XOR(SHR(XOR(D_hi, lanes_hi[7]), 20), SHL(XOR(D_lo, lanes_lo[7]), 12)), XOR(SHR(XOR(D_lo, lanes_lo[17]), 19), SHL(XOR(D_hi, lanes_hi[17]), 13)), XOR(SHR(XOR(D_hi, lanes_hi[17]), 19), SHL(XOR(D_lo, lanes_lo[17]), 13)), XOR(SHL(XOR(D_lo, lanes_lo[2]), 1), SHR(XOR(D_hi, lanes_hi[2]), 31)), XOR(SHL(XOR(D_hi, lanes_hi[2]), 1), SHR(XOR(D_lo, lanes_lo[2]), 31)), XOR(SHL(XOR(D_lo, lanes_lo[12]), 10), SHR(XOR(D_hi, lanes_hi[12]), 22)), XOR(SHL(XOR(D_hi, lanes_hi[12]), 10), SHR(XOR(D_lo, lanes_lo[12]), 22))
local L, H = XOR(D_lo, lanes_lo[22]), XOR(D_hi, lanes_hi[22])
lanes_lo[22], lanes_hi[22] = XOR(SHL(L, 2), SHR(H, 30)), XOR(SHL(H, 2), SHR(L, 30))
D_lo = XOR(lanes_lo[27], SHL(lanes_lo[29], 1), SHR(lanes_hi[29], 31))
D_hi = XOR(lanes_hi[27], SHL(lanes_hi[29], 1), SHR(lanes_lo[29], 31))
lanes_lo[3], lanes_hi[3], lanes_lo[8], lanes_hi[8], lanes_lo[13], lanes_hi[13], lanes_lo[23], lanes_hi[23] = XOR(SHR(XOR(D_lo, lanes_lo[13]), 21), SHL(XOR(D_hi, lanes_hi[13]), 11)), XOR(SHR(XOR(D_hi, lanes_hi[13]), 21), SHL(XOR(D_lo, lanes_lo[13]), 11)), XOR(SHR(XOR(D_lo, lanes_lo[23]), 3), SHL(XOR(D_hi, lanes_hi[23]), 29)), XOR(SHR(XOR(D_hi, lanes_hi[23]), 3), SHL(XOR(D_lo, lanes_lo[23]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[8]), 6), SHR(XOR(D_hi, lanes_hi[8]), 26)), XOR(SHL(XOR(D_hi, lanes_hi[8]), 6), SHR(XOR(D_lo, lanes_lo[8]), 26)), XOR(SHR(XOR(D_lo, lanes_lo[3]), 2), SHL(XOR(D_hi, lanes_hi[3]), 30)), XOR(SHR(XOR(D_hi, lanes_hi[3]), 2), SHL(XOR(D_lo, lanes_lo[3]), 30))
L, H = XOR(D_lo, lanes_lo[18]), XOR(D_hi, lanes_hi[18])
lanes_lo[18], lanes_hi[18] = XOR(SHL(L, 15), SHR(H, 17)), XOR(SHL(H, 15), SHR(L, 17))
D_lo = XOR(lanes_lo[28], SHL(lanes_lo[30], 1), SHR(lanes_hi[30], 31))
D_hi = XOR(lanes_hi[28], SHL(lanes_hi[30], 1), SHR(lanes_lo[30], 31))
lanes_lo[4], lanes_hi[4], lanes_lo[9], lanes_hi[9], lanes_lo[19], lanes_hi[19], lanes_lo[24], lanes_hi[24] = XOR(SHL(XOR(D_lo, lanes_lo[19]), 21), SHR(XOR(D_hi, lanes_hi[19]), 11)), XOR(SHL(XOR(D_hi, lanes_hi[19]), 21), SHR(XOR(D_lo, lanes_lo[19]), 11)), XOR(SHL(XOR(D_lo, lanes_lo[4]), 28), SHR(XOR(D_hi, lanes_hi[4]), 4)), XOR(SHL(XOR(D_hi, lanes_hi[4]), 28), SHR(XOR(D_lo, lanes_lo[4]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[24]), 8), SHL(XOR(D_hi, lanes_hi[24]), 24)), XOR(SHR(XOR(D_hi, lanes_hi[24]), 8), SHL(XOR(D_lo, lanes_lo[24]), 24)), XOR(SHR(XOR(D_lo, lanes_lo[9]), 9), SHL(XOR(D_hi, lanes_hi[9]), 23)), XOR(SHR(XOR(D_hi, lanes_hi[9]), 9), SHL(XOR(D_lo, lanes_lo[9]), 23))
L, H = XOR(D_lo, lanes_lo[14]), XOR(D_hi, lanes_hi[14])
lanes_lo[14], lanes_hi[14] = XOR(SHL(L, 25), SHR(H, 7)), XOR(SHL(H, 25), SHR(L, 7))
D_lo = XOR(lanes_lo[29], SHL(lanes_lo[26], 1), SHR(lanes_hi[26], 31))
D_hi = XOR(lanes_hi[29], SHL(lanes_hi[26], 1), SHR(lanes_lo[26], 31))
lanes_lo[5], lanes_hi[5], lanes_lo[15], lanes_hi[15], lanes_lo[20], lanes_hi[20], lanes_lo[25], lanes_hi[25] = XOR(SHL(XOR(D_lo, lanes_lo[25]), 14), SHR(XOR(D_hi, lanes_hi[25]), 18)), XOR(SHL(XOR(D_hi, lanes_hi[25]), 14), SHR(XOR(D_lo, lanes_lo[25]), 18)), XOR(SHL(XOR(D_lo, lanes_lo[20]), 8), SHR(XOR(D_hi, lanes_hi[20]), 24)), XOR(SHL(XOR(D_hi, lanes_hi[20]), 8), SHR(XOR(D_lo, lanes_lo[20]), 24)), XOR(SHL(XOR(D_lo, lanes_lo[5]), 27), SHR(XOR(D_hi, lanes_hi[5]), 5)), XOR(SHL(XOR(D_hi, lanes_hi[5]), 27), SHR(XOR(D_lo, lanes_lo[5]), 5)), XOR(SHR(XOR(D_lo, lanes_lo[15]), 25), SHL(XOR(D_hi, lanes_hi[15]), 7)), XOR(SHR(XOR(D_hi, lanes_hi[15]), 25), SHL(XOR(D_lo, lanes_lo[15]), 7))
L, H = XOR(D_lo, lanes_lo[10]), XOR(D_hi, lanes_hi[10])
lanes_lo[10], lanes_hi[10] = XOR(SHL(L, 20), SHR(H, 12)), XOR(SHL(H, 20), SHR(L, 12))
D_lo = XOR(lanes_lo[30], SHL(lanes_lo[27], 1), SHR(lanes_hi[27], 31))
D_hi = XOR(lanes_hi[30], SHL(lanes_hi[27], 1), SHR(lanes_lo[27], 31))
lanes_lo[6], lanes_hi[6], lanes_lo[11], lanes_hi[11], lanes_lo[16], lanes_hi[16], lanes_lo[21], lanes_hi[21] = XOR(SHL(XOR(D_lo, lanes_lo[11]), 3), SHR(XOR(D_hi, lanes_hi[11]), 29)), XOR(SHL(XOR(D_hi, lanes_hi[11]), 3), SHR(XOR(D_lo, lanes_lo[11]), 29)), XOR(SHL(XOR(D_lo, lanes_lo[21]), 18), SHR(XOR(D_hi, lanes_hi[21]), 14)), XOR(SHL(XOR(D_hi, lanes_hi[21]), 18), SHR(XOR(D_lo, lanes_lo[21]), 14)), XOR(SHR(XOR(D_lo, lanes_lo[6]), 28), SHL(XOR(D_hi, lanes_hi[6]), 4)), XOR(SHR(XOR(D_hi, lanes_hi[6]), 28), SHL(XOR(D_lo, lanes_lo[6]), 4)), XOR(SHR(XOR(D_lo, lanes_lo[16]), 23), SHL(XOR(D_hi, lanes_hi[16]), 9)), XOR(SHR(XOR(D_hi, lanes_hi[16]), 23), SHL(XOR(D_lo, lanes_lo[16]), 9))
lanes_lo[1], lanes_hi[1] = XOR(D_lo, lanes_lo[1]), XOR(D_hi, lanes_hi[1])
lanes_lo[1], lanes_lo[2], lanes_lo[3], lanes_lo[4], lanes_lo[5] = XOR(lanes_lo[1], AND(NOT(lanes_lo[2]), lanes_lo[3]), RC_lo[round_idx]), XOR(lanes_lo[2], AND(NOT(lanes_lo[3]), lanes_lo[4])), XOR(lanes_lo[3], AND(NOT(lanes_lo[4]), lanes_lo[5])), XOR(lanes_lo[4], AND(NOT(lanes_lo[5]), lanes_lo[1])), XOR(lanes_lo[5], AND(NOT(lanes_lo[1]), lanes_lo[2]))
lanes_lo[6], lanes_lo[7], lanes_lo[8], lanes_lo[9], lanes_lo[10] = XOR(lanes_lo[9], AND(NOT(lanes_lo[10]), lanes_lo[6])), XOR(lanes_lo[10], AND(NOT(lanes_lo[6]), lanes_lo[7])), XOR(lanes_lo[6], AND(NOT(lanes_lo[7]), lanes_lo[8])), XOR(lanes_lo[7], AND(NOT(lanes_lo[8]), lanes_lo[9])), XOR(lanes_lo[8], AND(NOT(lanes_lo[9]), lanes_lo[10]))
lanes_lo[11], lanes_lo[12], lanes_lo[13], lanes_lo[14], lanes_lo[15] = XOR(lanes_lo[12], AND(NOT(lanes_lo[13]), lanes_lo[14])), XOR(lanes_lo[13], AND(NOT(lanes_lo[14]), lanes_lo[15])), XOR(lanes_lo[14], AND(NOT(lanes_lo[15]), lanes_lo[11])), XOR(lanes_lo[15], AND(NOT(lanes_lo[11]), lanes_lo[12])), XOR(lanes_lo[11], AND(NOT(lanes_lo[12]), lanes_lo[13]))
lanes_lo[16], lanes_lo[17], lanes_lo[18], lanes_lo[19], lanes_lo[20] = XOR(lanes_lo[20], AND(NOT(lanes_lo[16]), lanes_lo[17])), XOR(lanes_lo[16], AND(NOT(lanes_lo[17]), lanes_lo[18])), XOR(lanes_lo[17], AND(NOT(lanes_lo[18]), lanes_lo[19])), XOR(lanes_lo[18], AND(NOT(lanes_lo[19]), lanes_lo[20])), XOR(lanes_lo[19], AND(NOT(lanes_lo[20]), lanes_lo[16]))
lanes_lo[21], lanes_lo[22], lanes_lo[23], lanes_lo[24], lanes_lo[25] = XOR(lanes_lo[23], AND(NOT(lanes_lo[24]), lanes_lo[25])), XOR(lanes_lo[24], AND(NOT(lanes_lo[25]), lanes_lo[21])), XOR(lanes_lo[25], AND(NOT(lanes_lo[21]), lanes_lo[22])), XOR(lanes_lo[21], AND(NOT(lanes_lo[22]), lanes_lo[23])), XOR(lanes_lo[22], AND(NOT(lanes_lo[23]), lanes_lo[24]))
lanes_hi[1], lanes_hi[2], lanes_hi[3], lanes_hi[4], lanes_hi[5] = XOR(lanes_hi[1], AND(NOT(lanes_hi[2]), lanes_hi[3]), RC_hi[round_idx]), XOR(lanes_hi[2], AND(NOT(lanes_hi[3]), lanes_hi[4])), XOR(lanes_hi[3], AND(NOT(lanes_hi[4]), lanes_hi[5])), XOR(lanes_hi[4], AND(NOT(lanes_hi[5]), lanes_hi[1])), XOR(lanes_hi[5], AND(NOT(lanes_hi[1]), lanes_hi[2]))
lanes_hi[6], lanes_hi[7], lanes_hi[8], lanes_hi[9], lanes_hi[10] = XOR(lanes_hi[9], AND(NOT(lanes_hi[10]), lanes_hi[6])), XOR(lanes_hi[10], AND(NOT(lanes_hi[6]), lanes_hi[7])), XOR(lanes_hi[6], AND(NOT(lanes_hi[7]), lanes_hi[8])), XOR(lanes_hi[7], AND(NOT(lanes_hi[8]), lanes_hi[9])), XOR(lanes_hi[8], AND(NOT(lanes_hi[9]), lanes_hi[10]))
lanes_hi[11], lanes_hi[12], lanes_hi[13], lanes_hi[14], lanes_hi[15] = XOR(lanes_hi[12], AND(NOT(lanes_hi[13]), lanes_hi[14])), XOR(lanes_hi[13], AND(NOT(lanes_hi[14]), lanes_hi[15])), XOR(lanes_hi[14], AND(NOT(lanes_hi[15]), lanes_hi[11])), XOR(lanes_hi[15], AND(NOT(lanes_hi[11]), lanes_hi[12])), XOR(lanes_hi[11], AND(NOT(lanes_hi[12]), lanes_hi[13]))
lanes_hi[16], lanes_hi[17], lanes_hi[18], lanes_hi[19], lanes_hi[20] = XOR(lanes_hi[20], AND(NOT(lanes_hi[16]), lanes_hi[17])), XOR(lanes_hi[16], AND(NOT(lanes_hi[17]), lanes_hi[18])), XOR(lanes_hi[17], AND(NOT(lanes_hi[18]), lanes_hi[19])), XOR(lanes_hi[18], AND(NOT(lanes_hi[19]), lanes_hi[20])), XOR(lanes_hi[19], AND(NOT(lanes_hi[20]), lanes_hi[16]))
lanes_hi[21], lanes_hi[22], lanes_hi[23], lanes_hi[24], lanes_hi[25] = XOR(lanes_hi[23], AND(NOT(lanes_hi[24]), lanes_hi[25])), XOR(lanes_hi[24], AND(NOT(lanes_hi[25]), lanes_hi[21])), XOR(lanes_hi[25], AND(NOT(lanes_hi[21]), lanes_hi[22])), XOR(lanes_hi[21], AND(NOT(lanes_hi[22]), lanes_hi[23])), XOR(lanes_hi[22], AND(NOT(lanes_hi[23]), lanes_hi[24]))
end
end
end
end
if branch == "LJ" then
-- SHA256 implementation for "LuaJIT without FFI" branch
function sha256_feed_64(H, str, offs, size)
-- offs >= 0, size >= 0, size is multiple of 64
local W, K = common_W, sha2_K_hi
for pos = offs, offs + size - 1, 64 do
for j = 1, 16 do
pos = pos + 4
local a, b, c, d = byte(str, pos - 3, pos)
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
end
for j = 17, 64 do
local a, b = W[j-15], W[j-2]
W[j] = NORM( NORM( XOR(ROR(a, 7), ROL(a, 14), SHR(a, 3)) + XOR(ROL(b, 15), ROL(b, 13), SHR(b, 10)) ) + NORM( W[j-7] + W[j-16] ) )
end
local a, b, c, d, e, f, g, h = H[1], H[2], H[3], H[4], H[5], H[6], H[7], H[8]
for j = 1, 64, 8 do -- Thanks to Peter Cawley for this workaround (unroll the loop to avoid "PHI shuffling too complex" due to PHIs overlap)
local z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j] + W[j] + h) )
h, g, f, e = g, f, e, NORM(d + z)
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+1] + W[j+1] + h) )
h, g, f, e = g, f, e, NORM(d + z)
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+2] + W[j+2] + h) )
h, g, f, e = g, f, e, NORM(d + z)
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+3] + W[j+3] + h) )
h, g, f, e = g, f, e, NORM(d + z)
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+4] + W[j+4] + h) )
h, g, f, e = g, f, e, NORM(d + z)
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+5] + W[j+5] + h) )
h, g, f, e = g, f, e, NORM(d + z)
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+6] + W[j+6] + h) )
h, g, f, e = g, f, e, NORM(d + z)
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
z = NORM( XOR(ROR(e, 6), ROR(e, 11), ROL(e, 7)) + XOR(g, AND(e, XOR(f, g))) + (K[j+7] + W[j+7] + h) )
h, g, f, e = g, f, e, NORM(d + z)
d, c, b, a = c, b, a, NORM( XOR(AND(a, XOR(b, c)), AND(b, c)) + XOR(ROR(a, 2), ROR(a, 13), ROL(a, 10)) + z )
end
H[1], H[2], H[3], H[4] = NORM(a + H[1]), NORM(b + H[2]), NORM(c + H[3]), NORM(d + H[4])
H[5], H[6], H[7], H[8] = NORM(e + H[5]), NORM(f + H[6]), NORM(g + H[7]), NORM(h + H[8])
end
end
local function ADD64_4(a_lo, a_hi, b_lo, b_hi, c_lo, c_hi, d_lo, d_hi)
local sum_lo = a_lo % 2^32 + b_lo % 2^32 + c_lo % 2^32 + d_lo % 2^32
local sum_hi = a_hi + b_hi + c_hi + d_hi
local result_lo = NORM( sum_lo )
local result_hi = NORM( sum_hi + floor(sum_lo / 2^32) )
return result_lo, result_hi
end
if LuaJIT_arch == "x86" then -- Special trick is required to avoid "PHI shuffling too complex" on x86 platform
-- SHA512 implementation for "LuaJIT x86 without FFI" branch
function sha512_feed_128(H_lo, H_hi, str, offs, size)
-- offs >= 0, size >= 0, size is multiple of 128
-- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
for pos = offs, offs + size - 1, 128 do
for j = 1, 16*2 do
pos = pos + 4
local a, b, c, d = byte(str, pos - 3, pos)
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
end
for jj = 17*2, 80*2, 2 do
local a_lo, a_hi = W[jj-30], W[jj-31]
local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
local b_lo, b_hi = W[jj-4], W[jj-5]
local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33])
end
local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
local zero = 0
for j = 1, 80 do
local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32
local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) )
zero = zero + zero -- this thick is needed to avoid "PHI shuffling too complex" due to PHIs overlap
h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = OR(zero, g_lo), OR(zero, g_hi), OR(zero, f_lo), OR(zero, f_hi), OR(zero, e_lo), OR(zero, e_hi)
local sum_lo = z_lo % 2^32 + d_lo % 2^32
e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) )
d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = OR(zero, c_lo), OR(zero, c_hi), OR(zero, b_lo), OR(zero, b_hi), OR(zero, a_lo), OR(zero, a_hi)
u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo)))
t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi)))
local sum_lo = z_lo % 2^32 + t_lo % 2^32 + u_lo % 2^32
a_lo, a_hi = NORM( sum_lo ), NORM( z_hi + t_hi + u_hi + floor(sum_lo / 2^32) )
end
H_lo[1], H_hi[1] = ADD64_4(H_lo[1], H_hi[1], a_lo, a_hi, 0, 0, 0, 0)
H_lo[2], H_hi[2] = ADD64_4(H_lo[2], H_hi[2], b_lo, b_hi, 0, 0, 0, 0)
H_lo[3], H_hi[3] = ADD64_4(H_lo[3], H_hi[3], c_lo, c_hi, 0, 0, 0, 0)
H_lo[4], H_hi[4] = ADD64_4(H_lo[4], H_hi[4], d_lo, d_hi, 0, 0, 0, 0)
H_lo[5], H_hi[5] = ADD64_4(H_lo[5], H_hi[5], e_lo, e_hi, 0, 0, 0, 0)
H_lo[6], H_hi[6] = ADD64_4(H_lo[6], H_hi[6], f_lo, f_hi, 0, 0, 0, 0)
H_lo[7], H_hi[7] = ADD64_4(H_lo[7], H_hi[7], g_lo, g_hi, 0, 0, 0, 0)
H_lo[8], H_hi[8] = ADD64_4(H_lo[8], H_hi[8], h_lo, h_hi, 0, 0, 0, 0)
end
end
else -- all platforms except x86
-- SHA512 implementation for "LuaJIT non-x86 without FFI" branch
function sha512_feed_128(H_lo, H_hi, str, offs, size)
-- offs >= 0, size >= 0, size is multiple of 128
-- W1_hi, W1_lo, W2_hi, W2_lo, ... Wk_hi = W[2*k-1], Wk_lo = W[2*k]
local W, K_lo, K_hi = common_W, sha2_K_lo, sha2_K_hi
for pos = offs, offs + size - 1, 128 do
for j = 1, 16*2 do
pos = pos + 4
local a, b, c, d = byte(str, pos - 3, pos)
W[j] = OR(SHL(a, 24), SHL(b, 16), SHL(c, 8), d)
end
for jj = 17*2, 80*2, 2 do
local a_lo, a_hi = W[jj-30], W[jj-31]
local t_lo = XOR(OR(SHR(a_lo, 1), SHL(a_hi, 31)), OR(SHR(a_lo, 8), SHL(a_hi, 24)), OR(SHR(a_lo, 7), SHL(a_hi, 25)))
local t_hi = XOR(OR(SHR(a_hi, 1), SHL(a_lo, 31)), OR(SHR(a_hi, 8), SHL(a_lo, 24)), SHR(a_hi, 7))
local b_lo, b_hi = W[jj-4], W[jj-5]
local u_lo = XOR(OR(SHR(b_lo, 19), SHL(b_hi, 13)), OR(SHL(b_lo, 3), SHR(b_hi, 29)), OR(SHR(b_lo, 6), SHL(b_hi, 26)))
local u_hi = XOR(OR(SHR(b_hi, 19), SHL(b_lo, 13)), OR(SHL(b_hi, 3), SHR(b_lo, 29)), SHR(b_hi, 6))
W[jj], W[jj-1] = ADD64_4(t_lo, t_hi, u_lo, u_hi, W[jj-14], W[jj-15], W[jj-32], W[jj-33])
end
local a_lo, b_lo, c_lo, d_lo, e_lo, f_lo, g_lo, h_lo = H_lo[1], H_lo[2], H_lo[3], H_lo[4], H_lo[5], H_lo[6], H_lo[7], H_lo[8]
local a_hi, b_hi, c_hi, d_hi, e_hi, f_hi, g_hi, h_hi = H_hi[1], H_hi[2], H_hi[3], H_hi[4], H_hi[5], H_hi[6], H_hi[7], H_hi[8]
for j = 1, 80 do
local t_lo = XOR(g_lo, AND(e_lo, XOR(f_lo, g_lo)))
local t_hi = XOR(g_hi, AND(e_hi, XOR(f_hi, g_hi)))
local u_lo = XOR(OR(SHR(e_lo, 14), SHL(e_hi, 18)), OR(SHR(e_lo, 18), SHL(e_hi, 14)), OR(SHL(e_lo, 23), SHR(e_hi, 9)))
local u_hi = XOR(OR(SHR(e_hi, 14), SHL(e_lo, 18)), OR(SHR(e_hi, 18), SHL(e_lo, 14)), OR(SHL(e_hi, 23), SHR(e_lo, 9)))
local sum_lo = u_lo % 2^32 + t_lo % 2^32 + h_lo % 2^32 + K_lo[j] + W[2*j] % 2^32
local z_lo, z_hi = NORM( sum_lo ), NORM( u_hi + t_hi + h_hi + K_hi[j] + W[2*j-1] + floor(sum_lo / 2^32) )
h_lo, h_hi, g_lo, g_hi, f_lo, f_hi = g_lo, g_hi, f_lo, f_hi, e_lo, e_hi
local sum_lo = z_lo % 2^32 + d_lo % 2^32
e_lo, e_hi = NORM( sum_lo ), NORM( z_hi + d_hi + floor(sum_lo / 2^32) )
d_lo, d_hi, c_lo, c_hi, b_lo, b_hi = c_lo, c_hi, b_lo, b_hi, a_lo, a_hi
u_lo = XOR(OR(SHR(b_lo, 28), SHL(b_hi, 4)), OR(SHL(b_lo, 30), SHR(b_hi, 2)), OR(SHL(b_lo, 25), SHR(b_hi, 7)))
u_hi = XOR(OR(SHR(b_hi, 28), SHL(b_lo, 4)), OR(SHL(b_hi, 30), SHR(b_lo, 2)), OR(SHL(b_hi, 25), SHR(b_lo, 7)))
t_lo = OR(AND(d_lo, c_lo), AND(b_lo, XOR(d_lo, c_lo)))
t_hi = OR(AND(d_hi, c_hi), AND(b_hi, XOR(d_hi, c_hi)))
local sum_lo = z_lo % 2^32 + u_lo % 2^32 + t_lo % 2^32