-
Notifications
You must be signed in to change notification settings - Fork 10
/
X25519-AArch64-simple.s
1788 lines (1616 loc) · 67.9 KB
/
X25519-AArch64-simple.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* X25519-AArch64 by Emil Lenngren (2018)
*
* To the extent possible under law, the person who associated CvC0 with
* X25519-AArch64 has waived all copyright and related or neighboring rights
* to X25519-AArch64.
*
* You should have received a copy of the CvC0 legalcode along with this
* work. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
/*
* This is an AArch64 implementation of X25519.
* It follows the reference implementation where the representation of
* a field element [0..2^255-19) is represented by a 256-bit little endian integer,
* reduced modulo 2^256-38, and may possibly be in the range [2^256-38..2^256).
* The scalar is a 256-bit integer where certain bits are hardcoded per specification.
*
* The implementation runs in constant time (~145k cycles on Cortex-vA53),
* and no conditional branches or memory access pattern depend on secret data.
*/
/*
* Implementation manually de-interleaved and modularized for use with SLOTHY. See
*
* Fast and Clean: Auditable High Performance Assembly via Constraint Solving
* (Abdulrahman, Becker, Kannwischer, Klein)
*/
#include <hal_env.h>
#include "instruction_wrappers.i"
.macro fcsel_dform out, in0, in1, cond // @slothy:no-unfold
fcsel dform_\out, dform_\in0, dform_\in1, \cond
.endm
#define STACK_MASK1 0
#define STACK_MASK2 8
#define STACK_A_0 16
#define STACK_A_8 (STACK_A_0+ 8)
#define STACK_A_16 (STACK_A_0+16)
#define STACK_A_24 (STACK_A_0+24)
#define STACK_A_32 (STACK_A_0+32)
#define STACK_B_0 64
#define STACK_B_8 (STACK_B_0+ 8)
#define STACK_B_16 (STACK_B_0+16)
#define STACK_B_24 (STACK_B_0+24)
#define STACK_B_32 (STACK_B_0+32)
#define STACK_CTR 104
#define STACK_LASTBIT 108
#define STACK_SCALAR 112
#define STACK_X_0 168
#define STACK_X_8 (STACK_X_0+ 8)
#define STACK_X_16 (STACK_X_0+16)
#define STACK_X_24 (STACK_X_0+24)
#define STACK_X_32 (STACK_X_0+32)
#define STACK_OUT_PTR (STACK_X_0+48)
.cpu generic+fp+simd
.text
.align 2
// in: x0: pointer
// out: x0: loaded value
// .type load64unaligned, %function
load64unaligned:
ldrb w1, [x0]
ldrb w2, [x0, #1]
ldrb w3, [x0, #2]
ldrb w4, [x0, #3]
ldrb w5, [x0, #4]
ldrb w6, [x0, #5]
ldrb w7, [x0, #6]
ldrb w8, [x0, #7]
orr w1, w1, w2, lsl #8
orr w3, w3, w4, lsl #8
orr w5, w5, w6, lsl #8
orr w7, w7, w8, lsl #8
orr w1, w1, w3, lsl #16
orr w5, w5, w7, lsl #16
orr x0, x1, x5, lsl #32
ret
// .size load64unaligned, .-load64unaligned
// in: x0: pointer
// out: x0-x3: loaded value
// .type load256unaligned, %function
load256unaligned:
stp x29, x30, [sp, #-64]!
mov x29, sp
stp x19, x20, [sp, #16]
stp x21, x22, [sp, #32]
mov x19, x0
bl load64unaligned
mov x20, x0
add x0, x19, #8
bl load64unaligned
mov x21, x0
add x0, x19, #16
bl load64unaligned
mov x22, x0
add x0, x19, #24
bl load64unaligned
mov x3, x0
mov x0, x20
mov x1, x21
mov x2, x22
ldp x19, x20, [sp, #16]
ldp x21, x22, [sp, #32]
ldp x29, x30, [sp], #64
ret
// .size load256unaligned, .-load256unaligned
vAB0 .req v0
vAB1 .req v1
vAB2 .req v2
vAB3 .req v3
vAB4 .req v4
vAB5 .req v5
vAB6 .req v6
vAB7 .req v7
vAB8 .req v8
vAB9 .req v9
vT0 .req vAB0
vT1 .req vAB1
vT2 .req vAB2
vT3 .req vAB3
vT4 .req vAB4
vT5 .req vAB5
vT6 .req vAB6
vT7 .req vAB7
vT8 .req vAB8
vT9 .req vAB9
vTA0 .req vAB0
vTA1 .req vAB1
vTA2 .req vAB2
vTA3 .req vAB3
vTA4 .req vAB4
vTA5 .req vAB5
vTA6 .req vAB6
vTA7 .req vAB7
vTA8 .req vAB8
vTA9 .req vAB9
vBX0 .req v10
vBX1 .req v11
vBX2 .req v12
vBX3 .req v13
vBX4 .req v14
vBX5 .req v15
vBX6 .req v16
vBX7 .req v17
vBX8 .req v18
vBX9 .req v19
vDC0 .req vBX0
vDC1 .req vBX1
vDC2 .req vBX2
vDC3 .req vBX3
vDC4 .req vBX4
vDC5 .req vBX5
vDC6 .req vBX6
vDC7 .req vBX7
vDC8 .req vBX8
vDC9 .req vBX9
vADBC0 .req v20
vADBC1 .req v21
vADBC2 .req v22
vADBC3 .req v23
vADBC4 .req v24
vADBC5 .req v25
vADBC6 .req v26
vADBC7 .req v27
vADBC8 .req v28
vADBC9 .req v29
vX4Z50 .req vADBC0
vX4Z51 .req vADBC1
vX4Z52 .req vADBC2
vX4Z53 .req vADBC3
vX4Z54 .req vADBC4
vX4Z55 .req vADBC5
vX4Z56 .req vADBC6
vX4Z57 .req vADBC7
vX4Z58 .req vADBC8
vX4Z59 .req vADBC9
vMaskA .req v30
vMaskB .req v15
vZ20 .req v1
vZ22 .req v3
vZ24 .req v5
vZ26 .req v7
vZ28 .req v9
vZ30 .req v11
vZ32 .req v13
vZ34 .req v15
vZ36 .req v17
vZ38 .req v19
vX20 .req v0
vX22 .req v2
vX24 .req v4
vX26 .req v6
vX28 .req v8
vX30 .req v10
vX32 .req v12
vX34 .req v14
vX36 .req v16
vX38 .req v18
vB0 .req v20
vB2 .req v21
vB4 .req v22
vB6 .req v23
vB8 .req v24
vA0 .req v0
vA2 .req v2
vA4 .req v4
vA6 .req v6
vA8 .req v8
vC0 .req v10
vC2 .req v12
vC4 .req v14
vC6 .req v16
vC8 .req v18
vD0 .req v25
vD2 .req v26
vD4 .req v27
vD6 .req v28
vD8 .req v29
vF0 .req v1
vF2 .req v3
vF4 .req v5
vF6 .req v7
vF8 .req v9
vG0 .req v20
vG2 .req v21
vG4 .req v22
vG6 .req v23
vG8 .req v24
// F
sF0 .req x0
sF1 .req x1
sF2 .req x2
sF3 .req x3
sF4 .req x4
sF5 .req x5
sF6 .req x6
sF7 .req x7
sF8 .req x8
sF9 .req x9
sAA0 .req x20
sAA1 .req x21
sAA2 .req x22
sAA3 .req x23
sAA4 .req x24
sAA5 .req x25
sAA6 .req x26
sAA7 .req x27
sAA8 .req x28
sAA9 .req x19
stmp .req x2
// G
sG0 .req x0
sG1 .req x1
sG2 .req x2
sG3 .req x3
sG4 .req x4
sG5 .req x5
sG6 .req x6
sG7 .req x7
sG8 .req x8
sG9 .req x9
sBB0 .req x0
sBB1 .req x1
sBB2 .req x2
sBB3 .req x3
sBB4 .req x4
sBB5 .req x5
sBB6 .req x6
sBB7 .req x7
sBB8 .req x8
sBB9 .req x9
// E
sE0 .req x10
sE1 .req x11
sE2 .req x12
sE3 .req x13
sE4 .req x14
sE5 .req x15
sE6 .req x16
sE7 .req x17
sE8 .req x19
sE9 .req x20
sZ40 .req x23
sZ41 .req x3
sZ42 .req x21
sZ44 .req x7
sZ45 .req x6
sZ46 .req x24
sZ48 .req x22
START:
.macro scalar_stack_ldr sA, offset, name
ldr \sA\()0, [sp, #\offset\()_0] // @slothy:reads=[\name\()0]
ldr \sA\()2, [sp, #\offset\()_8] // @slothy:reads=[\name\()8]
ldr \sA\()4, [sp, #\offset\()_16] // @slothy:reads=[\name\()16]
ldr \sA\()6, [sp, #\offset\()_24] // @slothy:reads=[\name\()24]
ldr \sA\()8, [sp, #\offset\()_32] // @slothy:reads=[\name\()32]
.endm
.macro scalar_stack_str offset, sA, name
stp \sA\()0, \sA\()2, [sp, #\offset\()_0] // @slothy:writes=[\name\()0,\name\()8]
stp \sA\()4, \sA\()6, [sp, #\offset\()_16] // @slothy:writes=[\name\()16,\name\()24]
str \sA\()8, [sp, #\offset\()_32] // @slothy:writes=[\name\()32]
.endm
.macro vector_stack_str offset, vA, name
stp D<\vA\()0>, D<\vA\()2>, [sp, #\offset\()_0] // @slothy:writes=[\name\()0,\name\()8]
stp D<\vA\()4>, D<\vA\()6>, [sp, #\offset\()_16] // @slothy:writes=[\name\()16,\name\()24]
str D<\vA\()8>, [sp, #\offset\()_32] // @slothy:writes=[\name\()32]
.endm
// TODO: eliminate this explicit register assignment by converting stack_vld2_lane to AArch64Instruction
xvector_load_lane_tmp .req x26
.macro vector_load_lane vA, offset, lane, name
add xvector_load_lane_tmp, sp, #\offset\()_0
ld2 { \vA\()0.s, \vA\()1.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()0]
ld2 { \vA\()2.s, \vA\()3.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()8]
ld2 { \vA\()4.s, \vA\()5.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()16]
ld2 { \vA\()6.s, \vA\()7.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()24]
ld2 { \vA\()8.s, \vA\()9.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()32]
.endm
.macro vector_sub_inner vC0, vC2, vC4, vC6, vC8, \
vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8
// (2^255-19)*4 - vB
sub \vC0\().2s, v28.2s, \vB0\().2s
sub \vC2\().2s, v29.2s, \vB2\().2s
sub \vC4\().2s, v29.2s, \vB4\().2s
sub \vC6\().2s, v29.2s, \vB6\().2s
sub \vC8\().2s, v29.2s, \vB8\().2s
// ... + vA
add \vC0\().2s, \vA0\().2s, \vC0\().2s
add \vC2\().2s, \vA2\().2s, \vC2\().2s
add \vC4\().2s, \vA4\().2s, \vC4\().2s
add \vC6\().2s, \vA6\().2s, \vC6\().2s
add \vC8\().2s, \vA8\().2s, \vC8\().2s
.endm
.macro vector_sub vC, vA, vB
vector_sub_inner \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8, \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8
.endm
.macro vector_add_inner vC0, vC2, vC4, vC6, vC8, vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8
add \vC0\().2s, \vA0\().2s, \vB0\().2s
add \vC2\().2s, \vA2\().2s, \vB2\().2s
add \vC4\().2s, \vA4\().2s, \vB4\().2s
add \vC6\().2s, \vA6\().2s, \vB6\().2s
add \vC8\().2s, \vA8\().2s, \vB8\().2s
.endm
.macro vector_add vC, vA, vB
vector_add_inner \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8, \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8
.endm
.macro vector_cmov_inner vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8, vC0, vC2, vC4, vC6, vC8
fcsel_dform \vA0, \vB0, \vC0, eq
fcsel_dform \vA2, \vB2, \vC2, eq
fcsel_dform \vA4, \vB4, \vC4, eq
fcsel_dform \vA6, \vB6, \vC6, eq
fcsel_dform \vA8, \vB8, \vC8, eq
.endm
.macro vector_cmov vA, vB, vC
vector_cmov_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8, \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8,
.endm
.macro vector_transpose_inner vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9, vB0, vB2, vB4, vB6, vB8, vC0, vC2, vC4, vC6, vC8
trn2 \vA1\().2s, \vB0\().2s, \vC0\().2s
trn1 \vA0\().2s, \vB0\().2s, \vC0\().2s
trn2 \vA3\().2s, \vB2\().2s, \vC2\().2s
trn1 \vA2\().2s, \vB2\().2s, \vC2\().2s
trn2 \vA5\().2s, \vB4\().2s, \vC4\().2s
trn1 \vA4\().2s, \vB4\().2s, \vC4\().2s
trn2 \vA7\().2s, \vB6\().2s, \vC6\().2s
trn1 \vA6\().2s, \vB6\().2s, \vC6\().2s
trn2 \vA9\().2s, \vB8\().2s, \vC8\().2s
trn1 \vA8\().2s, \vB8\().2s, \vC8\().2s
.endm
.macro vector_transpose vA, vB, vC
vector_transpose_inner \vA\()0, \vA\()1, \vA\()2, \vA\()3, \vA\()4, \vA\()5, \vA\()6, \vA\()7, \vA\()8, \vA\()9, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8, \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8,
.endm
.macro vector_to_scalar_inner sA0, sA2, sA4, sA6, sA8, vB0, vB2, vB4, vB6, vB8
mov \sA0, \vB0\().d[0]
mov \sA2, \vB2\().d[0]
mov \sA4, \vB4\().d[0]
mov \sA6, \vB6\().d[0]
mov \sA8, \vB8\().d[0]
.endm
.macro vector_to_scalar sA, vB
vector_to_scalar_inner \sA\()0, \sA\()2, \sA\()4, \sA\()6, \sA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8
.endm
.macro scalar_to_vector_inner vA0, vA2, vA4, vA6, vA8, sB0, sB2, sB4, sB6, sB8
mov \vA0\().d[0], \sB0
mov \vA2\().d[0], \sB2
mov \vA4\().d[0], \sB4
mov \vA6\().d[0], \sB6
mov \vA8\().d[0], \sB8
.endm
.macro scalar_to_vector vA, sB
scalar_to_vector_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \sB\()0, \sB\()2, \sB\()4, \sB\()6, \sB\()8
.endm
.macro vector_extract_upper_inner vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8
mov \vA0\().d[0], \vB0\().d[1]
mov \vA2\().d[0], \vB2\().d[1]
mov \vA4\().d[0], \vB4\().d[1]
mov \vA6\().d[0], \vB6\().d[1]
mov \vA8\().d[0], \vB8\().d[1]
.endm
.macro vector_extract_upper vA, vB
vector_extract_upper_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8
.endm
.macro vector_compress_inner vA0, vA2, vA4, vA6, vA8, vB0, vB1, vB2, vB3, vB4, vB5, vB6, vB7, vB8, vB9
trn1 \vA0\().4s, \vB0\().4s, \vB1\().4s
trn1 \vA2\().4s, \vB2\().4s, \vB3\().4s
trn1 \vA4\().4s, \vB4\().4s, \vB5\().4s
trn1 \vA6\().4s, \vB6\().4s, \vB7\().4s
trn1 \vA8\().4s, \vB8\().4s, \vB9\().4s
.endm
.macro vector_compress vA, vB
vector_compress_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()1, \vB\()2, \vB\()3, \vB\()4, \vB\()5, \vB\()6, \vB\()7, \vB\()8, \vB\()9,
.endm
.macro scalar_clear_carries_inner sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9
and \sA1, \sA1, #0x1ffffff
and \sA3, \sA3, #0x1ffffff
and \sA5, \sA5, #0x1ffffff
and \sA7, \sA7, #0x1ffffff
mov W<\sA0>, W<\sA0>
mov W<\sA2>, W<\sA2>
mov W<\sA4>, W<\sA4>
mov W<\sA6>, W<\sA6>
mov W<\sA8>, W<\sA8>
.endm
.macro scalar_clear_carries sA
scalar_clear_carries_inner \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9
.endm
.macro scalar_decompress_inner sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9
lsr \sA1, \sA0, #32
lsr \sA3, \sA2, #32
lsr \sA5, \sA4, #32
lsr \sA7, \sA6, #32
lsr \sA9, \sA8, #32
.endm
.macro scalar_decompress sA
scalar_decompress_inner \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9
.endm
// TODO: eliminate those. should be easy
vR_l4h4l5h5 .req vADBC4
vR_l6h6l7h7 .req vADBC5
vR_l0h0l1h1 .req vADBC0
vR_l2h2l3h3 .req vADBC1
vR_l0123 .req vADBC4
vR_l4567 .req vADBC6
vR_h0123 .req vADBC5
vR_h4567 .req vADBC7
vR_l89h89 .req vADBC8
vR_h89xx .req vADBC9
vSum0123 .req vADBC0
vSum4567 .req vADBC1
vSum89xx .req vADBC2
vDiff0123 .req v10
vDiff4567 .req v11
vDiff89xx .req v12
// TODO: eliminate those explicit register assignments by converting stack_vld1r and stack_vldr_bform to AArch64Instruction
vrepack_inner_tmp .req v19
vrepack_inner_tmp2 .req v0
.macro vector_addsub_repack_inner vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9, vC0, vC1, vC2, vC3, vC4, vC5, vC6, vC7, vC8, vC9
uzp1 vR_l4h4l5h5.4s, \vC4\().4s, \vC5\().4s
uzp1 vR_l6h6l7h7.4s, \vC6\().4s, \vC7\().4s
ld1r {vrepack_inner_tmp.2d}, [sp] // @slothy:reads=mask1
uzp1 vR_l4567.4s, vR_l4h4l5h5.4s, vR_l6h6l7h7.4s
uzp2 vR_h4567.4s, vR_l4h4l5h5.4s, vR_l6h6l7h7.4s
trn1 vR_l89h89.4s, \vC8\().4s, \vC9\().4s
ldr B<vrepack_inner_tmp2>, [sp, #STACK_MASK2] // @slothy:reads=mask2
uzp1 vR_l0h0l1h1.4s, \vC0\().4s, \vC1\().4s
uzp1 vR_l2h2l3h3.4s, \vC2\().4s, \vC3\().4s
mov vR_h89xx.d[0], vR_l89h89.d[1]
uzp1 vR_l0123.4s, vR_l0h0l1h1.4s, vR_l2h2l3h3.4s
uzp2 vR_h0123.4s, vR_l0h0l1h1.4s, vR_l2h2l3h3.4s
add vDiff4567.4s, vR_l4567.4s, vrepack_inner_tmp.4s
add vDiff89xx.2s, vR_l89h89.2s, vrepack_inner_tmp.2s
mov vrepack_inner_tmp.b[0], vrepack_inner_tmp2.b[0]
add vSum0123.4s, vR_l0123.4s, vR_h0123.4s
add vSum4567.4s, vR_l4567.4s, vR_h4567.4s
add vSum89xx.2s, vR_l89h89.2s, vR_h89xx.2s
add vDiff0123.4s, vR_l0123.4s, vrepack_inner_tmp.4s
sub vDiff4567.4s, vDiff4567.4s, vR_h4567.4s
sub vDiff0123.4s, vDiff0123.4s, vR_h0123.4s
sub vDiff89xx.2s, vDiff89xx.2s, vR_h89xx.2s
zip1 \vA0\().4s, vDiff0123.4s, vSum0123.4s
zip2 \vA2\().4s, vDiff0123.4s, vSum0123.4s
zip1 \vA4\().4s, vDiff4567.4s, vSum4567.4s
zip2 \vA6\().4s, vDiff4567.4s, vSum4567.4s
zip1 \vA8\().2s, vDiff89xx.2s, vSum89xx.2s
zip2 \vA9\().2s, vDiff89xx.2s, vSum89xx.2s
mov \vA1\().d[0], \vA0\().d[1]
mov \vA3\().d[0], \vA2\().d[1]
mov \vA5\().d[0], \vA4\().d[1]
mov \vA7\().d[0], \vA6\().d[1]
.endm
.macro vector_addsub_repack vA, vC
vector_addsub_repack_inner \vA\()0, \vA\()1, \vA\()2, \vA\()3, \vA\()4, \vA\()5, \vA\()6, \vA\()7, \vA\()8, \vA\()9, \vC\()0, \vC\()1, \vC\()2, \vC\()3, \vC\()4, \vC\()5, \vC\()6, \vC\()7, \vC\()8, \vC\()9
.endm
// sAA0 .. sAA9 output AA = A^2
// sA0 .. sA9 input A
// TODO: simplify (this is still the same instruction order as before; we can make it simpler and leave the re-ordering to Sloty)
.macro scalar_sqr_inner sAA0, sAA1, sAA2, sAA3, sAA4, sAA5, sAA6, sAA7, sAA8, sAA9, sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9
lsr \sA1, \sA0, #32
lsr \sA3, \sA2, #32
lsr \sA5, \sA4, #32
lsr \sA7, \sA6, #32
lsr \sA9, \sA8, #32
add X<tmp_scalar_sqr_dbl_9>, \sA9, \sA9
add X<tmp_scalar_sqr_dbl_8>, \sA8, \sA8
add X<tmp_scalar_sqr_dbl_7>, \sA7, \sA7
add X<tmp_scalar_sqr_dbl_6>, \sA6, \sA6
add X<tmp_scalar_sqr_dbl_5>, \sA5, \sA5
add X<tmp_scalar_sqr_dbl_4>, \sA4, \sA4
add X<tmp_scalar_sqr_dbl_3>, \sA3, \sA3
add X<tmp_scalar_sqr_dbl_2>, \sA2, \sA2
add X<tmp_scalar_sqr_dbl_1>, \sA1, \sA1
umull X<tmp_scalar_sqr_8>, W<\sA4>, W<\sA4>
umull X<tmp_scalar_sqr_9>, W<\sA4>, W<tmp_scalar_sqr_dbl_5>
mul W<\sA9>, W<\sA9>, W<const19>
mul W<\sA7>, W<\sA7>, W<const19>
mul W<\sA5>, W<\sA5>, W<const19>
umaddl X<tmp_scalar_sqr_8>, W<\sA9>, W<tmp_scalar_sqr_dbl_9>, X<tmp_scalar_sqr_8>
umaddl X<tmp_scalar_sqr_9>, W<\sA0>, W<tmp_scalar_sqr_dbl_9>, X<tmp_scalar_sqr_9>
umull X<tmp_scalar_sqr_0>, W<\sA0>, W<\sA0>
umull X<tmp_scalar_sqr_1>, W<\sA0>, W<tmp_scalar_sqr_dbl_1>
umull X<tmp_scalar_sqr_2>, W<\sA0>, W<tmp_scalar_sqr_dbl_2>
umull X<tmp_scalar_sqr_3>, W<\sA0>, W<tmp_scalar_sqr_dbl_3>
umull X<tmp_scalar_sqr_4>, W<\sA0>, W<tmp_scalar_sqr_dbl_4>
umull X<tmp_scalar_sqr_5>, W<\sA0>, W<tmp_scalar_sqr_dbl_5>
umull X<tmp_scalar_sqr_6>, W<\sA0>, W<tmp_scalar_sqr_dbl_6>
umull X<tmp_scalar_sqr_7>, W<\sA0>, W<tmp_scalar_sqr_dbl_7>
umaddl X<tmp_scalar_sqr_8>, W<\sA0>, W<tmp_scalar_sqr_dbl_8>, X<tmp_scalar_sqr_8>
mul W<tmp_scalar_sqr_tw_6>, W<\sA6>, W<const19>
umaddl X<tmp_scalar_sqr_2>, W<\sA1>, W<tmp_scalar_sqr_dbl_1>, X<tmp_scalar_sqr_2>
umaddl X<tmp_scalar_sqr_3>, W<\sA1>, W<tmp_scalar_sqr_dbl_2>, X<tmp_scalar_sqr_3>
umaddl X<tmp_scalar_sqr_4>, W<tmp_scalar_sqr_dbl_1>, W<tmp_scalar_sqr_dbl_3>, X<tmp_scalar_sqr_4>
umaddl X<tmp_scalar_sqr_5>, W<\sA1>, W<tmp_scalar_sqr_dbl_4>, X<tmp_scalar_sqr_5>
umaddl X<tmp_scalar_sqr_6>, W<tmp_scalar_sqr_dbl_1>, W<tmp_scalar_sqr_dbl_5>, X<tmp_scalar_sqr_6>
umaddl X<tmp_scalar_sqr_7>, W<\sA1>, W<tmp_scalar_sqr_dbl_6>, X<tmp_scalar_sqr_7>
umaddl X<tmp_scalar_sqr_8>, W<tmp_scalar_sqr_dbl_1>, W<tmp_scalar_sqr_dbl_7>, X<tmp_scalar_sqr_8>
umaddl X<tmp_scalar_sqr_9>, W<\sA1>, W<tmp_scalar_sqr_dbl_8>, X<tmp_scalar_sqr_9>
mul W<tmp_scalar_sqr_tw_8>, W<\sA8>, W<const19>
umaddl X<tmp_scalar_sqr_4>, W<\sA2>, W<\sA2>, X<tmp_scalar_sqr_4>
umaddl X<tmp_scalar_sqr_5>, W<\sA2>, W<tmp_scalar_sqr_dbl_3>, X<tmp_scalar_sqr_5>
umaddl X<tmp_scalar_sqr_6>, W<\sA2>, W<tmp_scalar_sqr_dbl_4>, X<tmp_scalar_sqr_6>
umaddl X<tmp_scalar_sqr_7>, W<\sA2>, W<tmp_scalar_sqr_dbl_5>, X<tmp_scalar_sqr_7>
umaddl X<tmp_scalar_sqr_8>, W<\sA2>, W<tmp_scalar_sqr_dbl_6>, X<tmp_scalar_sqr_8>
umaddl X<tmp_scalar_sqr_9>, W<\sA2>, W<tmp_scalar_sqr_dbl_7>, X<tmp_scalar_sqr_9>
umaddl X<tmp_scalar_sqr_6>, W<\sA3>, W<tmp_scalar_sqr_dbl_3>, X<tmp_scalar_sqr_6>
umaddl X<tmp_scalar_sqr_7>, W<\sA3>, W<tmp_scalar_sqr_dbl_4>, X<tmp_scalar_sqr_7>
umaddl X<tmp_scalar_sqr_8>, W<tmp_scalar_sqr_dbl_3>, W<tmp_scalar_sqr_dbl_5>, X<tmp_scalar_sqr_8>
umaddl X<tmp_scalar_sqr_9>, W<\sA3>, W<tmp_scalar_sqr_dbl_6>, X<tmp_scalar_sqr_9>
umaddl X<tmp_scalar_sqr_6>, W<\sA8>, W<tmp_scalar_sqr_tw_8>, X<tmp_scalar_sqr_6>
umaddl X<tmp_scalar_sqr_2>, W<\sA6>, W<tmp_scalar_sqr_tw_6>, X<tmp_scalar_sqr_2>
add X<tmp_scalar_sqr_9>, X<tmp_scalar_sqr_9>, X<tmp_scalar_sqr_8>, lsr #26
umaddl X<tmp_scalar_sqr_0>, W<\sA5>, W<tmp_scalar_sqr_dbl_5>, X<tmp_scalar_sqr_0>
add X<tmp_scalar_sqr_0>, X<tmp_scalar_sqr_0>, X<tmp_scalar_sqr_9>, lsr #25
bic X<tmp_scalar_sqr_10>, X<tmp_scalar_sqr_9>, #0x1ffffff
add X<tmp_scalar_sqr_0>, X<tmp_scalar_sqr_0>, X<tmp_scalar_sqr_10>, lsr #24
and X<tmp_scalar_sqr_9>, X<tmp_scalar_sqr_9>, #0x1ffffff
add X<tmp_scalar_sqr_0>, X<tmp_scalar_sqr_0>, X<tmp_scalar_sqr_10>, lsr #21
umaddl X<tmp_scalar_sqr_4>, W<\sA7>, W<tmp_scalar_sqr_dbl_7>, X<tmp_scalar_sqr_4>
add X<tmp_scalar_sqr_quad_1>, X<tmp_scalar_sqr_dbl_1>, X<tmp_scalar_sqr_dbl_1>
add X<tmp_scalar_sqr_quad_3>, X<tmp_scalar_sqr_dbl_3>, X<tmp_scalar_sqr_dbl_3>
add X<tmp_scalar_sqr_quad_5>, X<tmp_scalar_sqr_dbl_5>, X<tmp_scalar_sqr_dbl_5>
add X<tmp_scalar_sqr_quad_7>, X<tmp_scalar_sqr_dbl_7>, X<tmp_scalar_sqr_dbl_7>
umaddl X<tmp_scalar_sqr_0>, W<tmp_scalar_sqr_tw_6>, W<tmp_scalar_sqr_dbl_4>, X<tmp_scalar_sqr_0>
umaddl X<tmp_scalar_sqr_1>, W<tmp_scalar_sqr_tw_6>, W<tmp_scalar_sqr_dbl_5>, X<tmp_scalar_sqr_1>
and X<tmp_scalar_sqr_8>, X<tmp_scalar_sqr_8>, #0x3ffffff
umaddl X<tmp_scalar_sqr_0>, W<\sA7>, W<tmp_scalar_sqr_quad_3>, X<tmp_scalar_sqr_0>
umaddl X<tmp_scalar_sqr_1>, W<\sA7>, W<tmp_scalar_sqr_dbl_4>, X<tmp_scalar_sqr_1>
umaddl X<tmp_scalar_sqr_2>, W<\sA7>, W<tmp_scalar_sqr_quad_5>, X<tmp_scalar_sqr_2>
umaddl X<tmp_scalar_sqr_3>, W<\sA7>, W<tmp_scalar_sqr_dbl_6>, X<tmp_scalar_sqr_3>
umaddl X<tmp_scalar_sqr_0>, W<tmp_scalar_sqr_tw_8>, W<tmp_scalar_sqr_dbl_2>, X<tmp_scalar_sqr_0>
umaddl X<tmp_scalar_sqr_1>, W<tmp_scalar_sqr_tw_8>, W<tmp_scalar_sqr_dbl_3>, X<tmp_scalar_sqr_1>
umaddl X<tmp_scalar_sqr_2>, W<tmp_scalar_sqr_tw_8>, W<tmp_scalar_sqr_dbl_4>, X<tmp_scalar_sqr_2>
umaddl X<tmp_scalar_sqr_3>, W<tmp_scalar_sqr_tw_8>, W<tmp_scalar_sqr_dbl_5>, X<tmp_scalar_sqr_3>
umaddl X<tmp_scalar_sqr_4>, W<tmp_scalar_sqr_tw_8>, W<tmp_scalar_sqr_dbl_6>, X<tmp_scalar_sqr_4>
umaddl X<tmp_scalar_sqr_5>, W<tmp_scalar_sqr_tw_8>, W<tmp_scalar_sqr_dbl_7>, X<tmp_scalar_sqr_5>
umaddl X<tmp_scalar_sqr_0>, W<\sA9>, W<tmp_scalar_sqr_quad_1>, X<tmp_scalar_sqr_0>
umaddl X<tmp_scalar_sqr_1>, W<\sA9>, W<tmp_scalar_sqr_dbl_2>, X<tmp_scalar_sqr_1>
umaddl X<tmp_scalar_sqr_2>, W<\sA9>, W<tmp_scalar_sqr_quad_3>, X<tmp_scalar_sqr_2>
umaddl X<tmp_scalar_sqr_3>, W<\sA9>, W<tmp_scalar_sqr_dbl_4>, X<tmp_scalar_sqr_3>
umaddl X<tmp_scalar_sqr_4>, W<\sA9>, W<tmp_scalar_sqr_quad_5>, X<tmp_scalar_sqr_4>
umaddl X<tmp_scalar_sqr_5>, W<\sA9>, W<tmp_scalar_sqr_dbl_6>, X<tmp_scalar_sqr_5>
umaddl X<tmp_scalar_sqr_6>, W<\sA9>, W<tmp_scalar_sqr_quad_7>, X<tmp_scalar_sqr_6>
umaddl X<tmp_scalar_sqr_7>, W<\sA9>, W<tmp_scalar_sqr_dbl_8>, X<tmp_scalar_sqr_7>
add \sAA1, X<tmp_scalar_sqr_1>, X<tmp_scalar_sqr_0>, lsr #26
and \sAA0, X<tmp_scalar_sqr_0>, #0x3ffffff
add \sAA2, X<tmp_scalar_sqr_2>, \sAA1, lsr #25
bfi \sAA0, \sAA1, #32, #25
add \sAA3, X<tmp_scalar_sqr_3>, \sAA2, lsr #26
and \sAA2, \sAA2, #0x3ffffff
add \sAA4, X<tmp_scalar_sqr_4>, \sAA3, lsr #25
bfi \sAA2, \sAA3, #32, #25
add \sAA5, X<tmp_scalar_sqr_5>, \sAA4, lsr #26
and \sAA4, \sAA4, #0x3ffffff
add \sAA6, X<tmp_scalar_sqr_6>, \sAA5, lsr #25
bfi \sAA4, \sAA5, #32, #25
add \sAA7, X<tmp_scalar_sqr_7>, \sAA6, lsr #26
and \sAA6, \sAA6, #0x3ffffff
add \sAA8, X<tmp_scalar_sqr_8>, \sAA7, lsr #25
bfi \sAA6, \sAA7, #32, #25
add \sAA9, X<tmp_scalar_sqr_9>, \sAA8, lsr #26
and \sAA8, \sAA8, #0x3ffffff
bfi \sAA8, \sAA9, #32, #26
.endm
.macro scalar_sqr sAA, sA
scalar_sqr_inner \sAA\()0, \sAA\()1, \sAA\()2, \sAA\()3, \sAA\()4, \sAA\()5, \sAA\()6, \sAA\()7, \sAA\()8, \sAA\()9, \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9
.endm
// sC0 .. sC9 output C = A*B
// sA0 .. sA9 input A
// sB0 .. sB9 input B
.macro scalar_mul_inner sC0, sC1, sC2, sC3, sC4, sC5, sC6, sC7, sC8, sC9, sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9, sB0, sB1, sB2, sB3, sB4, sB5, sB6, sB7, sB8, sB9
mul W<tmp_scalar_mul_tw_1>, W<\sA1>, W<const19>
mul W<tmp_scalar_mul_tw_2>, W<\sA2>, W<const19>
mul W<tmp_scalar_mul_tw_3>, W<\sA3>, W<const19>
mul W<tmp_scalar_mul_tw_5>, W<\sA5>, W<const19>
mul W<tmp_scalar_mul_tw_6>, W<\sA6>, W<const19>
mul W<tmp_scalar_mul_tw_7>, W<\sA7>, W<const19>
mul W<tmp_scalar_mul_tw_8>, W<\sA8>, W<const19>
mul W<tmp_scalar_mul_tw_9>, W<\sA9>, W<const19>
umull X<tmp_scalar_mul_9>, W<\sA1>, W<\sB8>
umaddl X<tmp_scalar_mul_9>, W<\sA3>, W<\sB6>, X<tmp_scalar_mul_9>
umaddl X<tmp_scalar_mul_9>, W<\sA5>, W<\sB4>, X<tmp_scalar_mul_9>
umaddl X<tmp_scalar_mul_9>, W<\sA7>, W<\sB2>, X<tmp_scalar_mul_9>
umaddl X<tmp_scalar_mul_9>, W<\sA9>, W<\sB0>, X<tmp_scalar_mul_9>
umaddl X<tmp_scalar_mul_9>, W<\sA0>, W<\sB9>, X<tmp_scalar_mul_9>
umaddl X<tmp_scalar_mul_9>, W<\sA2>, W<\sB7>, X<tmp_scalar_mul_9>
umaddl X<tmp_scalar_mul_9>, W<\sA4>, W<\sB5>, X<tmp_scalar_mul_9>
umaddl X<tmp_scalar_mul_9>, W<\sA6>, W<\sB3>, X<tmp_scalar_mul_9>
umaddl X<tmp_scalar_mul_9>, W<\sA8>, W<\sB1>, X<tmp_scalar_mul_9>
umull X<tmp_scalar_mul_8>, W<\sA1>, W<\sB7>
umaddl X<tmp_scalar_mul_8>, W<\sA3>, W<\sB5>, X<tmp_scalar_mul_8>
umaddl X<tmp_scalar_mul_8>, W<\sA5>, W<\sB3>, X<tmp_scalar_mul_8>
umaddl X<tmp_scalar_mul_8>, W<\sA7>, W<\sB1>, X<tmp_scalar_mul_8>
umaddl X<tmp_scalar_mul_8>, W<tmp_scalar_mul_tw_9>, W<\sB9>, X<tmp_scalar_mul_8>
add X<tmp_scalar_mul_8>, X<tmp_scalar_mul_8>, X<tmp_scalar_mul_8>
umaddl X<tmp_scalar_mul_8>, W<\sA0>, W<\sB8>, X<tmp_scalar_mul_8>
umaddl X<tmp_scalar_mul_8>, W<\sA2>, W<\sB6>, X<tmp_scalar_mul_8>
umaddl X<tmp_scalar_mul_8>, W<\sA4>, W<\sB4>, X<tmp_scalar_mul_8>
umaddl X<tmp_scalar_mul_8>, W<\sA6>, W<\sB2>, X<tmp_scalar_mul_8>
umaddl X<tmp_scalar_mul_8>, W<\sA8>, W<\sB0>, X<tmp_scalar_mul_8>
umull X<tmp_scalar_mul_7>, W<\sA1>, W<\sB6>
umaddl X<tmp_scalar_mul_7>, W<\sA3>, W<\sB4>, X<tmp_scalar_mul_7>
umaddl X<tmp_scalar_mul_7>, W<\sA5>, W<\sB2>, X<tmp_scalar_mul_7>
umaddl X<tmp_scalar_mul_7>, W<\sA7>, W<\sB0>, X<tmp_scalar_mul_7>
umaddl X<tmp_scalar_mul_7>, W<tmp_scalar_mul_tw_9>, W<\sB8>, X<tmp_scalar_mul_7>
umaddl X<tmp_scalar_mul_7>, W<\sA0>, W<\sB7>, X<tmp_scalar_mul_7>
umaddl X<tmp_scalar_mul_7>, W<\sA2>, W<\sB5>, X<tmp_scalar_mul_7>
umaddl X<tmp_scalar_mul_7>, W<\sA4>, W<\sB3>, X<tmp_scalar_mul_7>
umaddl X<tmp_scalar_mul_7>, W<\sA6>, W<\sB1>, X<tmp_scalar_mul_7>
umaddl X<tmp_scalar_mul_7>, W<tmp_scalar_mul_tw_8>, W<\sB9>, X<tmp_scalar_mul_7>
umull X<tmp_scalar_mul_6>, W<\sA1>, W<\sB5>
umaddl X<tmp_scalar_mul_6>, W<\sA3>, W<\sB3>, X<tmp_scalar_mul_6>
umaddl X<tmp_scalar_mul_6>, W<\sA5>, W<\sB1>, X<tmp_scalar_mul_6>
umaddl X<tmp_scalar_mul_6>, W<tmp_scalar_mul_tw_7>, W<\sB9>, X<tmp_scalar_mul_6>
umaddl X<tmp_scalar_mul_6>, W<tmp_scalar_mul_tw_9>, W<\sB7>, X<tmp_scalar_mul_6>
add X<tmp_scalar_mul_6>, X<tmp_scalar_mul_6>, X<tmp_scalar_mul_6>
umaddl X<tmp_scalar_mul_6>, W<\sA0>, W<\sB6>, X<tmp_scalar_mul_6>
umaddl X<tmp_scalar_mul_6>, W<\sA2>, W<\sB4>, X<tmp_scalar_mul_6>
umaddl X<tmp_scalar_mul_6>, W<\sA4>, W<\sB2>, X<tmp_scalar_mul_6>
umaddl X<tmp_scalar_mul_6>, W<\sA6>, W<\sB0>, X<tmp_scalar_mul_6>
umaddl X<tmp_scalar_mul_6>, W<tmp_scalar_mul_tw_8>, W<\sB8>, X<tmp_scalar_mul_6>
umull X<tmp_scalar_mul_5>, W<tmp_scalar_mul_tw_9>, W<\sB6>
umaddl X<tmp_scalar_mul_5>, W<\sA5>, W<\sB0>, X<tmp_scalar_mul_5>
umaddl X<tmp_scalar_mul_5>, W<tmp_scalar_mul_tw_7>, W<\sB8>, X<tmp_scalar_mul_5>
umaddl X<tmp_scalar_mul_5>, W<\sA3>, W<\sB2>, X<tmp_scalar_mul_5>
umaddl X<tmp_scalar_mul_5>, W<\sA1>, W<\sB4>, X<tmp_scalar_mul_5>
umaddl X<tmp_scalar_mul_5>, W<tmp_scalar_mul_tw_8>, W<\sB7>, X<tmp_scalar_mul_5>
umaddl X<tmp_scalar_mul_5>, W<tmp_scalar_mul_tw_6>, W<\sB9>, X<tmp_scalar_mul_5>
umaddl X<tmp_scalar_mul_5>, W<\sA4>, W<\sB1>, X<tmp_scalar_mul_5>
umaddl X<tmp_scalar_mul_5>, W<\sA2>, W<\sB3>, X<tmp_scalar_mul_5>
umaddl X<tmp_scalar_mul_5>, W<\sA0>, W<\sB5>, X<tmp_scalar_mul_5>
umull X<tmp_scalar_mul_4>, W<tmp_scalar_mul_tw_9>, W<\sB5>
umaddl X<tmp_scalar_mul_4>, W<tmp_scalar_mul_tw_7>, W<\sB7>, X<tmp_scalar_mul_4>
umaddl X<tmp_scalar_mul_4>, W<tmp_scalar_mul_tw_5>, W<\sB9>, X<tmp_scalar_mul_4>
umaddl X<tmp_scalar_mul_4>, W<\sA3>, W<\sB1>, X<tmp_scalar_mul_4>
umaddl X<tmp_scalar_mul_4>, W<\sA1>, W<\sB3>, X<tmp_scalar_mul_4>
add X<tmp_scalar_mul_4>, X<tmp_scalar_mul_4>, X<tmp_scalar_mul_4>
umaddl X<tmp_scalar_mul_4>, W<tmp_scalar_mul_tw_8>, W<\sB6>, X<tmp_scalar_mul_4>
umaddl X<tmp_scalar_mul_4>, W<tmp_scalar_mul_tw_6>, W<\sB8>, X<tmp_scalar_mul_4>
umaddl X<tmp_scalar_mul_4>, W<\sA4>, W<\sB0>, X<tmp_scalar_mul_4>
umaddl X<tmp_scalar_mul_4>, W<\sA2>, W<\sB2>, X<tmp_scalar_mul_4>
umaddl X<tmp_scalar_mul_4>, W<\sA0>, W<\sB4>, X<tmp_scalar_mul_4>
umull X<tmp_scalar_mul_3>, W<tmp_scalar_mul_tw_9>, W<\sB4>
umaddl X<tmp_scalar_mul_3>, W<tmp_scalar_mul_tw_7>, W<\sB6>, X<tmp_scalar_mul_3>
umaddl X<tmp_scalar_mul_3>, W<tmp_scalar_mul_tw_5>, W<\sB8>, X<tmp_scalar_mul_3>
umaddl X<tmp_scalar_mul_3>, W<\sA3>, W<\sB0>, X<tmp_scalar_mul_3>
umaddl X<tmp_scalar_mul_3>, W<\sA1>, W<\sB2>, X<tmp_scalar_mul_3>
mul W<tmp_scalar_mul_tw_4>, W<\sA4>, W<const19>
umaddl X<tmp_scalar_mul_3>, W<tmp_scalar_mul_tw_8>, W<\sB5>, X<tmp_scalar_mul_3>
umaddl X<tmp_scalar_mul_3>, W<tmp_scalar_mul_tw_6>, W<\sB7>, X<tmp_scalar_mul_3>
umaddl X<tmp_scalar_mul_3>, W<tmp_scalar_mul_tw_4>, W<\sB9>, X<tmp_scalar_mul_3>
umaddl X<tmp_scalar_mul_3>, W<\sA2>, W<\sB1>, X<tmp_scalar_mul_3>
umaddl X<tmp_scalar_mul_3>, W<\sA0>, W<\sB3>, X<tmp_scalar_mul_3>
add X<tmp_scalar_mul_5>, X<tmp_scalar_mul_5>, X<tmp_scalar_mul_4>, lsr #26
and \sC4, X<tmp_scalar_mul_4>, #0x3ffffff
add X<tmp_scalar_mul_6>, X<tmp_scalar_mul_6>, X<tmp_scalar_mul_5>, lsr #25
and \sC5, X<tmp_scalar_mul_5>, #0x1ffffff
add X<tmp_scalar_mul_7>, X<tmp_scalar_mul_7>, X<tmp_scalar_mul_6>, lsr #26
and \sC6, X<tmp_scalar_mul_6>, #0x3ffffff
add X<tmp_scalar_mul_8>, X<tmp_scalar_mul_8>, X<tmp_scalar_mul_7>, lsr #25
bfi \sC6, X<tmp_scalar_mul_7>, #32, #25
add X<tmp_scalar_mul_9>, X<tmp_scalar_mul_9>, X<tmp_scalar_mul_8>, lsr #26
and \sC8, X<tmp_scalar_mul_8>, #0x3ffffff
bic X<tmp_scalar_mul_0b>, X<tmp_scalar_mul_9>, #0x3ffffff
lsr X<tmp_scalar_mul_0>, X<tmp_scalar_mul_0b>, #26
bfi \sC8, X<tmp_scalar_mul_9>, #32, #26
add X<tmp_scalar_mul_0>, X<tmp_scalar_mul_0>, X<tmp_scalar_mul_0b>, lsr #25
add X<tmp_scalar_mul_0>, X<tmp_scalar_mul_0>, X<tmp_scalar_mul_0b>, lsr #22
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_9>, W<\sB1>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_7>, W<\sB3>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_5>, W<\sB5>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_3>, W<\sB7>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_1>, W<\sB9>, X<tmp_scalar_mul_0>
add X<tmp_scalar_mul_0>, X<tmp_scalar_mul_0>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_8>, W<\sB2>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_6>, W<\sB4>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_4>, W<\sB6>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<tmp_scalar_mul_tw_2>, W<\sB8>, X<tmp_scalar_mul_0>
umaddl X<tmp_scalar_mul_0>, W<\sA0>, W<\sB0>, X<tmp_scalar_mul_0>
umull X<tmp_scalar_mul_1>, W<tmp_scalar_mul_tw_9>, W<\sB2>
umaddl X<tmp_scalar_mul_1>, W<tmp_scalar_mul_tw_7>, W<\sB4>, X<tmp_scalar_mul_1>
umaddl X<tmp_scalar_mul_1>, W<tmp_scalar_mul_tw_5>, W<\sB6>, X<tmp_scalar_mul_1>
umaddl X<tmp_scalar_mul_1>, W<tmp_scalar_mul_tw_3>, W<\sB8>, X<tmp_scalar_mul_1>
umaddl X<tmp_scalar_mul_1>, W<\sA1>, W<\sB0>, X<tmp_scalar_mul_1>
umaddl X<tmp_scalar_mul_1>, W<tmp_scalar_mul_tw_8>, W<\sB3>, X<tmp_scalar_mul_1>
umaddl X<tmp_scalar_mul_1>, W<tmp_scalar_mul_tw_6>, W<\sB5>, X<tmp_scalar_mul_1>
umaddl X<tmp_scalar_mul_1>, W<tmp_scalar_mul_tw_4>, W<\sB7>, X<tmp_scalar_mul_1>
umaddl X<tmp_scalar_mul_1>, W<tmp_scalar_mul_tw_2>, W<\sB9>, X<tmp_scalar_mul_1>
umaddl X<tmp_scalar_mul_1>, W<\sA0>, W<\sB1>, X<tmp_scalar_mul_1>
umull X<tmp_scalar_mul_2>, W<tmp_scalar_mul_tw_9>, W<\sB3>
umaddl X<tmp_scalar_mul_2>, W<tmp_scalar_mul_tw_7>, W<\sB5>, X<tmp_scalar_mul_2>
umaddl X<tmp_scalar_mul_2>, W<tmp_scalar_mul_tw_5>, W<\sB7>, X<tmp_scalar_mul_2>
umaddl X<tmp_scalar_mul_2>, W<tmp_scalar_mul_tw_3>, W<\sB9>, X<tmp_scalar_mul_2>
umaddl X<tmp_scalar_mul_2>, W<\sA1>, W<\sB1>, X<tmp_scalar_mul_2>
add X<tmp_scalar_mul_2>, X<tmp_scalar_mul_2>, X<tmp_scalar_mul_2>
umaddl X<tmp_scalar_mul_2>, W<tmp_scalar_mul_tw_8>, W<\sB4>, X<tmp_scalar_mul_2>
umaddl X<tmp_scalar_mul_2>, W<tmp_scalar_mul_tw_6>, W<\sB6>, X<tmp_scalar_mul_2>
umaddl X<tmp_scalar_mul_2>, W<tmp_scalar_mul_tw_4>, W<\sB8>, X<tmp_scalar_mul_2>
umaddl X<tmp_scalar_mul_2>, W<\sA2>, W<\sB0>, X<tmp_scalar_mul_2>
umaddl X<tmp_scalar_mul_2>, W<\sA0>, W<\sB2>, X<tmp_scalar_mul_2>
add \sC1, X<tmp_scalar_mul_1>, X<tmp_scalar_mul_0>, lsr #26
and \sC0, X<tmp_scalar_mul_0>, #0x3ffffff
add \sC2, X<tmp_scalar_mul_2>, \sC1, lsr #25
bfi \sC0, \sC1, #32, #25
add X<tmp_scalar_mul_3>, X<tmp_scalar_mul_3>, \sC2, lsr #26
and \sC2, \sC2, #0x3ffffff
add \sC4, \sC4, X<tmp_scalar_mul_3>, lsr #25
bfi \sC2, X<tmp_scalar_mul_3>, #32, #25
add \sC5, \sC5, \sC4, lsr #26
and \sC4, \sC4, #0x3ffffff
bfi \sC4, \sC5, #32, #26
.endm
.macro scalar_mul sC, sA, sB
scalar_mul_inner \sC\()0, \sC\()1, \sC\()2, \sC\()3, \sC\()4, \sC\()5, \sC\()6, \sC\()7, \sC\()8, \sC\()9, \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9, \sB\()0, \sB\()1, \sB\()2, \sB\()3, \sB\()4, \sB\()5, \sB\()6, \sB\()7, \sB\()8, \sB\()9
.endm
xtmp_scalar_sub_0 .req x21
// sC0 .. sC4 output C = A + 4p - B (registers may be the same as A)
// sA0 .. sA4 first operand A
// sB0 .. sB4 second operand B
.macro scalar_sub_inner sC0, sC1, sC2, sC3, sC4, sA0, sA1, sA2, sA3, sA4, sB0, sB1, sB2, sB3, sB4
ldr xtmp_scalar_sub_0, #=0x07fffffe07fffffc
add \sC1, \sA1, xtmp_scalar_sub_0
add \sC2, \sA2, xtmp_scalar_sub_0
add \sC3, \sA3, xtmp_scalar_sub_0
add \sC4, \sA4, xtmp_scalar_sub_0
movk xtmp_scalar_sub_0, #0xffb4
add \sC0, \sA0, xtmp_scalar_sub_0
sub \sC0, \sC0, \sB0
sub \sC1, \sC1, \sB1
sub \sC2, \sC2, \sB2
sub \sC3, \sC3, \sB3
sub \sC4, \sC4, \sB4
.endm
.macro scalar_sub sC, sA, sB
scalar_sub_inner \sC\()0, \sC\()2, \sC\()4, \sC\()6, \sC\()8, \sA\()0, \sA\()2, \sA\()4, \sA\()6, \sA\()8, \sB\()0, \sB\()2, \sB\()4, \sB\()6, \sB\()8
.endm
.macro scalar_addm_inner sC0, sC1, sC2, sC3, sC4, sC5, sC6, sC7, sC8, sC9, sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9, sB0, sB1, sB2, sB3, sB4, sB5, sB6, sB7, sB8, sB9, multconst
ldr X<tmp_scalar_addm_0>, #=\multconst
umaddl \sC9, W<\sB9>, W<tmp_scalar_addm_0>, \sA9
umaddl \sC0, W<\sB0>, W<tmp_scalar_addm_0>, \sA0
umaddl \sC1, W<\sB1>, W<tmp_scalar_addm_0>, \sA1
umaddl \sC2, W<\sB2>, W<tmp_scalar_addm_0>, \sA2
lsr X<tmp_scalar_addm_1>, \sC9, #25
umaddl \sC3, W<\sB3>, W<tmp_scalar_addm_0>, \sA3
and \sC9, \sC9, #0x1ffffff
umaddl \sC4, W<\sB4>, W<tmp_scalar_addm_0>, \sA4
add \sC0, \sC0, X<tmp_scalar_addm_1>
umaddl \sC5, W<\sB5>, W<tmp_scalar_addm_0>, \sA5
add \sC0, \sC0, X<tmp_scalar_addm_1>, lsl #1
umaddl \sC6, W<\sB6>, W<tmp_scalar_addm_0>, \sA6
add \sC0, \sC0, X<tmp_scalar_addm_1>, lsl #4
umaddl \sC7, W<\sB7>, W<tmp_scalar_addm_0>, \sA7
umaddl \sC8, W<\sB8>, W<tmp_scalar_addm_0>, \sA8
add \sC1, \sC1, \sC0, lsr #26
and \sC0, \sC0, #0x3ffffff
add \sC2, \sC2, \sC1, lsr #25
and \sC1, \sC1, #0x1ffffff
add \sC3, \sC3, \sC2, lsr #26
and \sC2, \sC2, #0x3ffffff
add \sC4, \sC4, \sC3, lsr #25
and \sC3, \sC3, #0x1ffffff
add \sC5, \sC5, \sC4, lsr #26
and \sC4, \sC4, #0x3ffffff
add \sC6, \sC6, \sC5, lsr #25
and \sC5, \sC5, #0x1ffffff
add \sC7, \sC7, \sC6, lsr #26
and \sC6, \sC6, #0x3ffffff
add \sC8, \sC8, \sC7, lsr #25
and \sC7, \sC7, #0x1ffffff
add \sC9, \sC9, \sC8, lsr #26
and \sC8, \sC8, #0x3ffffff
.endm
.macro scalar_addm sC, sA, sB, multconst
scalar_addm_inner \sC\()0, \sC\()1, \sC\()2, \sC\()3, \sC\()4, \sC\()5, \sC\()6, \sC\()7, \sC\()8, \sC\()9, \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9, \sB\()0, \sB\()1, \sB\()2, \sB\()3, \sB\()4, \sB\()5, \sB\()6, \sB\()7, \sB\()8, \sB\()9, \multconst
.endm
// vAA0 .. vAA9 output AA = A^2
// vA0 .. vA9 input A
.macro vector_sqr_inner vAA0, vAA1, vAA2, vAA3, vAA4, vAA5, vAA6, vAA7, vAA8, vAA9, vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9
shl V<tmp_vector_sqr_dbl_9>.2s, \vA9\().2s, #1
shl V<tmp_vector_sqr_dbl_8>.2s, \vA8\().2s, #1
shl V<tmp_vector_sqr_dbl_7>.2s, \vA7\().2s, #1
shl V<tmp_vector_sqr_dbl_6>.2s, \vA6\().2s, #1
shl V<tmp_vector_sqr_dbl_5>.2s, \vA5\().2s, #1
shl V<tmp_vector_sqr_dbl_4>.2s, \vA4\().2s, #1
shl V<tmp_vector_sqr_dbl_3>.2s, \vA3\().2s, #1
shl V<tmp_vector_sqr_dbl_2>.2s, \vA2\().2s, #1
shl V<tmp_vector_sqr_dbl_1>.2s, \vA1\().2s, #1
umull V<tmp_vector_sqr_9>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_9>.2s
umlal V<tmp_vector_sqr_9>.2d, \vA1\().2s, V<tmp_vector_sqr_dbl_8>.2s
umlal V<tmp_vector_sqr_9>.2d, \vA2\().2s, V<tmp_vector_sqr_dbl_7>.2s
umlal V<tmp_vector_sqr_9>.2d, \vA3\().2s, V<tmp_vector_sqr_dbl_6>.2s
umlal V<tmp_vector_sqr_9>.2d, \vA4\().2s, V<tmp_vector_sqr_dbl_5>.2s
umull V<tmp_vector_sqr_8>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_8>.2s
umlal V<tmp_vector_sqr_8>.2d, V<tmp_vector_sqr_dbl_1>.2s, V<tmp_vector_sqr_dbl_7>.2s
umlal V<tmp_vector_sqr_8>.2d, \vA2\().2s, V<tmp_vector_sqr_dbl_6>.2s
umlal V<tmp_vector_sqr_8>.2d, V<tmp_vector_sqr_dbl_3>.2s, V<tmp_vector_sqr_dbl_5>.2s
umlal V<tmp_vector_sqr_8>.2d, \vA4\().2s, \vA4\().2s
mul V<tmp_vector_sqr_tw_9>.2s, \vA9\().2s, vconst19.2s
umull V<tmp_vector_sqr_7>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_7>.2s
umlal V<tmp_vector_sqr_7>.2d, \vA1\().2s, V<tmp_vector_sqr_dbl_6>.2s
umlal V<tmp_vector_sqr_7>.2d, \vA2\().2s, V<tmp_vector_sqr_dbl_5>.2s
umlal V<tmp_vector_sqr_7>.2d, \vA3\().2s, V<tmp_vector_sqr_dbl_4>.2s
umlal V<tmp_vector_sqr_8>.2d, V<tmp_vector_sqr_tw_9>.2s, V<tmp_vector_sqr_dbl_9>.2s
umull V<tmp_vector_sqr_6>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_6>.2s
umlal V<tmp_vector_sqr_6>.2d, V<tmp_vector_sqr_dbl_1>.2s, V<tmp_vector_sqr_dbl_5>.2s
umlal V<tmp_vector_sqr_6>.2d, \vA2\().2s, V<tmp_vector_sqr_dbl_4>.2s
umlal V<tmp_vector_sqr_6>.2d, V<tmp_vector_sqr_dbl_3>.2s, \vA3\().2s
umull V<tmp_vector_sqr_5>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_5>.2s
umlal V<tmp_vector_sqr_5>.2d, \vA1\().2s, V<tmp_vector_sqr_dbl_4>.2s
umlal V<tmp_vector_sqr_5>.2d, \vA2\().2s, V<tmp_vector_sqr_dbl_3>.2s
umull V<tmp_vector_sqr_4>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_4>.2s
umlal V<tmp_vector_sqr_4>.2d, V<tmp_vector_sqr_dbl_1>.2s, V<tmp_vector_sqr_dbl_3>.2s
umlal V<tmp_vector_sqr_4>.2d, \vA2\().2s, \vA2\().2s
umull V<tmp_vector_sqr_3>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_3>.2s
umlal V<tmp_vector_sqr_3>.2d, \vA1\().2s, V<tmp_vector_sqr_dbl_2>.2s
umull V<tmp_vector_sqr_2>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_2>.2s
umlal V<tmp_vector_sqr_2>.2d, V<tmp_vector_sqr_dbl_1>.2s, \vA1\().2s
umull V<tmp_vector_sqr_1>.2d, \vA0\().2s, V<tmp_vector_sqr_dbl_1>.2s
umull V<tmp_vector_sqr_0>.2d, \vA0\().2s, \vA0\().2s
usra V<tmp_vector_sqr_9>.2d, V<tmp_vector_sqr_8>.2d, #26
and V<tmp_vector_sqr_8>.16b, V<tmp_vector_sqr_8>.16b, vMaskA.16b
mul V<tmp_vector_sqr_tw_8>.2s, \vA8\().2s, vconst19.2s
bic V<tmp_vector_sqr_dbl_9>.16b, V<tmp_vector_sqr_9>.16b, vMaskB.16b
and \vA9\().16b, V<tmp_vector_sqr_9>.16b, vMaskB.16b
usra V<tmp_vector_sqr_0>.2d, V<tmp_vector_sqr_dbl_9>.2d, #25
mul V<tmp_vector_sqr_tw_7>.2s, \vA7\().2s, vconst19.2s
usra V<tmp_vector_sqr_0>.2d, V<tmp_vector_sqr_dbl_9>.2d, #24
mul V<tmp_vector_sqr_tw_6>.2s, \vA6\().2s, vconst19.2s
usra V<tmp_vector_sqr_0>.2d, V<tmp_vector_sqr_dbl_9>.2d, #21
mul V<tmp_vector_sqr_tw_5>.2s, \vA5\().2s, vconst19.2s
shl V<tmp_vector_sqr_quad_1>.2s, V<tmp_vector_sqr_dbl_1>.2s, #1
shl V<tmp_vector_sqr_quad_3>.2s, V<tmp_vector_sqr_dbl_3>.2s, #1
shl V<tmp_vector_sqr_quad_5>.2s, V<tmp_vector_sqr_dbl_5>.2s, #1
shl V<tmp_vector_sqr_quad_7>.2s, V<tmp_vector_sqr_dbl_7>.2s, #1
umlal V<tmp_vector_sqr_0>.2d, V<tmp_vector_sqr_tw_5>.2s, V<tmp_vector_sqr_dbl_5>.2s
umlal V<tmp_vector_sqr_0>.2d, V<tmp_vector_sqr_tw_9>.2s, V<tmp_vector_sqr_quad_1>.2s
umlal V<tmp_vector_sqr_0>.2d, V<tmp_vector_sqr_tw_8>.2s, V<tmp_vector_sqr_dbl_2>.2s
umlal V<tmp_vector_sqr_0>.2d, V<tmp_vector_sqr_tw_7>.2s, V<tmp_vector_sqr_quad_3>.2s
umlal V<tmp_vector_sqr_0>.2d, V<tmp_vector_sqr_tw_6>.2s, V<tmp_vector_sqr_dbl_4>.2s
umlal V<tmp_vector_sqr_1>.2d, V<tmp_vector_sqr_tw_9>.2s, V<tmp_vector_sqr_dbl_2>.2s
umlal V<tmp_vector_sqr_1>.2d, V<tmp_vector_sqr_tw_8>.2s, V<tmp_vector_sqr_dbl_3>.2s
umlal V<tmp_vector_sqr_1>.2d, V<tmp_vector_sqr_tw_7>.2s, V<tmp_vector_sqr_dbl_4>.2s
umlal V<tmp_vector_sqr_1>.2d, V<tmp_vector_sqr_tw_6>.2s, V<tmp_vector_sqr_dbl_5>.2s
umlal V<tmp_vector_sqr_2>.2d, V<tmp_vector_sqr_tw_6>.2s, \vA6\().2s
umlal V<tmp_vector_sqr_2>.2d, V<tmp_vector_sqr_tw_9>.2s, V<tmp_vector_sqr_quad_3>.2s
umlal V<tmp_vector_sqr_2>.2d, V<tmp_vector_sqr_tw_8>.2s, V<tmp_vector_sqr_dbl_4>.2s
umlal V<tmp_vector_sqr_2>.2d, V<tmp_vector_sqr_tw_7>.2s, V<tmp_vector_sqr_quad_5>.2s
usra V<tmp_vector_sqr_1>.2d, V<tmp_vector_sqr_0>.2d, #26
umlal V<tmp_vector_sqr_3>.2d, V<tmp_vector_sqr_tw_9>.2s, V<tmp_vector_sqr_dbl_4>.2s
umlal V<tmp_vector_sqr_3>.2d, V<tmp_vector_sqr_tw_8>.2s, V<tmp_vector_sqr_dbl_5>.2s
umlal V<tmp_vector_sqr_3>.2d, V<tmp_vector_sqr_tw_7>.2s, V<tmp_vector_sqr_dbl_6>.2s
usra V<tmp_vector_sqr_2>.2d, V<tmp_vector_sqr_1>.2d, #25
umlal V<tmp_vector_sqr_4>.2d, V<tmp_vector_sqr_tw_7>.2s, V<tmp_vector_sqr_dbl_7>.2s
umlal V<tmp_vector_sqr_4>.2d, V<tmp_vector_sqr_tw_9>.2s, V<tmp_vector_sqr_quad_5>.2s
umlal V<tmp_vector_sqr_4>.2d, V<tmp_vector_sqr_tw_8>.2s, V<tmp_vector_sqr_dbl_6>.2s
usra V<tmp_vector_sqr_3>.2d, V<tmp_vector_sqr_2>.2d, #26
umlal V<tmp_vector_sqr_5>.2d, V<tmp_vector_sqr_tw_9>.2s, V<tmp_vector_sqr_dbl_6>.2s
umlal V<tmp_vector_sqr_5>.2d, V<tmp_vector_sqr_tw_8>.2s, V<tmp_vector_sqr_dbl_7>.2s
usra V<tmp_vector_sqr_4>.2d, V<tmp_vector_sqr_3>.2d, #25