forked from bavison/arm-mem
-
Notifications
You must be signed in to change notification settings - Fork 0
/
memcmp.S
284 lines (264 loc) · 8.43 KB
/
memcmp.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/*
Copyright (c) 2013, Raspberry Pi Foundation
Copyright (c) 2013, RISC OS Open Ltd
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "arm-mem.h"
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
.text
.arch armv6
.object_arch armv4
.arm
.altmacro
.p2align 2
.macro memcmp_process_head unaligned
.if unaligned
ldr DAT0, [S_1], #4
ldr DAT1, [S_1], #4
ldr DAT2, [S_1], #4
ldr DAT3, [S_1], #4
.else
ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
.endif
ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
.endm
.macro memcmp_process_tail
cmp DAT0, DAT4
cmpeq DAT1, DAT5
cmpeq DAT2, DAT6
cmpeq DAT3, DAT7
bne 200f
.endm
.macro memcmp_leading_31bytes
movs DAT0, OFF, lsl #31
ldrmib DAT0, [S_1], #1
ldrcsh DAT1, [S_1], #2
ldrmib DAT4, [S_2], #1
ldrcsh DAT5, [S_2], #2
movpl DAT0, #0
movcc DAT1, #0
movpl DAT4, #0
movcc DAT5, #0
submi N, N, #1
subcs N, N, #2
cmp DAT0, DAT4
cmpeq DAT1, DAT5
bne 200f
movs DAT0, OFF, lsl #29
ldrmi DAT0, [S_1], #4
ldrcs DAT1, [S_1], #4
ldrcs DAT2, [S_1], #4
ldrmi DAT4, [S_2], #4
ldmcsia S_2!, {DAT5, DAT6}
movpl DAT0, #0
movcc DAT1, #0
movcc DAT2, #0
movpl DAT4, #0
movcc DAT5, #0
movcc DAT6, #0
submi N, N, #4
subcs N, N, #8
cmp DAT0, DAT4
cmpeq DAT1, DAT5
cmpeq DAT2, DAT6
bne 200f
tst OFF, #16
beq 105f
memcmp_process_head 1
sub N, N, #16
memcmp_process_tail
105:
.endm
.macro memcmp_trailing_15bytes unaligned
movs N, N, lsl #29
.if unaligned
ldrcs DAT0, [S_1], #4
ldrcs DAT1, [S_1], #4
.else
ldmcsia S_1!, {DAT0, DAT1}
.endif
ldrmi DAT2, [S_1], #4
ldmcsia S_2!, {DAT4, DAT5}
ldrmi DAT6, [S_2], #4
movcc DAT0, #0
movcc DAT1, #0
movpl DAT2, #0
movcc DAT4, #0
movcc DAT5, #0
movpl DAT6, #0
cmp DAT0, DAT4
cmpeq DAT1, DAT5
cmpeq DAT2, DAT6
bne 200f
movs N, N, lsl #2
ldrcsh DAT0, [S_1], #2
ldrmib DAT1, [S_1]
ldrcsh DAT4, [S_2], #2
ldrmib DAT5, [S_2]
movcc DAT0, #0
movpl DAT1, #0
movcc DAT4, #0
movpl DAT5, #0
cmp DAT0, DAT4
cmpeq DAT1, DAT5
bne 200f
.endm
.macro memcmp_long_inner_loop unaligned
110:
memcmp_process_head unaligned
pld [S_2, #prefetch_distance*32 + 16]
memcmp_process_tail
memcmp_process_head unaligned
pld [S_1, OFF]
memcmp_process_tail
subs N, N, #32
bhs 110b
/* Just before the final (prefetch_distance+1) 32-byte blocks,
* deal with final preloads */
preload_trailing 0, S_1, N, DAT0
preload_trailing 0, S_2, N, DAT0
add N, N, #(prefetch_distance+2)*32 - 16
120:
memcmp_process_head unaligned
memcmp_process_tail
subs N, N, #16
bhs 120b
/* Trailing words and bytes */
tst N, #15
beq 199f
memcmp_trailing_15bytes unaligned
199: /* Reached end without detecting a difference */
mov a1, #0
setend le
pop {DAT1-DAT6, pc}
.endm
.macro memcmp_short_inner_loop unaligned
subs N, N, #16 /* simplifies inner loop termination */
blo 122f
120:
memcmp_process_head unaligned
memcmp_process_tail
subs N, N, #16
bhs 120b
122: /* Trailing words and bytes */
tst N, #15
beq 199f
memcmp_trailing_15bytes unaligned
199: /* Reached end without detecting a difference */
mov a1, #0
setend le
pop {DAT1-DAT6, pc}
.endm
/*
* int memcmp(const void *s1, const void *s2, size_t n);
* On entry:
* a1 = pointer to buffer 1
* a2 = pointer to buffer 2
* a3 = number of bytes to compare (as unsigned chars)
* On exit:
* a1 = >0/=0/<0 if s1 >/=/< s2
*/
.set prefetch_distance, 2
myfunc memcmp
S_1 .req a1
S_2 .req a2
N .req a3
DAT0 .req a4
DAT1 .req v1
DAT2 .req v2
DAT3 .req v3
DAT4 .req v4
DAT5 .req v5
DAT6 .req v6
DAT7 .req ip
OFF .req lr
push {DAT1-DAT6, lr}
setend be /* lowest-addressed bytes are most significant */
/* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
cmp N, #(prefetch_distance+3)*32 - 1
blo 170f
/* Long case */
/* Adjust N so that the decrement instruction can also test for
* inner loop termination. We want it to stop when there are
* (prefetch_distance+1) complete blocks to go. */
sub N, N, #(prefetch_distance+2)*32
preload_leading_step1 0, DAT0, S_1
preload_leading_step1 0, DAT1, S_2
tst S_2, #31
beq 154f
rsb OFF, S_2, #0 /* no need to AND with 15 here */
preload_leading_step2 0, DAT0, S_1, OFF, DAT2
preload_leading_step2 0, DAT1, S_2, OFF, DAT2
memcmp_leading_31bytes
154: /* Second source now cacheline (32-byte) aligned; we have at
* least one prefetch to go. */
/* Prefetch offset is best selected such that it lies in the
* first 8 of each 32 bytes - but it's just as easy to aim for
* the first one */
and OFF, S_1, #31
rsb OFF, OFF, #32*prefetch_distance
tst S_1, #3
bne 140f
memcmp_long_inner_loop 0
140: memcmp_long_inner_loop 1
170: /* Short case */
teq N, #0
beq 199f
preload_all 0, 0, 0, S_1, N, DAT0, DAT1
preload_all 0, 0, 0, S_2, N, DAT0, DAT1
tst S_2, #3
beq 174f
172: subs N, N, #1
blo 199f
ldrb DAT0, [S_1], #1
ldrb DAT4, [S_2], #1
cmp DAT0, DAT4
bne 200f
tst S_2, #3
bne 172b
174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
tst S_1, #3
bne 140f
memcmp_short_inner_loop 0
140: memcmp_short_inner_loop 1
200: /* Difference found: determine sign. */
movhi a1, #1
movlo a1, #-1
setend le
pop {DAT1-DAT6, pc}
.unreq S_1
.unreq S_2
.unreq N
.unreq DAT0
.unreq DAT1
.unreq DAT2
.unreq DAT3
.unreq DAT4
.unreq DAT5
.unreq DAT6
.unreq DAT7
.unreq OFF
.endfunc