forked from slcz/arm-assembly-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sieve.S
258 lines (229 loc) · 3.68 KB
/
sieve.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#
# Sieve of Eratosthenes
# Segmented sieve, https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes#Segmented_sieve
# Running on Quad Core Raspberry PI 2, to take 60 seconds to find primes from 2 to
# 2 ^ 32 (if disable printf)
#
# Compile by: gcc sieve.S -O2 -g -o sieve -lpthread
#
#define __ASSEMBLY__
#include <asm/unistd.h>
#define O_ACCMODE 00000003
#define O_RDONLY 00000000
#define O_WRONLY 00000001
#define O_RDWR 00000002
/* THREADS = NUMBER OF CORES, LOG2STAGE2 = L1_DCACHE_SIZE * 8 */
#define THREADS (4)
#define LOG2STAGE2 (18)
#define LOG2MAX (16)
#define MAX (1<<LOG2MAX)
/* stage 1 prime number stored in half word */
#define STAGE1 (MAX * 2)
#define STAGE2 (1<<LOG2STAGE2)
#define BMAP (MAX / 8)
#define LOG2BMAP2 (LOG2STAGE2 - 3)
#define BMAP2 (STAGE2 / 8 * THREADS)
.data
format: .asciz "%u "
.bss
.balign 4
stage1: . = . + STAGE1
bitmap: . = . + BMAP
bmap2: . = . + BMAP2
number: . = . + 16
thread: . = . + (THREADS * 4)
#define OS(x) mov r7, $__NR_ ## x; swi #0
.text
#
# Bit map operation
# R0: starting index
# R12: base address of the bitmap
#
set:
mov r1, r0, lsr #5
ldr r2, [r12, r1, lsl #2]
and r0, r0, #0x1F
rsb r0, r0, #31
mov r3, #1
orr r2, r2, r3, lsl r0
str r2, [r12, r1, lsl #2]
mov pc, lr
find:
mov r1, r0, lsr #5
ldr r2, [r12, r1, lsl #2]
and r3, r0, #0x1F
mov r1, #1
mov r1, r1, lsl r3
sub r1, r1, #1
mov r1, r1, ror r3
orr r2, r2, r1
bic r0, r0, #31
1:
mvn r2, r2
clz r3, r2
add r0, r0, r3
cmp r3, #32
movne pc, lr
cmp r0, r11
moveq pc, lr
ldr r2, [r12, r0, lsr #3]
b 1b
# RETURN: R0 MODULU, R1 QUOTIENT
divide:
cmp r0, #0
moveq pc, lr
clz r10, r0
mov r2, #0
mov r3, r2
rsb r10, r10, #31
1:
mov r9, r0, lsr r10
and r9, r9, #1
orr r2, r9, r2, lsl #1
cmp r2, r1
mov r9, #1
subhs r2, r2, r1
orrhs r3, r3, r9, lsl r10
cmp r10, #0
sub r10, r10, #1
bne 1b
mov r0, r2
mov r1, r3
moveq pc, lr
display_number:
push {r12, lr}
mov r1, r0
ldr r0, =format
bl printf
pop {r12, pc}
sieve1:
push {lr}
2:
cmp r4, r11
bhs 2f
mov r0, r4
bl display_number
strh r4, [r8], #2
mov r0, r4
1:
add r0, r0, r4
cmp r0, r11
bhs 3f
mov r5, r0
bl set
mov r0, r5
b 1b
3:
mov r0, r4
add r0, r0, #1
bl find
mov r4, r0
b 2b
2:
mov r0, #0
strh r0, [r8], #2
pop {pc}
sieve2:
push {r6, lr}
mov r6, r4
3:
mov r4, r6
ldrh r5, [r8], #2
cmp r5, #0
beq 1f
mov r0, r4
mov r1, r5
bl divide
mov r4, #0
subs r4, r4, r0
addlt r4, r4, r5
2:
cmp r4, r11
bge 3b
mov r0, r4
bl set
add r4, r4, r5
b 2b
1:
mov r4, r6
mov r0, #0
1:
bl find
cmp r0, r11
bge 1f
mov r6, r0
add r0, r0, r4
bl display_number
add r0, r6, #1
cmp r0, r11
blt 1b
1:
pop {r6, pc}
partition:
push {r4-r11,lr}
mov r6, r0
# R8 / 2 number of primes in sqrt(n)
ldr r8, =stage1
mov r4, #STAGE2
add r4, r4, r6, lsl #LOG2STAGE2
2:
#if 0
mov r0, r4
mov r0, r0, lsr #26
mov r0, r0, lsl #26
cmp r0, r4
bleq display_number
#endif
# memset(bmap2, 0)
ldr r12, =bmap2
add r12, r12, r6, lsl #LOG2BMAP2
mov r0, #0
mov r1, r0
mov r2, r0
mov r3, r0
mov r5, #BMAP
1:
stmia r12!,{r0-r3}
stmia r12!,{r0-r3}
stmia r12!,{r0-r3}
stmia r12!,{r0-r3}
subs r5, r5, #64
bne 1b
ldr r12, =bmap2
add r12, r12, r6, lsl #15
mov r11, #STAGE2
ldr r8, =stage1
bl sieve2
adds r4, r4, #(STAGE2 * THREADS)
bcc 2b
pop {r4-r11,pc}
.globl main
main:
push {r4-r11,lr}
ldr r12, =bitmap
ldr r8, =stage1
mov r4, #2
mov r11, #MAX
bl sieve1
mov r4, #0
ldr r5, =thread
1:
mov r0, r5
mov r3, r4
mov r1, #0
ldr r2, =partition
bl pthread_create
add r5, r5, #4
add r4, r4, #1
cmp r4, #THREADS
blt 1b
mov r4, #0
ldr r5, =thread
1:
ldr r0, [r5], #4
mov r1, #0
bl pthread_join
add r4, r4, #1
cmp r4, #THREADS
blt 1b
pop {r4-r11,pc}