-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathmemcpy.S
141 lines (130 loc) · 2.91 KB
/
memcpy.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#if 0
void *memcpy_rvv(void *restrict dest, void const *restrict src, size_t n) {
unsigned char *d = dest;
unsigned char const *s = src;
for (size_t vl; n > 0; n -= vl, s += vl, d += vl) {
vl = __riscv_vsetvl_e8m8(n);
vuint8m8_t vec_src = __riscv_vle8_v_u8m8(s, vl);
__riscv_vse8_v_u8m8(d, vec_src, vl);
}
return dest;
}
#endif
#ifdef MX
# a0 = dest, a1 = src, a2 = len
.global MX(memcpy_rvv_)
MX(memcpy_rvv_):
mv a3, a0
1:
vsetvli t0, a2, e8, MX(), ta, ma
vle8.v v0, (a1)
add a1, a1, t0
sub a2, a2, t0
vse8.v v0, (a3)
add a3, a3, t0
bnez a2, 1b
ret
.global MX(memcpy_rvv_align_dest_)
MX(memcpy_rvv_align_dest_):
mv a3, a0
vsetvli t0, zero, e8, MX(), ta, ma # vlenb
bltu a2, t0, 2f # len < vlenb
# align dest to vlenb
sub t1, zero, a0
addi t2, t0, -1
and t1, t1, t2 #align = (-dest) & (vlenb-1)
vsetvli t0, t1, e8, MX(), ta, ma
1:
vle8.v v0, (a1)
add a1, a1, t0
sub a2, a2, t0
vse8.v v0, (a3)
add a3, a3, t0
2:
vsetvli t0, a2, e8, MX(), ta, ma
bnez a2, 1b
ret
.global MX(memcpy_rvv_align_src_)
MX(memcpy_rvv_align_src_):
mv a3, a0
vsetvli t0, zero, e8, MX(), ta, ma # vlen
bltu a2, t0, 2f # len < vlen
# align src to vlen
sub t1, zero, a1
addi t2, t0, -1
and t1, t1, t2 # align = (-src) & (vlen-1)
vsetvli t0, t1, e8, MX(), ta, ma
1:
vle8.v v0, (a1)
add a1, a1, t0
sub a2, a2, t0
vse8.v v0, (a3)
add a3, a3, t0
2:
vsetvli t0, a2, e8, MX(), ta, ma
bnez a2, 1b
ret
# combination of memcpy_rvv_align_dest and memcpy_rvv
.global MX(memcpy_rvv_align_dest_hybrid_)
MX(memcpy_rvv_align_dest_hybrid_):
mv a3, a0
vsetvli t0, zero, e8, MX(), ta, ma # vlen
slli t1, t0, 8 # skip costly division for more values
bltu a2, t1, 2f # len < vlen
sub t1, zero, a0
addi t2, t0, -1
and t1, t1, t2 # align = (-dest) & (vlen-1)
vsetvli t0, t1, e8, MX(), ta, ma # align dest to vlen
1:
vle8.v v0, (a1)
add a1, a1, t0
sub a2, a2, t0
vse8.v v0, (a3)
add a3, a3, t0
2:
vsetvli t0, a2, e8, MX(), ta, ma
bnez a2, 1b
ret
.global MX(memcpy_rvv_tail_)
MX(memcpy_rvv_tail_):
vsetvli t0, a2, e8, MX(), ta, ma
remu a3, a2, t0 # tail = n % vlenb
sub a2, a2, a3 # n -= tail
add a4, a0, a2 # end = dest + n
mv a2, a0 # n = dest
1:
vle8.v v8, (a1)
add a1, a1, t0 # src += vlenb
vse8.v v8, (a2)
add a2, a2, t0 # dest += vlenb
bltu a2, a4, 1b # dest < end
# copy tail
vsetvli zero, a3, e8, MX(), ta, ma
vle8.v v8, (a1)
vse8.v v8, (a2)
ret
# this is supposed to test how well the implementation handles
# operations with an vl smaller than VLMAX
.global MX(memcpy_rvv_128_)
MX(memcpy_rvv_128_):
li t0, 128/8
bgt a2, t0, 1f
mv t0, a2
1:
vsetvli t0, t0, e8, MX(), ta, ma
remu a3, a2, t0 # tail = n % vlenb
sub a2, a2, a3 # n -= tail
add a4, a0, a2 # end = dest + n
mv a2, a0 # n = dest
1:
vle8.v v8, (a1)
add a1, a1, t0 # src += vlenb
vse8.v v8, (a2)
add a2, a2, t0 # dest += vlenb
bltu a2, a4, 1b # dest < end
# copy tail
vsetvli zero, a3, e8, MX(), ta, ma
vle8.v v8, (a1)
vse8.v v8, (a2)
ret
#endif