-
Notifications
You must be signed in to change notification settings - Fork 1
/
convolver_xmm.c
64 lines (54 loc) · 1.9 KB
/
convolver_xmm.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/*
* (c) Copyright 2013 -- Anders Torger
*
* This program is open source. For license terms, see the LICENSE file.
*
*/
#include "asmprot.h"
#include <xmmintrin.h>
void
convolver_sse_convolve_add(void *input_cbuf,
void *coeffs,
void *output_cbuf,
int loop_counter)
{
__m128 *b = (__m128 *)input_cbuf;
__m128 *c = (__m128 *)coeffs;
__m128 *d = (__m128 *)output_cbuf;
float d1s, d2s;
int i;
d1s = ((float *)d)[0] + ((float *)b)[0] * ((float *)c)[0];
d2s = ((float *)d)[4] + ((float *)b)[4] * ((float *)c)[4];
for (i = 0; i < loop_counter; i++) {
int n = i << 1;
d[n+0] = _mm_add_ps(d[n+0], _mm_sub_ps(_mm_mul_ps(b[n+0], c[n+0]), _mm_mul_ps(b[n+1], c[n+1])));
d[n+1] = _mm_add_ps(d[n+1], _mm_add_ps(_mm_mul_ps(b[n+0], c[n+1]), _mm_mul_ps(b[n+1], c[n+0])));
}
((float *)d)[0] = d1s;
((float *)d)[4] = d2s;
}
#ifdef __SSE2__
void
convolver_sse2_convolve_add(void *input_cbuf,
void *coeffs,
void *output_cbuf,
int loop_counter)
{
__m128d *b = (__m128d *)input_cbuf;
__m128d *c = (__m128d *)coeffs;
__m128d *d = (__m128d *)output_cbuf;
double d1s, d2s;
int i;
d1s = ((double *)d)[0] + ((double *)b)[0] * ((double *)c)[0];
d2s = ((double *)d)[4] + ((double *)b)[4] * ((double *)c)[4];
for (i = 0; i < loop_counter; i++) {
int n = i << 2;
d[n+0] = _mm_add_pd(d[n+0], _mm_sub_pd(_mm_mul_pd(b[n+0], c[n+0]), _mm_mul_pd(b[n+2], c[n+2])));
d[n+1] = _mm_add_pd(d[n+1], _mm_sub_pd(_mm_mul_pd(b[n+1], c[n+1]), _mm_mul_pd(b[n+3], c[n+3])));
d[n+2] = _mm_add_pd(d[n+2], _mm_add_pd(_mm_mul_pd(b[n+0], c[n+2]), _mm_mul_pd(b[n+2], c[n+0])));
d[n+3] = _mm_add_pd(d[n+3], _mm_add_pd(_mm_mul_pd(b[n+1], c[n+3]), _mm_mul_pd(b[n+3], c[n+1])));
}
((double *)d)[0] = d1s;
((double *)d)[4] = d2s;
}
#endif