]> code.delx.au - pulseaudio/blob - src/pulsecore/sconv_sse.c
Merge remote-tracking branch 'mkbosmans/mingw32-build'
[pulseaudio] / src / pulsecore / sconv_sse.c
1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
6
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
11
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 USA.
21 ***/
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <stdlib.h>
29
30 #include <pulsecore/g711.h>
31 #include <pulsecore/macro.h>
32 #include <pulsecore/endianmacros.h>
33
34 #include "cpu-x86.h"
35 #include "sconv.h"
36
37 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
38
39 static const PA_DECLARE_ALIGNED (16, float, one[4]) = { 1.0, 1.0, 1.0, 1.0 };
40 static const PA_DECLARE_ALIGNED (16, float, mone[4]) = { -1.0, -1.0, -1.0, -1.0 };
41 static const PA_DECLARE_ALIGNED (16, float, scale[4]) = { 0x7fff, 0x7fff, 0x7fff, 0x7fff };
42
43 static void pa_sconv_s16le_from_f32ne_sse(unsigned n, const float *a, int16_t *b) {
44 pa_reg_x86 temp, i;
45
46 __asm__ __volatile__ (
47 " movaps %5, %%xmm5 \n\t"
48 " movaps %6, %%xmm6 \n\t"
49 " movaps %7, %%xmm7 \n\t"
50 " xor %0, %0 \n\t"
51
52 " mov %4, %1 \n\t"
53 " sar $3, %1 \n\t" /* 8 floats at a time */
54 " cmp $0, %1 \n\t"
55 " je 2f \n\t"
56
57 "1: \n\t"
58 " movups (%2, %0, 2), %%xmm0 \n\t" /* read 8 floats */
59 " movups 16(%2, %0, 2), %%xmm2 \n\t"
60 " minps %%xmm5, %%xmm0 \n\t" /* clamp to 1.0 */
61 " minps %%xmm5, %%xmm2 \n\t"
62 " maxps %%xmm6, %%xmm0 \n\t" /* clamp to -1.0 */
63 " maxps %%xmm6, %%xmm2 \n\t"
64 " mulps %%xmm7, %%xmm0 \n\t" /* *= 0x7fff */
65 " mulps %%xmm7, %%xmm2 \n\t"
66
67 " cvtps2pi %%xmm0, %%mm0 \n\t" /* low part to int */
68 " cvtps2pi %%xmm2, %%mm2 \n\t"
69 " movhlps %%xmm0, %%xmm0 \n\t" /* bring high part in position */
70 " movhlps %%xmm2, %%xmm2 \n\t"
71 " cvtps2pi %%xmm0, %%mm1 \n\t" /* high part to int */
72 " cvtps2pi %%xmm2, %%mm3 \n\t"
73
74 " packssdw %%mm1, %%mm0 \n\t" /* pack parts */
75 " packssdw %%mm3, %%mm2 \n\t"
76 " movq %%mm0, (%3, %0) \n\t"
77 " movq %%mm2, 8(%3, %0) \n\t"
78
79 " add $16, %0 \n\t"
80 " dec %1 \n\t"
81 " jne 1b \n\t"
82
83 "2: \n\t"
84 " mov %4, %1 \n\t" /* prepare for leftovers */
85 " and $7, %1 \n\t"
86 " je 4f \n\t"
87
88 "3: \n\t"
89 " movss (%2, %0, 2), %%xmm0 \n\t"
90 " minss %%xmm5, %%xmm0 \n\t"
91 " maxss %%xmm6, %%xmm0 \n\t"
92 " mulss %%xmm7, %%xmm0 \n\t"
93 " cvtss2si %%xmm0, %4 \n\t"
94 " movw %w4, (%3, %0) \n\t"
95 " add $2, %0 \n\t"
96 " dec %1 \n\t"
97 " jne 3b \n\t"
98
99 "4: \n\t"
100 " emms \n\t"
101
102 : "=&r" (i), "=&r" (temp)
103 : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
104 : "cc", "memory"
105 );
106 }
107
108 static void pa_sconv_s16le_from_f32ne_sse2(unsigned n, const float *a, int16_t *b) {
109 pa_reg_x86 temp, i;
110
111 __asm__ __volatile__ (
112 " movaps %5, %%xmm5 \n\t"
113 " movaps %6, %%xmm6 \n\t"
114 " movaps %7, %%xmm7 \n\t"
115 " xor %0, %0 \n\t"
116
117 " mov %4, %1 \n\t"
118 " sar $3, %1 \n\t" /* 8 floats at a time */
119 " cmp $0, %1 \n\t"
120 " je 2f \n\t"
121
122 "1: \n\t"
123 " movups (%2, %0, 2), %%xmm0 \n\t" /* read 8 floats */
124 " movups 16(%2, %0, 2), %%xmm2 \n\t"
125 " minps %%xmm5, %%xmm0 \n\t" /* clamp to 1.0 */
126 " minps %%xmm5, %%xmm2 \n\t"
127 " maxps %%xmm6, %%xmm0 \n\t" /* clamp to -1.0 */
128 " maxps %%xmm6, %%xmm2 \n\t"
129 " mulps %%xmm7, %%xmm0 \n\t" /* *= 0x7fff */
130 " mulps %%xmm7, %%xmm2 \n\t"
131
132 " cvtps2dq %%xmm0, %%xmm0 \n\t"
133 " cvtps2dq %%xmm2, %%xmm2 \n\t"
134
135 " packssdw %%xmm2, %%xmm0 \n\t"
136 " movdqu %%xmm0, (%3, %0) \n\t"
137
138 " add $16, %0 \n\t"
139 " dec %1 \n\t"
140 " jne 1b \n\t"
141
142 "2: \n\t"
143 " mov %4, %1 \n\t" /* prepare for leftovers */
144 " and $7, %1 \n\t"
145 " je 4f \n\t"
146
147 "3: \n\t"
148 " movss (%2, %0, 2), %%xmm0 \n\t"
149 " minss %%xmm5, %%xmm0 \n\t"
150 " maxss %%xmm6, %%xmm0 \n\t"
151 " mulss %%xmm7, %%xmm0 \n\t"
152 " cvtss2si %%xmm0, %4 \n\t"
153 " movw %w4, (%3, %0) \n\t"
154 " add $2, %0 \n\t"
155 " dec %1 \n\t"
156 " jne 3b \n\t"
157
158 "4: \n\t"
159
160 : "=&r" (i), "=&r" (temp)
161 : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
162 : "cc", "memory"
163 );
164 }
165
166 #undef RUN_TEST
167
168 #ifdef RUN_TEST
169 #define SAMPLES 1019
170 #define TIMES 1000
171
172 static void run_test(void) {
173 int16_t samples[SAMPLES];
174 int16_t samples_ref[SAMPLES];
175 float floats[SAMPLES];
176 int i;
177 pa_usec_t start, stop;
178 pa_convert_func_t func;
179
180 printf("checking SSE %zd\n", sizeof(samples));
181
182 memset(samples_ref, 0, sizeof(samples_ref));
183 memset(samples, 0, sizeof(samples));
184
185 for (i = 0; i < SAMPLES; i++) {
186 floats[i] = (rand()/(RAND_MAX+2.2)) - 1.1;
187 }
188
189 func = pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
190 func(SAMPLES, floats, samples_ref);
191 pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
192
193 for (i = 0; i < SAMPLES; i++) {
194 if (samples[i] != samples_ref[i]) {
195 printf ("%d: %04x != %04x (%f)\n", i, samples[i], samples_ref[i],
196 floats[i]);
197 }
198 }
199
200 start = pa_rtclock_now();
201 for (i = 0; i < TIMES; i++) {
202 pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
203 }
204 stop = pa_rtclock_now();
205 pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
206
207 start = pa_rtclock_now();
208 for (i = 0; i < TIMES; i++) {
209 func(SAMPLES, floats, samples_ref);
210 }
211 stop = pa_rtclock_now();
212 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
213 }
214 #endif
215 #endif /* defined (__i386__) || defined (__amd64__) */
216
217
218 void pa_convert_func_init_sse(pa_cpu_x86_flag_t flags) {
219 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
220
221 #ifdef RUN_TEST
222 run_test();
223 #endif
224
225 if (flags & PA_CPU_X86_SSE2) {
226 pa_log_info("Initialising SSE2 optimized conversions.");
227 pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse2);
228 } else {
229 pa_log_info("Initialising SSE optimized conversions.");
230 pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse);
231 }
232
233 #endif /* defined (__i386__) || defined (__amd64__) */
234 }