]> code.delx.au - pulseaudio/blob - src/pulsecore/svolume_sse.c
main: hook up cpu detection code
[pulseaudio] / src / pulsecore / svolume_sse.c
1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
6
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
11
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 USA.
21 ***/
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <alloca.h>
28
29 #include <pulsecore/random.h>
30 #include <pulsecore/macro.h>
31 #include <pulsecore/g711.h>
32 #include <pulsecore/core-util.h>
33
34 #include "cpu-x86.h"
35
36 #include "sample-util.h"
37 #include "endianmacros.h"
38
39 #if 0
40 static void
41 pa_volume_u8_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
42 {
43 unsigned channel;
44
45 for (channel = 0; length; length--) {
46 int32_t t, hi, lo;
47
48 hi = volumes[channel] >> 16;
49 lo = volumes[channel] & 0xFFFF;
50
51 t = (int32_t) *samples - 0x80;
52 t = ((t * lo) >> 16) + (t * hi);
53 t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
54 *samples++ = (uint8_t) (t + 0x80);
55
56 if (PA_UNLIKELY(++channel >= channels))
57 channel = 0;
58 }
59 }
60
61 static void
62 pa_volume_alaw_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
63 {
64 unsigned channel;
65
66 for (channel = 0; length; length--) {
67 int32_t t, hi, lo;
68
69 hi = volumes[channel] >> 16;
70 lo = volumes[channel] & 0xFFFF;
71
72 t = (int32_t) st_alaw2linear16(*samples);
73 t = ((t * lo) >> 16) + (t * hi);
74 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
75 *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
76
77 if (PA_UNLIKELY(++channel >= channels))
78 channel = 0;
79 }
80 }
81
82 static void
83 pa_volume_ulaw_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
84 {
85 unsigned channel;
86
87 for (channel = 0; length; length--) {
88 int32_t t, hi, lo;
89
90 hi = volumes[channel] >> 16;
91 lo = volumes[channel] & 0xFFFF;
92
93 t = (int32_t) st_ulaw2linear16(*samples);
94 t = ((t * lo) >> 16) + (t * hi);
95 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
96 *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
97
98 if (PA_UNLIKELY(++channel >= channels))
99 channel = 0;
100 }
101 }
102 #endif
103
104 #define VOLUME_32x16(s,v) /* .. | vh | vl | */ \
105 " pxor %%xmm4, %%xmm4 \n\t" /* .. | 0 | 0 | */ \
106 " punpcklwd %%xmm4, "#s" \n\t" /* .. | 0 | p0 | */ \
107 " pcmpgtw "#s", %%xmm4 \n\t" /* .. | 0 | s(p0) | */ \
108 " pand "#v", %%xmm4 \n\t" /* .. | 0 | (vl) | */ \
109 " movdqa "#s", %%xmm5 \n\t" \
110 " pmulhuw "#v", "#s" \n\t" /* .. | 0 | vl*p0 | */ \
111 " psubd %%xmm4, "#s" \n\t" /* .. | 0 | vl*p0 | + sign correct */ \
112 " psrld $16, "#v" \n\t" /* .. | p0 | 0 | */ \
113 " pmaddwd %%xmm5, "#v" \n\t" /* .. | p0 * vh | */ \
114 " paddd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \
115 " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */
116
117 #define MOD_ADD(a,b) \
118 " add "#a", %3 \n\t" /* channel += inc */ \
119 " mov %3, %4 \n\t" \
120 " sub "#b", %4 \n\t" /* tmp = channel - channels */ \
121 " cmp "#b", %3 \n\t" /* if (channel >= channels) */ \
122 " cmovae %4, %3 \n\t" /* channel = tmp */
123
124 /* swap 16 bits */
125 #define SWAP_16(s) \
126 " movdqa "#s", %%xmm4 \n\t" /* .. | h l | */ \
127 " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \
128 " psllw $8, "#s" \n\t" /* .. | l 0 | */ \
129 " por %%xmm4, "#s" \n\t" /* .. | l h | */
130
131 /* swap 2 registers 16 bits for better pairing */
132 #define SWAP_16_2(s1,s2) \
133 " movdqa "#s1", %%xmm4 \n\t" /* .. | h l | */ \
134 " movdqa "#s2", %%xmm5 \n\t" \
135 " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \
136 " psrlw $8, %%xmm5 \n\t" \
137 " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \
138 " psllw $8, "#s2" \n\t" \
139 " por %%xmm4, "#s1" \n\t" /* .. | l h | */ \
140 " por %%xmm5, "#s2" \n\t"
141
142 static void
143 pa_volume_s16ne_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
144 {
145 pa_reg_x86 channel, temp;
146
147 /* the max number of samples we process at a time, this is also the max amount
148 * we overread the volume array, which should have enough padding. */
149 channels = MAX (8, channels);
150
151 __asm__ __volatile__ (
152 " xor %3, %3 \n\t"
153 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
154
155 " test $1, %2 \n\t" /* check for odd samples */
156 " je 2f \n\t"
157
158 " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
159 " movw (%0), %4 \n\t" /* .. | p0 | */
160 " movd %4, %%xmm1 \n\t"
161 VOLUME_32x16 (%%xmm1, %%xmm0)
162 " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
163 " movw %4, (%0) \n\t"
164 " add $2, %0 \n\t"
165 MOD_ADD ($1, %5)
166
167 "2: \n\t"
168 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
169 " test $1, %2 \n\t"
170 " je 4f \n\t"
171
172 "3: \n\t" /* do samples in groups of 2 */
173 " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
174 " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
175 VOLUME_32x16 (%%xmm1, %%xmm0)
176 " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
177 " add $4, %0 \n\t"
178 MOD_ADD ($2, %5)
179
180 "4: \n\t"
181 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
182 " test $1, %2 \n\t"
183 " je 6f \n\t"
184
185 "5: \n\t" /* do samples in groups of 4 */
186 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
187 " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
188 VOLUME_32x16 (%%xmm1, %%xmm0)
189 " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
190 " add $8, %0 \n\t"
191 MOD_ADD ($4, %5)
192
193 "6: \n\t"
194 " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
195 " cmp $0, %2 \n\t"
196 " je 8f \n\t"
197
198 "7: \n\t" /* do samples in groups of 8 */
199 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
200 " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
201 " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
202 " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
203 VOLUME_32x16 (%%xmm1, %%xmm0)
204 VOLUME_32x16 (%%xmm3, %%xmm2)
205 " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
206 " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
207 " add $16, %0 \n\t"
208 MOD_ADD ($8, %5)
209 " dec %2 \n\t"
210 " jne 7b \n\t"
211 "8: \n\t"
212
213 : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
214 : "r" ((pa_reg_x86)channels)
215 : "cc"
216 );
217 }
218
219 static void
220 pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
221 {
222 pa_reg_x86 channel, temp;
223
224 /* the max number of samples we process at a time, this is also the max amount
225 * we overread the volume array, which should have enough padding. */
226 channels = MAX (8, channels);
227
228 __asm__ __volatile__ (
229 " xor %3, %3 \n\t"
230 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
231
232 " test $1, %2 \n\t" /* check for odd samples */
233 " je 2f \n\t"
234
235 " movd (%1, %3, 4), %%xmm0 \n\t" /* do odd sample */
236 " movw (%0), %4 \n\t"
237 " rorw $8, %4 \n\t"
238 " movd %4, %%xmm1 \n\t"
239 VOLUME_32x16 (%%xmm1, %%xmm0)
240 " movd %%xmm0, %4 \n\t"
241 " rorw $8, %4 \n\t"
242 " movw %4, (%0) \n\t"
243 " add $2, %0 \n\t"
244 MOD_ADD ($1, %5)
245
246 "2: \n\t"
247 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
248 " test $1, %2 \n\t" /* check for odd samples */
249 " je 4f \n\t"
250
251 "3: \n\t" /* do samples in pairs of 2 */
252 " movq (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
253 " movd (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
254 SWAP_16 (%%xmm1)
255 VOLUME_32x16 (%%xmm1, %%xmm0)
256 SWAP_16 (%%xmm0)
257 " movd %%xmm0, (%0) \n\t"
258 " add $4, %0 \n\t"
259 MOD_ADD ($2, %5)
260
261 "4: \n\t"
262 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
263 " test $1, %2 \n\t" /* check for odd samples */
264 " je 6f \n\t"
265
266 "5: \n\t" /* do samples in pairs of 4 */
267 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
268 " movq (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
269 SWAP_16 (%%xmm1)
270 VOLUME_32x16 (%%xmm1, %%xmm0)
271 SWAP_16 (%%xmm0)
272 " movq %%xmm0, (%0) \n\t"
273 " add $8, %0 \n\t"
274 MOD_ADD ($4, %5)
275
276 "6: \n\t"
277 " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
278 " cmp $0, %2 \n\t"
279 " je 8f \n\t"
280
281 "7: \n\t" /* do samples in pairs of 8 */
282 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
283 " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* v3_h | v3_l | v2_h | v2_l */
284 " movq (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
285 " movq 8(%0), %%xmm3 \n\t" /* X | X | p3 | p2 */
286 SWAP_16_2 (%%xmm1, %%xmm3)
287 VOLUME_32x16 (%%xmm1, %%xmm0)
288 VOLUME_32x16 (%%xmm3, %%xmm2)
289 SWAP_16_2 (%%xmm0, %%xmm2)
290 " movq %%xmm0, (%0) \n\t"
291 " movq %%xmm2, 8(%0) \n\t"
292 " add $16, %0 \n\t"
293 MOD_ADD ($8, %5)
294 " dec %2 \n\t"
295 " jne 7b \n\t"
296 "8: \n\t"
297
298 : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
299 : "r" ((pa_reg_x86)channels)
300 : "cc"
301 );
302 }
303
304 #if 0
305 static void
306 pa_volume_float32ne_sse (float *samples, float *volumes, unsigned channels, unsigned length)
307 {
308 unsigned channel;
309
310 length /= sizeof (float);
311
312 for (channel = 0; length; length--) {
313 *samples++ *= volumes[channel];
314
315 if (PA_UNLIKELY(++channel >= channels))
316 channel = 0;
317 }
318 }
319
320 static void
321 pa_volume_float32re_sse (float *samples, float *volumes, unsigned channels, unsigned length)
322 {
323 unsigned channel;
324
325 length /= sizeof (float);
326
327 for (channel = 0; length; length--) {
328 float t;
329
330 t = PA_FLOAT32_SWAP(*samples);
331 t *= volumes[channel];
332 *samples++ = PA_FLOAT32_SWAP(t);
333
334 if (PA_UNLIKELY(++channel >= channels))
335 channel = 0;
336 }
337 }
338
339 static void
340 pa_volume_s32ne_sse (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
341 {
342 unsigned channel;
343
344 length /= sizeof (int32_t);
345
346 for (channel = 0; length; length--) {
347 int64_t t;
348
349 t = (int64_t)(*samples);
350 t = (t * volumes[channel]) >> 16;
351 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
352 *samples++ = (int32_t) t;
353
354 if (PA_UNLIKELY(++channel >= channels))
355 channel = 0;
356 }
357 }
358
359 static void
360 pa_volume_s32re_sse (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
361 {
362 unsigned channel;
363
364 length /= sizeof (int32_t);
365
366 for (channel = 0; length; length--) {
367 int64_t t;
368
369 t = (int64_t) PA_INT32_SWAP(*samples);
370 t = (t * volumes[channel]) >> 16;
371 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
372 *samples++ = PA_INT32_SWAP((int32_t) t);
373
374 if (PA_UNLIKELY(++channel >= channels))
375 channel = 0;
376 }
377 }
378
379 static void
380 pa_volume_s24ne_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
381 {
382 unsigned channel;
383 uint8_t *e;
384
385 e = samples + length;
386
387 for (channel = 0; samples < e; samples += 3) {
388 int64_t t;
389
390 t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
391 t = (t * volumes[channel]) >> 16;
392 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
393 PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
394
395 if (PA_UNLIKELY(++channel >= channels))
396 channel = 0;
397 }
398 }
399
400 static void
401 pa_volume_s24re_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
402 {
403 unsigned channel;
404 uint8_t *e;
405
406 e = samples + length;
407
408 for (channel = 0; samples < e; samples += 3) {
409 int64_t t;
410
411 t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
412 t = (t * volumes[channel]) >> 16;
413 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
414 PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
415
416 if (PA_UNLIKELY(++channel >= channels))
417 channel = 0;
418 }
419 }
420
421 static void
422 pa_volume_s24_32ne_sse (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
423 {
424 unsigned channel;
425
426 length /= sizeof (uint32_t);
427
428 for (channel = 0; length; length--) {
429 int64_t t;
430
431 t = (int64_t) ((int32_t) (*samples << 8));
432 t = (t * volumes[channel]) >> 16;
433 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
434 *samples++ = ((uint32_t) ((int32_t) t)) >> 8;
435
436 if (PA_UNLIKELY(++channel >= channels))
437 channel = 0;
438 }
439 }
440
441 static void
442 pa_volume_s24_32re_sse (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
443 {
444 unsigned channel;
445
446 length /= sizeof (uint32_t);
447
448 for (channel = 0; length; length--) {
449 int64_t t;
450
451 t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
452 t = (t * volumes[channel]) >> 16;
453 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
454 *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
455
456 if (PA_UNLIKELY(++channel >= channels))
457 channel = 0;
458 }
459 }
460 #endif
461
462 #undef RUN_TEST
463
464 #ifdef RUN_TEST
465 #define CHANNELS 2
466 #define SAMPLES 1021
467 #define TIMES 1000
468 #define PADDING 16
469
470 static void run_test (void) {
471 int16_t samples[SAMPLES];
472 int16_t samples_ref[SAMPLES];
473 int16_t samples_orig[SAMPLES];
474 int32_t volumes[CHANNELS + PADDING];
475 int i, j, padding;
476 pa_do_volume_func_t func;
477
478 func = pa_get_volume_func (PA_SAMPLE_S16RE);
479
480 printf ("checking SSE %d\n", sizeof (samples));
481
482 for (j = 0; j < TIMES; j++) {
483 pa_random (samples, sizeof (samples));
484 memcpy (samples_ref, samples, sizeof (samples));
485 memcpy (samples_orig, samples, sizeof (samples));
486
487 for (i = 0; i < CHANNELS; i++)
488 volumes[i] = rand() >> 1;
489 for (padding = 0; padding < PADDING; padding++, i++)
490 volumes[i] = volumes[padding];
491
492 pa_volume_s16re_sse (samples, volumes, CHANNELS, SAMPLES * sizeof (int16_t));
493 func (samples_ref, volumes, CHANNELS, SAMPLES * sizeof (int16_t));
494
495 for (i = 0; i < SAMPLES; i++) {
496 if (samples[i] != samples_ref[i]) {
497 printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
498 samples_orig[i], volumes[i % CHANNELS]);
499 }
500 }
501 }
502 }
503 #endif
504
505 void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) {
506 pa_log_info("Initialising SSE optimized functions.");
507
508 #ifdef RUN_TEST
509 run_test ();
510 #endif
511
512 pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse);
513 pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse);
514 }