]> code.delx.au - pulseaudio/blob - src/pulsecore/svolume_mmx.c
main: hook up cpu detection code
[pulseaudio] / src / pulsecore / svolume_mmx.c
1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
6
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
11
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 USA.
21 ***/
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <alloca.h>
28
29 #include <pulsecore/random.h>
30 #include <pulsecore/macro.h>
31 #include <pulsecore/g711.h>
32 #include <pulsecore/core-util.h>
33
34 #include "cpu-x86.h"
35
36 #include "sample-util.h"
37 #include "endianmacros.h"
38
39 #if 0
40 static void
41 pa_volume_u8_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
42 {
43 unsigned channel;
44
45 for (channel = 0; length; length--) {
46 int32_t t, hi, lo;
47
48 hi = volumes[channel] >> 16;
49 lo = volumes[channel] & 0xFFFF;
50
51 t = (int32_t) *samples - 0x80;
52 t = ((t * lo) >> 16) + (t * hi);
53 t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
54 *samples++ = (uint8_t) (t + 0x80);
55
56 if (PA_UNLIKELY(++channel >= channels))
57 channel = 0;
58 }
59 }
60
61 static void
62 pa_volume_alaw_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
63 {
64 unsigned channel;
65
66 for (channel = 0; length; length--) {
67 int32_t t, hi, lo;
68
69 hi = volumes[channel] >> 16;
70 lo = volumes[channel] & 0xFFFF;
71
72 t = (int32_t) st_alaw2linear16(*samples);
73 t = ((t * lo) >> 16) + (t * hi);
74 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
75 *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
76
77 if (PA_UNLIKELY(++channel >= channels))
78 channel = 0;
79 }
80 }
81
82 static void
83 pa_volume_ulaw_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
84 {
85 unsigned channel;
86
87 for (channel = 0; length; length--) {
88 int32_t t, hi, lo;
89
90 hi = volumes[channel] >> 16;
91 lo = volumes[channel] & 0xFFFF;
92
93 t = (int32_t) st_ulaw2linear16(*samples);
94 t = ((t * lo) >> 16) + (t * hi);
95 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
96 *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
97
98 if (PA_UNLIKELY(++channel >= channels))
99 channel = 0;
100 }
101 }
102 #endif
103
104 #define VOLUME_32x16(s,v) /* .. | vh | vl | */ \
105 " pxor %%mm4, %%mm4 \n\t" /* .. | 0 | 0 | */ \
106 " punpcklwd %%mm4, "#s" \n\t" /* .. | 0 | p0 | */ \
107 " pcmpgtw "#v", %%mm4 \n\t" /* .. | 0 | s(vl) | */ \
108 " pand "#s", %%mm4 \n\t" /* .. | 0 | (p0) | (vl >> 15) & p */ \
109 " movq %%mm6, %%mm5 \n\t" /* .. | ffff | 0 | */ \
110 " pand "#v", %%mm5 \n\t" /* .. | vh | 0 | */ \
111 " por %%mm5, %%mm4 \n\t" /* .. | vh | (p0) | */ \
112 " pmulhw "#s", "#v" \n\t" /* .. | 0 | vl*p0 | */ \
113 " paddw %%mm4, "#v" \n\t" /* .. | vh | vl*p0 | vh + sign correct */ \
114 " pslld $16, "#s" \n\t" /* .. | p0 | 0 | */ \
115 " por %%mm7, "#s" \n\t" /* .. | p0 | 1 | */ \
116 " pmaddwd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \
117 " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */
118
119 #define MOD_ADD(a,b) \
120 " add "#a", %3 \n\t" \
121 " mov %3, %4 \n\t" \
122 " sub "#b", %4 \n\t" \
123 " cmp "#b", %3 \n\t" \
124 " cmovae %4, %3 \n\t"
125
126 /* swap 16 bits */
127 #define SWAP_16(s) \
128 " movq "#s", %%mm4 \n\t" /* .. | h l | */ \
129 " psrlw $8, %%mm4 \n\t" /* .. | 0 h | */ \
130 " psllw $8, "#s" \n\t" /* .. | l 0 | */ \
131 " por %%mm4, "#s" \n\t" /* .. | l h | */
132
133 /* swap 2 registers 16 bits for better pairing */
134 #define SWAP_16_2(s1,s2) \
135 " movq "#s1", %%mm4 \n\t" /* .. | h l | */ \
136 " movq "#s2", %%mm5 \n\t" \
137 " psrlw $8, %%mm4 \n\t" /* .. | 0 h | */ \
138 " psrlw $8, %%mm5 \n\t" \
139 " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \
140 " psllw $8, "#s2" \n\t" \
141 " por %%mm4, "#s1" \n\t" /* .. | l h | */ \
142 " por %%mm5, "#s2" \n\t"
143
144 static void
145 pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
146 {
147 pa_reg_x86 channel, temp;
148
149 /* the max number of samples we process at a time, this is also the max amount
150 * we overread the volume array, which should have enough padding. */
151 channels = MAX (4, channels);
152
153 __asm__ __volatile__ (
154 " xor %3, %3 \n\t"
155 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
156 " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
157 " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
158 " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
159 " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
160
161 " test $1, %2 \n\t" /* check for odd samples */
162 " je 2f \n\t"
163
164 " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
165 " movw (%0), %4 \n\t" /* .. | p0 | */
166 " movd %4, %%mm1 \n\t"
167 VOLUME_32x16 (%%mm1, %%mm0)
168 " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
169 " movw %4, (%0) \n\t"
170 " add $2, %0 \n\t"
171 MOD_ADD ($1, %5)
172
173 "2: \n\t"
174 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
175 " test $1, %2 \n\t" /* check for odd samples */
176 " je 4f \n\t"
177
178 "3: \n\t" /* do samples in groups of 2 */
179 " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
180 " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
181 VOLUME_32x16 (%%mm1, %%mm0)
182 " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
183 " add $4, %0 \n\t"
184 MOD_ADD ($2, %5)
185
186 "4: \n\t"
187 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
188 " cmp $0, %2 \n\t"
189 " je 6f \n\t"
190
191 "5: \n\t" /* do samples in groups of 4 */
192 " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
193 " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
194 " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
195 " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
196 VOLUME_32x16 (%%mm1, %%mm0)
197 VOLUME_32x16 (%%mm3, %%mm2)
198 " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
199 " movd %%mm2, 4(%0) \n\t" /* | p3*v3 | p2*v2 | */
200 " add $8, %0 \n\t"
201 MOD_ADD ($4, %5)
202 " dec %2 \n\t"
203 " jne 5b \n\t"
204
205 "6: \n\t"
206 " emms \n\t"
207
208 : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
209 : "r" ((pa_reg_x86)channels)
210 : "cc"
211 );
212 }
213
214 static void
215 pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
216 {
217 pa_reg_x86 channel, temp;
218
219 /* the max number of samples we process at a time, this is also the max amount
220 * we overread the volume array, which should have enough padding. */
221 channels = MAX (4, channels);
222
223 __asm__ __volatile__ (
224 " xor %3, %3 \n\t"
225 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
226 " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
227 " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
228 " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
229 " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
230
231 " test $1, %2 \n\t" /* check for odd samples */
232 " je 2f \n\t"
233
234 " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
235 " movw (%0), %4 \n\t" /* .. | p0 | */
236 " rorw $8, %4 \n\t"
237 " movd %4, %%mm1 \n\t"
238 VOLUME_32x16 (%%mm1, %%mm0)
239 " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
240 " rorw $8, %4 \n\t"
241 " movw %4, (%0) \n\t"
242 " add $2, %0 \n\t"
243 MOD_ADD ($1, %5)
244
245 "2: \n\t"
246 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
247 " test $1, %2 \n\t" /* check for odd samples */
248 " je 4f \n\t"
249
250 "3: \n\t" /* do samples in groups of 2 */
251 " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
252 " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
253 SWAP_16 (%%mm1)
254 VOLUME_32x16 (%%mm1, %%mm0)
255 SWAP_16 (%%mm0)
256 " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
257 " add $4, %0 \n\t"
258 MOD_ADD ($2, %5)
259
260 "4: \n\t"
261 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
262 " cmp $0, %2 \n\t"
263 " je 6f \n\t"
264
265 "5: \n\t" /* do samples in groups of 4 */
266 " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
267 " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
268 " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
269 " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
270 SWAP_16_2 (%%mm1, %%mm3)
271 VOLUME_32x16 (%%mm1, %%mm0)
272 VOLUME_32x16 (%%mm3, %%mm2)
273 SWAP_16_2 (%%mm0, %%mm2)
274 " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
275 " movd %%mm2, 4(%0) \n\t" /* | p3*v3 | p2*v2 | */
276 " add $8, %0 \n\t"
277 MOD_ADD ($4, %5)
278 " dec %2 \n\t"
279 " jne 5b \n\t"
280
281 "6: \n\t"
282 " emms \n\t"
283
284 : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
285 : "r" ((pa_reg_x86)channels)
286 : "cc"
287 );
288 }
289
290 #if 0
291 static void
292 pa_volume_float32ne_mmx (float *samples, float *volumes, unsigned channels, unsigned length)
293 {
294 unsigned channel;
295
296 length /= sizeof (float);
297
298 for (channel = 0; length; length--) {
299 *samples++ *= volumes[channel];
300
301 if (PA_UNLIKELY(++channel >= channels))
302 channel = 0;
303 }
304 }
305
306 static void
307 pa_volume_float32re_mmx (float *samples, float *volumes, unsigned channels, unsigned length)
308 {
309 unsigned channel;
310
311 length /= sizeof (float);
312
313 for (channel = 0; length; length--) {
314 float t;
315
316 t = PA_FLOAT32_SWAP(*samples);
317 t *= volumes[channel];
318 *samples++ = PA_FLOAT32_SWAP(t);
319
320 if (PA_UNLIKELY(++channel >= channels))
321 channel = 0;
322 }
323 }
324
325 static void
326 pa_volume_s32ne_mmx (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
327 {
328 unsigned channel;
329
330 length /= sizeof (int32_t);
331
332 for (channel = 0; length; length--) {
333 int64_t t;
334
335 t = (int64_t)(*samples);
336 t = (t * volumes[channel]) >> 16;
337 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
338 *samples++ = (int32_t) t;
339
340 if (PA_UNLIKELY(++channel >= channels))
341 channel = 0;
342 }
343 }
344
345 static void
346 pa_volume_s32re_mmx (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
347 {
348 unsigned channel;
349
350 length /= sizeof (int32_t);
351
352 for (channel = 0; length; length--) {
353 int64_t t;
354
355 t = (int64_t) PA_INT32_SWAP(*samples);
356 t = (t * volumes[channel]) >> 16;
357 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
358 *samples++ = PA_INT32_SWAP((int32_t) t);
359
360 if (PA_UNLIKELY(++channel >= channels))
361 channel = 0;
362 }
363 }
364
365 static void
366 pa_volume_s24ne_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
367 {
368 unsigned channel;
369 uint8_t *e;
370
371 e = samples + length;
372
373 for (channel = 0; samples < e; samples += 3) {
374 int64_t t;
375
376 t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
377 t = (t * volumes[channel]) >> 16;
378 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
379 PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
380
381 if (PA_UNLIKELY(++channel >= channels))
382 channel = 0;
383 }
384 }
385
386 static void
387 pa_volume_s24re_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
388 {
389 unsigned channel;
390 uint8_t *e;
391
392 e = samples + length;
393
394 for (channel = 0; samples < e; samples += 3) {
395 int64_t t;
396
397 t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
398 t = (t * volumes[channel]) >> 16;
399 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
400 PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
401
402 if (PA_UNLIKELY(++channel >= channels))
403 channel = 0;
404 }
405 }
406
407 static void
408 pa_volume_s24_32ne_mmx (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
409 {
410 unsigned channel;
411
412 length /= sizeof (uint32_t);
413
414 for (channel = 0; length; length--) {
415 int64_t t;
416
417 t = (int64_t) ((int32_t) (*samples << 8));
418 t = (t * volumes[channel]) >> 16;
419 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
420 *samples++ = ((uint32_t) ((int32_t) t)) >> 8;
421
422 if (PA_UNLIKELY(++channel >= channels))
423 channel = 0;
424 }
425 }
426
427 static void
428 pa_volume_s24_32re_mmx (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
429 {
430 unsigned channel;
431
432 length /= sizeof (uint32_t);
433
434 for (channel = 0; length; length--) {
435 int64_t t;
436
437 t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
438 t = (t * volumes[channel]) >> 16;
439 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
440 *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
441
442 if (PA_UNLIKELY(++channel >= channels))
443 channel = 0;
444 }
445 }
446 #endif
447
448 #undef RUN_TEST
449
450 #ifdef RUN_TEST
451 #define CHANNELS 2
452 #define SAMPLES 1021
453 #define TIMES 1000
454 #define PADDING 16
455
456 static void run_test (void) {
457 int16_t samples[SAMPLES];
458 int16_t samples_ref[SAMPLES];
459 int16_t samples_orig[SAMPLES];
460 int32_t volumes[CHANNELS + PADDING];
461 int i, j, padding;
462 pa_do_volume_func_t func;
463
464 func = pa_get_volume_func (PA_SAMPLE_S16RE);
465
466 printf ("checking MMX %d\n", sizeof (samples));
467
468 for (j = 0; j < TIMES; j++) {
469 pa_random (samples, sizeof (samples));
470 memcpy (samples_ref, samples, sizeof (samples));
471 memcpy (samples_orig, samples, sizeof (samples));
472
473 for (i = 0; i < CHANNELS; i++)
474 volumes[i] = rand() >> 1;
475 for (padding = 0; padding < PADDING; padding++, i++)
476 volumes[i] = volumes[padding];
477
478 pa_volume_s16re_mmx (samples, volumes, CHANNELS, sizeof (samples));
479 func (samples_ref, volumes, CHANNELS, sizeof (samples));
480
481 for (i = 0; i < SAMPLES; i++) {
482 if (samples[i] != samples_ref[i]) {
483 printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
484 samples_orig[i], volumes[i % CHANNELS]);
485 }
486 }
487 }
488 }
489 #endif
490
491 void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) {
492 pa_log_info("Initialising MMX optimized functions.");
493
494 #ifdef RUN_TEST
495 run_test ();
496 #endif
497
498 pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
499 pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
500 }