]>
code.delx.au - pulseaudio/blob - src/pulsecore/svolume_mmx.c
2 This file is part of PulseAudio.
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
29 #include <pulsecore/random.h>
30 #include <pulsecore/macro.h>
31 #include <pulsecore/g711.h>
32 #include <pulsecore/core-util.h>
36 #include "sample-util.h"
37 #include "endianmacros.h"
41 pa_volume_u8_mmx (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
45 for (channel
= 0; length
; length
--) {
48 hi
= volumes
[channel
] >> 16;
49 lo
= volumes
[channel
] & 0xFFFF;
51 t
= (int32_t) *samples
- 0x80;
52 t
= ((t
* lo
) >> 16) + (t
* hi
);
53 t
= PA_CLAMP_UNLIKELY(t
, -0x80, 0x7F);
54 *samples
++ = (uint8_t) (t
+ 0x80);
56 if (PA_UNLIKELY(++channel
>= channels
))
62 pa_volume_alaw_mmx (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
66 for (channel
= 0; length
; length
--) {
69 hi
= volumes
[channel
] >> 16;
70 lo
= volumes
[channel
] & 0xFFFF;
72 t
= (int32_t) st_alaw2linear16(*samples
);
73 t
= ((t
* lo
) >> 16) + (t
* hi
);
74 t
= PA_CLAMP_UNLIKELY(t
, -0x8000, 0x7FFF);
75 *samples
++ = (uint8_t) st_13linear2alaw((int16_t) t
>> 3);
77 if (PA_UNLIKELY(++channel
>= channels
))
83 pa_volume_ulaw_mmx (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
87 for (channel
= 0; length
; length
--) {
90 hi
= volumes
[channel
] >> 16;
91 lo
= volumes
[channel
] & 0xFFFF;
93 t
= (int32_t) st_ulaw2linear16(*samples
);
94 t
= ((t
* lo
) >> 16) + (t
* hi
);
95 t
= PA_CLAMP_UNLIKELY(t
, -0x8000, 0x7FFF);
96 *samples
++ = (uint8_t) st_14linear2ulaw((int16_t) t
>> 2);
98 if (PA_UNLIKELY(++channel
>= channels
))
104 #define VOLUME_32x16(s,v) /* .. | vh | vl | */ \
105 " pxor %%mm4, %%mm4 \n\t" /* .. | 0 | 0 | */ \
106 " punpcklwd %%mm4, "#s" \n\t" /* .. | 0 | p0 | */ \
107 " pcmpgtw "#v", %%mm4 \n\t" /* .. | 0 | s(vl) | */ \
108 " pand "#s", %%mm4 \n\t" /* .. | 0 | (p0) | (vl >> 15) & p */ \
109 " movq %%mm6, %%mm5 \n\t" /* .. | ffff | 0 | */ \
110 " pand "#v", %%mm5 \n\t" /* .. | vh | 0 | */ \
111 " por %%mm5, %%mm4 \n\t" /* .. | vh | (p0) | */ \
112 " pmulhw "#s", "#v" \n\t" /* .. | 0 | vl*p0 | */ \
113 " paddw %%mm4, "#v" \n\t" /* .. | vh | vl*p0 | vh + sign correct */ \
114 " pslld $16, "#s" \n\t" /* .. | p0 | 0 | */ \
115 " por %%mm7, "#s" \n\t" /* .. | p0 | 1 | */ \
116 " pmaddwd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \
117 " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */
119 #define MOD_ADD(a,b) \
120 " add "#a", %3 \n\t" \
122 " sub "#b", %4 \n\t" \
123 " cmp "#b", %3 \n\t" \
124 " cmovae %4, %3 \n\t"
128 " movq "#s", %%mm4 \n\t" /* .. | h l | */ \
129 " psrlw $8, %%mm4 \n\t" /* .. | 0 h | */ \
130 " psllw $8, "#s" \n\t" /* .. | l 0 | */ \
131 " por %%mm4, "#s" \n\t" /* .. | l h | */
133 /* swap 2 registers 16 bits for better pairing */
134 #define SWAP_16_2(s1,s2) \
135 " movq "#s1", %%mm4 \n\t" /* .. | h l | */ \
136 " movq "#s2", %%mm5 \n\t" \
137 " psrlw $8, %%mm4 \n\t" /* .. | 0 h | */ \
138 " psrlw $8, %%mm5 \n\t" \
139 " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \
140 " psllw $8, "#s2" \n\t" \
141 " por %%mm4, "#s1" \n\t" /* .. | l h | */ \
142 " por %%mm5, "#s2" \n\t"
145 pa_volume_s16ne_mmx (int16_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
147 pa_reg_x86 channel
, temp
;
149 /* the max number of samples we process at a time, this is also the max amount
150 * we overread the volume array, which should have enough padding. */
151 channels
= MAX (4, channels
);
153 __asm__
__volatile__ (
155 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
156 " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
157 " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
158 " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
159 " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
161 " test $1, %2 \n\t" /* check for odd samples */
164 " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
165 " movw (%0), %4 \n\t" /* .. | p0 | */
166 " movd %4, %%mm1 \n\t"
167 VOLUME_32x16 (%%mm1
, %%mm0
)
168 " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
169 " movw %4, (%0) \n\t"
174 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
175 " test $1, %2 \n\t" /* check for odd samples */
178 "3: \n\t" /* do samples in groups of 2 */
179 " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
180 " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
181 VOLUME_32x16 (%%mm1
, %%mm0
)
182 " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
187 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
191 "5: \n\t" /* do samples in groups of 4 */
192 " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
193 " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
194 " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
195 " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
196 VOLUME_32x16 (%%mm1
, %%mm0
)
197 VOLUME_32x16 (%%mm3
, %%mm2
)
198 " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
199 " movd %%mm2, 4(%0) \n\t" /* | p3*v3 | p2*v2 | */
208 : "+r" (samples
), "+r" (volumes
), "+r" (length
), "=D" ((pa_reg_x86
)channel
), "=&r" (temp
)
209 : "r" ((pa_reg_x86
)channels
)
215 pa_volume_s16re_mmx (int16_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
217 pa_reg_x86 channel
, temp
;
219 /* the max number of samples we process at a time, this is also the max amount
220 * we overread the volume array, which should have enough padding. */
221 channels
= MAX (4, channels
);
223 __asm__
__volatile__ (
225 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
226 " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
227 " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
228 " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
229 " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
231 " test $1, %2 \n\t" /* check for odd samples */
234 " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
235 " movw (%0), %4 \n\t" /* .. | p0 | */
237 " movd %4, %%mm1 \n\t"
238 VOLUME_32x16 (%%mm1
, %%mm0
)
239 " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
241 " movw %4, (%0) \n\t"
246 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
247 " test $1, %2 \n\t" /* check for odd samples */
250 "3: \n\t" /* do samples in groups of 2 */
251 " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
252 " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
254 VOLUME_32x16 (%%mm1
, %%mm0
)
256 " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
261 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
265 "5: \n\t" /* do samples in groups of 4 */
266 " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
267 " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
268 " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
269 " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
270 SWAP_16_2 (%%mm1
, %%mm3
)
271 VOLUME_32x16 (%%mm1
, %%mm0
)
272 VOLUME_32x16 (%%mm3
, %%mm2
)
273 SWAP_16_2 (%%mm0
, %%mm2
)
274 " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
275 " movd %%mm2, 4(%0) \n\t" /* | p3*v3 | p2*v2 | */
284 : "+r" (samples
), "+r" (volumes
), "+r" (length
), "=D" ((pa_reg_x86
)channel
), "=&r" (temp
)
285 : "r" ((pa_reg_x86
)channels
)
292 pa_volume_float32ne_mmx (float *samples
, float *volumes
, unsigned channels
, unsigned length
)
296 length
/= sizeof (float);
298 for (channel
= 0; length
; length
--) {
299 *samples
++ *= volumes
[channel
];
301 if (PA_UNLIKELY(++channel
>= channels
))
307 pa_volume_float32re_mmx (float *samples
, float *volumes
, unsigned channels
, unsigned length
)
311 length
/= sizeof (float);
313 for (channel
= 0; length
; length
--) {
316 t
= PA_FLOAT32_SWAP(*samples
);
317 t
*= volumes
[channel
];
318 *samples
++ = PA_FLOAT32_SWAP(t
);
320 if (PA_UNLIKELY(++channel
>= channels
))
326 pa_volume_s32ne_mmx (int32_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
330 length
/= sizeof (int32_t);
332 for (channel
= 0; length
; length
--) {
335 t
= (int64_t)(*samples
);
336 t
= (t
* volumes
[channel
]) >> 16;
337 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
338 *samples
++ = (int32_t) t
;
340 if (PA_UNLIKELY(++channel
>= channels
))
346 pa_volume_s32re_mmx (int32_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
350 length
/= sizeof (int32_t);
352 for (channel
= 0; length
; length
--) {
355 t
= (int64_t) PA_INT32_SWAP(*samples
);
356 t
= (t
* volumes
[channel
]) >> 16;
357 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
358 *samples
++ = PA_INT32_SWAP((int32_t) t
);
360 if (PA_UNLIKELY(++channel
>= channels
))
366 pa_volume_s24ne_mmx (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
371 e
= samples
+ length
;
373 for (channel
= 0; samples
< e
; samples
+= 3) {
376 t
= (int64_t)((int32_t) (PA_READ24NE(samples
) << 8));
377 t
= (t
* volumes
[channel
]) >> 16;
378 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
379 PA_WRITE24NE(samples
, ((uint32_t) (int32_t) t
) >> 8);
381 if (PA_UNLIKELY(++channel
>= channels
))
387 pa_volume_s24re_mmx (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
392 e
= samples
+ length
;
394 for (channel
= 0; samples
< e
; samples
+= 3) {
397 t
= (int64_t)((int32_t) (PA_READ24RE(samples
) << 8));
398 t
= (t
* volumes
[channel
]) >> 16;
399 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
400 PA_WRITE24RE(samples
, ((uint32_t) (int32_t) t
) >> 8);
402 if (PA_UNLIKELY(++channel
>= channels
))
408 pa_volume_s24_32ne_mmx (uint32_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
412 length
/= sizeof (uint32_t);
414 for (channel
= 0; length
; length
--) {
417 t
= (int64_t) ((int32_t) (*samples
<< 8));
418 t
= (t
* volumes
[channel
]) >> 16;
419 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
420 *samples
++ = ((uint32_t) ((int32_t) t
)) >> 8;
422 if (PA_UNLIKELY(++channel
>= channels
))
428 pa_volume_s24_32re_mmx (uint32_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
432 length
/= sizeof (uint32_t);
434 for (channel
= 0; length
; length
--) {
437 t
= (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples
) << 8));
438 t
= (t
* volumes
[channel
]) >> 16;
439 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
440 *samples
++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t
)) >> 8);
442 if (PA_UNLIKELY(++channel
>= channels
))
456 static void run_test (void) {
457 int16_t samples
[SAMPLES
];
458 int16_t samples_ref
[SAMPLES
];
459 int16_t samples_orig
[SAMPLES
];
460 int32_t volumes
[CHANNELS
+ PADDING
];
462 pa_do_volume_func_t func
;
464 func
= pa_get_volume_func (PA_SAMPLE_S16RE
);
466 printf ("checking MMX %d\n", sizeof (samples
));
468 for (j
= 0; j
< TIMES
; j
++) {
469 pa_random (samples
, sizeof (samples
));
470 memcpy (samples_ref
, samples
, sizeof (samples
));
471 memcpy (samples_orig
, samples
, sizeof (samples
));
473 for (i
= 0; i
< CHANNELS
; i
++)
474 volumes
[i
] = rand() >> 1;
475 for (padding
= 0; padding
< PADDING
; padding
++, i
++)
476 volumes
[i
] = volumes
[padding
];
478 pa_volume_s16re_mmx (samples
, volumes
, CHANNELS
, sizeof (samples
));
479 func (samples_ref
, volumes
, CHANNELS
, sizeof (samples
));
481 for (i
= 0; i
< SAMPLES
; i
++) {
482 if (samples
[i
] != samples_ref
[i
]) {
483 printf ("%d: %04x != %04x (%04x * %04x)\n", i
, samples
[i
], samples_ref
[i
],
484 samples_orig
[i
], volumes
[i
% CHANNELS
]);
491 void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags
) {
492 pa_log_info("Initialising MMX optimized functions.");
498 pa_set_volume_func (PA_SAMPLE_S16NE
, (pa_do_volume_func_t
) pa_volume_s16ne_mmx
);
499 pa_set_volume_func (PA_SAMPLE_S16RE
, (pa_do_volume_func_t
) pa_volume_s16re_mmx
);