]>
code.delx.au - pulseaudio/blob - src/pulsecore/svolume_sse.c
2 This file is part of PulseAudio.
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
29 #include <pulsecore/random.h>
30 #include <pulsecore/macro.h>
31 #include <pulsecore/g711.h>
32 #include <pulsecore/core-util.h>
36 #include "sample-util.h"
37 #include "endianmacros.h"
41 pa_volume_u8_sse (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
45 for (channel
= 0; length
; length
--) {
48 hi
= volumes
[channel
] >> 16;
49 lo
= volumes
[channel
] & 0xFFFF;
51 t
= (int32_t) *samples
- 0x80;
52 t
= ((t
* lo
) >> 16) + (t
* hi
);
53 t
= PA_CLAMP_UNLIKELY(t
, -0x80, 0x7F);
54 *samples
++ = (uint8_t) (t
+ 0x80);
56 if (PA_UNLIKELY(++channel
>= channels
))
62 pa_volume_alaw_sse (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
66 for (channel
= 0; length
; length
--) {
69 hi
= volumes
[channel
] >> 16;
70 lo
= volumes
[channel
] & 0xFFFF;
72 t
= (int32_t) st_alaw2linear16(*samples
);
73 t
= ((t
* lo
) >> 16) + (t
* hi
);
74 t
= PA_CLAMP_UNLIKELY(t
, -0x8000, 0x7FFF);
75 *samples
++ = (uint8_t) st_13linear2alaw((int16_t) t
>> 3);
77 if (PA_UNLIKELY(++channel
>= channels
))
83 pa_volume_ulaw_sse (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
87 for (channel
= 0; length
; length
--) {
90 hi
= volumes
[channel
] >> 16;
91 lo
= volumes
[channel
] & 0xFFFF;
93 t
= (int32_t) st_ulaw2linear16(*samples
);
94 t
= ((t
* lo
) >> 16) + (t
* hi
);
95 t
= PA_CLAMP_UNLIKELY(t
, -0x8000, 0x7FFF);
96 *samples
++ = (uint8_t) st_14linear2ulaw((int16_t) t
>> 2);
98 if (PA_UNLIKELY(++channel
>= channels
))
104 #define VOLUME_32x16(s,v) /* .. | vh | vl | */ \
105 " pxor %%xmm4, %%xmm4 \n\t" /* .. | 0 | 0 | */ \
106 " punpcklwd %%xmm4, "#s" \n\t" /* .. | 0 | p0 | */ \
107 " pcmpgtw "#s", %%xmm4 \n\t" /* .. | 0 | s(p0) | */ \
108 " pand "#v", %%xmm4 \n\t" /* .. | 0 | (vl) | */ \
109 " movdqa "#s", %%xmm5 \n\t" \
110 " pmulhuw "#v", "#s" \n\t" /* .. | 0 | vl*p0 | */ \
111 " psubd %%xmm4, "#s" \n\t" /* .. | 0 | vl*p0 | + sign correct */ \
112 " psrld $16, "#v" \n\t" /* .. | p0 | 0 | */ \
113 " pmaddwd %%xmm5, "#v" \n\t" /* .. | p0 * vh | */ \
114 " paddd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \
115 " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */
117 #define MOD_ADD(a,b) \
118 " add "#a", %3 \n\t" /* channel += inc */ \
120 " sub "#b", %4 \n\t" /* tmp = channel - channels */ \
121 " cmp "#b", %3 \n\t" /* if (channel >= channels) */ \
122 " cmovae %4, %3 \n\t" /* channel = tmp */
126 " movdqa "#s", %%xmm4 \n\t" /* .. | h l | */ \
127 " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \
128 " psllw $8, "#s" \n\t" /* .. | l 0 | */ \
129 " por %%xmm4, "#s" \n\t" /* .. | l h | */
131 /* swap 2 registers 16 bits for better pairing */
132 #define SWAP_16_2(s1,s2) \
133 " movdqa "#s1", %%xmm4 \n\t" /* .. | h l | */ \
134 " movdqa "#s2", %%xmm5 \n\t" \
135 " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \
136 " psrlw $8, %%xmm5 \n\t" \
137 " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \
138 " psllw $8, "#s2" \n\t" \
139 " por %%xmm4, "#s1" \n\t" /* .. | l h | */ \
140 " por %%xmm5, "#s2" \n\t"
143 pa_volume_s16ne_sse (int16_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
145 pa_reg_x86 channel
, temp
;
147 /* the max number of samples we process at a time, this is also the max amount
148 * we overread the volume array, which should have enough padding. */
149 channels
= MAX (8, channels
);
151 __asm__
__volatile__ (
153 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
155 " test $1, %2 \n\t" /* check for odd samples */
158 " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
159 " movw (%0), %4 \n\t" /* .. | p0 | */
160 " movd %4, %%xmm1 \n\t"
161 VOLUME_32x16 (%%xmm1
, %%xmm0
)
162 " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
163 " movw %4, (%0) \n\t"
168 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
172 "3: \n\t" /* do samples in groups of 2 */
173 " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
174 " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
175 VOLUME_32x16 (%%xmm1
, %%xmm0
)
176 " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
181 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
185 "5: \n\t" /* do samples in groups of 4 */
186 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
187 " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
188 VOLUME_32x16 (%%xmm1
, %%xmm0
)
189 " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
194 " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
198 "7: \n\t" /* do samples in groups of 8 */
199 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
200 " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
201 " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
202 " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
203 VOLUME_32x16 (%%xmm1
, %%xmm0
)
204 VOLUME_32x16 (%%xmm3
, %%xmm2
)
205 " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
206 " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
213 : "+r" (samples
), "+r" (volumes
), "+r" (length
), "=D" (channel
), "=&r" (temp
)
214 : "r" ((pa_reg_x86
)channels
)
220 pa_volume_s16re_sse (int16_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
222 pa_reg_x86 channel
, temp
;
224 /* the max number of samples we process at a time, this is also the max amount
225 * we overread the volume array, which should have enough padding. */
226 channels
= MAX (8, channels
);
228 __asm__
__volatile__ (
230 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
232 " test $1, %2 \n\t" /* check for odd samples */
235 " movd (%1, %3, 4), %%xmm0 \n\t" /* do odd sample */
236 " movw (%0), %4 \n\t"
238 " movd %4, %%xmm1 \n\t"
239 VOLUME_32x16 (%%xmm1
, %%xmm0
)
240 " movd %%xmm0, %4 \n\t"
242 " movw %4, (%0) \n\t"
247 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
248 " test $1, %2 \n\t" /* check for odd samples */
251 "3: \n\t" /* do samples in pairs of 2 */
252 " movq (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
253 " movd (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
255 VOLUME_32x16 (%%xmm1
, %%xmm0
)
257 " movd %%xmm0, (%0) \n\t"
262 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
263 " test $1, %2 \n\t" /* check for odd samples */
266 "5: \n\t" /* do samples in pairs of 4 */
267 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
268 " movq (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
270 VOLUME_32x16 (%%xmm1
, %%xmm0
)
272 " movq %%xmm0, (%0) \n\t"
277 " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
281 "7: \n\t" /* do samples in pairs of 8 */
282 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
283 " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* v3_h | v3_l | v2_h | v2_l */
284 " movq (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
285 " movq 8(%0), %%xmm3 \n\t" /* X | X | p3 | p2 */
286 SWAP_16_2 (%%xmm1
, %%xmm3
)
287 VOLUME_32x16 (%%xmm1
, %%xmm0
)
288 VOLUME_32x16 (%%xmm3
, %%xmm2
)
289 SWAP_16_2 (%%xmm0
, %%xmm2
)
290 " movq %%xmm0, (%0) \n\t"
291 " movq %%xmm2, 8(%0) \n\t"
298 : "+r" (samples
), "+r" (volumes
), "+r" (length
), "=D" (channel
), "=&r" (temp
)
299 : "r" ((pa_reg_x86
)channels
)
306 pa_volume_float32ne_sse (float *samples
, float *volumes
, unsigned channels
, unsigned length
)
310 length
/= sizeof (float);
312 for (channel
= 0; length
; length
--) {
313 *samples
++ *= volumes
[channel
];
315 if (PA_UNLIKELY(++channel
>= channels
))
321 pa_volume_float32re_sse (float *samples
, float *volumes
, unsigned channels
, unsigned length
)
325 length
/= sizeof (float);
327 for (channel
= 0; length
; length
--) {
330 t
= PA_FLOAT32_SWAP(*samples
);
331 t
*= volumes
[channel
];
332 *samples
++ = PA_FLOAT32_SWAP(t
);
334 if (PA_UNLIKELY(++channel
>= channels
))
340 pa_volume_s32ne_sse (int32_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
344 length
/= sizeof (int32_t);
346 for (channel
= 0; length
; length
--) {
349 t
= (int64_t)(*samples
);
350 t
= (t
* volumes
[channel
]) >> 16;
351 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
352 *samples
++ = (int32_t) t
;
354 if (PA_UNLIKELY(++channel
>= channels
))
360 pa_volume_s32re_sse (int32_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
364 length
/= sizeof (int32_t);
366 for (channel
= 0; length
; length
--) {
369 t
= (int64_t) PA_INT32_SWAP(*samples
);
370 t
= (t
* volumes
[channel
]) >> 16;
371 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
372 *samples
++ = PA_INT32_SWAP((int32_t) t
);
374 if (PA_UNLIKELY(++channel
>= channels
))
380 pa_volume_s24ne_sse (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
385 e
= samples
+ length
;
387 for (channel
= 0; samples
< e
; samples
+= 3) {
390 t
= (int64_t)((int32_t) (PA_READ24NE(samples
) << 8));
391 t
= (t
* volumes
[channel
]) >> 16;
392 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
393 PA_WRITE24NE(samples
, ((uint32_t) (int32_t) t
) >> 8);
395 if (PA_UNLIKELY(++channel
>= channels
))
401 pa_volume_s24re_sse (uint8_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
406 e
= samples
+ length
;
408 for (channel
= 0; samples
< e
; samples
+= 3) {
411 t
= (int64_t)((int32_t) (PA_READ24RE(samples
) << 8));
412 t
= (t
* volumes
[channel
]) >> 16;
413 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
414 PA_WRITE24RE(samples
, ((uint32_t) (int32_t) t
) >> 8);
416 if (PA_UNLIKELY(++channel
>= channels
))
422 pa_volume_s24_32ne_sse (uint32_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
426 length
/= sizeof (uint32_t);
428 for (channel
= 0; length
; length
--) {
431 t
= (int64_t) ((int32_t) (*samples
<< 8));
432 t
= (t
* volumes
[channel
]) >> 16;
433 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
434 *samples
++ = ((uint32_t) ((int32_t) t
)) >> 8;
436 if (PA_UNLIKELY(++channel
>= channels
))
442 pa_volume_s24_32re_sse (uint32_t *samples
, int32_t *volumes
, unsigned channels
, unsigned length
)
446 length
/= sizeof (uint32_t);
448 for (channel
= 0; length
; length
--) {
451 t
= (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples
) << 8));
452 t
= (t
* volumes
[channel
]) >> 16;
453 t
= PA_CLAMP_UNLIKELY(t
, -0x80000000LL
, 0x7FFFFFFFLL
);
454 *samples
++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t
)) >> 8);
456 if (PA_UNLIKELY(++channel
>= channels
))
470 static void run_test (void) {
471 int16_t samples
[SAMPLES
];
472 int16_t samples_ref
[SAMPLES
];
473 int16_t samples_orig
[SAMPLES
];
474 int32_t volumes
[CHANNELS
+ PADDING
];
476 pa_do_volume_func_t func
;
478 func
= pa_get_volume_func (PA_SAMPLE_S16RE
);
480 printf ("checking SSE %d\n", sizeof (samples
));
482 for (j
= 0; j
< TIMES
; j
++) {
483 pa_random (samples
, sizeof (samples
));
484 memcpy (samples_ref
, samples
, sizeof (samples
));
485 memcpy (samples_orig
, samples
, sizeof (samples
));
487 for (i
= 0; i
< CHANNELS
; i
++)
488 volumes
[i
] = rand() >> 1;
489 for (padding
= 0; padding
< PADDING
; padding
++, i
++)
490 volumes
[i
] = volumes
[padding
];
492 pa_volume_s16re_sse (samples
, volumes
, CHANNELS
, SAMPLES
* sizeof (int16_t));
493 func (samples_ref
, volumes
, CHANNELS
, SAMPLES
* sizeof (int16_t));
495 for (i
= 0; i
< SAMPLES
; i
++) {
496 if (samples
[i
] != samples_ref
[i
]) {
497 printf ("%d: %04x != %04x (%04x * %04x)\n", i
, samples
[i
], samples_ref
[i
],
498 samples_orig
[i
], volumes
[i
% CHANNELS
]);
505 void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags
) {
506 pa_log_info("Initialising SSE optimized functions.");
512 pa_set_volume_func (PA_SAMPLE_S16NE
, (pa_do_volume_func_t
) pa_volume_s16ne_sse
);
513 pa_set_volume_func (PA_SAMPLE_S16RE
, (pa_do_volume_func_t
) pa_volume_s16re_sse
);