]> code.delx.au - pulseaudio/blob - src/pulsecore/svolume_mmx.c
volume: add first mmx optimized function
[pulseaudio] / src / pulsecore / svolume_mmx.c
1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
6
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
11
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 USA.
21 ***/
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <alloca.h>
28
29 #include <pulsecore/random.h>
30 #include <pulsecore/macro.h>
31 #include <pulsecore/g711.h>
32 #include <pulsecore/core-util.h>
33
34 #include "sample-util.h"
35 #include "endianmacros.h"
36
37 #if 0
38 static void
39 pa_volume_u8_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
40 {
41 unsigned channel;
42
43 for (channel = 0; length; length--) {
44 int32_t t, hi, lo;
45
46 hi = volumes[channel] >> 16;
47 lo = volumes[channel] & 0xFFFF;
48
49 t = (int32_t) *samples - 0x80;
50 t = ((t * lo) >> 16) + (t * hi);
51 t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
52 *samples++ = (uint8_t) (t + 0x80);
53
54 if (PA_UNLIKELY(++channel >= channels))
55 channel = 0;
56 }
57 }
58
59 static void
60 pa_volume_alaw_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
61 {
62 unsigned channel;
63
64 for (channel = 0; length; length--) {
65 int32_t t, hi, lo;
66
67 hi = volumes[channel] >> 16;
68 lo = volumes[channel] & 0xFFFF;
69
70 t = (int32_t) st_alaw2linear16(*samples);
71 t = ((t * lo) >> 16) + (t * hi);
72 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
73 *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
74
75 if (PA_UNLIKELY(++channel >= channels))
76 channel = 0;
77 }
78 }
79
80 static void
81 pa_volume_ulaw_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
82 {
83 unsigned channel;
84
85 for (channel = 0; length; length--) {
86 int32_t t, hi, lo;
87
88 hi = volumes[channel] >> 16;
89 lo = volumes[channel] & 0xFFFF;
90
91 t = (int32_t) st_ulaw2linear16(*samples);
92 t = ((t * lo) >> 16) + (t * hi);
93 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
94 *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
95
96 if (PA_UNLIKELY(++channel >= channels))
97 channel = 0;
98 }
99 }
100 #endif
101
102 static void
103 pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
104 {
105 int64_t channel, temp;
106
107 /* the max number of samples we process at a time */
108 channels = MAX (4, channels);
109
110 #define VOLUME_32x16(s,v) /* v1_h | v1_l | v0_h | v0_l */ \
111 " pxor %%mm4, %%mm4 \n\t" \
112 " punpcklwd %%mm4, "#s" \n\t" /* 0 | p1 | 0 | p0 */ \
113 " pcmpgtw "#s", %%mm4 \n\t" /* select sign from sample */ \
114 " pand "#v", %%mm4 \n\t" /* extract correction factors */ \
115 " movq "#s", %%mm5 \n\t" \
116 " pmulhuw "#v", "#s" \n\t" /* 0 | p1*v1lh | 0 | p0*v0lh */ \
117 " psubd %%mm4, "#s" \n\t" /* sign correction */ \
118 " psrld $16, "#v" \n\t" /* 0 | v1h | 0 | v0h */ \
119 " pmaddwd %%mm5, "#v" \n\t" /* p1 * v1h | p0 * v0h */ \
120 " paddd "#s", "#v" \n\t" /* p1 * v1 | p0 * v0 */ \
121 " packssdw "#v", "#v" \n\t" /* p0*v0 | p1*v1 | p0*v0 | p1*v1 */
122
123 #define MOD_ADD(a,b) \
124 " add "#a", %3 \n\t" \
125 " mov %3, %4 \n\t" \
126 " sub %5, %4 \n\t" \
127 " cmp %3, "#b" \n\t" \
128 " cmovae %4, %3 \n\t"
129
130 __asm__ __volatile__ (
131 " xor %3, %3 \n\t"
132 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
133
134 " test $1, %2 \n\t" /* check for odd samples */
135 " je 2f \n\t"
136
137 " movd (%1, %3, 4), %%mm0 \n\t" /* do odd samples */
138 " movw (%0), %%ax \n\t"
139 " movd %%eax, %%mm1 \n\t"
140 VOLUME_32x16 (%%mm1, %%mm0)
141 " movd %%mm0, %%eax \n\t"
142 " movw %%ax, (%0) \n\t"
143 " add $2, %0 \n\t"
144 MOD_ADD ($1, %5)
145 " dec %2 \n\t"
146
147 "2: \n\t"
148 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
149 " test $1, %2 \n\t" /* check for odd samples */
150 " je 4f \n\t"
151
152 "3: \n\t" /* do samples in pairs of 2 */
153 " movq (%1, %3, 4), %%mm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
154 " movd (%0), %%mm1 \n\t" /* X | X | p1 | p0 */
155 VOLUME_32x16 (%%mm1, %%mm0)
156 " movd %%mm0, (%0) \n\t"
157 " add $4, %0 \n\t"
158 MOD_ADD ($2, %5)
159 " dec %2 \n\t"
160
161 "4: \n\t"
162 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
163 " cmp $0, %2 \n\t"
164 " je 6f \n\t"
165
166 "5: \n\t" /* do samples in pairs of 4 */
167 " movq (%1, %3, 4), %%mm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
168 " movq 8(%1, %3, 4), %%mm2 \n\t" /* v3_h | v3_l | v2_h | v2_l */
169 " movd (%0), %%mm1 \n\t" /* X | X | p1 | p0 */
170 " movd 4(%0), %%mm3 \n\t" /* X | X | p3 | p2 */
171 VOLUME_32x16 (%%mm1, %%mm0)
172 VOLUME_32x16 (%%mm3, %%mm2)
173 " movd %%mm0, (%0) \n\t"
174 " movd %%mm2, 4(%0) \n\t"
175 " add $8, %0 \n\t"
176 MOD_ADD ($4, %5)
177 " dec %2 \n\t"
178 " jne 5b \n\t"
179
180 "6: \n\t"
181 " emms \n\t"
182
183 : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((int64_t)channel), "=r" (temp)
184 : "r" ((int64_t)channels)
185 : "rax", "cc"
186 );
187 }
188
189 #if 0
190 static void
191 pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
192 {
193 unsigned channel;
194
195 length /= sizeof (int16_t);
196
197 for (channel = 0; length; length--) {
198 int32_t t, hi, lo;
199
200 hi = volumes[channel] >> 16;
201 lo = volumes[channel] & 0xFFFF;
202
203 t = (int32_t) PA_INT16_SWAP(*samples);
204 t = ((t * lo) >> 16) + (t * hi);
205 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
206 *samples++ = PA_INT16_SWAP((int16_t) t);
207
208 if (PA_UNLIKELY(++channel >= channels))
209 channel = 0;
210 }
211 }
212
213 static void
214 pa_volume_float32ne_mmx (float *samples, float *volumes, unsigned channels, unsigned length)
215 {
216 unsigned channel;
217
218 length /= sizeof (float);
219
220 for (channel = 0; length; length--) {
221 *samples++ *= volumes[channel];
222
223 if (PA_UNLIKELY(++channel >= channels))
224 channel = 0;
225 }
226 }
227
228 static void
229 pa_volume_float32re_mmx (float *samples, float *volumes, unsigned channels, unsigned length)
230 {
231 unsigned channel;
232
233 length /= sizeof (float);
234
235 for (channel = 0; length; length--) {
236 float t;
237
238 t = PA_FLOAT32_SWAP(*samples);
239 t *= volumes[channel];
240 *samples++ = PA_FLOAT32_SWAP(t);
241
242 if (PA_UNLIKELY(++channel >= channels))
243 channel = 0;
244 }
245 }
246
247 static void
248 pa_volume_s32ne_mmx (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
249 {
250 unsigned channel;
251
252 length /= sizeof (int32_t);
253
254 for (channel = 0; length; length--) {
255 int64_t t;
256
257 t = (int64_t)(*samples);
258 t = (t * volumes[channel]) >> 16;
259 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
260 *samples++ = (int32_t) t;
261
262 if (PA_UNLIKELY(++channel >= channels))
263 channel = 0;
264 }
265 }
266
267 static void
268 pa_volume_s32re_mmx (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
269 {
270 unsigned channel;
271
272 length /= sizeof (int32_t);
273
274 for (channel = 0; length; length--) {
275 int64_t t;
276
277 t = (int64_t) PA_INT32_SWAP(*samples);
278 t = (t * volumes[channel]) >> 16;
279 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
280 *samples++ = PA_INT32_SWAP((int32_t) t);
281
282 if (PA_UNLIKELY(++channel >= channels))
283 channel = 0;
284 }
285 }
286
287 static void
288 pa_volume_s24ne_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
289 {
290 unsigned channel;
291 uint8_t *e;
292
293 e = samples + length;
294
295 for (channel = 0; samples < e; samples += 3) {
296 int64_t t;
297
298 t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
299 t = (t * volumes[channel]) >> 16;
300 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
301 PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
302
303 if (PA_UNLIKELY(++channel >= channels))
304 channel = 0;
305 }
306 }
307
308 static void
309 pa_volume_s24re_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
310 {
311 unsigned channel;
312 uint8_t *e;
313
314 e = samples + length;
315
316 for (channel = 0; samples < e; samples += 3) {
317 int64_t t;
318
319 t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
320 t = (t * volumes[channel]) >> 16;
321 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
322 PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
323
324 if (PA_UNLIKELY(++channel >= channels))
325 channel = 0;
326 }
327 }
328
329 static void
330 pa_volume_s24_32ne_mmx (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
331 {
332 unsigned channel;
333
334 length /= sizeof (uint32_t);
335
336 for (channel = 0; length; length--) {
337 int64_t t;
338
339 t = (int64_t) ((int32_t) (*samples << 8));
340 t = (t * volumes[channel]) >> 16;
341 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
342 *samples++ = ((uint32_t) ((int32_t) t)) >> 8;
343
344 if (PA_UNLIKELY(++channel >= channels))
345 channel = 0;
346 }
347 }
348
349 static void
350 pa_volume_s24_32re_mmx (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
351 {
352 unsigned channel;
353
354 length /= sizeof (uint32_t);
355
356 for (channel = 0; length; length--) {
357 int64_t t;
358
359 t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
360 t = (t * volumes[channel]) >> 16;
361 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
362 *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
363
364 if (PA_UNLIKELY(++channel >= channels))
365 channel = 0;
366 }
367 }
368 #endif
369
370 #undef RUN_TEST
371
372 #ifdef RUN_TEST
373 #define CHANNELS 1
374 #define SAMPLES 1021
375 #define TIMES 1000
376
377 static void run_test (void) {
378 int16_t samples[SAMPLES];
379 int16_t samples_ref[SAMPLES];
380 int16_t samples_orig[SAMPLES];
381 int32_t volumes[CHANNELS];
382 int i, j;
383 pa_do_volume_func_t func;
384
385 func = pa_get_volume_func (PA_SAMPLE_S16NE);
386
387 printf ("checking\n");
388
389 for (j = 0; j < TIMES; j++) {
390 pa_random (samples, sizeof (samples));
391 memcpy (samples_ref, samples, sizeof (samples));
392 memcpy (samples_orig, samples, sizeof (samples));
393
394 for (i = 0; i < CHANNELS; i++) {
395 volumes[i] = rand() >> 15;
396 }
397
398 pa_volume_s16ne_mmx (samples, volumes, CHANNELS, SAMPLES * sizeof (int16_t));
399 func (samples_ref, volumes, CHANNELS, SAMPLES * sizeof (int16_t));
400
401 for (i = 0; i < SAMPLES; i++) {
402 if (samples[i] != samples_ref[i]) {
403 printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
404 samples_orig[i], volumes[i % CHANNELS]);
405 }
406 #if 0
407 else
408 printf ("%d: %04x == %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
409 samples_orig[i], volumes[i % CHANNELS]);
410 #endif
411 }
412 }
413 }
414 #endif
415
416 void pa_volume_func_init_mmx (void) {
417 pa_log_info("Initialising MMX optimized functions.");
418
419 #ifdef RUN_TEST
420 run_test ();
421 #endif
422
423 pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
424 }