]> code.delx.au - pulseaudio/blob - src/pulsecore/remap_neon.c
remap: Add ARM NEON optimized remapping and rearrange code
[pulseaudio] / src / pulsecore / remap_neon.c
1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2013 Peter Meerwald <p.meerwald@bct-electronic.com>
5
6 PulseAudio is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published
8 by the Free Software Foundation; either version 2.1 of the License,
9 or (at your option) any later version.
10
11 PulseAudio is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15 ***/
16
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include <pulse/sample.h>
22 #include <pulse/xmalloc.h>
23 #include <pulsecore/log.h>
24 #include <pulsecore/macro.h>
25
26 #include "cpu-arm.h"
27 #include "remap.h"
28
29 #include <arm_neon.h>
30
31 static void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) {
32 for (; n >= 4; n -= 4) {
33 __asm__ __volatile__ (
34 "vld1.32 {q0}, [%[src]]! \n\t"
35 "vmov q1, q0 \n\t"
36 "vst2.32 {q0,q1}, [%[dst]]! \n\t"
37 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
38 : /* input operands */
39 : "memory", "q0", "q1" /* clobber list */
40 );
41 }
42
43 for (; n > 0; n--) {
44 dst[0] = dst[1] = src[0];
45 src++;
46 dst += 2;
47 }
48 }
49
50 static void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) {
51 for (; n >= 2; n -= 2) {
52 __asm__ __volatile__ (
53 "ldm %[src]!, {r4,r6} \n\t"
54 "mov r5, r4 \n\t"
55 "mov r7, r6 \n\t"
56 "stm %[dst]!, {r4-r7} \n\t"
57 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
58 : /* input operands */
59 : "memory", "r4", "r5", "r6", "r7" /* clobber list */
60 );
61 }
62
63 if (n > 0)
64 dst[0] = dst[1] = src[0];
65 }
66
67 static void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
68 for (; n >= 8; n -= 8) {
69 __asm__ __volatile__ (
70 "vld1.16 {q0}, [%[src]]! \n\t"
71 "vmov q1, q0 \n\t"
72 "vst2.16 {q0,q1}, [%[dst]]! \n\t"
73 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
74 : /* input operands */
75 : "memory", "q0", "q1" /* clobber list */
76 );
77 }
78
79 for (; n > 0; n--) {
80 dst[0] = dst[1] = src[0];
81 src++;
82 dst += 2;
83 }
84 }
85
86 static void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
87 for (; n >= 2; n -= 2) {
88 __asm__ __volatile__ (
89 "vld1.32 {d0}, [%[src]]! \n\t"
90 "vdup.f32 q1, d0[0] \n\t"
91 "vdup.f32 q2, d0[1] \n\t"
92 "vst1.32 {q1,q2}, [%[dst]]! \n\t"
93 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
94 : /* input operands */
95 : "memory", "q0", "q1", "q2" /* clobber list */
96 );
97 }
98
99 if (n--)
100 dst[0] = dst[1] = dst[2] = dst[3] = src[0];
101 }
102
103 static void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
104 for (; n >= 4; n -= 4) {
105 __asm__ __volatile__ (
106 "vld1.16 {d0}, [%[src]]! \n\t"
107 "vdup.s16 d1, d0[1] \n\t"
108 "vdup.s16 d2, d0[2] \n\t"
109 "vdup.s16 d3, d0[3] \n\t"
110 "vdup.s16 d0, d0[0] \n\t"
111 "vst1.16 {d0,d1,d2,d3}, [%[dst]]!\n\t"
112 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
113 : /* input operands */
114 : "memory", "d0", "d1", "d2", "d3" /* clobber list */
115 );
116 }
117
118 for (; n > 0; n--) {
119 dst[0] = dst[1] = dst[2] = dst[3] = src[0];
120 src++;
121 dst += 4;
122 }
123 }
124
125 static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
126 const float32x4_t halve = vdupq_n_f32(0.5f);
127 for (; n >= 4; n -= 4) {
128 __asm__ __volatile__ (
129 "vld2.32 {q0,q1}, [%[src]]! \n\t"
130 "vadd.f32 q0, q0, q1 \n\t"
131 "vmul.f32 q0, q0, %q[halve] \n\t"
132 "vst1.32 {q0}, [%[dst]]! \n\t"
133 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
134 : [halve] "w" (halve) /* input operands */
135 : "memory", "q0", "q1" /* clobber list */
136 );
137 }
138
139 for (; n > 0; n--) {
140 dst[0] = (src[0] + src[1])*0.5f;
141 src += 2;
142 dst++;
143 }
144 }
145
146 static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
147 for (; n >= 8; n -= 8) {
148 __asm__ __volatile__ (
149 "vld2.16 {q0,q1}, [%[src]]! \n\t"
150 "vrhadd.s16 q0, q0, q1 \n\t"
151 "vst1.16 {q0}, [%[dst]]! \n\t"
152 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
153 : /* input operands */
154 : "memory", "q0", "q1" /* clobber list */
155 );
156 }
157
158 for (; n > 0; n--) {
159 dst[0] = (src[0] + src[1])/2;
160 src += 2;
161 dst++;
162 }
163 }
164
165 static void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
166 const float32x2_t quart = vdup_n_f32(0.25f);
167 for (; n >= 2; n -= 2) {
168 __asm__ __volatile__ (
169 "vld4.32 {d0,d1,d2,d3}, [%[src]]!\n\t"
170 "vadd.f32 d0, d0, d1 \n\t"
171 "vadd.f32 d2, d2, d3 \n\t"
172 "vadd.f32 d0, d0, d2 \n\t"
173 "vmul.f32 d0, d0, %[quart] \n\t"
174 "vst1.32 {d0}, [%[dst]]! \n\t"
175 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
176 : [quart] "w" (quart) /* input operands */
177 : "memory", "d0", "d1", "d2", "d3" /* clobber list */
178 );
179 }
180
181 if (n > 0)
182 dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
183 }
184
185 static void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
186 for (; n >= 4; n -= 4) {
187 __asm__ __volatile__ (
188 "vld4.16 {d0,d1,d2,d3}, [%[src]]!\n\t"
189 "vrhadd.s16 d0, d0, d1 \n\t"
190 "vrhadd.s16 d2, d2, d3 \n\t"
191 "vrhadd.s16 d0, d0, d2 \n\t"
192 "vst1.16 {d0}, [%[dst]]! \n\t"
193 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
194 : /* input operands */
195 : "memory", "d0", "d1", "d2", "d3" /* clobber list */
196 );
197 }
198
199 for (; n > 0; n--) {
200 dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
201 src += 4;
202 dst++;
203 }
204 }
205
206 static void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
207 int32x4_t *f = m->state;
208 const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
209
210 for (; n > 0; n--) {
211 __asm__ __volatile__ (
212 "vld1.16 {d0}, [%[src]]! \n\t"
213 "vmovl.s16 q0, d0 \n\t"
214 "vdup.s32 q1, d0[0] \n\t"
215 "vmul.s32 q1, q1, %q[f0] \n\t"
216 "vdup.s32 q2, d0[1] \n\t"
217 "vmla.s32 q1, q2, %q[f1] \n\t"
218 "vdup.s32 q2, d1[0] \n\t"
219 "vmla.s32 q1, q2, %q[f2] \n\t"
220 "vdup.s32 q2, d1[1] \n\t"
221 "vmla.s32 q1, q2, %q[f3] \n\t"
222 "vqshrn.s32 d2, q1, #16 \n\t"
223 "vst1.32 {d2}, [%[dst]]! \n\t"
224 : [dst] "+r" (dst), [src] "+r" (src)
225 : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
226 : "memory", "q0", "q1", "q2"
227 );
228 }
229 }
230
231 static void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
232 float32x4_t *f = m->state;
233 const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
234
235 for (; n > 0; n--) {
236 __asm__ __volatile__ (
237 "vld1.32 {d0,d1}, [%[src]]! \n\t"
238 "vdup.f32 q1, d0[0] \n\t"
239 "vmul.f32 q1, q1, %q[f0] \n\t"
240 "vdup.f32 q2, d0[1] \n\t"
241 "vmla.f32 q1, q2, %q[f1] \n\t"
242 "vdup.f32 q2, d1[0] \n\t"
243 "vmla.f32 q1, q2, %q[f2] \n\t"
244 "vdup.f32 q2, d1[1] \n\t"
245 "vmla.f32 q1, q2, %q[f3] \n\t"
246 "vst1.32 {d2,d3}, [%[dst]]! \n\t"
247 : [dst] "+r" (dst), [src] "+r" (src)
248 : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
249 : "memory", "q0", "q1", "q2"
250 );
251 }
252 }
253
254 static void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
255 const uint8x8_t t = ((uint8x8_t *) m->state)[0];
256
257 for (; n >= 2; n -= 2) {
258 __asm__ __volatile__ (
259 "vld1.s16 d0, [%[src]]! \n\t"
260 "vtbl.8 d0, {d0}, %[t] \n\t"
261 "vst1.s16 d0, [%[dst]]! \n\t"
262 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
263 : [t] "w" (t) /* input operands */
264 : "memory", "d0" /* clobber list */
265 );
266 }
267
268 if (n > 0) {
269 __asm__ __volatile__ (
270 "vld1.32 d0[0], [%[src]]! \n\t"
271 "vtbl.8 d0, {d0}, %[t] \n\t"
272 "vst1.32 d0[0], [%[dst]]! \n\t"
273 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
274 : [t] "w" (t) /* input operands */
275 : "memory", "d0" /* clobber list */
276 );
277 }
278 }
279
280 static void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
281 const uint8x8_t t = ((uint8x8_t *) m->state)[0];
282
283 for (; n > 0; n--) {
284 __asm__ __volatile__ (
285 "vld1.32 d0[0], [%[src]]! \n\t"
286 "vtbl.8 d0, {d0}, %[t] \n\t"
287 "vst1.s16 d0, [%[dst]]! \n\t"
288 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
289 : [t] "w" (t) /* input operands */
290 : "memory", "d0" /* clobber list */
291 );
292 }
293 }
294
295 static void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
296 const uint8x8_t t = ((uint8x8_t *) m->state)[0];
297
298 for (; n > 0; n--) {
299 __asm__ __volatile__ (
300 "vld1.s16 d0, [%[src]]! \n\t"
301 "vtbl.8 d0, {d0}, %[t] \n\t"
302 "vst1.s16 d0, [%[dst]]! \n\t"
303 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
304 : [t] "w" (t) /* input operands */
305 : "memory", "d0" /* clobber list */
306 );
307 }
308 }
309
310 static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
311 const uint8x8_t t = ((uint8x8_t *)m->state)[0];
312
313 for (; n > 0; n--) {
314 __asm__ __volatile__ (
315 "vld1.f32 d0, [%[src]]! \n\t"
316 "vtbl.8 d0, {d0}, %[t] \n\t"
317 "vst1.s16 {d0}, [%[dst]]! \n\t"
318 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
319 : [t] "w" (t) /* input operands */
320 : "memory", "d0" /* clobber list */
321 );
322 }
323 }
324
325 static void remap_arrange_ch2_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
326 const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
327 const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
328
329 for (; n > 0; n--) {
330 __asm__ __volatile__ (
331 "vld1.f32 d0, [%[src]]! \n\t"
332 "vtbl.8 d1, {d0}, %[t0] \n\t"
333 "vtbl.8 d2, {d0}, %[t1] \n\t"
334 "vst1.s16 {d1,d2}, [%[dst]]! \n\t"
335 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
336 : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
337 : "memory", "d0", "d1", "d2" /* clobber list */
338 );
339 }
340 }
341
342 static void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
343 const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
344 const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
345
346 for (; n > 0; n--) {
347 __asm__ __volatile__ (
348 "vld1.f32 {d0,d1}, [%[src]]! \n\t"
349 "vtbl.8 d2, {d0,d1}, %[t0] \n\t"
350 "vtbl.8 d3, {d0,d1}, %[t1] \n\t"
351 "vst1.s16 {d2,d3}, [%[dst]]! \n\t"
352 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
353 : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
354 : "memory", "d0", "d1", "d2", "d3" /* clobber list */
355 );
356 }
357 }
358
359 static pa_cpu_arm_flag_t arm_flags;
360
361 static void init_remap_neon(pa_remap_t *m) {
362 unsigned n_oc, n_ic;
363 int8_t arrange[PA_CHANNELS_MAX];
364
365 n_oc = m->o_ss.channels;
366 n_ic = m->i_ss.channels;
367
368 if (n_ic == 1 && n_oc == 2 &&
369 m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
370 if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
371
372 pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
373 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
374 (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
375 }
376 else {
377 pa_log_info("Using ARM NEON mono to stereo remapping");
378 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
379 (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
380 }
381 } else if (n_ic == 1 && n_oc == 4 &&
382 m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
383 m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
384
385 pa_log_info("Using ARM NEON mono to 4-channel remapping");
386 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
387 (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
388 } else if (n_ic == 2 && n_oc == 1 &&
389 m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
390
391 pa_log_info("Using ARM NEON stereo to mono remapping");
392 pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
393 (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
394 } else if (n_ic == 4 && n_oc == 1 &&
395 m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
396 m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
397
398 pa_log_info("Using ARM NEON 4-channel to mono remapping");
399 pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
400 (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
401 } else if (pa_setup_remap_arrange(m, arrange) &&
402 ((n_ic == 2 && n_oc == 2) ||
403 (n_ic == 2 && n_oc == 4) ||
404 (n_ic == 4 && n_oc == 4))) {
405 unsigned o;
406
407 if (n_ic == 2 && n_oc == 2) {
408 pa_log_info("Using NEON stereo arrange remapping");
409 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
410 (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
411 } else if (n_ic == 2 && n_oc == 4) {
412 pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
413 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
414 (pa_do_remap_func_t) remap_arrange_ch2_ch4_float32ne_neon);
415 } else if (n_ic == 4 && n_oc == 4) {
416 pa_log_info("Using NEON 4-channel arrange remapping");
417 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
418 (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
419 }
420
421 /* setup state */
422 switch (m->format) {
423 case PA_SAMPLE_S16NE: {
424 uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1);
425 for (o = 0; o < 4; o++) {
426 if (arrange[o % n_oc] >= 0) {
427 /* convert channel index to vtbl indices */
428 unsigned frame = o / n_oc;
429 ((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0;
430 ((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1;
431 } else {
432 /* use invalid table indices to map to 0 */
433 ((uint8_t *) t)[o * 2 + 0] = 0xff;
434 ((uint8_t *) t)[o * 2 + 1] = 0xff;
435 }
436 }
437 break;
438 }
439 case PA_SAMPLE_FLOAT32NE: {
440 uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
441 for (o = 0; o < n_oc; o++) {
442 if (arrange[o] >= 0) {
443 /* convert channel index to vtbl indices */
444 ((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0;
445 ((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1;
446 ((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2;
447 ((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3;
448 } else {
449 /* use invalid table indices to map to 0 */
450 ((uint8_t *) t)[o * 4 + 0] = 0xff;
451 ((uint8_t *) t)[o * 4 + 1] = 0xff;
452 ((uint8_t *) t)[o * 4 + 2] = 0xff;
453 ((uint8_t *) t)[o * 4 + 3] = 0xff;
454 }
455 }
456 break;
457 }
458 default:
459 pa_assert_not_reached();
460 }
461 } else if (n_ic == 4 && n_oc == 4) {
462 unsigned i, o;
463
464 pa_log_info("Using ARM NEON 4-channel remapping");
465 pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
466 (pa_do_remap_func_t) remap_ch4_float32ne_neon);
467
468 /* setup state */
469 switch (m->format) {
470 case PA_SAMPLE_S16NE: {
471 int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4);
472 for (o = 0; o < 4; o++) {
473 for (i = 0; i < 4; i++) {
474 ((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000);
475 }
476 }
477 break;
478 }
479 case PA_SAMPLE_FLOAT32NE: {
480 float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4);
481 for (o = 0; o < 4; o++) {
482 for (i = 0; i < 4; i++) {
483 ((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f);
484 }
485 }
486 break;
487 }
488 default:
489 pa_assert_not_reached();
490 }
491 }
492 }
493
494 void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
495 pa_log_info("Initialising ARM NEON optimized remappers.");
496 arm_flags = flags;
497 pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
498 }