]> code.delx.au - pulseaudio/blob - src/pulsecore/sconv_sse.c
alsa-mixer: When figuring out the max_dB of a path, use only channels that are used...
[pulseaudio] / src / pulsecore / sconv_sse.c
1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
6
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
11
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 USA.
21 ***/
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <stdlib.h>
29
30 #include <pulsecore/g711.h>
31 #include <pulsecore/macro.h>
32
33 #include "endianmacros.h"
34
35 #include "cpu-x86.h"
36 #include "sconv.h"
37
38 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
39
40 static const PA_DECLARE_ALIGNED (16, float, one[4]) = { 1.0, 1.0, 1.0, 1.0 };
41 static const PA_DECLARE_ALIGNED (16, float, mone[4]) = { -1.0, -1.0, -1.0, -1.0 };
42 static const PA_DECLARE_ALIGNED (16, float, scale[4]) = { 0x7fff, 0x7fff, 0x7fff, 0x7fff };
43
44 static void pa_sconv_s16le_from_f32ne_sse(unsigned n, const float *a, int16_t *b) {
45 pa_reg_x86 temp, i;
46
47 __asm__ __volatile__ (
48 " movaps %5, %%xmm5 \n\t"
49 " movaps %6, %%xmm6 \n\t"
50 " movaps %7, %%xmm7 \n\t"
51 " xor %0, %0 \n\t"
52
53 " mov %4, %1 \n\t"
54 " sar $3, %1 \n\t" /* 8 floats at a time */
55 " cmp $0, %1 \n\t"
56 " je 2f \n\t"
57
58 "1: \n\t"
59 " movups (%2, %0, 2), %%xmm0 \n\t" /* read 8 floats */
60 " movups 16(%2, %0, 2), %%xmm2 \n\t"
61 " minps %%xmm5, %%xmm0 \n\t" /* clamp to 1.0 */
62 " minps %%xmm5, %%xmm2 \n\t"
63 " maxps %%xmm6, %%xmm0 \n\t" /* clamp to -1.0 */
64 " maxps %%xmm6, %%xmm2 \n\t"
65 " mulps %%xmm7, %%xmm0 \n\t" /* *= 0x7fff */
66 " mulps %%xmm7, %%xmm2 \n\t"
67
68 " cvtps2pi %%xmm0, %%mm0 \n\t" /* low part to int */
69 " cvtps2pi %%xmm2, %%mm2 \n\t"
70 " movhlps %%xmm0, %%xmm0 \n\t" /* bring high part in position */
71 " movhlps %%xmm2, %%xmm2 \n\t"
72 " cvtps2pi %%xmm0, %%mm1 \n\t" /* high part to int */
73 " cvtps2pi %%xmm2, %%mm3 \n\t"
74
75 " packssdw %%mm1, %%mm0 \n\t" /* pack parts */
76 " packssdw %%mm3, %%mm2 \n\t"
77 " movq %%mm0, (%3, %0) \n\t"
78 " movq %%mm2, 8(%3, %0) \n\t"
79
80 " add $16, %0 \n\t"
81 " dec %1 \n\t"
82 " jne 1b \n\t"
83
84 "2: \n\t"
85 " mov %4, %1 \n\t" /* prepare for leftovers */
86 " and $7, %1 \n\t"
87 " je 4f \n\t"
88
89 "3: \n\t"
90 " movss (%2, %0, 2), %%xmm0 \n\t"
91 " minss %%xmm5, %%xmm0 \n\t"
92 " maxss %%xmm6, %%xmm0 \n\t"
93 " mulss %%xmm7, %%xmm0 \n\t"
94 " cvtss2si %%xmm0, %4 \n\t"
95 " movw %w4, (%3, %0) \n\t"
96 " add $2, %0 \n\t"
97 " dec %1 \n\t"
98 " jne 3b \n\t"
99
100 "4: \n\t"
101 " emms \n\t"
102
103 : "=&r" (i), "=&r" (temp)
104 : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
105 : "cc", "memory"
106 );
107 }
108
109 static void pa_sconv_s16le_from_f32ne_sse2(unsigned n, const float *a, int16_t *b) {
110 pa_reg_x86 temp, i;
111
112 __asm__ __volatile__ (
113 " movaps %5, %%xmm5 \n\t"
114 " movaps %6, %%xmm6 \n\t"
115 " movaps %7, %%xmm7 \n\t"
116 " xor %0, %0 \n\t"
117
118 " mov %4, %1 \n\t"
119 " sar $3, %1 \n\t" /* 8 floats at a time */
120 " cmp $0, %1 \n\t"
121 " je 2f \n\t"
122
123 "1: \n\t"
124 " movups (%2, %0, 2), %%xmm0 \n\t" /* read 8 floats */
125 " movups 16(%2, %0, 2), %%xmm2 \n\t"
126 " minps %%xmm5, %%xmm0 \n\t" /* clamp to 1.0 */
127 " minps %%xmm5, %%xmm2 \n\t"
128 " maxps %%xmm6, %%xmm0 \n\t" /* clamp to -1.0 */
129 " maxps %%xmm6, %%xmm2 \n\t"
130 " mulps %%xmm7, %%xmm0 \n\t" /* *= 0x7fff */
131 " mulps %%xmm7, %%xmm2 \n\t"
132
133 " cvtps2dq %%xmm0, %%xmm0 \n\t"
134 " cvtps2dq %%xmm2, %%xmm2 \n\t"
135
136 " packssdw %%xmm2, %%xmm0 \n\t"
137 " movdqu %%xmm0, (%3, %0) \n\t"
138
139 " add $16, %0 \n\t"
140 " dec %1 \n\t"
141 " jne 1b \n\t"
142
143 "2: \n\t"
144 " mov %4, %1 \n\t" /* prepare for leftovers */
145 " and $7, %1 \n\t"
146 " je 4f \n\t"
147
148 "3: \n\t"
149 " movss (%2, %0, 2), %%xmm0 \n\t"
150 " minss %%xmm5, %%xmm0 \n\t"
151 " maxss %%xmm6, %%xmm0 \n\t"
152 " mulss %%xmm7, %%xmm0 \n\t"
153 " cvtss2si %%xmm0, %4 \n\t"
154 " movw %w4, (%3, %0) \n\t"
155 " add $2, %0 \n\t"
156 " dec %1 \n\t"
157 " jne 3b \n\t"
158
159 "4: \n\t"
160
161 : "=&r" (i), "=&r" (temp)
162 : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
163 : "cc", "memory"
164 );
165 }
166
167 #undef RUN_TEST
168
169 #ifdef RUN_TEST
170 #define SAMPLES 1019
171 #define TIMES 1000
172
173 static void run_test (void) {
174 int16_t samples[SAMPLES];
175 int16_t samples_ref[SAMPLES];
176 float floats[SAMPLES];
177 int i;
178 pa_usec_t start, stop;
179 pa_convert_func_t func;
180
181 printf ("checking SSE %zd\n", sizeof (samples));
182
183 memset (samples_ref, 0, sizeof (samples_ref));
184 memset (samples, 0, sizeof (samples));
185
186 for (i = 0; i < SAMPLES; i++) {
187 floats[i] = (rand()/(RAND_MAX+2.2)) - 1.1;
188 }
189
190 func = pa_get_convert_from_float32ne_function (PA_SAMPLE_S16LE);
191 func (SAMPLES, floats, samples_ref);
192 pa_sconv_s16le_from_f32ne_sse2 (SAMPLES, floats, samples);
193
194 for (i = 0; i < SAMPLES; i++) {
195 if (samples[i] != samples_ref[i]) {
196 printf ("%d: %04x != %04x (%f)\n", i, samples[i], samples_ref[i],
197 floats[i]);
198 }
199 }
200
201 start = pa_rtclock_now();
202 for (i = 0; i < TIMES; i++) {
203 pa_sconv_s16le_from_f32ne_sse2 (SAMPLES, floats, samples);
204 }
205 stop = pa_rtclock_now();
206 pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
207
208 start = pa_rtclock_now();
209 for (i = 0; i < TIMES; i++) {
210 func (SAMPLES, floats, samples_ref);
211 }
212 stop = pa_rtclock_now();
213 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
214 }
215 #endif
216 #endif /* defined (__i386__) || defined (__amd64__) */
217
218
219 void pa_convert_func_init_sse (pa_cpu_x86_flag_t flags) {
220 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
221
222 #ifdef RUN_TEST
223 run_test ();
224 #endif
225
226 if (flags & PA_CPU_X86_SSE2) {
227 pa_log_info("Initialising SSE2 optimized conversions.");
228 pa_set_convert_from_float32ne_function (PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse2);
229 } else {
230 pa_log_info("Initialising SSE optimized conversions.");
231 pa_set_convert_from_float32ne_function (PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse);
232 }
233
234 #endif /* defined (__i386__) || defined (__amd64__) */
235 }