]> code.delx.au - pulseaudio/blob - src/pulsecore/sconv_sse.c
3c9a809f8097f4bcb9a8ae9759a018f7db0b066c
[pulseaudio] / src / pulsecore / sconv_sse.c
1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
6
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
11
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 USA.
21 ***/
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <stdlib.h>
29
30 #include <pulsecore/macro.h>
31 #include <pulsecore/endianmacros.h>
32
33 #include "cpu-x86.h"
34 #include "sconv.h"
35
36 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
37
38 static const PA_DECLARE_ALIGNED (16, float, one[4]) = { 1.0, 1.0, 1.0, 1.0 };
39 static const PA_DECLARE_ALIGNED (16, float, mone[4]) = { -1.0, -1.0, -1.0, -1.0 };
40 static const PA_DECLARE_ALIGNED (16, float, scale[4]) = { 0x7fff, 0x7fff, 0x7fff, 0x7fff };
41
42 static void pa_sconv_s16le_from_f32ne_sse(unsigned n, const float *a, int16_t *b) {
43 pa_reg_x86 temp, i;
44
45 __asm__ __volatile__ (
46 " movaps %5, %%xmm5 \n\t"
47 " movaps %6, %%xmm6 \n\t"
48 " movaps %7, %%xmm7 \n\t"
49 " xor %0, %0 \n\t"
50
51 " mov %4, %1 \n\t"
52 " sar $3, %1 \n\t" /* 8 floats at a time */
53 " cmp $0, %1 \n\t"
54 " je 2f \n\t"
55
56 "1: \n\t"
57 " movups (%q2, %0, 2), %%xmm0 \n\t" /* read 8 floats */
58 " movups 16(%q2, %0, 2), %%xmm2 \n\t"
59 " minps %%xmm5, %%xmm0 \n\t" /* clamp to 1.0 */
60 " minps %%xmm5, %%xmm2 \n\t"
61 " maxps %%xmm6, %%xmm0 \n\t" /* clamp to -1.0 */
62 " maxps %%xmm6, %%xmm2 \n\t"
63 " mulps %%xmm7, %%xmm0 \n\t" /* *= 0x7fff */
64 " mulps %%xmm7, %%xmm2 \n\t"
65
66 " cvtps2pi %%xmm0, %%mm0 \n\t" /* low part to int */
67 " cvtps2pi %%xmm2, %%mm2 \n\t"
68 " movhlps %%xmm0, %%xmm0 \n\t" /* bring high part in position */
69 " movhlps %%xmm2, %%xmm2 \n\t"
70 " cvtps2pi %%xmm0, %%mm1 \n\t" /* high part to int */
71 " cvtps2pi %%xmm2, %%mm3 \n\t"
72
73 " packssdw %%mm1, %%mm0 \n\t" /* pack parts */
74 " packssdw %%mm3, %%mm2 \n\t"
75 " movq %%mm0, (%q3, %0) \n\t"
76 " movq %%mm2, 8(%q3, %0) \n\t"
77
78 " add $16, %0 \n\t"
79 " dec %1 \n\t"
80 " jne 1b \n\t"
81
82 "2: \n\t"
83 " mov %4, %1 \n\t" /* prepare for leftovers */
84 " and $7, %1 \n\t"
85 " je 4f \n\t"
86
87 "3: \n\t"
88 " movss (%q2, %0, 2), %%xmm0 \n\t"
89 " minss %%xmm5, %%xmm0 \n\t"
90 " maxss %%xmm6, %%xmm0 \n\t"
91 " mulss %%xmm7, %%xmm0 \n\t"
92 " cvtss2si %%xmm0, %4 \n\t"
93 " movw %w4, (%q3, %0) \n\t"
94 " add $2, %0 \n\t"
95 " dec %1 \n\t"
96 " jne 3b \n\t"
97
98 "4: \n\t"
99 " emms \n\t"
100
101 : "=&r" (i), "=&r" (temp)
102 : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
103 : "cc", "memory"
104 );
105 }
106
107 static void pa_sconv_s16le_from_f32ne_sse2(unsigned n, const float *a, int16_t *b) {
108 pa_reg_x86 temp, i;
109
110 __asm__ __volatile__ (
111 " movaps %5, %%xmm5 \n\t"
112 " movaps %6, %%xmm6 \n\t"
113 " movaps %7, %%xmm7 \n\t"
114 " xor %0, %0 \n\t"
115
116 " mov %4, %1 \n\t"
117 " sar $3, %1 \n\t" /* 8 floats at a time */
118 " cmp $0, %1 \n\t"
119 " je 2f \n\t"
120
121 "1: \n\t"
122 " movups (%q2, %0, 2), %%xmm0 \n\t" /* read 8 floats */
123 " movups 16(%q2, %0, 2), %%xmm2 \n\t"
124 " minps %%xmm5, %%xmm0 \n\t" /* clamp to 1.0 */
125 " minps %%xmm5, %%xmm2 \n\t"
126 " maxps %%xmm6, %%xmm0 \n\t" /* clamp to -1.0 */
127 " maxps %%xmm6, %%xmm2 \n\t"
128 " mulps %%xmm7, %%xmm0 \n\t" /* *= 0x7fff */
129 " mulps %%xmm7, %%xmm2 \n\t"
130
131 " cvtps2dq %%xmm0, %%xmm0 \n\t"
132 " cvtps2dq %%xmm2, %%xmm2 \n\t"
133
134 " packssdw %%xmm2, %%xmm0 \n\t"
135 " movdqu %%xmm0, (%q3, %0) \n\t"
136
137 " add $16, %0 \n\t"
138 " dec %1 \n\t"
139 " jne 1b \n\t"
140
141 "2: \n\t"
142 " mov %4, %1 \n\t" /* prepare for leftovers */
143 " and $7, %1 \n\t"
144 " je 4f \n\t"
145
146 "3: \n\t"
147 " movss (%q2, %0, 2), %%xmm0 \n\t"
148 " minss %%xmm5, %%xmm0 \n\t"
149 " maxss %%xmm6, %%xmm0 \n\t"
150 " mulss %%xmm7, %%xmm0 \n\t"
151 " cvtss2si %%xmm0, %4 \n\t"
152 " movw %w4, (%q3, %0) \n\t"
153 " add $2, %0 \n\t"
154 " dec %1 \n\t"
155 " jne 3b \n\t"
156
157 "4: \n\t"
158
159 : "=&r" (i), "=&r" (temp)
160 : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
161 : "cc", "memory"
162 );
163 }
164
165 #undef RUN_TEST
166
167 #ifdef RUN_TEST
168 #define SAMPLES 1019
169 #define TIMES 1000
170
171 static void run_test(void) {
172 int16_t samples[SAMPLES];
173 int16_t samples_ref[SAMPLES];
174 float floats[SAMPLES];
175 int i;
176 pa_usec_t start, stop;
177 pa_convert_func_t func;
178
179 printf("checking SSE %zd\n", sizeof(samples));
180
181 memset(samples_ref, 0, sizeof(samples_ref));
182 memset(samples, 0, sizeof(samples));
183
184 for (i = 0; i < SAMPLES; i++) {
185 floats[i] = 2.1f * (rand()/(float) RAND_MAX - 0.5f);
186 }
187
188 func = pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
189 func(SAMPLES, floats, samples_ref);
190 pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
191
192 for (i = 0; i < SAMPLES; i++) {
193 if (samples[i] != samples_ref[i]) {
194 printf ("%d: %04x != %04x (%f)\n", i, samples[i], samples_ref[i],
195 floats[i]);
196 }
197 }
198
199 start = pa_rtclock_now();
200 for (i = 0; i < TIMES; i++) {
201 pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
202 }
203 stop = pa_rtclock_now();
204 pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
205
206 start = pa_rtclock_now();
207 for (i = 0; i < TIMES; i++) {
208 func(SAMPLES, floats, samples_ref);
209 }
210 stop = pa_rtclock_now();
211 pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
212 }
213 #endif
214 #endif /* defined (__i386__) || defined (__amd64__) */
215
216
217 void pa_convert_func_init_sse(pa_cpu_x86_flag_t flags) {
218 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
219
220 #ifdef RUN_TEST
221 run_test();
222 #endif
223
224 if (flags & PA_CPU_X86_SSE2) {
225 pa_log_info("Initialising SSE2 optimized conversions.");
226 pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse2);
227 } else {
228 pa_log_info("Initialising SSE optimized conversions.");
229 pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse);
230 }
231
232 #endif /* defined (__i386__) || defined (__amd64__) */
233 }