]> code.delx.au - pulseaudio/blob - src/pulsecore/sconv_sse.c
sconv, svolume: Fix compilation on 32-bit FreeBSD
[pulseaudio] / src / pulsecore / sconv_sse.c
1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
6
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
11
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 USA.
21 ***/
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <stdlib.h>
29
30 #include <pulsecore/macro.h>
31 #include <pulsecore/endianmacros.h>
32
33 #include "cpu-x86.h"
34 #include "sconv.h"
35
36 #if (!defined(__APPLE__) && !defined(__FreeBSD__) && defined (__i386__)) || defined (__amd64__)
37
38 static const PA_DECLARE_ALIGNED (16, float, scale[4]) = { 0x8000, 0x8000, 0x8000, 0x8000 };
39
40 static void pa_sconv_s16le_from_f32ne_sse(unsigned n, const float *a, int16_t *b) {
41 pa_reg_x86 temp, i;
42
43 __asm__ __volatile__ (
44 " movaps %5, %%xmm5 \n\t"
45 " xor %0, %0 \n\t"
46
47 " mov %4, %1 \n\t"
48 " sar $3, %1 \n\t" /* 8 floats at a time */
49 " cmp $0, %1 \n\t"
50 " je 2f \n\t"
51
52 "1: \n\t"
53 " movups (%q2, %0, 2), %%xmm0 \n\t" /* read 8 floats */
54 " movups 16(%q2, %0, 2), %%xmm2 \n\t"
55 " mulps %%xmm5, %%xmm0 \n\t" /* *= 0x8000 */
56 " mulps %%xmm5, %%xmm2 \n\t"
57
58 " cvtps2pi %%xmm0, %%mm0 \n\t" /* low part to int */
59 " cvtps2pi %%xmm2, %%mm2 \n\t"
60 " movhlps %%xmm0, %%xmm0 \n\t" /* bring high part in position */
61 " movhlps %%xmm2, %%xmm2 \n\t"
62 " cvtps2pi %%xmm0, %%mm1 \n\t" /* high part to int */
63 " cvtps2pi %%xmm2, %%mm3 \n\t"
64
65 " packssdw %%mm1, %%mm0 \n\t" /* pack parts */
66 " packssdw %%mm3, %%mm2 \n\t"
67 " movq %%mm0, (%q3, %0) \n\t"
68 " movq %%mm2, 8(%q3, %0) \n\t"
69
70 " add $16, %0 \n\t"
71 " dec %1 \n\t"
72 " jne 1b \n\t"
73
74 "2: \n\t"
75 " mov %4, %1 \n\t" /* prepare for leftovers */
76 " and $7, %1 \n\t"
77 " je 5f \n\t"
78
79 "3: \n\t"
80 " movss (%q2, %0, 2), %%xmm0 \n\t"
81 " mulss %%xmm5, %%xmm0 \n\t"
82 " cvtss2si %%xmm0, %4 \n\t"
83 " add $0x8000, %4 \n\t" /* check for saturation */
84 " and $~0xffff, %4 \n\t"
85 " cvtss2si %%xmm0, %4 \n\t"
86 " je 4f \n\t"
87 " sar $31, %4 \n\t"
88 " xor $0x7fff, %4 \n\t"
89
90 "4: \n\t"
91 " movw %w4, (%q3, %0) \n\t" /* store leftover */
92 " add $2, %0 \n\t"
93 " dec %1 \n\t"
94 " jne 3b \n\t"
95
96 "5: \n\t"
97 " emms \n\t"
98
99 : "=&r" (i), "=&r" (temp)
100 : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*scale)
101 : "cc", "memory"
102 );
103 }
104
105 static void pa_sconv_s16le_from_f32ne_sse2(unsigned n, const float *a, int16_t *b) {
106 pa_reg_x86 temp, i;
107
108 __asm__ __volatile__ (
109 " movaps %5, %%xmm5 \n\t"
110 " xor %0, %0 \n\t"
111
112 " mov %4, %1 \n\t"
113 " sar $3, %1 \n\t" /* 8 floats at a time */
114 " cmp $0, %1 \n\t"
115 " je 2f \n\t"
116
117 "1: \n\t"
118 " movups (%q2, %0, 2), %%xmm0 \n\t" /* read 8 floats */
119 " movups 16(%q2, %0, 2), %%xmm2 \n\t"
120 " mulps %%xmm5, %%xmm0 \n\t" /* *= 0x8000 */
121 " mulps %%xmm5, %%xmm2 \n\t"
122
123 " cvtps2dq %%xmm0, %%xmm0 \n\t"
124 " cvtps2dq %%xmm2, %%xmm2 \n\t"
125
126 " packssdw %%xmm2, %%xmm0 \n\t"
127 " movdqu %%xmm0, (%q3, %0) \n\t"
128
129 " add $16, %0 \n\t"
130 " dec %1 \n\t"
131 " jne 1b \n\t"
132
133 "2: \n\t"
134 " mov %4, %1 \n\t" /* prepare for leftovers */
135 " and $7, %1 \n\t"
136 " je 5f \n\t"
137
138 "3: \n\t"
139 " movss (%q2, %0, 2), %%xmm0 \n\t"
140 " mulss %%xmm5, %%xmm0 \n\t"
141 " cvtss2si %%xmm0, %4 \n\t"
142 " add $0x8000, %4 \n\t"
143 " and $~0xffff, %4 \n\t" /* check for saturation */
144 " cvtss2si %%xmm0, %4 \n\t"
145 " je 4f \n\t"
146 " sar $31, %4 \n\t"
147 " xor $0x7fff, %4 \n\t"
148
149 "4: \n\t"
150 " movw %w4, (%q3, %0) \n\t" /* store leftover */
151 " add $2, %0 \n\t"
152 " dec %1 \n\t"
153 " jne 3b \n\t"
154
155 "5: \n\t"
156
157 : "=&r" (i), "=&r" (temp)
158 : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*scale)
159 : "cc", "memory"
160 );
161 }
162
163 #endif /* defined (__i386__) || defined (__amd64__) */
164
165 void pa_convert_func_init_sse(pa_cpu_x86_flag_t flags) {
166 #if (!defined(__APPLE__) && !defined(__FreeBSD__) && defined (__i386__)) || defined (__amd64__)
167
168 if (flags & PA_CPU_X86_SSE2) {
169 pa_log_info("Initialising SSE2 optimized conversions.");
170 pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse2);
171
172 } else if (flags & PA_CPU_X86_SSE) {
173 pa_log_info("Initialising SSE optimized conversions.");
174 pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse);
175 }
176
177 #endif /* defined (__i386__) || defined (__amd64__) */
178 }