code.delx.au - pulseaudio/blob - src/pulsecore/sconv_sse.c

   1 /***
   2   This file is part of PulseAudio.
   3
   4   Copyright 2004-2006 Lennart Poettering
   5   Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
   6
   7   PulseAudio is free software; you can redistribute it and/or modify
   8   it under the terms of the GNU Lesser General Public License as published
   9   by the Free Software Foundation; either version 2.1 of the License,
  10   or (at your option) any later version.
  11
  12   PulseAudio is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with PulseAudio; if not, write to the Free Software
  19   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  20   USA.
  21 ***/
  22
  23 #ifdef HAVE_CONFIG_H
  24 #include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29
  30 #include <pulsecore/g711.h>
  31 #include <pulsecore/macro.h>
  32 #include <pulsecore/endianmacros.h>
  33
  34 #include "cpu-x86.h"
  35 #include "sconv.h"
  36
  37 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
  38
  39 static const PA_DECLARE_ALIGNED (16, float, one[4]) = { 1.0, 1.0, 1.0, 1.0 };
  40 static const PA_DECLARE_ALIGNED (16, float, mone[4]) = { -1.0, -1.0, -1.0, -1.0 };
  41 static const PA_DECLARE_ALIGNED (16, float, scale[4]) = { 0x7fff, 0x7fff, 0x7fff, 0x7fff };
  42
  43 static void pa_sconv_s16le_from_f32ne_sse(unsigned n, const float *a, int16_t *b) {
  44     pa_reg_x86 temp, i;
  45
  46     __asm__ __volatile__ (
  47         " movaps %5, %%xmm5             \n\t"
  48         " movaps %6, %%xmm6             \n\t"
  49         " movaps %7, %%xmm7             \n\t"
  50         " xor %0, %0                    \n\t"
  51
  52         " mov %4, %1                    \n\t"
  53         " sar $3, %1                    \n\t" /* 8 floats at a time */
  54         " cmp $0, %1                    \n\t"
  55         " je 2f                         \n\t"
  56
  57         "1:                             \n\t"
  58         " movups (%2, %0, 2), %%xmm0    \n\t" /* read 8 floats */
  59         " movups 16(%2, %0, 2), %%xmm2  \n\t"
  60         " minps  %%xmm5, %%xmm0         \n\t" /* clamp to 1.0 */
  61         " minps  %%xmm5, %%xmm2         \n\t"
  62         " maxps  %%xmm6, %%xmm0         \n\t" /* clamp to -1.0 */
  63         " maxps  %%xmm6, %%xmm2         \n\t"
  64         " mulps  %%xmm7, %%xmm0         \n\t" /* *= 0x7fff */
  65         " mulps  %%xmm7, %%xmm2         \n\t"
  66
  67         " cvtps2pi %%xmm0, %%mm0        \n\t" /* low part to int */
  68         " cvtps2pi %%xmm2, %%mm2        \n\t"
  69         " movhlps  %%xmm0, %%xmm0       \n\t" /* bring high part in position */
  70         " movhlps  %%xmm2, %%xmm2       \n\t"
  71         " cvtps2pi %%xmm0, %%mm1        \n\t" /* high part to int */
  72         " cvtps2pi %%xmm2, %%mm3        \n\t"
  73
  74         " packssdw %%mm1, %%mm0         \n\t" /* pack parts */
  75         " packssdw %%mm3, %%mm2         \n\t"
  76         " movq     %%mm0, (%3, %0)      \n\t"
  77         " movq    %%mm2, 8(%3, %0)     \n\t"
  78
  79         " add $16, %0                   \n\t"
  80         " dec %1                        \n\t"
  81         " jne 1b                        \n\t"
  82
  83         "2:                             \n\t"
  84         " mov %4, %1                    \n\t" /* prepare for leftovers */
  85         " and $7, %1                    \n\t"
  86         " je 4f                         \n\t"
  87
  88         "3:                             \n\t"
  89         " movss (%2, %0, 2), %%xmm0     \n\t"
  90         " minss  %%xmm5, %%xmm0         \n\t"
  91         " maxss  %%xmm6, %%xmm0         \n\t"
  92         " mulss  %%xmm7, %%xmm0         \n\t"
  93         " cvtss2si %%xmm0, %4           \n\t"
  94         " movw  %w4, (%3, %0)           \n\t"
  95         " add $2, %0                    \n\t"
  96         " dec %1                        \n\t"
  97         " jne 3b                        \n\t"
  98
  99         "4:                             \n\t"
 100         " emms                          \n\t"
 101
 102         : "=&r" (i), "=&r" (temp)
 103         : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
 104         : "cc", "memory"
 105     );
 106 }
 107
 108 static void pa_sconv_s16le_from_f32ne_sse2(unsigned n, const float *a, int16_t *b) {
 109     pa_reg_x86 temp, i;
 110
 111     __asm__ __volatile__ (
 112         " movaps %5, %%xmm5             \n\t"
 113         " movaps %6, %%xmm6             \n\t"
 114         " movaps %7, %%xmm7             \n\t"
 115         " xor %0, %0                    \n\t"
 116
 117         " mov %4, %1                    \n\t"
 118         " sar $3, %1                    \n\t" /* 8 floats at a time */
 119         " cmp $0, %1                    \n\t"
 120         " je 2f                         \n\t"
 121
 122         "1:                             \n\t"
 123         " movups (%2, %0, 2), %%xmm0    \n\t" /* read 8 floats */
 124         " movups 16(%2, %0, 2), %%xmm2  \n\t"
 125         " minps  %%xmm5, %%xmm0         \n\t" /* clamp to 1.0 */
 126         " minps  %%xmm5, %%xmm2         \n\t"
 127         " maxps  %%xmm6, %%xmm0         \n\t" /* clamp to -1.0 */
 128         " maxps  %%xmm6, %%xmm2         \n\t"
 129         " mulps  %%xmm7, %%xmm0         \n\t" /* *= 0x7fff */
 130         " mulps  %%xmm7, %%xmm2         \n\t"
 131
 132         " cvtps2dq %%xmm0, %%xmm0       \n\t"
 133         " cvtps2dq %%xmm2, %%xmm2       \n\t"
 134
 135         " packssdw %%xmm2, %%xmm0       \n\t"
 136         " movdqu   %%xmm0, (%3, %0)     \n\t"
 137
 138         " add $16, %0                   \n\t"
 139         " dec %1                        \n\t"
 140         " jne 1b                        \n\t"
 141
 142         "2:                             \n\t"
 143         " mov %4, %1                    \n\t" /* prepare for leftovers */
 144         " and $7, %1                    \n\t"
 145         " je 4f                         \n\t"
 146
 147         "3:                             \n\t"
 148         " movss (%2, %0, 2), %%xmm0     \n\t"
 149         " minss  %%xmm5, %%xmm0         \n\t"
 150         " maxss  %%xmm6, %%xmm0         \n\t"
 151         " mulss  %%xmm7, %%xmm0         \n\t"
 152         " cvtss2si %%xmm0, %4           \n\t"
 153         " movw  %w4, (%3, %0)           \n\t"
 154         " add $2, %0                    \n\t"
 155         " dec %1                        \n\t"
 156         " jne 3b                        \n\t"
 157
 158         "4:                             \n\t"
 159
 160         : "=&r" (i), "=&r" (temp)
 161         : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
 162         : "cc", "memory"
 163     );
 164 }
 165
 166 #undef RUN_TEST
 167
 168 #ifdef RUN_TEST
 169 #define SAMPLES 1019
 170 #define TIMES 1000
 171
 172 static void run_test(void) {
 173     int16_t samples[SAMPLES];
 174     int16_t samples_ref[SAMPLES];
 175     float floats[SAMPLES];
 176     int i;
 177     pa_usec_t start, stop;
 178     pa_convert_func_t func;
 179
 180     printf("checking SSE %zd\n", sizeof(samples));
 181
 182     memset(samples_ref, 0, sizeof(samples_ref));
 183     memset(samples, 0, sizeof(samples));
 184
 185     for (i = 0; i < SAMPLES; i++) {
 186         floats[i] = (rand()/(RAND_MAX+2.2)) - 1.1;
 187     }
 188
 189     func = pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
 190     func(SAMPLES, floats, samples_ref);
 191     pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
 192
 193     for (i = 0; i < SAMPLES; i++) {
 194         if (samples[i] != samples_ref[i]) {
 195             printf ("%d: %04x != %04x (%f)\n", i, samples[i], samples_ref[i],
 196                       floats[i]);
 197         }
 198     }
 199
 200     start = pa_rtclock_now();
 201     for (i = 0; i < TIMES; i++) {
 202         pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
 203     }
 204     stop = pa_rtclock_now();
 205     pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
 206
 207     start = pa_rtclock_now();
 208     for (i = 0; i < TIMES; i++) {
 209         func(SAMPLES, floats, samples_ref);
 210     }
 211     stop = pa_rtclock_now();
 212     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
 213 }
 214 #endif
 215 #endif /* defined (__i386__) || defined (__amd64__) */
 216
 217
 218 void pa_convert_func_init_sse(pa_cpu_x86_flag_t flags) {
 219 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
 220
 221 #ifdef RUN_TEST
 222     run_test();
 223 #endif
 224
 225     if (flags & PA_CPU_X86_SSE2) {
 226         pa_log_info("Initialising SSE2 optimized conversions.");
 227         pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse2);
 228     } else {
 229         pa_log_info("Initialising SSE optimized conversions.");
 230         pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse);
 231     }
 232
 233 #endif /* defined (__i386__) || defined (__amd64__) */
 234 }