" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
VOLUME_32x16 (%%mm1, %%mm0)
- " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
+ " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
" movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
VOLUME_32x16 (%%mm1, %%mm0)
VOLUME_32x16 (%%mm3, %%mm2)
- " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
- " movd %%mm2, 4(%0) \n\t" /* | p3*v3 | p2*v2 | */
+ " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
+ " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
" dec %2 \n\t"
SWAP_16 (%%mm1)
VOLUME_32x16 (%%mm1, %%mm0)
SWAP_16 (%%mm0)
- " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
+ " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
VOLUME_32x16 (%%mm1, %%mm0)
VOLUME_32x16 (%%mm3, %%mm2)
SWAP_16_2 (%%mm0, %%mm2)
- " movd %%mm0, (%0) \n\t" /* | p1*v1 | p0*v0 | */
- " movd %%mm2, 4(%0) \n\t" /* | p3*v3 | p2*v2 | */
+ " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
+ " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
" dec %2 \n\t"
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
- " movd (%1, %3, 4), %%xmm0 \n\t" /* do odd sample */
- " movw (%0), %4 \n\t"
+ " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
+ " movw (%0), %4 \n\t" /* .. | p0 | */
" rorw $8, %4 \n\t"
" movd %4, %%xmm1 \n\t"
VOLUME_32x16 (%%xmm1, %%xmm0)
- " movd %%xmm0, %4 \n\t"
+ " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
" rorw $8, %4 \n\t"
" movw %4, (%0) \n\t"
" add $2, %0 \n\t"
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
- " test $1, %2 \n\t" /* check for odd samples */
+ " test $1, %2 \n\t"
" je 4f \n\t"
- "3: \n\t" /* do samples in pairs of 2 */
- " movq (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
- " movd (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
+ "3: \n\t" /* do samples in groups of 2 */
+ " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
+ " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
SWAP_16 (%%xmm1)
VOLUME_32x16 (%%xmm1, %%xmm0)
SWAP_16 (%%xmm0)
- " movd %%xmm0, (%0) \n\t"
+ " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
- " test $1, %2 \n\t" /* check for odd samples */
+ " test $1, %2 \n\t"
" je 6f \n\t"
- "5: \n\t" /* do samples in pairs of 4 */
- " movdqu (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
- " movq (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
+ /* FIXME, we can do aligned access of the volume values if we can guarantee
+ * that the array is 16 bytes aligned, we probably have to do the odd values
+ * after this then. */
+ "5: \n\t" /* do samples in groups of 4 */
+ " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
+ " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
SWAP_16 (%%xmm1)
VOLUME_32x16 (%%xmm1, %%xmm0)
SWAP_16 (%%xmm0)
- " movq %%xmm0, (%0) \n\t"
+ " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
" cmp $0, %2 \n\t"
" je 8f \n\t"
- "7: \n\t" /* do samples in pairs of 8 */
- " movdqu (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
- " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* v3_h | v3_l | v2_h | v2_l */
- " movq (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
- " movq 8(%0), %%xmm3 \n\t" /* X | X | p3 | p2 */
+ "7: \n\t" /* do samples in groups of 8 */
+ " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
+ " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
+ " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
+ " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
SWAP_16_2 (%%xmm1, %%xmm3)
VOLUME_32x16 (%%xmm1, %%xmm0)
VOLUME_32x16 (%%xmm3, %%xmm2)
SWAP_16_2 (%%xmm0, %%xmm2)
- " movq %%xmm0, (%0) \n\t"
- " movq %%xmm2, 8(%0) \n\t"
+ " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
+ " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
" add $16, %0 \n\t"
MOD_ADD ($8, %5)
" dec %2 \n\t"
}
#endif
-#define RUN_TEST
+#undef RUN_TEST
#ifdef RUN_TEST
#define CHANNELS 2