Commit 5cc6d524 authored by Justin Ruggles's avatar Justin Ruggles
Browse files

lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX

The current SSE version is slower than the MMX version on Athlon64 and Sandy
Bridge, but the SSE4 and AVX versions are faster on Sandy Bridge.
parent 0b45334a
......@@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
mova m3, [srcq+src3q]
mova m4, [srcq+src4q]
mova m5, [srcq+src5q]
%if cpuflag(sse)
%if cpuflag(sse4)
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
movaps m6, m4
shufps m4, m0, q3210
blendps m6, m4, m0, 1100b
movlhps m0, m2
movhlps m6, m2
movaps [dstq ], m0
movaps [dstq+16], m4
movaps [dstq+32], m6
movaps m6, m5
shufps m5, m1, q3210
movhlps m4, m2
blendps m2, m5, m1, 1100b
movlhps m1, m3
movhlps m6, m3
movhlps m5, m3
movaps [dstq ], m0
movaps [dstq+16], m6
movaps [dstq+32], m4
movaps [dstq+48], m1
movaps [dstq+64], m5
movaps [dstq+80], m6
movaps [dstq+64], m2
movaps [dstq+80], m5
%else ; mmx
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
......@@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
INIT_MMX mmx
CONV_FLTP_TO_FLT_6CH
INIT_XMM sse
INIT_XMM sse4
CONV_FLTP_TO_FLT_6CH
%if HAVE_AVX
INIT_XMM avx
CONV_FLTP_TO_FLT_6CH
%endif
......@@ -22,8 +22,9 @@
#include "libavutil/cpu.h"
#include "libavresample/audio_convert.h"
extern void ff_conv_fltp_to_flt_6ch_mmx(float *dst, float *const *src, int len);
extern void ff_conv_fltp_to_flt_6ch_sse(float *dst, float *const *src, int len);
extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len);
extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len);
extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len);
av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
{
......@@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx);
}
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 16, 4, "SSE", ff_conv_fltp_to_flt_6ch_sse);
6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4);
}
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx);
}
#endif
}
......@@ -42,10 +42,9 @@
%endmacro
%macro SBUTTERFLYPS 3
movaps m%3, m%1
unpcklps m%1, m%2
unpckhps m%3, m%2
SWAP %2, %3
unpcklps m%3, m%1, m%2
unpckhps m%1, m%1, m%2
SWAP %1, %3, %2
%endmacro
%macro TRANSPOSE4x4B 5
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment