Commit b75726cb authored by Justin Ruggles's avatar Justin Ruggles
lavr: add x86-optimized function for mixing 2 to 1 s16p with q8 coeffs

parent c140fb2c
......@@ -109,3 +109,44 @@ INIT_XMM sse2
; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
; int out_ch, int in_ch);
cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
mov src1q, [srcq+gprsize]
mov srcq, [srcq]
sub src1q, srcq
mov matrixq, [matrixq]
movd m4, [matrixq]
movd m5, [matrixq]
SPLATW m4, m4, 0
SPLATW m5, m5, 1
pxor m0, m0
punpcklwd m4, m0
punpcklwd m5, m0
mova m0, [srcq ]
mova m2, [srcq+src1q]
punpckhwd m1, m0, m0
punpcklwd m0, m0
punpckhwd m3, m2, m2
punpcklwd m2, m2
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m5
pmaddwd m3, m5
paddd m0, m2
paddd m1, m3
psrad m0, 8
psrad m1, 8
packssdw m0, m1
mova [srcq], m0
add srcq, mmsize
sub lend, mmsize/2
jg .loop
......@@ -32,6 +32,9 @@ extern void ff_mix_2_to_1_s16p_flt_sse2(int16_t **src, float **matrix, int len,
extern void ff_mix_2_to_1_s16p_flt_sse4(int16_t **src, float **matrix, int len,
int out_ch, int in_ch);
extern void ff_mix_2_to_1_s16p_q8_sse2(int16_t **src, int16_t **matrix,
int len, int out_ch, int in_ch);
av_cold void ff_audio_mix_init_x86(AudioMix *am)
......@@ -44,6 +47,8 @@ av_cold void ff_audio_mix_init_x86(AudioMix *am)
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
2, 1, 16, 8, "SSE2", ff_mix_2_to_1_s16p_flt_sse2);
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_Q8,
2, 1, 16, 8, "SSE2", ff_mix_2_to_1_s16p_q8_sse2);
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
