Commit 91851a7b authored by Justin Ruggles's avatar Justin Ruggles
Browse files

lavr: x86: optimized 2-channel s16 to fltp conversion

parent 205ace88
......@@ -34,6 +34,7 @@ pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1,
pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
pw_zero_even: times 4 dw 0x0000, 0xffff
SECTION_TEXT
......@@ -923,3 +924,41 @@ CONV_S16_TO_S16P_6CH
INIT_XMM avx
CONV_S16_TO_S16P_6CH
%endif
;------------------------------------------------------------------------------
; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
; int channels);
;------------------------------------------------------------------------------
%macro CONV_S16_TO_FLTP_2CH 0
cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
lea lenq, [4*lend]
mov dst1q, [dst0q+gprsize]
mov dst0q, [dst0q ]
add srcq, lenq
add dst0q, lenq
add dst1q, lenq
neg lenq
mova m3, [pf_s32_inv_scale]
mova m4, [pw_zero_even]
.loop:
mova m1, [srcq+lenq]
pslld m0, m1, 16
pand m1, m4
cvtdq2ps m0, m0
cvtdq2ps m1, m1
mulps m0, m0, m3
mulps m1, m1, m3
mova [dst0q+lenq], m0
mova [dst1q+lenq], m1
add lenq, mmsize
jl .loop
REP_RET
%endmacro
INIT_XMM sse2
CONV_S16_TO_FLTP_2CH
%if HAVE_AVX
INIT_XMM avx
CONV_S16_TO_FLTP_2CH
%endif
......@@ -106,6 +106,11 @@ extern void ff_conv_s16_to_s16p_6ch_ssse3(int16_t *const *dst, int16_t *src,
extern void ff_conv_s16_to_s16p_6ch_avx (int16_t *const *dst, int16_t *src,
int len, int channels);
extern void ff_conv_s16_to_fltp_2ch_sse2(float *const *dst, int16_t *src,
int len, int channels);
extern void ff_conv_s16_to_fltp_2ch_avx (float *const *dst, int16_t *src,
int len, int channels);
av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
{
#if HAVE_YASM
......@@ -157,6 +162,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
2, 16, 8, "SSE2", ff_conv_s16_to_s16p_2ch_sse2);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S16,
6, 16, 4, "SSE2", ff_conv_s16_to_s16p_6ch_sse2);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_S16,
2, 16, 8, "SSE2", ff_conv_s16_to_fltp_2ch_sse2);
}
if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSE) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16P,
......@@ -195,6 +202,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
2, 16, 8, "AVX", ff_conv_s16_to_s16p_2ch_avx);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16P, AV_SAMPLE_FMT_S16,
6, 16, 4, "AVX", ff_conv_s16_to_s16p_6ch_avx);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_S16,
2, 16, 8, "AVX", ff_conv_s16_to_fltp_2ch_avx);
}
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment