lavr: Add x86-optimized function for flt to s16 conversion

......@@ -28,6 +28,7 @@ SECTION_RODATA 32
pf_s32_inv_scale: times 8 dd 0x30000000
pf_s16_inv_scale: times 4 dd 0x38000000
pf_s16_scale: times 4 dd 0x47000000
......@@ -158,6 +159,38 @@ INIT_YMM avx
; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
cglobal conv_flt_to_s16, 3,3,5, dst, src, len
lea lenq, [2*lend]
lea srcq, [srcq+2*lenq]
add dstq, lenq
neg lenq
mova m4, [pf_s16_scale]
mova m0, [srcq+2*lenq ]
mova m1, [srcq+2*lenq+1*mmsize]
mova m2, [srcq+2*lenq+2*mmsize]
mova m3, [srcq+2*lenq+3*mmsize]
mulps m0, m4
mulps m1, m4
mulps m2, m4
mulps m3, m4
cvtps2dq m0, m0
cvtps2dq m1, m1
cvtps2dq m2, m2
cvtps2dq m3, m3
packssdw m0, m1
packssdw m2, m3
mova [dstq+lenq ], m0
mova [dstq+lenq+mmsize], m2
add lenq, mmsize*2
jl .loop
; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
; int channels);
......@@ -33,6 +33,8 @@ extern void ff_conv_s32_to_s16_sse2(int16_t *dst, const int32_t *src, int len);
extern void ff_conv_s32_to_flt_sse2(float *dst, const int32_t *src, int len);
extern void ff_conv_s32_to_flt_avx (float *dst, const int32_t *src, int len);
extern void ff_conv_flt_to_s16_sse2(int16_t *dst, const float *src, int len);
extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len);
extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len);
extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len);
......@@ -67,6 +69,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
0, 16, 8, "SSE2", ff_conv_s16_to_flt_sse2);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S32,
0, 16, 8, "SSE2", ff_conv_s32_to_flt_sse2);
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_FLT,
0, 16, 16, "SSE2", ff_conv_flt_to_s16_sse2);
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16,
