ac3enc: Add x86-optimized function to speed up log2_tab().

AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute
value of each element in an array of int16_t.
......@@ -42,9 +42,18 @@ static void ac3_exponent_min_c(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
static int ac3_max_msb_abs_int16_c(const int16_t *src, int len)
int i, v = 0;
for (i = 0; i < len; i++)
v |= abs(src[i]);
return v;
av_cold void ff_ac3dsp_init(AC3DSPContext *c)
c->ac3_exponent_min = ac3_exponent_min_c;
c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
......@@ -35,6 +35,17 @@ typedef struct AC3DSPContext {
* @param nb_coefs number of frequency coefficients.
void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
* Calculate the maximum MSB of the absolute value of each element in an
* array of int16_t.
* @param src input array
* constraints: align 16. values must be in range [-32767,32767]
* @param len number of values in the array
* constraints: multiple of 16 greater than 0
* @return a value with the same MSB as max(abs(src[]))
int (*ac3_max_msb_abs_int16)(const int16_t *src, int len);
} AC3DSPContext;
void ff_ac3dsp_init (AC3DSPContext *c);
......@@ -270,14 +270,9 @@ static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
* @param n number of values in the array
* @return log2(max(abs(tab[])))
static int log2_tab(int16_t *tab, int n)
static int log2_tab(AC3EncodeContext *s, int16_t *src, int len)
int i, v;
v = 0;
for (i = 0; i < n; i++)
v |= abs(tab[i]);
int v = s->ac3dsp.ac3_max_msb_abs_int16(src, len);
return av_log2(v);
......@@ -308,7 +303,7 @@ static void lshift_tab(int16_t *tab, int n, unsigned int lshift)
static int normalize_samples(AC3EncodeContext *s)
int v = 14 - log2_tab(s->windowed_samples, AC3_WINDOW_SIZE);
int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE);
lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v);
return v - 9;
......@@ -65,3 +65,72 @@ AC3_EXPONENT_MIN sse2
%undef PMINUB
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
; This function uses 2 different methods to calculate a valid result.
; 1) logical 'or' of abs of each element
; This is used for ssse3 because of the pabsw instruction.
; It is also used for mmx because of the lack of min/max instructions.
; 2) calculate min/max for the array, then or(abs(min),abs(max))
; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
%macro AC3_MAX_MSB_ABS_INT16 2
cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
pxor m2, m2
pxor m3, m3
%ifidn %2, min_max
mova m0, [srcq]
mova m1, [srcq+mmsize]
pminsw m2, m0
pminsw m2, m1
pmaxsw m3, m0
pmaxsw m3, m1
%else ; or_abs
%ifidn %1, mmx
mova m0, [srcq]
mova m1, [srcq+mmsize]
ABS2 m0, m1, m3, m4
%else ; ssse3
; using memory args is faster for ssse3
pabsw m0, [srcq]
pabsw m1, [srcq+mmsize]
por m2, m0
por m2, m1
add srcq, mmsize*2
sub lend, mmsize
ja .loop
%ifidn %2, min_max
ABS2 m2, m3, m0, m1
por m2, m3
%ifidn mmsize, 16
mova m0, m2
punpckhqdq m0, m0
por m2, m0
PSHUFLW m0, m2, 0xe
por m2, m0
PSHUFLW m0, m2, 0x1
por m2, m0
movd eax, m2
and eax, 0xFFFF
%define ABS2 ABS2_MMX
%define PSHUFLW pshufw
AC3_MAX_MSB_ABS_INT16 mmx, or_abs
%define ABS2 ABS2_MMX2
AC3_MAX_MSB_ABS_INT16 mmxext, min_max
%define PSHUFLW pshuflw
AC3_MAX_MSB_ABS_INT16 sse2, min_max
%define ABS2 ABS2_SSSE3
AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
......@@ -27,6 +27,11 @@ extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int n
extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len);
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
int mm_flags = av_get_cpu_flags();
......@@ -34,12 +39,18 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
if (mm_flags & AV_CPU_FLAG_MMX) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
