Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
BC
public
external
ffmpeg
Commits
9d35fa52
Commit
9d35fa52
authored
Apr 25, 2011
by
Vitor Sessak
Committed by
Reinhard Tartler
Apr 26, 2011
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add AVX FFT implementation.
Signed-off-by:
Reinhard Tartler
<
siretart@tauware.de
>
parent
13dfce3d
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
23 changed files
with
450 additions
and
207 deletions
+450
-207
Changelog
Changelog
+1
-1
libavcodec/aac.h
libavcodec/aac.h
+5
-5
libavcodec/aacenc.h
libavcodec/aacenc.h
+1
-1
libavcodec/ac3dec.h
libavcodec/ac3dec.h
+5
-5
libavcodec/ac3enc.c
libavcodec/ac3enc.c
+1
-1
libavcodec/atrac1.c
libavcodec/atrac1.c
+10
-10
libavcodec/atrac3.c
libavcodec/atrac3.c
+3
-3
libavcodec/binkaudio.c
libavcodec/binkaudio.c
+1
-1
libavcodec/cook.c
libavcodec/cook.c
+1
-1
libavcodec/dca.c
libavcodec/dca.c
+5
-5
libavcodec/fft.c
libavcodec/fft.c
+48
-5
libavcodec/fft.h
libavcodec/fft.h
+2
-1
libavcodec/imc.c
libavcodec/imc.c
+1
-1
libavcodec/nellymoserdec.c
libavcodec/nellymoserdec.c
+2
-2
libavcodec/nellymoserenc.c
libavcodec/nellymoserenc.c
+3
-3
libavcodec/qdm2.c
libavcodec/qdm2.c
+1
-1
libavcodec/wma.h
libavcodec/wma.h
+4
-4
libavcodec/wmaprodec.c
libavcodec/wmaprodec.c
+2
-2
libavcodec/wmavoice.c
libavcodec/wmavoice.c
+3
-3
libavcodec/x86/fft.c
libavcodec/x86/fft.c
+8
-1
libavcodec/x86/fft.h
libavcodec/x86/fft.h
+2
-0
libavcodec/x86/fft_mmx.asm
libavcodec/x86/fft_mmx.asm
+334
-150
libavcodec/x86/fft_sse.c
libavcodec/x86/fft_sse.c
+7
-1
No files found.
Changelog
View file @
9d35fa52
...
...
@@ -5,7 +5,7 @@ releases are sorted from youngest to oldest.
version <next>:
- Lots of deprecated API cruft removed
- fft and imdct optimizations for AVX (Sandy Bridge) processors
version 0.7_beta1:
...
...
libavcodec/aac.h
View file @
9d35fa52
...
...
@@ -223,9 +223,9 @@ typedef struct {
float
sf
[
120
];
///< scalefactors
int
sf_idx
[
128
];
///< scalefactor indices (used by encoder)
uint8_t
zeroes
[
128
];
///< band is not coded (used by encoder)
DECLARE_ALIGNED
(
16
,
float
,
coeffs
)[
1024
];
///< coefficients for IMDCT
DECLARE_ALIGNED
(
16
,
float
,
saved
)[
1024
];
///< overlap
DECLARE_ALIGNED
(
16
,
float
,
ret
)[
2048
];
///< PCM output
DECLARE_ALIGNED
(
32
,
float
,
coeffs
)[
1024
];
///< coefficients for IMDCT
DECLARE_ALIGNED
(
32
,
float
,
saved
)[
1024
];
///< overlap
DECLARE_ALIGNED
(
32
,
float
,
ret
)[
2048
];
///< PCM output
DECLARE_ALIGNED
(
16
,
int16_t
,
ltp_state
)[
3072
];
///< time signal for LTP
PredictorState
predictor_state
[
MAX_PREDICTORS
];
}
SingleChannelElement
;
...
...
@@ -272,7 +272,7 @@ typedef struct {
* @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
* @{
*/
DECLARE_ALIGNED
(
16
,
float
,
buf_mdct
)[
1024
];
DECLARE_ALIGNED
(
32
,
float
,
buf_mdct
)[
1024
];
/** @} */
/**
...
...
@@ -296,7 +296,7 @@ typedef struct {
int
sf_offset
;
///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
/** @} */
DECLARE_ALIGNED
(
16
,
float
,
temp
)[
128
];
DECLARE_ALIGNED
(
32
,
float
,
temp
)[
128
];
enum
OCStatus
output_configured
;
}
AACContext
;
...
...
libavcodec/aacenc.h
View file @
9d35fa52
...
...
@@ -64,7 +64,7 @@ typedef struct AACEncContext {
int
last_frame
;
float
lambda
;
DECLARE_ALIGNED
(
16
,
int
,
qcoefs
)[
96
];
///< quantized coefficients
DECLARE_ALIGNED
(
16
,
float
,
scoefs
)[
1024
];
///< scaled coefficients
DECLARE_ALIGNED
(
32
,
float
,
scoefs
)[
1024
];
///< scaled coefficients
}
AACEncContext
;
#endif
/* AVCODEC_AACENC_H */
libavcodec/ac3dec.h
View file @
9d35fa52
...
...
@@ -200,11 +200,11 @@ typedef struct {
///@defgroup arrays aligned arrays
DECLARE_ALIGNED
(
16
,
int
,
fixed_coeffs
)[
AC3_MAX_CHANNELS
][
AC3_MAX_COEFS
];
///> fixed-point transform coefficients
DECLARE_ALIGNED
(
16
,
float
,
transform_coeffs
)[
AC3_MAX_CHANNELS
][
AC3_MAX_COEFS
];
///< transform coefficients
DECLARE_ALIGNED
(
16
,
float
,
delay
)[
AC3_MAX_CHANNELS
][
AC3_BLOCK_SIZE
];
///< delay - added to the next block
DECLARE_ALIGNED
(
16
,
float
,
window
)[
AC3_BLOCK_SIZE
];
///< window coefficients
DECLARE_ALIGNED
(
16
,
float
,
tmp_output
)[
AC3_BLOCK_SIZE
];
///< temporary storage for output before windowing
DECLARE_ALIGNED
(
16
,
float
,
output
)[
AC3_MAX_CHANNELS
][
AC3_BLOCK_SIZE
];
///< output after imdct transform and windowing
DECLARE_ALIGNED
(
32
,
float
,
transform_coeffs
)[
AC3_MAX_CHANNELS
][
AC3_MAX_COEFS
];
///< transform coefficients
DECLARE_ALIGNED
(
32
,
float
,
delay
)[
AC3_MAX_CHANNELS
][
AC3_BLOCK_SIZE
];
///< delay - added to the next block
DECLARE_ALIGNED
(
32
,
float
,
window
)[
AC3_BLOCK_SIZE
];
///< window coefficients
DECLARE_ALIGNED
(
32
,
float
,
tmp_output
)[
AC3_BLOCK_SIZE
];
///< temporary storage for output before windowing
DECLARE_ALIGNED
(
32
,
float
,
output
)[
AC3_MAX_CHANNELS
][
AC3_BLOCK_SIZE
];
///< output after imdct transform and windowing
///@}
}
AC3DecodeContext
;
...
...
libavcodec/ac3enc.c
View file @
9d35fa52
...
...
@@ -201,7 +201,7 @@ typedef struct AC3EncodeContext {
uint8_t
exp_strategy
[
AC3_MAX_CHANNELS
][
AC3_MAX_BLOCKS
];
///< exponent strategies
DECLARE_ALIGNED
(
16
,
SampleType
,
windowed_samples
)[
AC3_WINDOW_SIZE
];
DECLARE_ALIGNED
(
32
,
SampleType
,
windowed_samples
)[
AC3_WINDOW_SIZE
];
}
AC3EncodeContext
;
typedef
struct
AC3Mant
{
...
...
libavcodec/atrac1.c
View file @
9d35fa52
...
...
@@ -60,11 +60,11 @@ typedef struct {
int
log2_block_count
[
AT1_QMF_BANDS
];
///< log2 number of blocks in a band
int
num_bfus
;
///< number of Block Floating Units
float
*
spectrum
[
2
];
DECLARE_ALIGNED
(
16
,
float
,
spec1
)[
AT1_SU_SAMPLES
];
///< mdct buffer
DECLARE_ALIGNED
(
16
,
float
,
spec2
)[
AT1_SU_SAMPLES
];
///< mdct buffer
DECLARE_ALIGNED
(
16
,
float
,
fst_qmf_delay
)[
46
];
///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED
(
16
,
float
,
snd_qmf_delay
)[
46
];
///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED
(
16
,
float
,
last_qmf_delay
)[
256
+
23
];
///< delay line for the last stacked QMF filter
DECLARE_ALIGNED
(
32
,
float
,
spec1
)[
AT1_SU_SAMPLES
];
///< mdct buffer
DECLARE_ALIGNED
(
32
,
float
,
spec2
)[
AT1_SU_SAMPLES
];
///< mdct buffer
DECLARE_ALIGNED
(
32
,
float
,
fst_qmf_delay
)[
46
];
///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED
(
32
,
float
,
snd_qmf_delay
)[
46
];
///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED
(
32
,
float
,
last_qmf_delay
)[
256
+
23
];
///< delay line for the last stacked QMF filter
}
AT1SUCtx
;
/**
...
...
@@ -72,13 +72,13 @@ typedef struct {
*/
typedef
struct
{
AT1SUCtx
SUs
[
AT1_MAX_CHANNELS
];
///< channel sound unit
DECLARE_ALIGNED
(
16
,
float
,
spec
)[
AT1_SU_SAMPLES
];
///< the mdct spectrum buffer
DECLARE_ALIGNED
(
32
,
float
,
spec
)[
AT1_SU_SAMPLES
];
///< the mdct spectrum buffer
DECLARE_ALIGNED
(
16
,
float
,
low
)[
256
];
DECLARE_ALIGNED
(
16
,
float
,
mid
)[
256
];
DECLARE_ALIGNED
(
16
,
float
,
high
)[
512
];
DECLARE_ALIGNED
(
32
,
float
,
low
)[
256
];
DECLARE_ALIGNED
(
32
,
float
,
mid
)[
256
];
DECLARE_ALIGNED
(
32
,
float
,
high
)[
512
];
float
*
bands
[
3
];
DECLARE_ALIGNED
(
16
,
float
,
out_samples
)[
AT1_MAX_CHANNELS
][
AT1_SU_SAMPLES
];
DECLARE_ALIGNED
(
32
,
float
,
out_samples
)[
AT1_MAX_CHANNELS
][
AT1_SU_SAMPLES
];
FFTContext
mdct_ctx
[
3
];
int
channels
;
DSPContext
dsp
;
...
...
libavcodec/atrac3.c
View file @
9d35fa52
...
...
@@ -74,8 +74,8 @@ typedef struct {
int
gcBlkSwitch
;
gain_block
gainBlock
[
2
];
DECLARE_ALIGNED
(
16
,
float
,
spectrum
)[
1024
];
DECLARE_ALIGNED
(
16
,
float
,
IMDCT_buf
)[
1024
];
DECLARE_ALIGNED
(
32
,
float
,
spectrum
)[
1024
];
DECLARE_ALIGNED
(
32
,
float
,
IMDCT_buf
)[
1024
];
float
delayBuf1
[
46
];
///<qmf delay buffers
float
delayBuf2
[
46
];
...
...
@@ -122,7 +122,7 @@ typedef struct {
FFTContext
mdct_ctx
;
}
ATRAC3Context
;
static
DECLARE_ALIGNED
(
16
,
float
,
mdct_window
)[
512
];
static
DECLARE_ALIGNED
(
32
,
float
,
mdct_window
)[
512
];
static
VLC
spectral_coeff_tab
[
7
];
static
float
gain_tab1
[
16
];
static
float
gain_tab2
[
31
];
...
...
libavcodec/binkaudio.c
View file @
9d35fa52
...
...
@@ -55,7 +55,7 @@ typedef struct {
int
num_bands
;
unsigned
int
*
bands
;
float
root
;
DECLARE_ALIGNED
(
16
,
FFTSample
,
coeffs
)[
BINK_BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
32
,
FFTSample
,
coeffs
)[
BINK_BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
16
,
short
,
previous
)[
BINK_BLOCK_MAX_SIZE
/
16
];
///< coeffs from previous audio block
float
*
coeffs_ptr
[
MAX_CHANNELS
];
///< pointers to the coeffs arrays for float_to_int16_interleave
union
{
...
...
libavcodec/cook.c
View file @
9d35fa52
...
...
@@ -153,7 +153,7 @@ typedef struct cook {
/* data buffers */
uint8_t
*
decoded_bytes_buffer
;
DECLARE_ALIGNED
(
16
,
float
,
mono_mdct_output
)[
2048
];
DECLARE_ALIGNED
(
32
,
float
,
mono_mdct_output
)[
2048
];
float
decode_buffer_1
[
1024
];
float
decode_buffer_2
[
1024
];
float
decode_buffer_0
[
1060
];
/* static allocation for joint decode */
...
...
libavcodec/dca.c
View file @
9d35fa52
...
...
@@ -321,16 +321,16 @@ typedef struct {
/* Subband samples history (for ADPCM) */
float
subband_samples_hist
[
DCA_PRIM_CHANNELS_MAX
][
DCA_SUBBANDS
][
4
];
DECLARE_ALIGNED
(
16
,
float
,
subband_fir_hist
)[
DCA_PRIM_CHANNELS_MAX
][
512
];
DECLARE_ALIGNED
(
16
,
float
,
subband_fir_noidea
)[
DCA_PRIM_CHANNELS_MAX
][
32
];
DECLARE_ALIGNED
(
32
,
float
,
subband_fir_hist
)[
DCA_PRIM_CHANNELS_MAX
][
512
];
DECLARE_ALIGNED
(
32
,
float
,
subband_fir_noidea
)[
DCA_PRIM_CHANNELS_MAX
][
32
];
int
hist_index
[
DCA_PRIM_CHANNELS_MAX
];
DECLARE_ALIGNED
(
16
,
float
,
raXin
)[
32
];
DECLARE_ALIGNED
(
32
,
float
,
raXin
)[
32
];
int
output
;
///< type of output
float
scale_bias
;
///< output scale
DECLARE_ALIGNED
(
16
,
float
,
subband_samples
)[
DCA_BLOCKS_MAX
][
DCA_PRIM_CHANNELS_MAX
][
DCA_SUBBANDS
][
8
];
DECLARE_ALIGNED
(
16
,
float
,
samples
)[(
DCA_PRIM_CHANNELS_MAX
+
1
)
*
256
];
DECLARE_ALIGNED
(
32
,
float
,
subband_samples
)[
DCA_BLOCKS_MAX
][
DCA_PRIM_CHANNELS_MAX
][
DCA_SUBBANDS
][
8
];
DECLARE_ALIGNED
(
32
,
float
,
samples
)[(
DCA_PRIM_CHANNELS_MAX
+
1
)
*
256
];
const
float
*
samples_chanptr
[
DCA_PRIM_CHANNELS_MAX
+
1
];
uint8_t
dca_buffer
[
DCA_MAX_FRAME_SIZE
+
DCA_MAX_EXSS_HEADER_SIZE
+
DCA_BUFFER_PADDING_SIZE
];
...
...
libavcodec/fft.c
View file @
9d35fa52
...
...
@@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
#endif
}
static
const
int
avx_tab
[]
=
{
0
,
4
,
1
,
5
,
8
,
12
,
9
,
13
,
2
,
6
,
3
,
7
,
10
,
14
,
11
,
15
};
static
int
is_second_half_of_fft32
(
int
i
,
int
n
)
{
if
(
n
<=
32
)
return
i
>=
16
;
else
if
(
i
<
n
/
2
)
return
is_second_half_of_fft32
(
i
,
n
/
2
);
else
if
(
i
<
3
*
n
/
4
)
return
is_second_half_of_fft32
(
i
-
n
/
2
,
n
/
4
);
else
return
is_second_half_of_fft32
(
i
-
3
*
n
/
4
,
n
/
4
);
}
static
av_cold
void
fft_perm_avx
(
FFTContext
*
s
)
{
int
i
;
int
n
=
1
<<
s
->
nbits
;
for
(
i
=
0
;
i
<
n
;
i
+=
16
)
{
int
k
;
if
(
is_second_half_of_fft32
(
i
,
n
))
{
for
(
k
=
0
;
k
<
16
;
k
++
)
s
->
revtab
[
-
split_radix_permutation
(
i
+
k
,
n
,
s
->
inverse
)
&
(
n
-
1
)]
=
i
+
avx_tab
[
k
];
}
else
{
for
(
k
=
0
;
k
<
16
;
k
++
)
{
int
j
=
i
+
k
;
j
=
(
j
&
~
7
)
|
((
j
>>
1
)
&
3
)
|
((
j
<<
2
)
&
4
);
s
->
revtab
[
-
split_radix_permutation
(
i
+
k
,
n
,
s
->
inverse
)
&
(
n
-
1
)]
=
j
;
}
}
}
}
av_cold
int
ff_fft_init
(
FFTContext
*
s
,
int
nbits
,
int
inverse
)
{
int
i
,
j
,
n
;
...
...
@@ -132,11 +170,16 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
for
(
j
=
4
;
j
<=
nbits
;
j
++
)
{
ff_init_ff_cos_tabs
(
j
);
}
for
(
i
=
0
;
i
<
n
;
i
++
)
{
int
j
=
i
;
if
(
s
->
fft_permutation
==
FF_FFT_PERM_SWAP_LSBS
)
j
=
(
j
&~
3
)
|
((
j
>>
1
)
&
1
)
|
((
j
<<
1
)
&
2
);
s
->
revtab
[
-
split_radix_permutation
(
i
,
n
,
s
->
inverse
)
&
(
n
-
1
)]
=
j
;
if
(
s
->
fft_permutation
==
FF_FFT_PERM_AVX
)
{
fft_perm_avx
(
s
);
}
else
{
for
(
i
=
0
;
i
<
n
;
i
++
)
{
int
j
=
i
;
if
(
s
->
fft_permutation
==
FF_FFT_PERM_SWAP_LSBS
)
j
=
(
j
&~
3
)
|
((
j
>>
1
)
&
1
)
|
((
j
<<
1
)
&
2
);
s
->
revtab
[
-
split_radix_permutation
(
i
,
n
,
s
->
inverse
)
&
(
n
-
1
)]
=
j
;
}
}
return
0
;
...
...
libavcodec/fft.h
View file @
9d35fa52
...
...
@@ -85,6 +85,7 @@ struct FFTContext {
int
fft_permutation
;
#define FF_FFT_PERM_DEFAULT 0
#define FF_FFT_PERM_SWAP_LSBS 1
#define FF_FFT_PERM_AVX 2
int
mdct_permutation
;
#define FF_MDCT_PERM_NONE 0
#define FF_MDCT_PERM_INTERLEAVE 1
...
...
@@ -97,7 +98,7 @@ struct FFTContext {
#endif
#define COSTABLE(size) \
COSTABLE_CONST DECLARE_ALIGNED(
16
, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
COSTABLE_CONST DECLARE_ALIGNED(
32
, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
extern
COSTABLE
(
16
);
extern
COSTABLE
(
32
);
...
...
libavcodec/imc.c
View file @
9d35fa52
...
...
@@ -88,7 +88,7 @@ typedef struct {
DSPContext
dsp
;
FFTContext
fft
;
DECLARE_ALIGNED
(
16
,
FFTComplex
,
samples
)[
COEFFS
/
2
];
DECLARE_ALIGNED
(
32
,
FFTComplex
,
samples
)[
COEFFS
/
2
];
float
*
out_samples
;
}
IMCContext
;
...
...
libavcodec/nellymoserdec.c
View file @
9d35fa52
...
...
@@ -47,7 +47,7 @@
typedef
struct
NellyMoserDecodeContext
{
AVCodecContext
*
avctx
;
DECLARE_ALIGNED
(
16
,
float
,
float_buf
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
32
,
float
,
float_buf
)[
NELLY_SAMPLES
];
float
state
[
128
];
AVLFG
random_state
;
GetBitContext
gb
;
...
...
@@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
DSPContext
dsp
;
FFTContext
imdct_ctx
;
FmtConvertContext
fmt_conv
;
DECLARE_ALIGNED
(
16
,
float
,
imdct_out
)[
NELLY_BUF_LEN
*
2
];
DECLARE_ALIGNED
(
32
,
float
,
imdct_out
)[
NELLY_BUF_LEN
*
2
];
}
NellyMoserDecodeContext
;
static
void
overlap_and_window
(
NellyMoserDecodeContext
*
s
,
float
*
state
,
float
*
audio
,
float
*
a_in
)
...
...
libavcodec/nellymoserenc.c
View file @
9d35fa52
...
...
@@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
int
have_saved
;
DSPContext
dsp
;
FFTContext
mdct_ctx
;
DECLARE_ALIGNED
(
16
,
float
,
mdct_out
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
16
,
float
,
in_buff
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
16
,
float
,
buf
)[
2
][
3
*
NELLY_BUF_LEN
];
///< sample buffer
DECLARE_ALIGNED
(
32
,
float
,
mdct_out
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
32
,
float
,
in_buff
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
32
,
float
,
buf
)[
2
][
3
*
NELLY_BUF_LEN
];
///< sample buffer
float
(
*
opt
)[
NELLY_BANDS
];
uint8_t
(
*
path
)[
NELLY_BANDS
];
}
NellyMoserEncodeContext
;
...
...
libavcodec/qdm2.c
View file @
9d35fa52
...
...
@@ -120,7 +120,7 @@ typedef struct {
}
FFTCoefficient
;
typedef
struct
{
DECLARE_ALIGNED
(
16
,
QDM2Complex
,
complex
)[
MPA_MAX_CHANNELS
][
256
];
DECLARE_ALIGNED
(
32
,
QDM2Complex
,
complex
)[
MPA_MAX_CHANNELS
][
256
];
}
QDM2FFT
;
/**
...
...
libavcodec/wma.h
View file @
9d35fa52
...
...
@@ -113,15 +113,15 @@ typedef struct WMACodecContext {
uint8_t
ms_stereo
;
///< true if mid/side stereo mode
uint8_t
channel_coded
[
MAX_CHANNELS
];
///< true if channel is coded
int
exponents_bsize
[
MAX_CHANNELS
];
///< log2 ratio frame/exp. length
DECLARE_ALIGNED
(
16
,
float
,
exponents
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
32
,
float
,
exponents
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
float
max_exponent
[
MAX_CHANNELS
];
WMACoef
coefs1
[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
16
,
float
,
coefs
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
16
,
FFTSample
,
output
)[
BLOCK_MAX_SIZE
*
2
];
DECLARE_ALIGNED
(
32
,
float
,
coefs
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
32
,
FFTSample
,
output
)[
BLOCK_MAX_SIZE
*
2
];
FFTContext
mdct_ctx
[
BLOCK_NB_SIZES
];
float
*
windows
[
BLOCK_NB_SIZES
];
/* output buffer for one frame and the last for IMDCT windowing */
DECLARE_ALIGNED
(
16
,
float
,
frame_out
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
*
2
];
DECLARE_ALIGNED
(
32
,
float
,
frame_out
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
*
2
];
/* last frame info */
uint8_t
last_superframe
[
MAX_CODED_SUPERFRAME_SIZE
+
4
];
/* padding added */
int
last_bitoffset
;
...
...
libavcodec/wmaprodec.c
View file @
9d35fa52
...
...
@@ -145,7 +145,7 @@ typedef struct {
uint8_t
table_idx
;
///< index in sf_offsets for the scale factor reference block
float
*
coeffs
;
///< pointer to the subframe decode buffer
uint16_t
num_vec_coeffs
;
///< number of vector coded coefficients
DECLARE_ALIGNED
(
16
,
float
,
out
)[
WMAPRO_BLOCK_MAX_SIZE
+
WMAPRO_BLOCK_MAX_SIZE
/
2
];
///< output buffer
DECLARE_ALIGNED
(
32
,
float
,
out
)[
WMAPRO_BLOCK_MAX_SIZE
+
WMAPRO_BLOCK_MAX_SIZE
/
2
];
///< output buffer
}
WMAProChannelCtx
;
/**
...
...
@@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
FF_INPUT_BUFFER_PADDING_SIZE
];
///< compressed frame data
PutBitContext
pb
;
///< context for filling the frame_data buffer
FFTContext
mdct_ctx
[
WMAPRO_BLOCK_SIZES
];
///< MDCT context per block size
DECLARE_ALIGNED
(
16
,
float
,
tmp
)[
WMAPRO_BLOCK_MAX_SIZE
];
///< IMDCT output buffer
DECLARE_ALIGNED
(
32
,
float
,
tmp
)[
WMAPRO_BLOCK_MAX_SIZE
];
///< IMDCT output buffer
float
*
windows
[
WMAPRO_BLOCK_SIZES
];
///< windows for the different block sizes
/* frame size dependent frame information (set during initialization) */
...
...
libavcodec/wmavoice.c
View file @
9d35fa52
...
...
@@ -275,11 +275,11 @@ typedef struct {
///< by postfilter
float
denoise_filter_cache
[
MAX_FRAMESIZE
];
int
denoise_filter_cache_size
;
///< samples in #denoise_filter_cache
DECLARE_ALIGNED
(
16
,
float
,
tilted_lpcs_pf
)[
0x80
];
DECLARE_ALIGNED
(
32
,
float
,
tilted_lpcs_pf
)[
0x80
];
///< aligned buffer for LPC tilting
DECLARE_ALIGNED
(
16
,
float
,
denoise_coeffs_pf
)[
0x80
];
DECLARE_ALIGNED
(
32
,
float
,
denoise_coeffs_pf
)[
0x80
];
///< aligned buffer for denoise coefficients
DECLARE_ALIGNED
(
16
,
float
,
synth_filter_out_buf
)[
0x80
+
MAX_LSPS_ALIGN16
];
DECLARE_ALIGNED
(
32
,
float
,
synth_filter_out_buf
)[
0x80
+
MAX_LSPS_ALIGN16
];
///< aligned buffer for postfilter speech
///< synthesis
/**
...
...
libavcodec/x86/fft.c
View file @
9d35fa52
...
...
@@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
{
#if HAVE_YASM
int
has_vectors
=
av_get_cpu_flags
();
if
(
has_vectors
&
AV_CPU_FLAG_SSE
&&
HAVE_SSE
)
{
if
(
has_vectors
&
AV_CPU_FLAG_AVX
&&
HAVE_AVX
&&
s
->
nbits
>=
5
)
{
/* AVX for SB */
s
->
imdct_calc
=
ff_imdct_calc_sse
;
s
->
imdct_half
=
ff_imdct_half_avx
;
s
->
fft_permute
=
ff_fft_permute_sse
;
s
->
fft_calc
=
ff_fft_calc_avx
;
s
->
fft_permutation
=
FF_FFT_PERM_AVX
;
}
else
if
(
has_vectors
&
AV_CPU_FLAG_SSE
&&
HAVE_SSE
)
{
/* SSE for P3/P4/K8 */
s
->
imdct_calc
=
ff_imdct_calc_sse
;
s
->
imdct_half
=
ff_imdct_half_sse
;
...
...
libavcodec/x86/fft.h
View file @
9d35fa52
...
...
@@ -22,6 +22,7 @@
#include "libavcodec/fft.h"
void
ff_fft_permute_sse
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_avx
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_sse
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_3dn
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_3dn2
(
FFTContext
*
s
,
FFTComplex
*
z
);
...
...
@@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
void
ff_imdct_half_3dn2
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_calc_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_avx
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_dct32_float_sse
(
FFTSample
*
out
,
const
FFTSample
*
in
);
#endif
libavcodec/x86/fft_mmx.asm
View file @
9d35fa52
;******************************************************************************
;* FFT transform with SSE/3DNow optimizations
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2011 Vitor Sessak
;*
;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein.
...
...
@@ -49,9 +50,21 @@ endstruc
SECTION
_RODATA
%define M_SQRT1_2 0.70710678118654752440
ps_root2:
times
4
dd
M_SQRT1_2
ps_root2mppm:
dd
-
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
-
M_SQRT1_2
ps_p1p1m1p1:
dd
0
,
0
,
1
<<
31
,
0
%define M_COS_PI_1_8 0.923879532511287
%define M_COS_PI_3_8 0.38268343236509
align
32
ps_cos16_1:
dd
1.0
,
M_COS_PI_1_8
,
M_SQRT1_2
,
M_COS_PI_3_8
,
1.0
,
M_COS_PI_1_8
,
M_SQRT1_2
,
M_COS_PI_3_8
ps_cos16_2:
dd
0
,
M_COS_PI_3_8
,
M_SQRT1_2
,
M_COS_PI_1_8
,
0
,
-
M_COS_PI_3_8
,
-
M_SQRT1_2
,
-
M_COS_PI_1_8
ps_root2:
times
8
dd
M_SQRT1_2
ps_root2mppm:
dd
-
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
-
M_SQRT1_2
,
-
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
-
M_SQRT1_2
ps_p1p1m1p1:
dd
0
,
0
,
1
<<
31
,
0
,
0
,
0
,
1
<<
31
,
0
perm1:
dd
0x00
,
0x02
,
0x03
,
0x01
,
0x03
,
0x00
,
0x02
,
0x01
perm2:
dd
0x00
,
0x01
,
0x02
,
0x03
,
0x01
,
0x00
,
0x02
,
0x03
ps_p1p1m1p1root2:
dd
1.0
,
1.0
,
-
1.0
,
1.0
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
ps_m1m1p1m1p1m1m1m1:
dd
1
<<
31
,
1
<<
31
,
0
,
1
<<
31
,
0
,
1
<<
31
,
1
<<
31
,
1
<<
31
ps_m1p1:
dd
1
<<
31
,
0
%assign i 16
...
...
@@ -96,51 +109,80 @@ section .text align=16
SWAP
%
3
,
%
6
%endmacro
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
; %3, %4, %5 tmp
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
%macro T8_AVX 5
vsubps
%
5
,
%
1
,
%
2
; v = %1 - %2
vaddps
%
3
,
%
1
,
%
2
; w = %1 + %2
vmulps
%
2
,
%
5
,
[
ps_p1p1m1p1root2
]
; v *= vals1
vpermilps
%
2
,
%
2
,
[
perm1
]
vblendps
%
1
,
%
2
,
%
3
,
0x33
; q = {w1,w2,v4,v2,w5,w6,v7,v6}
vshufps
%
5
,
%
3
,
%
2
,
0x4e
; r = {w3,w4,v1,v3,w7,w8,v8,v5}
vsubps
%
4
,
%
5
,
%
1
; s = r - q
vaddps
%
1
,
%
5
,
%
1
; u = r + q
vpermilps
%
1
,
%
1
,
[
perm2
]
; k = {u1,u2,u3,u4,u6,u5,u7,u8}
vshufps
%
5
,
%
4
,
%
1
,
0xbb
vshufps
%
3
,
%
4
,
%
1
,
0xee
vperm2f128
%
3
,
%
3
,
%
5
,
0x13
vxorps
%
4
,
%
4
,
[
ps_m1m1p1m1p1m1m1m1
]
; s *= {1,1,-1,-1,1,-1,-1,-1}
vshufps
%
2
,
%
1
,
%
4
,
0xdd
vshufps
%
1
,
%
1
,
%
4
,
0x88
vperm2f128
%
4
,
%
2
,
%
1
,
0x02
; v = {k1,k3,s1,s3,k2,k4,s2,s4}
vperm2f128
%
1
,
%
1
,
%
2
,
0x13
; w = {k6,k8,s6,s8,k5,k7,s5,s7}
vsubps
%
5
,
%
1
,
%
3
vblendps
%
1
,
%
5
,
%
1
,
0x55
; w -= {0,s7,0,k7,0,s8,0,k8}
vsubps
%
2
,
%
4
,
%
1
; %2 = v - w
vaddps
%
1
,
%
4
,
%
1
; %1 = v + w
%endmacro
; In SSE mode do one fft4 transforms
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
;
; In AVX mode do two fft4 transforms
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
%macro T4_SSE 3
mova
%
3
,
%
1
addps
%
1
,
%
2
; {t1,t2,t6,t5}
subps
%
3
,
%
2
; {t3,t4,-t8,t7}
xorps
%
3
,
[
ps_p1p1m1p1
]
mova
%
2
,
%
1
shufps
%
1
,
%
3
,
0x44
; {t1,t2,t3,t4}
shufps
%
2
,
%
3
,
0xbe
; {t6,t5,t7,t8}
mova
%
3
,
%
1
addps
%
1
,
%
2
; {r0,i0,r1,i1}
subps
%
3
,
%
2
; {r2,i2,r3,i3}
mova
%
2
,
%
1
shufps
%
1
,
%
3
,
0x88
; {r0,r1,r2,r3}
shufps
%
2
,
%
3
,
0xdd
; {i0,i1,i2,i3}
subps
%
3
,
%
1
,
%
2
; {t3,t4,-t8,t7}
addps
%
1
,
%
1
,
%
2
; {t1,t2,t6,t5}
xorps
%
3
,
%
3
,
[
ps_p1p1m1p1
]
shufps
%
2
,
%
1
,
%
3
,
0xbe
; {t6,t5,t7,t8}
shufps
%
1
,
%
1
,
%
3
,
0x44
; {t1,t2,t3,t4}
subps
%
3
,
%
1
,
%
2
; {r2,i2,r3,i3}
addps
%
1
,
%
1
,
%
2
; {r0,i0,r1,i1}
shufps
%
2
,
%
1
,
%
3
,
0xdd
; {i0,i1,i2,i3}
shufps
%
1
,
%
1
,
%
3
,
0x88
; {r0,r1,r2,r3}
%endmacro
; In SSE mode do one FFT8
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
;
; In AVX mode do two FFT8
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
%macro T8_SSE 6
mova
%
6
,
%
3
subps
%
3
,
%
4
; {r5,i5,r7,i7}
addps
%
6
,
%
4
; {t1,t2,t3,t4}
mova
%
4
,
%
3
shufps
%
4
,
%
4
,
0xb1
; {i5,r5,i7,r7}
mulps
%
3
,
[
ps_root2mppm
]
; {-r5,i5,r7,-i7}
mulps
%
4
,
[
ps_root2
]
addps
%
3
,
%
4
; {t8,t7,ta,t9}
mova
%
4
,
%
6
shufps
%
6
,
%
3
,
0x36
; {t3,t2,t9,t8}
shufps
%
4
,
%
3
,
0x9c
; {t1,t4,t7,ta}
mova
%
3
,
%
6
addps
%
6
,
%
4
; {t1,t2,t9,ta}
subps
%
3
,
%
4
; {t6,t5,tc,tb}
mova
%
4
,
%
6
shufps
%
6
,
%
3
,
0xd8
; {t1,t9,t5,tb}
shufps
%
4
,
%
3
,
0x8d
; {t2,ta,t6,tc}
mova
%
3
,
%
1
mova
%
5
,
%
2
addps
%
1
,
%
6
; {r0,r1,r2,r3}
addps
%
2
,
%
4
; {i0,i1,i2,i3}
subps
%
3
,
%
6
; {r4,r5,r6,r7}
subps
%
5
,
%
4
; {i4,i5,i6,i7}
SWAP
%
4
,
%
5
addps
%
6
,
%
3
,
%
4
; {t1,t2,t3,t4}
subps
%
3
,
%
3
,
%
4
; {r5,i5,r7,i7}
shufps
%
4
,
%
3
,
%
3
,
0xb1
; {i5,r5,i7,r7}
mulps
%
3
,
%
3
,
[
ps_root2mppm
]
; {-r5,i5,r7,-i7}
mulps
%
4
,
%
4
,
[
ps_root2
]
addps
%
3
,
%
3
,
%
4
; {t8,t7,ta,t9}
shufps
%
4
,
%
6
,
%
3
,
0x9c
; {t1,t4,t7,ta}
shufps
%
6
,
%
6
,
%
3
,
0x36
; {t3,t2,t9,t8}
subps
%
3
,
%
6
,
%
4
; {t6,t5,tc,tb}
addps
%
6
,
%
6
,
%
4
; {t1,t2,t9,ta}
shufps
%
5
,
%
6
,
%
3
,
0x8d
; {t2,ta,t6,tc}
shufps
%
6
,
%
6
,
%
3
,
0xd8
; {t1,t9,t5,tb}
subps
%
3
,
%
1
,
%
6
; {r4,r5,r6,r7}
addps
%
1
,
%
1
,
%
6
; {r0,r1,r2,r3}
subps
%
4
,
%
2
,
%
5
; {i4,i5,i6,i7}
addps
%
2
,
%
2
,
%
5
; {i0,i1,i2,i3}
%endmacro
; scheduled for cpu-bound sizes
...
...
@@ -148,52 +190,44 @@ section .text align=16
IF
%
1
mova
m4
,
Z
(
4
)
IF
%
1
mova
m5
,
Z
(
5
)
mova
m0
,
%
2
; wre
mova
m2
,
m4
mova
m1
,
%
3
; wim
mova
m3
,
m5
mulps
m2
,
m0
; r2*wre
mulps
m2
,
m4
,
m0
; r2*wre
IF
%
1
mova
m6
,
Z2
(
6
)
mulps
m3
,
m1
; i2*wim
mulps
m3
,
m5
,
m1
; i2*wim
IF
%
1
mova
m7
,
Z2
(
7
)
mulps
m4
,
m1
; r2*wim
mulps
m5
,
m0
; i2*wre
addps
m2
,
m3
; r2*wre + i2*wim
mova
m3
,
m1
mulps
m1
,
m6
; r3*wim
subps
m5
,
m4
; i2*wre - r2*wim
mova
m4
,
m0
mulps
m3
,
m7
; i3*wim
mulps
m4
,
m6
; r3*wre
mulps
m0
,
m7
; i3*wre
subps
m4
,
m3
; r3*wre - i3*wim
mulps
m4
,
m4
,
m1
; r2*wim
mulps
m5
,
m5
,
m0
; i2*wre
addps
m2
,
m2
,
m3
; r2*wre + i2*wim
mulps
m3
,
m1
,
m7
; i3*wim
subps
m5
,
m5
,
m4
; i2*wre - r2*wim
mulps
m1
,
m1
,
m6
; r3*wim
mulps
m4
,
m0
,
m6
; r3*wre
mulps
m0
,
m0
,
m7
; i3*wre
subps
m4
,
m4
,
m3
; r3*wre - i3*wim
mova
m3
,
Z
(
0
)
addps
m0
,
m1
; i3*wre + r3*wim
mova
m1
,
m4
addps
m4
,
m2
; t5
subps
m1
,
m2
; t3
subps
m3
,
m4
; r2