Commit 9d35fa52 authored by Vitor Sessak's avatar Vitor Sessak Committed by Reinhard Tartler

Add AVX FFT implementation.

Signed-off-by: default avatarReinhard Tartler <siretart@tauware.de>
parent 13dfce3d
......@@ -5,7 +5,7 @@ releases are sorted from youngest to oldest.
version <next>:
- Lots of deprecated API cruft removed
- fft and imdct optimizations for AVX (Sandy Bridge) processors
version 0.7_beta1:
......
......@@ -223,9 +223,9 @@ typedef struct {
float sf[120]; ///< scalefactors
int sf_idx[128]; ///< scalefactor indices (used by encoder)
uint8_t zeroes[128]; ///< band is not coded (used by encoder)
DECLARE_ALIGNED(16, float, coeffs)[1024]; ///< coefficients for IMDCT
DECLARE_ALIGNED(16, float, saved)[1024]; ///< overlap
DECLARE_ALIGNED(16, float, ret)[2048]; ///< PCM output
DECLARE_ALIGNED(32, float, coeffs)[1024]; ///< coefficients for IMDCT
DECLARE_ALIGNED(32, float, saved)[1024]; ///< overlap
DECLARE_ALIGNED(32, float, ret)[2048]; ///< PCM output
DECLARE_ALIGNED(16, int16_t, ltp_state)[3072]; ///< time signal for LTP
PredictorState predictor_state[MAX_PREDICTORS];
} SingleChannelElement;
......@@ -272,7 +272,7 @@ typedef struct {
* @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
* @{
*/
DECLARE_ALIGNED(16, float, buf_mdct)[1024];
DECLARE_ALIGNED(32, float, buf_mdct)[1024];
/** @} */
/**
......@@ -296,7 +296,7 @@ typedef struct {
int sf_offset; ///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
/** @} */
DECLARE_ALIGNED(16, float, temp)[128];
DECLARE_ALIGNED(32, float, temp)[128];
enum OCStatus output_configured;
} AACContext;
......
......@@ -64,7 +64,7 @@ typedef struct AACEncContext {
int last_frame;
float lambda;
DECLARE_ALIGNED(16, int, qcoefs)[96]; ///< quantized coefficients
DECLARE_ALIGNED(16, float, scoefs)[1024]; ///< scaled coefficients
DECLARE_ALIGNED(32, float, scoefs)[1024]; ///< scaled coefficients
} AACEncContext;
#endif /* AVCODEC_AACENC_H */
......@@ -200,11 +200,11 @@ typedef struct {
///@defgroup arrays aligned arrays
DECLARE_ALIGNED(16, int, fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///> fixed-point transform coefficients
DECLARE_ALIGNED(16, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///< transform coefficients
DECLARE_ALIGNED(16, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< delay - added to the next block
DECLARE_ALIGNED(16, float, window)[AC3_BLOCK_SIZE]; ///< window coefficients
DECLARE_ALIGNED(16, float, tmp_output)[AC3_BLOCK_SIZE]; ///< temporary storage for output before windowing
DECLARE_ALIGNED(16, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< output after imdct transform and windowing
DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///< transform coefficients
DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< delay - added to the next block
DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE]; ///< window coefficients
DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE]; ///< temporary storage for output before windowing
DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< output after imdct transform and windowing
///@}
} AC3DecodeContext;
......
......@@ -201,7 +201,7 @@ typedef struct AC3EncodeContext {
uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies
DECLARE_ALIGNED(16, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
DECLARE_ALIGNED(32, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
} AC3EncodeContext;
typedef struct AC3Mant {
......
......@@ -60,11 +60,11 @@ typedef struct {
int log2_block_count[AT1_QMF_BANDS]; ///< log2 number of blocks in a band
int num_bfus; ///< number of Block Floating Units
float* spectrum[2];
DECLARE_ALIGNED(16, float, spec1)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(16, float, spec2)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(16, float, fst_qmf_delay)[46]; ///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED(16, float, snd_qmf_delay)[46]; ///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED(16, float, last_qmf_delay)[256+23]; ///< delay line for the last stacked QMF filter
DECLARE_ALIGNED(32, float, spec1)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(32, float, spec2)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(32, float, fst_qmf_delay)[46]; ///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED(32, float, snd_qmf_delay)[46]; ///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED(32, float, last_qmf_delay)[256+23]; ///< delay line for the last stacked QMF filter
} AT1SUCtx;
/**
......@@ -72,13 +72,13 @@ typedef struct {
*/
typedef struct {
AT1SUCtx SUs[AT1_MAX_CHANNELS]; ///< channel sound unit
DECLARE_ALIGNED(16, float, spec)[AT1_SU_SAMPLES]; ///< the mdct spectrum buffer
DECLARE_ALIGNED(32, float, spec)[AT1_SU_SAMPLES]; ///< the mdct spectrum buffer
DECLARE_ALIGNED(16, float, low)[256];
DECLARE_ALIGNED(16, float, mid)[256];
DECLARE_ALIGNED(16, float, high)[512];
DECLARE_ALIGNED(32, float, low)[256];
DECLARE_ALIGNED(32, float, mid)[256];
DECLARE_ALIGNED(32, float, high)[512];
float* bands[3];
DECLARE_ALIGNED(16, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
DECLARE_ALIGNED(32, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
FFTContext mdct_ctx[3];
int channels;
DSPContext dsp;
......
......@@ -74,8 +74,8 @@ typedef struct {
int gcBlkSwitch;
gain_block gainBlock[2];
DECLARE_ALIGNED(16, float, spectrum)[1024];
DECLARE_ALIGNED(16, float, IMDCT_buf)[1024];
DECLARE_ALIGNED(32, float, spectrum)[1024];
DECLARE_ALIGNED(32, float, IMDCT_buf)[1024];
float delayBuf1[46]; ///<qmf delay buffers
float delayBuf2[46];
......@@ -122,7 +122,7 @@ typedef struct {
FFTContext mdct_ctx;
} ATRAC3Context;
static DECLARE_ALIGNED(16, float,mdct_window)[512];
static DECLARE_ALIGNED(32, float, mdct_window)[512];
static VLC spectral_coeff_tab[7];
static float gain_tab1[16];
static float gain_tab2[31];
......
......@@ -55,7 +55,7 @@ typedef struct {
int num_bands;
unsigned int *bands;
float root;
DECLARE_ALIGNED(16, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
DECLARE_ALIGNED(32, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, short, previous)[BINK_BLOCK_MAX_SIZE / 16]; ///< coeffs from previous audio block
float *coeffs_ptr[MAX_CHANNELS]; ///< pointers to the coeffs arrays for float_to_int16_interleave
union {
......
......@@ -153,7 +153,7 @@ typedef struct cook {
/* data buffers */
uint8_t* decoded_bytes_buffer;
DECLARE_ALIGNED(16, float,mono_mdct_output)[2048];
DECLARE_ALIGNED(32, float, mono_mdct_output)[2048];
float decode_buffer_1[1024];
float decode_buffer_2[1024];
float decode_buffer_0[1060]; /* static allocation for joint decode */
......
......@@ -321,16 +321,16 @@ typedef struct {
/* Subband samples history (for ADPCM) */
float subband_samples_hist[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][4];
DECLARE_ALIGNED(16, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
DECLARE_ALIGNED(16, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
DECLARE_ALIGNED(32, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
DECLARE_ALIGNED(32, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
int hist_index[DCA_PRIM_CHANNELS_MAX];
DECLARE_ALIGNED(16, float, raXin)[32];
DECLARE_ALIGNED(32, float, raXin)[32];
int output; ///< type of output
float scale_bias; ///< output scale
DECLARE_ALIGNED(16, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
DECLARE_ALIGNED(16, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
DECLARE_ALIGNED(32, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
DECLARE_ALIGNED(32, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
const float *samples_chanptr[DCA_PRIM_CHANNELS_MAX+1];
uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE];
......
......@@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
#endif
}
static const int avx_tab[] = {
0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
};
static int is_second_half_of_fft32(int i, int n)
{
if (n <= 32)
return i >= 16;
else if (i < n/2)
return is_second_half_of_fft32(i, n/2);
else if (i < 3*n/4)
return is_second_half_of_fft32(i - n/2, n/4);
else
return is_second_half_of_fft32(i - 3*n/4, n/4);
}
static av_cold void fft_perm_avx(FFTContext *s)
{
int i;
int n = 1 << s->nbits;
for (i = 0; i < n; i += 16) {
int k;
if (is_second_half_of_fft32(i, n)) {
for (k = 0; k < 16; k++)
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] =
i + avx_tab[k];
} else {
for (k = 0; k < 16; k++) {
int j = i + k;
j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4);
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j;
}
}
}
}
av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
{
int i, j, n;
......@@ -132,11 +170,16 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
for(j=4; j<=nbits; j++) {
ff_init_ff_cos_tabs(j);
}
for(i=0; i<n; i++) {
int j = i;
if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
if (s->fft_permutation == FF_FFT_PERM_AVX) {
fft_perm_avx(s);
} else {
for(i=0; i<n; i++) {
int j = i;
if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
}
}
return 0;
......
......@@ -85,6 +85,7 @@ struct FFTContext {
int fft_permutation;
#define FF_FFT_PERM_DEFAULT 0
#define FF_FFT_PERM_SWAP_LSBS 1
#define FF_FFT_PERM_AVX 2
int mdct_permutation;
#define FF_MDCT_PERM_NONE 0
#define FF_MDCT_PERM_INTERLEAVE 1
......@@ -97,7 +98,7 @@ struct FFTContext {
#endif
#define COSTABLE(size) \
COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
COSTABLE_CONST DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
extern COSTABLE(16);
extern COSTABLE(32);
......
......@@ -88,7 +88,7 @@ typedef struct {
DSPContext dsp;
FFTContext fft;
DECLARE_ALIGNED(16, FFTComplex, samples)[COEFFS/2];
DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS/2];
float *out_samples;
} IMCContext;
......
......@@ -47,7 +47,7 @@
typedef struct NellyMoserDecodeContext {
AVCodecContext* avctx;
DECLARE_ALIGNED(16, float,float_buf)[NELLY_SAMPLES];
DECLARE_ALIGNED(32, float, float_buf)[NELLY_SAMPLES];
float state[128];
AVLFG random_state;
GetBitContext gb;
......@@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
DSPContext dsp;
FFTContext imdct_ctx;
FmtConvertContext fmt_conv;
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
DECLARE_ALIGNED(32, float, imdct_out)[NELLY_BUF_LEN * 2];
} NellyMoserDecodeContext;
static void overlap_and_window(NellyMoserDecodeContext *s, float *state, float *audio, float *a_in)
......
......@@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
int have_saved;
DSPContext dsp;
FFTContext mdct_ctx;
DECLARE_ALIGNED(16, float, mdct_out)[NELLY_SAMPLES];
DECLARE_ALIGNED(16, float, in_buff)[NELLY_SAMPLES];
DECLARE_ALIGNED(16, float, buf)[2][3 * NELLY_BUF_LEN]; ///< sample buffer
DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES];
DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN]; ///< sample buffer
float (*opt )[NELLY_BANDS];
uint8_t (*path)[NELLY_BANDS];
} NellyMoserEncodeContext;
......
......@@ -120,7 +120,7 @@ typedef struct {
} FFTCoefficient;
typedef struct {
DECLARE_ALIGNED(16, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
DECLARE_ALIGNED(32, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
} QDM2FFT;
/**
......
......@@ -113,15 +113,15 @@ typedef struct WMACodecContext {
uint8_t ms_stereo; ///< true if mid/side stereo mode
uint8_t channel_coded[MAX_CHANNELS]; ///< true if channel is coded
int exponents_bsize[MAX_CHANNELS]; ///< log2 ratio frame/exp. length
DECLARE_ALIGNED(16, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(32, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
float max_exponent[MAX_CHANNELS];
WMACoef coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, FFTSample, output)[BLOCK_MAX_SIZE * 2];
DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
FFTContext mdct_ctx[BLOCK_NB_SIZES];
float *windows[BLOCK_NB_SIZES];
/* output buffer for one frame and the last for IMDCT windowing */
DECLARE_ALIGNED(16, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
/* last frame info */
uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4]; /* padding added */
int last_bitoffset;
......
......@@ -145,7 +145,7 @@ typedef struct {
uint8_t table_idx; ///< index in sf_offsets for the scale factor reference block
float* coeffs; ///< pointer to the subframe decode buffer
uint16_t num_vec_coeffs; ///< number of vector coded coefficients
DECLARE_ALIGNED(16, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
DECLARE_ALIGNED(32, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
} WMAProChannelCtx;
/**
......@@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
PutBitContext pb; ///< context for filling the frame_data buffer
FFTContext mdct_ctx[WMAPRO_BLOCK_SIZES]; ///< MDCT context per block size
DECLARE_ALIGNED(16, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
float* windows[WMAPRO_BLOCK_SIZES]; ///< windows for the different block sizes
/* frame size dependent frame information (set during initialization) */
......
......@@ -275,11 +275,11 @@ typedef struct {
///< by postfilter
float denoise_filter_cache[MAX_FRAMESIZE];
int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80];
DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
///< aligned buffer for LPC tilting
DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80];
DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
///< aligned buffer for denoise coefficients
DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
///< aligned buffer for postfilter speech
///< synthesis
/**
......
......@@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
{
#if HAVE_YASM
int has_vectors = av_get_cpu_flags();
if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
/* AVX for SB */
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_avx;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
} else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
/* SSE for P3/P4/K8 */
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse;
......
......@@ -22,6 +22,7 @@
#include "libavcodec/fft.h"
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
......@@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
#endif
;******************************************************************************
;* FFT transform with SSE/3DNow optimizations
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2011 Vitor Sessak
;*
;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein.
......@@ -49,9 +50,21 @@ endstruc
SECTION_RODATA
%define M_SQRT1_2 0.70710678118654752440
ps_root2: times 4 dd M_SQRT1_2
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
ps_p1p1m1p1: dd 0, 0, 1<<31, 0
%define M_COS_PI_1_8 0.923879532511287
%define M_COS_PI_3_8 0.38268343236509
align 32
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
ps_root2: times 8 dd M_SQRT1_2
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
ps_m1p1: dd 1<<31, 0
%assign i 16
......@@ -96,51 +109,80 @@ section .text align=16
SWAP %3, %6
%endmacro
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
; %3, %4, %5 tmp
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
%macro T8_AVX 5
vsubps %5, %1, %2 ; v = %1 - %2
vaddps %3, %1, %2 ; w = %1 + %2
vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
vpermilps %2, %2, [perm1]
vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
vsubps %4, %5, %1 ; s = r - q
vaddps %1, %5, %1 ; u = r + q
vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
vshufps %5, %4, %1, 0xbb
vshufps %3, %4, %1, 0xee
vperm2f128 %3, %3, %5, 0x13
vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
vshufps %2, %1, %4, 0xdd
vshufps %1, %1, %4, 0x88
vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
vsubps %5, %1, %3
vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
vsubps %2, %4, %1 ; %2 = v - w
vaddps %1, %4, %1 ; %1 = v + w
%endmacro
; In SSE mode do one fft4 transforms
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
;
; In AVX mode do two fft4 transforms
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
%macro T4_SSE 3
mova %3, %1
addps %1, %2 ; {t1,t2,t6,t5}
subps %3, %2 ; {t3,t4,-t8,t7}
xorps %3, [ps_p1p1m1p1]
mova %2, %1
shufps %1, %3, 0x44 ; {t1,t2,t3,t4}
shufps %2, %3, 0xbe ; {t6,t5,t7,t8}
mova %3, %1
addps %1, %2 ; {r0,i0,r1,i1}
subps %3, %2 ; {r2,i2,r3,i3}
mova %2, %1
shufps %1, %3, 0x88 ; {r0,r1,r2,r3}
shufps %2, %3, 0xdd ; {i0,i1,i2,i3}
subps %3, %1, %2 ; {t3,t4,-t8,t7}
addps %1, %1, %2 ; {t1,t2,t6,t5}
xorps %3, %3, [ps_p1p1m1p1]
shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
subps %3, %1, %2 ; {r2,i2,r3,i3}
addps %1, %1, %2 ; {r0,i0,r1,i1}
shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
%endmacro
; In SSE mode do one FFT8
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
;
; In AVX mode do two FFT8
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
%macro T8_SSE 6
mova %6, %3
subps %3, %4 ; {r5,i5,r7,i7}
addps %6, %4 ; {t1,t2,t3,t4}
mova %4, %3
shufps %4, %4, 0xb1 ; {i5,r5,i7,r7}
mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
mulps %4, [ps_root2]
addps %3, %4 ; {t8,t7,ta,t9}
mova %4, %6
shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
shufps %4, %3, 0x9c ; {t1,t4,t7,ta}
mova %3, %6
addps %6, %4 ; {t1,t2,t9,ta}
subps %3, %4 ; {t6,t5,tc,tb}
mova %4, %6
shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
shufps %4, %3, 0x8d ; {t2,ta,t6,tc}
mova %3, %1
mova %5, %2
addps %1, %6 ; {r0,r1,r2,r3}
addps %2, %4 ; {i0,i1,i2,i3}
subps %3, %6 ; {r4,r5,r6,r7}
subps %5, %4 ; {i4,i5,i6,i7}
SWAP %4, %5
addps %6, %3, %4 ; {t1,t2,t3,t4}
subps %3, %3, %4 ; {r5,i5,r7,i7}
shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
mulps %4, %4, [ps_root2]
addps %3, %3, %4 ; {t8,t7,ta,t9}
shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
subps %3, %6, %4 ; {t6,t5,tc,tb}
addps %6, %6, %4 ; {t1,t2,t9,ta}
shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
subps %3, %1, %6 ; {r4,r5,r6,r7}
addps %1, %1, %6 ; {r0,r1,r2,r3}
subps %4, %2, %5 ; {i4,i5,i6,i7}
addps %2, %2, %5 ; {i0,i1,i2,i3}
%endmacro
; scheduled for cpu-bound sizes
......@@ -148,52 +190,44 @@ section .text align=16
IF%1 mova m4, Z(4)
IF%1 mova m5, Z(5)
mova m0, %2 ; wre
mova m2, m4
mova m1, %3 ; wim
mova m3, m5
mulps m2, m0 ; r2*wre
mulps m2, m4, m0 ; r2*wre
IF%1 mova m6, Z2(6)
mulps m3, m1 ; i2*wim
mulps m3, m5, m1 ; i2*wim
IF%1 mova m7, Z2(7)
mulps m4, m1 ; r2*wim
mulps m5, m0 ; i2*wre
addps m2, m3 ; r2*wre + i2*wim
mova m3, m1
mulps m1, m6 ; r3*wim
subps m5, m4 ; i2*wre - r2*wim
mova m4, m0
mulps m3, m7 ; i3*wim
mulps m4, m6 ; r3*wre
mulps m0, m7 ; i3*wre
subps m4, m3 ; r3*wre - i3*wim
mulps m4, m4, m1 ; r2*wim
mulps m5, m5, m0 ; i2*wre
addps m2, m2, m3 ; r2*wre + i2*wim
mulps m3, m1, m7 ; i3*wim
subps m5, m5, m4 ; i2*wre - r2*wim
mulps m1, m1, m6 ; r3*wim
mulps m4, m0, m6 ; r3*wre
mulps m0, m0, m7 ; i3*wre
subps m4, m4, m3 ; r3*wre - i3*wim
mova m3, Z(0)
addps m0, m1 ; i3*wre + r3*wim
mova m1, m4
addps m4, m2 ; t5
subps m1, m2 ; t3
subps m3, m4 ; r2