Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
ffmpeg
Commits
82992604
Commit
82992604
authored
Jun 23, 2012
by
Mans Rullgard
Browse files
x86: fft: convert sse inline asm to yasm
parent
8123e090
Changes
3
Hide whitespace changes
Inline
Side-by-side
libavcodec/x86/Makefile
View file @
82992604
...
...
@@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
YASM-OBJS-$(CONFIG_ENCODERS)
+=
x86/dsputilenc_yasm.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOW)
+=
x86/fft_3dn.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT)
+=
x86/fft_3dn2.o
YASM-OBJS-FFT-$(HAVE_SSE)
+=
x86/fft_sse.o
YASM-OBJS-$(CONFIG_FFT)
+=
x86/fft_mmx.o
\
$
(
YASM-OBJS-FFT-yes
)
YASM-OBJS-$(CONFIG_H264CHROMA)
+=
x86/h264_chromamc.o
\
...
...
libavcodec/x86/fft_mmx.asm
View file @
82992604
...
...
@@ -45,6 +45,10 @@ struc FFTContext
.mdctbits:
resd
1
.tcos:
pointer
1
.tsin:
pointer
1
.fftperm:
pointer
1
.fftcalc:
pointer
1
.imdctcalc:
pointer
1
.imdcthalf:
pointer
1
endstruc
SECTION
_RODATA
...
...
@@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2:
dd
0x00
,
0x01
,
0x02
,
0x03
,
0x01
,
0x00
,
0x02
,
0x03
ps_p1p1m1p1root2:
dd
1.0
,
1.0
,
-
1.0
,
1.0
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
ps_m1m1p1m1p1m1m1m1:
dd
1
<<
31
,
1
<<
31
,
0
,
1
<<
31
,
0
,
1
<<
31
,
1
<<
31
,
1
<<
31
ps_m1m1m1m1:
times
4
dd
1
<<
31
ps_m1p1:
dd
1
<<
31
,
0
%assign i 16
...
...
@@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
rep
ret
%endmacro
%macro FFT_DISPATCH 2
; clobbers 5 GPRs, 8 XMMs
lea
r2
,
[
disp
atch_tab
%
1
]
mov
r2
,
[
r2
+
(
%
2q
-
2
)
*
gprsize
]
%ifdef PIC
lea
r3
,
[
$$
]
add
r2
,
r3
%endif
call
r2
%endmacro
; FFT_DISPATCH
INIT_YMM
avx
%if HAVE_AVX
...
...
@@ -548,6 +563,14 @@ INIT_YMM avx
DECL_PASS
pass_avx
,
PASS_BIG
1
DECL_PASS
pass_interleave_avx
,
PASS_BIG
0
cglobal
fft_calc
,
2
,
5
,
8
mov
r3d
,
[
r0
+
FFTContext.nbits
]
mov
r0
,
r1
mov
r1
,
r3
FFT_DISPATCH
_interleave
%+
SUFFIX
,
r1
REP_RET
%endif
INIT_XMM
ss
e
...
...
@@ -565,6 +588,112 @@ INIT_XMM sse
DECL_PASS
pass_sse
,
PASS_BIG
1
DECL_PASS
pass_interleave_sse
,
PASS_BIG
0
cglobal
fft_calc
,
2
,
5
,
8
mov
r3d
,
[
r0
+
FFTContext.nbits
]
PUSH
r1
PUSH
r3
mov
r0
,
r1
mov
r1
,
r3
FFT_DISPATCH
_interleave
%+
SUFFIX
,
r1
POP
rcx
POP
r4
cmp
rcx
,
4
jg
.end
mov
r2
,
-
1
add
rcx
,
3
shl
r2
,
cl
sub
r4
,
r2
.loop
movaps
xmm0
,
[
r4
+
r2
]
movaps
xmm1
,
xmm0
unpcklps
xmm0
,
[
r4
+
r2
+
16
]
unpckhps
xmm1
,
[
r4
+
r2
+
16
]
movaps
[
r4
+
r2
],
xmm0
movaps
[
r4
+
r2
+
16
],
xmm1
add
r2
,
32
jl
.loop
.end:
REP_RET
cextern_naked
memcpy
cglobal
fft_permute
,
2
,
7
,
1
mov
r4
,
[
r0
+
FFTContext.revtab
]
mov
r5
,
[
r0
+
FFTContext.tmpbuf
]
mov
ecx
,
[
r0
+
FFTContext.nbits
]
mov
r2
,
1
shl
r2
,
cl
xor
r0
,
r0
%if ARCH_X86_32
mov
r1
,
r1m
%endif
.loop:
movaps
xmm0
,
[
r1
+
8
*
r0
]
movzx
r6
,
word
[
r4
+
2
*
r0
]
movzx
r3
,
word
[
r4
+
2
*
r0
+
2
]
movlps
[
r5
+
8
*
r6
],
xmm0
movhps
[
r5
+
8
*
r3
],
xmm0
add
r0
,
2
cmp
r0
,
r2
jl
.loop
shl
r2
,
3
%if ARCH_X86_64
mov
r0
,
r1
mov
r1
,
r5
%else
push
r2
push
r5
push
r1
%endif
%if ARCH_X86_64 && WIN64 == 0
jmp
memcpy
%else
call
memcpy
%if ARCH_X86_32
add
esp
,
12
%endif
REP_RET
%endif
cglobal
imdct_calc
,
3
,
5
,
3
mov
r3d
,
[
r0
+
FFTContext.mdctsize
]
mov
r4
,
[
r0
+
FFTContext.imdcthalf
]
add
r1
,
r3
PUSH
r3
PUSH
r1
%if ARCH_X86_32
push
r2
push
r1
push
r0
%else
sub
rsp
,
8
%endif
call
r4
%if ARCH_X86_32
add
esp
,
12
%else
add
rsp
,
8
%endif
POP
r1
POP
r3
lea
r0
,
[
r1
+
2
*
r3
]
mov
r2
,
r3
sub
r3
,
16
neg
r2
movaps
xmm2
,
[
ps_m1m1m1m1
]
.loop:
movaps
xmm0
,
[
r1
+
r3
]
movaps
xmm1
,
[
r0
+
r2
]
shufps
xmm0
,
xmm0
,
0x1b
shufps
xmm1
,
xmm1
,
0x1b
xorps
xmm0
,
xmm2
movaps
[
r0
+
r3
],
xmm1
movaps
[
r1
+
r2
],
xmm0
sub
r3
,
16
add
r2
,
16
jl
.loop
REP_RET
INIT_MMX
3
dnow
%define mulps pfmul
%define addps pfadd
...
...
@@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define SECTION_REL
%endif
%macro FFT_DISPATCH 2
; clobbers 5 GPRs, 8 XMMs
lea
r2
,
[
disp
atch_tab
%
1
]
mov
r2
,
[
r2
+
(
%
2q
-
2
)
*
gprsize
]
%ifdef PIC
lea
r3
,
[
$$
]
add
r2
,
r3
%endif
call
r2
%endmacro
; FFT_DISPATCH
%macro DECL_FFT 1-2
; nbits, suffix
%ifidn %0, 1
%xdefine fullsuffix SUFFIX
...
...
libavcodec/x86/fft_sse.c
deleted
100644 → 0
View file @
8123e090
/*
* FFT/MDCT transform with SSE optimizations
* Copyright (c) 2008 Loren Merritt
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "fft.h"
#include "config.h"
DECLARE_ASM_CONST
(
16
,
unsigned
int
,
ff_m1m1m1m1
)[
4
]
=
{
1U
<<
31
,
1U
<<
31
,
1U
<<
31
,
1U
<<
31
};
void
ff_fft_dispatch_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_avx
(
FFTComplex
*
z
,
int
nbits
);
#if HAVE_AVX
void
ff_fft_calc_avx
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
ff_fft_dispatch_interleave_avx
(
z
,
s
->
nbits
);
}
#endif
void
ff_fft_calc_sse
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
int
n
=
1
<<
s
->
nbits
;
ff_fft_dispatch_interleave_sse
(
z
,
s
->
nbits
);
if
(
n
<=
16
)
{
x86_reg
i
=
-
8
*
n
;
__asm__
volatile
(
"1:
\n
"
"movaps (%0,%1), %%xmm0
\n
"
"movaps %%xmm0, %%xmm1
\n
"
"unpcklps 16(%0,%1), %%xmm0
\n
"
"unpckhps 16(%0,%1), %%xmm1
\n
"
"movaps %%xmm0, (%0,%1)
\n
"
"movaps %%xmm1, 16(%0,%1)
\n
"
"add $32, %0
\n
"
"jl 1b
\n
"
:
"+r"
(
i
)
:
"r"
(
z
+
n
)
:
"memory"
);
}
}
void
ff_fft_permute_sse
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
int
n
=
1
<<
s
->
nbits
;
int
i
;
for
(
i
=
0
;
i
<
n
;
i
+=
2
)
{
__asm__
volatile
(
"movaps %2, %%xmm0
\n
"
"movlps %%xmm0, %0
\n
"
"movhps %%xmm0, %1
\n
"
:
"=m"
(
s
->
tmp_buf
[
s
->
revtab
[
i
]]),
"=m"
(
s
->
tmp_buf
[
s
->
revtab
[
i
+
1
]])
:
"m"
(
z
[
i
])
);
}
memcpy
(
z
,
s
->
tmp_buf
,
n
*
sizeof
(
FFTComplex
));
}
void
ff_imdct_calc_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
)
{
x86_reg
j
,
k
;
long
n
=
s
->
mdct_size
;
long
n4
=
n
>>
2
;
s
->
imdct_half
(
s
,
output
+
n4
,
input
);
j
=
-
n
;
k
=
n
-
16
;
__asm__
volatile
(
"movaps "
MANGLE
(
ff_m1m1m1m1
)
", %%xmm7
\n
"
"1:
\n
"
"movaps (%2,%1), %%xmm0
\n
"
"movaps (%3,%0), %%xmm1
\n
"
"shufps $0x1b, %%xmm0, %%xmm0
\n
"
"shufps $0x1b, %%xmm1, %%xmm1
\n
"
"xorps %%xmm7, %%xmm0
\n
"
"movaps %%xmm1, (%3,%1)
\n
"
"movaps %%xmm0, (%2,%0)
\n
"
"sub $16, %1
\n
"
"add $16, %0
\n
"
"jl 1b
\n
"
:
"+r"
(
j
),
"+r"
(
k
)
:
"r"
(
output
+
n4
),
"r"
(
output
+
n4
*
3
)
XMM_CLOBBERS_ONLY
(
"%xmm0"
,
"%xmm1"
,
"%xmm7"
)
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment