Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
External Wiki
External Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
BC
public
external
ffmpeg
Commits
acf91215
Commit
acf91215
authored
Jul 08, 2014
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: dsputil: Avoid pointless CONFIG_ENCODERS indirection
The remaining dsputil bits are encoding-specific anyway.
parent
a8552ee3
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
855 additions
and
888 deletions
+855
-888
Makefile
libavcodec/x86/Makefile
+2
-3
dsputil_init.c
libavcodec/x86/dsputil_init.c
+853
-4
dsputil_x86.h
libavcodec/x86/dsputil_x86.h
+0
-1
dsputilenc_mmx.c
libavcodec/x86/dsputilenc_mmx.c
+0
-880
No files found.
libavcodec/x86/Makefile
View file @
acf91215
...
...
@@ -6,8 +6,7 @@ OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
OBJS-$(CONFIG_BLOCKDSP)
+=
x86/blockdsp.o
OBJS-$(CONFIG_BSWAPDSP)
+=
x86/bswapdsp_init.o
OBJS-$(CONFIG_DCT)
+=
x86/dct_init.o
OBJS-$(CONFIG_DSPUTIL)
+=
x86/dsputil_init.o
OBJS-$(CONFIG_ENCODERS)
+=
x86/dsputilenc_mmx.o
\
OBJS-$(CONFIG_DSPUTIL)
+=
x86/dsputil_init.o
\
x86/motion_est.o
OBJS-$(CONFIG_FDCTDSP)
+=
x86/fdctdsp_init.o
OBJS-$(CONFIG_FFT)
+=
x86/fft_init.o
...
...
@@ -72,7 +71,7 @@ YASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
YASM-OBJS-$(CONFIG_BSWAPDSP)
+=
x86/bswapdsp.o
YASM-OBJS-$(CONFIG_DCT)
+=
x86/dct32.o
YASM-OBJS-$(CONFIG_DNXHD_ENCODER)
+=
x86/dnxhdenc.o
YASM-OBJS-$(CONFIG_
ENCODERS)
+=
x86/dsputilenc.o
YASM-OBJS-$(CONFIG_
DSPUTIL)
+=
x86/dsputilenc.o
YASM-OBJS-$(CONFIG_FFT)
+=
x86/fft.o
YASM-OBJS-$(CONFIG_H263DSP)
+=
x86/h263_loopfilter.o
YASM-OBJS-$(CONFIG_H264CHROMA)
+=
x86/h264_chromamc.o
\
...
...
libavcodec/x86/dsputil_init.c
View file @
acf91215
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
...
...
@@ -16,16 +22,859 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_x86.h"
#if HAVE_INLINE_ASM
static
int
sse8_mmx
(
MpegEncContext
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
tmp
;
__asm__
volatile
(
"movl %4, %%ecx
\n
"
"shr $1, %%ecx
\n
"
"pxor %%mm0, %%mm0
\n
"
/* mm0 = 0 */
"pxor %%mm7, %%mm7
\n
"
/* mm7 holds the sum */
"1:
\n
"
"movq (%0), %%mm1
\n
"
/* mm1 = pix1[0][0 - 7] */
"movq (%1), %%mm2
\n
"
/* mm2 = pix2[0][0 - 7] */
"movq (%0, %3), %%mm3
\n
"
/* mm3 = pix1[1][0 - 7] */
"movq (%1, %3), %%mm4
\n
"
/* mm4 = pix2[1][0 - 7] */
/* todo: mm1-mm2, mm3-mm4 */
/* algo: subtract mm1 from mm2 with saturation and vice versa */
/* OR the results to get absolute difference */
"movq %%mm1, %%mm5
\n
"
"movq %%mm3, %%mm6
\n
"
"psubusb %%mm2, %%mm1
\n
"
"psubusb %%mm4, %%mm3
\n
"
"psubusb %%mm5, %%mm2
\n
"
"psubusb %%mm6, %%mm4
\n
"
"por %%mm1, %%mm2
\n
"
"por %%mm3, %%mm4
\n
"
/* now convert to 16-bit vectors so we can square them */
"movq %%mm2, %%mm1
\n
"
"movq %%mm4, %%mm3
\n
"
"punpckhbw %%mm0, %%mm2
\n
"
"punpckhbw %%mm0, %%mm4
\n
"
"punpcklbw %%mm0, %%mm1
\n
"
/* mm1 now spread over (mm1, mm2) */
"punpcklbw %%mm0, %%mm3
\n
"
/* mm4 now spread over (mm3, mm4) */
"pmaddwd %%mm2, %%mm2
\n
"
"pmaddwd %%mm4, %%mm4
\n
"
"pmaddwd %%mm1, %%mm1
\n
"
"pmaddwd %%mm3, %%mm3
\n
"
"lea (%0, %3, 2), %0
\n
"
/* pix1 += 2 * line_size */
"lea (%1, %3, 2), %1
\n
"
/* pix2 += 2 * line_size */
"paddd %%mm2, %%mm1
\n
"
"paddd %%mm4, %%mm3
\n
"
"paddd %%mm1, %%mm7
\n
"
"paddd %%mm3, %%mm7
\n
"
"decl %%ecx
\n
"
"jnz 1b
\n
"
"movq %%mm7, %%mm1
\n
"
"psrlq $32, %%mm7
\n
"
/* shift hi dword to lo */
"paddd %%mm7, %%mm1
\n
"
"movd %%mm1, %2
\n
"
:
"+r"
(
pix1
),
"+r"
(
pix2
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
),
"m"
(
h
)
:
"%ecx"
);
return
tmp
;
}
static
int
sse16_mmx
(
MpegEncContext
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
tmp
;
__asm__
volatile
(
"movl %4, %%ecx
\n
"
"pxor %%mm0, %%mm0
\n
"
/* mm0 = 0 */
"pxor %%mm7, %%mm7
\n
"
/* mm7 holds the sum */
"1:
\n
"
"movq (%0), %%mm1
\n
"
/* mm1 = pix1[0 - 7] */
"movq (%1), %%mm2
\n
"
/* mm2 = pix2[0 - 7] */
"movq 8(%0), %%mm3
\n
"
/* mm3 = pix1[8 - 15] */
"movq 8(%1), %%mm4
\n
"
/* mm4 = pix2[8 - 15] */
/* todo: mm1-mm2, mm3-mm4 */
/* algo: subtract mm1 from mm2 with saturation and vice versa */
/* OR the results to get absolute difference */
"movq %%mm1, %%mm5
\n
"
"movq %%mm3, %%mm6
\n
"
"psubusb %%mm2, %%mm1
\n
"
"psubusb %%mm4, %%mm3
\n
"
"psubusb %%mm5, %%mm2
\n
"
"psubusb %%mm6, %%mm4
\n
"
"por %%mm1, %%mm2
\n
"
"por %%mm3, %%mm4
\n
"
/* now convert to 16-bit vectors so we can square them */
"movq %%mm2, %%mm1
\n
"
"movq %%mm4, %%mm3
\n
"
"punpckhbw %%mm0, %%mm2
\n
"
"punpckhbw %%mm0, %%mm4
\n
"
"punpcklbw %%mm0, %%mm1
\n
"
/* mm1 now spread over (mm1, mm2) */
"punpcklbw %%mm0, %%mm3
\n
"
/* mm4 now spread over (mm3, mm4) */
"pmaddwd %%mm2, %%mm2
\n
"
"pmaddwd %%mm4, %%mm4
\n
"
"pmaddwd %%mm1, %%mm1
\n
"
"pmaddwd %%mm3, %%mm3
\n
"
"add %3, %0
\n
"
"add %3, %1
\n
"
"paddd %%mm2, %%mm1
\n
"
"paddd %%mm4, %%mm3
\n
"
"paddd %%mm1, %%mm7
\n
"
"paddd %%mm3, %%mm7
\n
"
"decl %%ecx
\n
"
"jnz 1b
\n
"
"movq %%mm7, %%mm1
\n
"
"psrlq $32, %%mm7
\n
"
/* shift hi dword to lo */
"paddd %%mm7, %%mm1
\n
"
"movd %%mm1, %2
\n
"
:
"+r"
(
pix1
),
"+r"
(
pix2
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
),
"m"
(
h
)
:
"%ecx"
);
return
tmp
;
}
static
int
hf_noise8_mmx
(
uint8_t
*
pix1
,
int
line_size
,
int
h
)
{
int
tmp
;
__asm__
volatile
(
"movl %3, %%ecx
\n
"
"pxor %%mm7, %%mm7
\n
"
"pxor %%mm6, %%mm6
\n
"
"movq (%0), %%mm0
\n
"
"movq %%mm0, %%mm1
\n
"
"psllq $8, %%mm0
\n
"
"psrlq $8, %%mm1
\n
"
"psrlq $8, %%mm0
\n
"
"movq %%mm0, %%mm2
\n
"
"movq %%mm1, %%mm3
\n
"
"punpcklbw %%mm7, %%mm0
\n
"
"punpcklbw %%mm7, %%mm1
\n
"
"punpckhbw %%mm7, %%mm2
\n
"
"punpckhbw %%mm7, %%mm3
\n
"
"psubw %%mm1, %%mm0
\n
"
"psubw %%mm3, %%mm2
\n
"
"add %2, %0
\n
"
"movq (%0), %%mm4
\n
"
"movq %%mm4, %%mm1
\n
"
"psllq $8, %%mm4
\n
"
"psrlq $8, %%mm1
\n
"
"psrlq $8, %%mm4
\n
"
"movq %%mm4, %%mm5
\n
"
"movq %%mm1, %%mm3
\n
"
"punpcklbw %%mm7, %%mm4
\n
"
"punpcklbw %%mm7, %%mm1
\n
"
"punpckhbw %%mm7, %%mm5
\n
"
"punpckhbw %%mm7, %%mm3
\n
"
"psubw %%mm1, %%mm4
\n
"
"psubw %%mm3, %%mm5
\n
"
"psubw %%mm4, %%mm0
\n
"
"psubw %%mm5, %%mm2
\n
"
"pxor %%mm3, %%mm3
\n
"
"pxor %%mm1, %%mm1
\n
"
"pcmpgtw %%mm0, %%mm3
\n\t
"
"pcmpgtw %%mm2, %%mm1
\n\t
"
"pxor %%mm3, %%mm0
\n
"
"pxor %%mm1, %%mm2
\n
"
"psubw %%mm3, %%mm0
\n
"
"psubw %%mm1, %%mm2
\n
"
"paddw %%mm0, %%mm2
\n
"
"paddw %%mm2, %%mm6
\n
"
"add %2, %0
\n
"
"1:
\n
"
"movq (%0), %%mm0
\n
"
"movq %%mm0, %%mm1
\n
"
"psllq $8, %%mm0
\n
"
"psrlq $8, %%mm1
\n
"
"psrlq $8, %%mm0
\n
"
"movq %%mm0, %%mm2
\n
"
"movq %%mm1, %%mm3
\n
"
"punpcklbw %%mm7, %%mm0
\n
"
"punpcklbw %%mm7, %%mm1
\n
"
"punpckhbw %%mm7, %%mm2
\n
"
"punpckhbw %%mm7, %%mm3
\n
"
"psubw %%mm1, %%mm0
\n
"
"psubw %%mm3, %%mm2
\n
"
"psubw %%mm0, %%mm4
\n
"
"psubw %%mm2, %%mm5
\n
"
"pxor %%mm3, %%mm3
\n
"
"pxor %%mm1, %%mm1
\n
"
"pcmpgtw %%mm4, %%mm3
\n\t
"
"pcmpgtw %%mm5, %%mm1
\n\t
"
"pxor %%mm3, %%mm4
\n
"
"pxor %%mm1, %%mm5
\n
"
"psubw %%mm3, %%mm4
\n
"
"psubw %%mm1, %%mm5
\n
"
"paddw %%mm4, %%mm5
\n
"
"paddw %%mm5, %%mm6
\n
"
"add %2, %0
\n
"
"movq (%0), %%mm4
\n
"
"movq %%mm4, %%mm1
\n
"
"psllq $8, %%mm4
\n
"
"psrlq $8, %%mm1
\n
"
"psrlq $8, %%mm4
\n
"
"movq %%mm4, %%mm5
\n
"
"movq %%mm1, %%mm3
\n
"
"punpcklbw %%mm7, %%mm4
\n
"
"punpcklbw %%mm7, %%mm1
\n
"
"punpckhbw %%mm7, %%mm5
\n
"
"punpckhbw %%mm7, %%mm3
\n
"
"psubw %%mm1, %%mm4
\n
"
"psubw %%mm3, %%mm5
\n
"
"psubw %%mm4, %%mm0
\n
"
"psubw %%mm5, %%mm2
\n
"
"pxor %%mm3, %%mm3
\n
"
"pxor %%mm1, %%mm1
\n
"
"pcmpgtw %%mm0, %%mm3
\n\t
"
"pcmpgtw %%mm2, %%mm1
\n\t
"
"pxor %%mm3, %%mm0
\n
"
"pxor %%mm1, %%mm2
\n
"
"psubw %%mm3, %%mm0
\n
"
"psubw %%mm1, %%mm2
\n
"
"paddw %%mm0, %%mm2
\n
"
"paddw %%mm2, %%mm6
\n
"
"add %2, %0
\n
"
"subl $2, %%ecx
\n
"
" jnz 1b
\n
"
"movq %%mm6, %%mm0
\n
"
"punpcklwd %%mm7, %%mm0
\n
"
"punpckhwd %%mm7, %%mm6
\n
"
"paddd %%mm0, %%mm6
\n
"
"movq %%mm6, %%mm0
\n
"
"psrlq $32, %%mm6
\n
"
"paddd %%mm6, %%mm0
\n
"
"movd %%mm0, %1
\n
"
:
"+r"
(
pix1
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
),
"g"
(
h
-
2
)
:
"%ecx"
);
return
tmp
;
}
static
int
hf_noise16_mmx
(
uint8_t
*
pix1
,
int
line_size
,
int
h
)
{
int
tmp
;
uint8_t
*
pix
=
pix1
;
__asm__
volatile
(
"movl %3, %%ecx
\n
"
"pxor %%mm7, %%mm7
\n
"
"pxor %%mm6, %%mm6
\n
"
"movq (%0), %%mm0
\n
"
"movq 1(%0), %%mm1
\n
"
"movq %%mm0, %%mm2
\n
"
"movq %%mm1, %%mm3
\n
"
"punpcklbw %%mm7, %%mm0
\n
"
"punpcklbw %%mm7, %%mm1
\n
"
"punpckhbw %%mm7, %%mm2
\n
"
"punpckhbw %%mm7, %%mm3
\n
"
"psubw %%mm1, %%mm0
\n
"
"psubw %%mm3, %%mm2
\n
"
"add %2, %0
\n
"
"movq (%0), %%mm4
\n
"
"movq 1(%0), %%mm1
\n
"
"movq %%mm4, %%mm5
\n
"
"movq %%mm1, %%mm3
\n
"
"punpcklbw %%mm7, %%mm4
\n
"
"punpcklbw %%mm7, %%mm1
\n
"
"punpckhbw %%mm7, %%mm5
\n
"
"punpckhbw %%mm7, %%mm3
\n
"
"psubw %%mm1, %%mm4
\n
"
"psubw %%mm3, %%mm5
\n
"
"psubw %%mm4, %%mm0
\n
"
"psubw %%mm5, %%mm2
\n
"
"pxor %%mm3, %%mm3
\n
"
"pxor %%mm1, %%mm1
\n
"
"pcmpgtw %%mm0, %%mm3
\n\t
"
"pcmpgtw %%mm2, %%mm1
\n\t
"
"pxor %%mm3, %%mm0
\n
"
"pxor %%mm1, %%mm2
\n
"
"psubw %%mm3, %%mm0
\n
"
"psubw %%mm1, %%mm2
\n
"
"paddw %%mm0, %%mm2
\n
"
"paddw %%mm2, %%mm6
\n
"
"add %2, %0
\n
"
"1:
\n
"
"movq (%0), %%mm0
\n
"
"movq 1(%0), %%mm1
\n
"
"movq %%mm0, %%mm2
\n
"
"movq %%mm1, %%mm3
\n
"
"punpcklbw %%mm7, %%mm0
\n
"
"punpcklbw %%mm7, %%mm1
\n
"
"punpckhbw %%mm7, %%mm2
\n
"
"punpckhbw %%mm7, %%mm3
\n
"
"psubw %%mm1, %%mm0
\n
"
"psubw %%mm3, %%mm2
\n
"
"psubw %%mm0, %%mm4
\n
"
"psubw %%mm2, %%mm5
\n
"
"pxor %%mm3, %%mm3
\n
"
"pxor %%mm1, %%mm1
\n
"
"pcmpgtw %%mm4, %%mm3
\n\t
"
"pcmpgtw %%mm5, %%mm1
\n\t
"
"pxor %%mm3, %%mm4
\n
"
"pxor %%mm1, %%mm5
\n
"
"psubw %%mm3, %%mm4
\n
"
"psubw %%mm1, %%mm5
\n
"
"paddw %%mm4, %%mm5
\n
"
"paddw %%mm5, %%mm6
\n
"
"add %2, %0
\n
"
"movq (%0), %%mm4
\n
"
"movq 1(%0), %%mm1
\n
"
"movq %%mm4, %%mm5
\n
"
"movq %%mm1, %%mm3
\n
"
"punpcklbw %%mm7, %%mm4
\n
"
"punpcklbw %%mm7, %%mm1
\n
"
"punpckhbw %%mm7, %%mm5
\n
"
"punpckhbw %%mm7, %%mm3
\n
"
"psubw %%mm1, %%mm4
\n
"
"psubw %%mm3, %%mm5
\n
"
"psubw %%mm4, %%mm0
\n
"
"psubw %%mm5, %%mm2
\n
"
"pxor %%mm3, %%mm3
\n
"
"pxor %%mm1, %%mm1
\n
"
"pcmpgtw %%mm0, %%mm3
\n\t
"
"pcmpgtw %%mm2, %%mm1
\n\t
"
"pxor %%mm3, %%mm0
\n
"
"pxor %%mm1, %%mm2
\n
"
"psubw %%mm3, %%mm0
\n
"
"psubw %%mm1, %%mm2
\n
"
"paddw %%mm0, %%mm2
\n
"
"paddw %%mm2, %%mm6
\n
"
"add %2, %0
\n
"
"subl $2, %%ecx
\n
"
" jnz 1b
\n
"
"movq %%mm6, %%mm0
\n
"
"punpcklwd %%mm7, %%mm0
\n
"
"punpckhwd %%mm7, %%mm6
\n
"
"paddd %%mm0, %%mm6
\n
"
"movq %%mm6, %%mm0
\n
"
"psrlq $32, %%mm6
\n
"
"paddd %%mm6, %%mm0
\n
"
"movd %%mm0, %1
\n
"
:
"+r"
(
pix1
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
),
"g"
(
h
-
2
)
:
"%ecx"
);
return
tmp
+
hf_noise8_mmx
(
pix
+
8
,
line_size
,
h
);
}
static
int
nsse16_mmx
(
MpegEncContext
*
c
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
score1
,
score2
;
if
(
c
)
score1
=
c
->
dsp
.
sse
[
0
](
c
,
pix1
,
pix2
,
line_size
,
h
);
else
score1
=
sse16_mmx
(
c
,
pix1
,
pix2
,
line_size
,
h
);
score2
=
hf_noise16_mmx
(
pix1
,
line_size
,
h
)
-
hf_noise16_mmx
(
pix2
,
line_size
,
h
);
if
(
c
)
return
score1
+
FFABS
(
score2
)
*
c
->
avctx
->
nsse_weight
;
else
return
score1
+
FFABS
(
score2
)
*
8
;
}
static
int
nsse8_mmx
(
MpegEncContext
*
c
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
score1
=
sse8_mmx
(
c
,
pix1
,
pix2
,
line_size
,
h
);
int
score2
=
hf_noise8_mmx
(
pix1
,
line_size
,
h
)
-
hf_noise8_mmx
(
pix2
,
line_size
,
h
);
if
(
c
)
return
score1
+
FFABS
(
score2
)
*
c
->
avctx
->
nsse_weight
;
else
return
score1
+
FFABS
(
score2
)
*
8
;
}
static
int
vsad_intra16_mmx
(
MpegEncContext
*
v
,
uint8_t
*
pix
,
uint8_t
*
dummy
,
int
line_size
,
int
h
)
{
int
tmp
;
assert
((((
int
)
pix
)
&
7
)
==
0
);
assert
((
line_size
&
7
)
==
0
);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
"movq 8(%0), %%mm3\n" \
"add %2,%0\n" \
"movq %%mm2, " #out0 "\n" \
"movq %%mm3, " #out1 "\n" \
"psubusb " #in0 ", %%mm2\n" \
"psubusb " #in1 ", %%mm3\n" \
"psubusb " #out0 ", " #in0 "\n" \
"psubusb " #out1 ", " #in1 "\n" \
"por %%mm2, " #in0 "\n" \
"por %%mm3, " #in1 "\n" \
"movq " #in0 ", %%mm2\n" \
"movq " #in1 ", %%mm3\n" \
"punpcklbw %%mm7, " #in0 "\n" \
"punpcklbw %%mm7, " #in1 "\n" \
"punpckhbw %%mm7, %%mm2\n" \
"punpckhbw %%mm7, %%mm3\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw %%mm3, %%mm2\n" \
"paddw %%mm2, " #in0 "\n" \
"paddw " #in0 ", %%mm6\n"
__asm__
volatile
(
"movl %3, %%ecx
\n
"
"pxor %%mm6, %%mm6
\n
"
"pxor %%mm7, %%mm7
\n
"
"movq (%0), %%mm0
\n
"
"movq 8(%0), %%mm1
\n
"
"add %2, %0
\n
"
"jmp 2f
\n
"
"1:
\n
"
SUM
(
%%
mm4
,
%%
mm5
,
%%
mm0
,
%%
mm1
)
"2:
\n
"
SUM
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm5
)
"subl $2, %%ecx
\n
"
"jnz 1b
\n
"
"movq %%mm6, %%mm0
\n
"
"psrlq $32, %%mm6
\n
"
"paddw %%mm6, %%mm0
\n
"
"movq %%mm0, %%mm6
\n
"
"psrlq $16, %%mm0
\n
"
"paddw %%mm6, %%mm0
\n
"
"movd %%mm0, %1
\n
"
:
"+r"
(
pix
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
),
"m"
(
h
)
:
"%ecx"
);
return
tmp
&
0xFFFF
;
}
#undef SUM
static
int
vsad_intra16_mmxext
(
MpegEncContext
*
v
,
uint8_t
*
pix
,
uint8_t
*
dummy
,
int
line_size
,
int
h
)
{
int
tmp
;
assert
((((
int
)
pix
)
&
7
)
==
0
);
assert
((
line_size
&
7
)
==
0
);
#define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n" \
"movq 8(%0), " #out1 "\n" \
"add %2, %0\n" \
"psadbw " #out0 ", " #in0 "\n" \
"psadbw " #out1 ", " #in1 "\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw " #in0 ", %%mm6\n"
__asm__
volatile
(
"movl %3, %%ecx
\n
"
"pxor %%mm6, %%mm6
\n
"
"pxor %%mm7, %%mm7
\n
"
"movq (%0), %%mm0
\n
"
"movq 8(%0), %%mm1
\n
"
"add %2, %0
\n
"
"jmp 2f
\n
"
"1:
\n
"
SUM
(
%%
mm4
,
%%
mm5
,
%%
mm0
,
%%
mm1
)
"2:
\n
"
SUM
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm5
)
"subl $2, %%ecx
\n
"
"jnz 1b
\n
"
"movd %%mm6, %1
\n
"
:
"+r"
(
pix
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
),
"m"
(
h
)
:
"%ecx"
);
return
tmp
;
}
#undef SUM
static
int
vsad16_mmx
(
MpegEncContext
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
tmp
;
assert
((((
int
)
pix1
)
&
7
)
==
0
);
assert
((((
int
)
pix2
)
&
7
)
==
0
);
assert
((
line_size
&
7
)
==
0
);
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n" \
"movq (%1), " #out0 "\n" \
"movq 8(%0), %%mm3\n" \
"movq 8(%1), " #out1 "\n" \
"add %3, %0\n" \
"add %3, %1\n" \
"psubb " #out0 ", %%mm2\n" \
"psubb " #out1 ", %%mm3\n" \
"pxor %%mm7, %%mm2\n" \
"pxor %%mm7, %%mm3\n" \
"movq %%mm2, " #out0 "\n" \
"movq %%mm3, " #out1 "\n" \
"psubusb " #in0 ", %%mm2\n" \
"psubusb " #in1 ", %%mm3\n" \
"psubusb " #out0 ", " #in0 "\n" \
"psubusb " #out1 ", " #in1 "\n" \
"por %%mm2, " #in0 "\n" \
"por %%mm3, " #in1 "\n" \
"movq " #in0 ", %%mm2\n" \
"movq " #in1 ", %%mm3\n" \
"punpcklbw %%mm7, " #in0 "\n" \
"punpcklbw %%mm7, " #in1 "\n" \
"punpckhbw %%mm7, %%mm2\n" \
"punpckhbw %%mm7, %%mm3\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw %%mm3, %%mm2\n" \
"paddw %%mm2, " #in0 "\n" \
"paddw " #in0 ", %%mm6\n"
__asm__
volatile
(
"movl %4, %%ecx
\n
"
"pxor %%mm6, %%mm6
\n
"
"pcmpeqw %%mm7, %%mm7
\n
"
"psllw $15, %%mm7
\n
"
"packsswb %%mm7, %%mm7
\n
"
"movq (%0), %%mm0
\n
"
"movq (%1), %%mm2
\n
"
"movq 8(%0), %%mm1
\n
"
"movq 8(%1), %%mm3
\n
"
"add %3, %0
\n
"
"add %3, %1
\n
"
"psubb %%mm2, %%mm0
\n
"
"psubb %%mm3, %%mm1
\n
"
"pxor %%mm7, %%mm0
\n
"
"pxor %%mm7, %%mm1
\n
"
"jmp 2f
\n
"
"1:
\n
"
SUM
(
%%
mm4
,
%%
mm5
,
%%
mm0
,
%%
mm1
)
"2:
\n
"
SUM
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm5
)
"subl $2, %%ecx
\n
"
"jnz 1b
\n
"
"movq %%mm6, %%mm0
\n
"
"psrlq $32, %%mm6
\n
"
"paddw %%mm6, %%mm0
\n
"
"movq %%mm0, %%mm6
\n
"
"psrlq $16, %%mm0
\n
"
"paddw %%mm6, %%mm0
\n
"
"movd %%mm0, %2
\n
"
:
"+r"
(
pix1
),
"+r"
(
pix2
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
),
"m"
(
h
)
:
"%ecx"
);
return
tmp
&
0x7FFF
;
}
#undef SUM
static
int
vsad16_mmxext
(
MpegEncContext
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
int
tmp
;
assert
((((
int
)
pix1
)
&
7
)
==
0
);
assert
((((
int
)
pix2
)
&
7
)
==
0
);
assert
((
line_size
&
7
)
==
0
);
#define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n" \
"movq (%1), %%mm2\n" \
"movq 8(%0), " #out1 "\n" \
"movq 8(%1), %%mm3\n" \
"add %3, %0\n" \
"add %3, %1\n" \
"psubb %%mm2, " #out0 "\n" \
"psubb %%mm3, " #out1 "\n" \
"pxor %%mm7, " #out0 "\n" \
"pxor %%mm7, " #out1 "\n" \
"psadbw " #out0 ", " #in0 "\n" \
"psadbw " #out1 ", " #in1 "\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw " #in0 ", %%mm6\n "
__asm__
volatile
(
"movl %4, %%ecx
\n
"
"pxor %%mm6, %%mm6
\n
"
"pcmpeqw %%mm7, %%mm7
\n
"
"psllw $15, %%mm7
\n
"
"packsswb %%mm7, %%mm7
\n
"
"movq (%0), %%mm0
\n
"
"movq (%1), %%mm2
\n
"
"movq 8(%0), %%mm1
\n
"
"movq 8(%1), %%mm3
\n
"
"add %3, %0
\n
"
"add %3, %1
\n
"
"psubb %%mm2, %%mm0
\n
"
"psubb %%mm3, %%mm1
\n
"
"pxor %%mm7, %%mm0
\n
"
"pxor %%mm7, %%mm1
\n
"
"jmp 2f
\n
"
"1:
\n
"
SUM
(
%%
mm4
,
%%
mm5
,
%%
mm0
,
%%
mm1
)
"2:
\n
"
SUM
(
%%
mm0
,
%%
mm1
,
%%
mm4
,
%%
mm5
)
"subl $2, %%ecx
\n
"
"jnz 1b
\n
"
"movd %%mm6, %2
\n
"
:
"+r"
(
pix1
),
"+r"
(
pix2
),
"=r"
(
tmp
)
:
"r"
((
x86_reg
)
line_size
),
"m"
(
h
)
:
"%ecx"
);
return
tmp
;
}
#undef SUM
#define MMABS_MMX(a,z) \
"pxor " #z ", " #z " \n\t" \
"pcmpgtw " #a ", " #z " \n\t" \
"pxor " #z ", " #a " \n\t" \
"psubw " #z ", " #a " \n\t"
#define MMABS_MMXEXT(a, z) \
"pxor " #z ", " #z " \n\t" \
"psubw " #a ", " #z " \n\t" \
"pmaxsw " #z ", " #a " \n\t"
#define MMABS_SSSE3(a,z) \
"pabsw " #a ", " #a " \n\t"
#define MMABS_SUM(a,z, sum) \
MMABS(a,z) \
"paddusw " #a ", " #sum " \n\t"
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
* up to about 100k on extreme inputs. But that's very unlikely to occur in
* natural video, and it's even more unlikely to not have any alternative
* mvs/modes with lower cost. */
#define HSUM_MMX(a, t, dst) \
"movq " #a ", " #t " \n\t" \
"psrlq $32, " #a " \n\t" \
"paddusw " #t ", " #a " \n\t" \
"movq " #a ", " #t " \n\t" \
"psrlq $16, " #a " \n\t" \
"paddusw " #t ", " #a " \n\t" \
"movd " #a ", " #dst " \n\t" \
#define HSUM_MMXEXT(a, t, dst) \
"pshufw $0x0E, " #a ", " #t " \n\t" \
"paddusw " #t ", " #a " \n\t" \
"pshufw $0x01, " #a ", " #t " \n\t" \
"paddusw " #t ", " #a " \n\t" \
"movd " #a ", " #dst " \n\t" \
#define HSUM_SSE2(a, t, dst) \
"movhlps " #a ", " #t " \n\t" \
"paddusw " #t ", " #a " \n\t" \
"pshuflw $0x0E, " #a ", " #t " \n\t" \
"paddusw " #t ", " #a " \n\t" \
"pshuflw $0x01, " #a ", " #t " \n\t" \
"paddusw " #t ", " #a " \n\t" \
"movd " #a ", " #dst " \n\t" \