Commit 610e00b3 authored by Daniel Kang's avatar Daniel Kang Committed by Diego Biurrun
Browse files

x86: h264: Convert 8-bit QPEL inline assembly to YASM


Signed-off-by: default avatarDiego Biurrun <diego@biurrun.de>
parent ad01ba6c
......@@ -51,7 +51,8 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_weight_10bit.o
YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
......
......@@ -1354,3 +1354,234 @@ BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
%macro op_avg 2
pavgb %1, %2
mova %2, %1
%endmacro
%macro op_puth 2-3
movh %2, %1
%endmacro
%macro op_put 2
mova %2, %1
%endmacro
; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS4_L2 1
%define OP op_%1h
cglobal %1_pixels4_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
movd m0, [r1]
movd m1, [r2]
add r1, r4
add r2, 4
pavgb m0, m1
OP m0, [r0], m3
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
pavgb m1, [r2+4]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2+8]
pavgb m1, [r2+12]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
add r2, 16
sub r5d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS4_L2 put
PIXELS4_L2 avg
; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS8_L2 1
%define OP op_%1
cglobal %1_pixels8_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r2]
add r1, r4
add r2, 8
pavgb m0, m1
OP m0, [r0]
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
pavgb m1, [r2+8]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2+16]
pavgb m1, [r2+24]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
add r2, 32
sub r5d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS8_L2 put
PIXELS8_L2 avg
; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS16_L2 1
%define OP op_%1
cglobal %1_pixels16_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r1+8]
pavgb m0, [r2]
pavgb m1, [r2+8]
add r1, r4
add r2, 16
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
pavgb m0, [r2]
pavgb m1, [r2+8]
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
pavgb m0, [r2+16]
pavgb m1, [r2+24]
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
add r2, 32
sub r5d, 2
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS16_L2 put
PIXELS16_L2 avg
INIT_MMX mmxext
; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
%macro PIXELS48 2
%if %2 == 4
%define OP movh
%else
%define OP mova
%endif
cglobal %1_pixels%2, 4,5
movsxdifnidn r2, r2d
lea r4, [r2*3]
.loop:
OP m0, [r1]
OP m1, [r1+r2]
OP m2, [r1+r2*2]
OP m3, [r1+r4]
lea r1, [r1+r2*4]
%ifidn %1, avg
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r4]
%endif
OP [r0], m0
OP [r0+r2], m1
OP [r0+r2*2], m2
OP [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jne .loop
RET
%endmacro
PIXELS48 put, 4
PIXELS48 avg, 4
PIXELS48 put, 8
PIXELS48 avg, 8
INIT_XMM sse2
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
cglobal put_pixels16, 4,5,4
movsxdifnidn r2, r2d
lea r4, [r2*3]
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r1+r2*2]
movu m3, [r1+r4]
lea r1, [r1+r2*4]
mova [r0], m0
mova [r0+r2], m1
mova [r0+r2*2], m2
mova [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jnz .loop
REP_RET
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
cglobal avg_pixels16, 4,5,4
movsxdifnidn r2, r2d
lea r4, [r2*3]
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r1+r2*2]
movu m3, [r1+r4]
lea r1, [r1+r2*4]
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r4]
mova [r0], m0
mova [r0+r2], m1
mova [r0+r2*2], m2
mova [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jnz .loop
REP_RET
......@@ -56,57 +56,6 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
}
#ifndef SKIP_FOR_3DNOW
static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm__ volatile(
"testl $1, %0 \n\t"
" jz 1f \n\t"
"movd (%1), %%mm0 \n\t"
"movd (%2), %%mm1 \n\t"
"add %4, %1 \n\t"
"add $4, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
"decl %0 \n\t"
"1: \n\t"
"movd (%1), %%mm0 \n\t"
"add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"movd (%2), %%mm2 \n\t"
"movd 4(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
PAVGB" %%mm2, %%mm0 \n\t"
PAVGB" %%mm3, %%mm1 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
"movd %%mm1, (%3) \n\t"
"add %5, %3 \n\t"
"movd (%1), %%mm0 \n\t"
"add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"movd 8(%2), %%mm2 \n\t"
"movd 12(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
PAVGB" %%mm2, %%mm0 \n\t"
PAVGB" %%mm3, %%mm1 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
"movd %%mm1, (%3) \n\t"
"add %5, %3 \n\t"
"add $16, %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
:"memory");
}
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm__ volatile(
......@@ -227,58 +176,6 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
:"memory");*/
}
static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm__ volatile(
"testl $1, %0 \n\t"
" jz 1f \n\t"
"movd (%1), %%mm0 \n\t"
"movd (%2), %%mm1 \n\t"
"add %4, %1 \n\t"
"add $4, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
"decl %0 \n\t"
"1: \n\t"
"movd (%1), %%mm0 \n\t"
"add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"add %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t"
PAVGB" 4(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t"
"movd %%mm1, (%3) \n\t"
"add %5, %3 \n\t"
"movd (%1), %%mm0 \n\t"
"add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"add %4, %1 \n\t"
PAVGB" 8(%2), %%mm0 \n\t"
PAVGB" 12(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t"
"movd %%mm1, (%3) \n\t"
"add %5, %3 \n\t"
"add $16, %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
:"memory");
}
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm__ volatile(
......@@ -876,33 +773,6 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line
:"%"REG_a, "memory");
}
#ifndef SKIP_FOR_3DNOW
static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
do {
__asm__ volatile(
"movd (%1), %%mm0 \n\t"
"movd (%1, %2), %%mm1 \n\t"
"movd (%1, %2, 2), %%mm2 \n\t"
"movd (%1, %3), %%mm3 \n\t"
PAVGB" (%0), %%mm0 \n\t"
PAVGB" (%0, %2), %%mm1 \n\t"
PAVGB" (%0, %2, 2), %%mm2 \n\t"
PAVGB" (%0, %3), %%mm3 \n\t"
"movd %%mm0, (%1) \n\t"
"movd %%mm1, (%1, %2) \n\t"
"movd %%mm2, (%1, %2, 2) \n\t"
"movd %%mm3, (%1, %3) \n\t"
::"S"(pixels), "D"(block),
"r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
:"memory");
block += 4*line_size;
pixels += 4*line_size;
h -= 4;
} while(h > 0);
}
#endif /* SKIP_FOR_3DNOW */
//FIXME the following could be optimized too ...
static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
......
......@@ -366,33 +366,6 @@ void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
} while (--i);
}
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
__asm__ volatile (
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movd (%1 ), %%mm0 \n\t"
"movd (%1, %3), %%mm1 \n\t"
"movd %%mm0, (%2) \n\t"
"movd %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movd (%1 ), %%mm0 \n\t"
"movd (%1, %3), %%mm1 \n\t"
"movd %%mm0, (%2) \n\t"
"movd %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size)
: "%"REG_a, "memory"
);
}
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
......@@ -455,56 +428,6 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
);
}
static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
__asm__ volatile (
"1: \n\t"
"movdqu (%1 ), %%xmm0 \n\t"
"movdqu (%1, %3 ), %%xmm1 \n\t"
"movdqu (%1, %3, 2), %%xmm2 \n\t"
"movdqu (%1, %4 ), %%xmm3 \n\t"
"lea (%1, %3, 4), %1 \n\t"
"movdqa %%xmm0, (%2) \n\t"
"movdqa %%xmm1, (%2, %3) \n\t"
"movdqa %%xmm2, (%2, %3, 2) \n\t"
"movdqa %%xmm3, (%2, %4) \n\t"
"subl $4, %0 \n\t"
"lea (%2, %3, 4), %2 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
: "memory"
);
}
static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
__asm__ volatile (
"1: \n\t"
"movdqu (%1 ), %%xmm0 \n\t"
"movdqu (%1, %3 ), %%xmm1 \n\t"
"movdqu (%1, %3, 2), %%xmm2 \n\t"
"movdqu (%1, %4 ), %%xmm3 \n\t"
"lea (%1, %3, 4), %1 \n\t"
"pavgb (%2 ), %%xmm0 \n\t"
"pavgb (%2, %3 ), %%xmm1 \n\t"
"pavgb (%2, %3, 2), %%xmm2 \n\t"
"pavgb (%2, %4), %%xmm3 \n\t"
"movdqa %%xmm0, (%2) \n\t"
"movdqa %%xmm1, (%2, %3) \n\t"
"movdqa %%xmm2, (%2, %3, 2) \n\t"
"movdqa %%xmm3, (%2, %4) \n\t"
"subl $4, %0 \n\t"
"lea (%2, %3, 4), %2 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
: "memory"
);
}
#define CLEAR_BLOCKS(name, n) \
static void name(DCTELEM *blocks) \
{ \
......@@ -2381,27 +2304,23 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
}
#endif /* HAVE_INLINE_ASM */
#if HAVE_MMXEXT_EXTERNAL
if (CONFIG_H264QPEL) {
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
#endif /* HAVE_INLINE_ASM */
if (!high_bit_depth) {
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
#endif /* HAVE_INLINE_ASM */
} else if (bit_depth == 10) {
#if HAVE_YASM
#if !ARCH_X86_64
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
......@@ -2410,18 +2329,14 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
#endif
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
#endif /* HAVE_YASM */
}
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, );
#endif /* HAVE_INLINE_ASM */
}
#if HAVE_YASM
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
......@@ -2447,7 +2362,7 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
#endif /* HAVE_YASM */
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
......@@ -2546,17 +2461,16 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSE2_EXTERNAL
const int bit_depth = avctx->bits_per_raw_sample;
#if HAVE_INLINE_ASM
const int high_bit_depth = bit_depth > 8;
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
// these functions are slower than mmx on AMD, but faster on Intel
if (!high_bit_depth) {
c->put_pixels_tab[0][0] = put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
if (CONFIG_H264QPEL)
H264_QPEL_FUNCS(0, 0, sse2);
}
......@@ -2583,9 +2497,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
}
#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
if (bit_depth == 10) {
if (CONFIG_H264QPEL) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
......@@ -2615,16 +2527,16 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
#endif /* HAVE_YASM */
#endif /* HAVE_SSE2_EXTERNAL */
}
static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSSE3_EXTERNAL
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
const int bit_depth = avctx->bits_per_raw_sample;
#if HAVE_SSSE3_INLINE
if (!high_bit_depth && CONFIG_H264QPEL) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
......@@ -2639,9 +2551,6 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
}
#endif /* HAVE_SSSE3_INLINE */
#if HAVE_SSSE3_EXTERNAL
if (bit_depth == 10 && CONFIG_H264QPEL) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
......
This diff is collapsed.
This diff is collapsed.