Commit 8db00081 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Martin Storsjö
Browse files

x86: hpeldsp: Move half-pel assembly from dsputil to hpeldsp


Signed-off-by: default avatarMartin Storsjö <martin@martin.st>
parent 28bc406c
......@@ -53,4 +53,7 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
hpel_funcs(avg, [2], 4);
hpel_funcs(avg, [3], 2);
hpel_funcs(avg_no_rnd,, 16);
if (ARCH_X86)
ff_hpeldsp_init_x86(c, flags);
}
......@@ -94,4 +94,6 @@ typedef struct HpelDSPContext {
void ff_hpeldsp_init(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
#endif /* AVCODEC_HPELDSP_H */
......@@ -10,6 +10,7 @@ OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o
......@@ -44,7 +45,7 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
x86/hpeldsp.o \
x86/fpel.o \
x86/mpeg4qpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
......@@ -63,7 +64,10 @@ YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o \
x86/fpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
......
......@@ -55,10 +55,6 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
#if HAVE_YASM
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
......@@ -66,54 +62,14 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
......@@ -186,14 +142,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
// using regr as temporary and for the output result
// first argument is unmodifed and second is trashed
// regfe is supposed to contain 0xfefefefefefefefe
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"paddb "#regb", "#regr" \n\t"
#define PAVGB_MMX(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"por "#regb", "#regr" \n\t" \
......@@ -203,20 +151,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
"psubb "#regb", "#regr" \n\t"
// mm6 is supposed to contain 0xfefefefefefefefe
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
......@@ -231,22 +165,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
"psubb "#regb", "#regr" \n\t" \
"psubb "#regd", "#regp" \n\t"
/***********************************/
/* MMX no rounding */
#define NO_RND 1
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "dsputil_rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#undef NO_RND
/***********************************/
/* MMX rounding */
......@@ -254,6 +172,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "dsputil_rnd_template.c"
......@@ -268,31 +187,21 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
#if HAVE_YASM
/***********************************/
/* 3Dnow specific */
#define DEF(x) x ## _3dnow
#include "dsputil_avg_template.c"
#undef DEF
/***********************************/
/* MMXEXT specific */
#define DEF(x) x ## _mmxext
#include "dsputil_avg_template.c"
#undef DEF
//FIXME the following could be optimized too ...
static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
ff_avg_pixels8_mmxext(block, pixels, line_size, h);
ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
}
#endif /* HAVE_YASM */
#if HAVE_INLINE_ASM
#define put_no_rnd_pixels16_mmx put_pixels16_mmx
#define put_no_rnd_pixels8_mmx put_pixels8_mmx
/***********************************/
/* standard MMX */
......@@ -1369,14 +1278,6 @@ void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
} while (0)
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
......@@ -1392,14 +1293,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->clear_blocks = clear_blocks_mmx;
c->draw_edges = draw_edges_mmx;
SET_HPEL_FUNCS(put, [0], 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
SET_HPEL_FUNCS(avg, [0], 16, mmx);
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
SET_HPEL_FUNCS(put, [1], 8, mmx);
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
SET_HPEL_FUNCS(avg, [1], 8, mmx);
switch (avctx->idct_algo) {
case FF_IDCT_AUTO:
case FF_IDCT_SIMPLEMMX:
......@@ -1445,34 +1338,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
if (!high_bit_depth) {
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
}
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
if (!high_bit_depth) {
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
}
}
#endif /* HAVE_YASM */
#if HAVE_INLINE_ASM
......@@ -1484,12 +1349,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
#endif /* HAVE_INLINE_ASM */
#if HAVE_MMXEXT_EXTERNAL
if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
avctx->codec_id == AV_CODEC_ID_THEORA)) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
}
/* slower than cmov version on AMD */
if (!(mm_flags & AV_CPU_FLAG_3DNOW))
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
......@@ -1505,46 +1364,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_YASM
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth) {
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
}
}
if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
avctx->codec_id == AV_CODEC_ID_THEORA)) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
}
#endif /* HAVE_YASM */
}
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
......@@ -1578,15 +1397,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
#endif /* HAVE_SSE2_INLINE */
#if HAVE_SSE2_EXTERNAL
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
// these functions are slower than mmx on AMD, but faster on Intel
if (!high_bit_depth) {
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
}
}
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (mm_flags & AV_CPU_FLAG_ATOM) {
......@@ -1644,9 +1454,6 @@ av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
if (mm_flags & AV_CPU_FLAG_MMXEXT)
dsputil_init_mmxext(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_3DNOW)
dsputil_init_3dnow(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE)
dsputil_init_sse(c, avctx, mm_flags);
......
......@@ -25,212 +25,6 @@
*/
// put_pixels
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"testl $1, %0 \n\t"
" jz 1f \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t"
"add %4, %1 \n\t"
"add $8, %2 \n\t"
PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
"movq %%mm4, (%3) \n\t"
"add %5, %3 \n\t"
"decl %0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t"
"add %4, %1 \n\t"
"movq (%1), %%mm2 \n\t"
"movq 8(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t"
"add %5, %3 \n\t"
"movq %%mm5, (%3) \n\t"
"add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 16(%2), %%mm1 \n\t"
"add %4, %1 \n\t"
"movq (%1), %%mm2 \n\t"
"movq 24(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
"add $32, %2 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t"
"add %5, %3 \n\t"
"movq %%mm5, (%3) \n\t"
"add %5, %3 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
:"memory");
}
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"testl $1, %0 \n\t"
" jz 1f \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t"
"movq 8(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
"add $16, %2 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t"
"movq %%mm5, 8(%3) \n\t"
"add %5, %3 \n\t"
"decl %0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t"
"movq 8(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t"
"movq %%mm5, 8(%3) \n\t"
"add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 16(%2), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t"
"movq 24(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t"
"movq %%mm5, 8(%3) \n\t"
"add %5, %3 \n\t"
"add $32, %2 \n\t"
"subl $2, %0 \n\t"
"jnz 1b \n\t"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
:"memory");
}
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"),%%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"),%%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
......@@ -297,27 +91,6 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff
:REG_a, "memory");
}
// avg_pixels
static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"movd %1, %%mm1 \n\t"
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
"movd %%mm2, %0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
}
while (--h);
}
#ifndef NO_RND
// in case more speed is needed - unroling would certainly help
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
......@@ -337,7 +110,6 @@ static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t l