Commit 437525c4 authored by Michael Niedermayer's avatar Michael Niedermayer
Browse files

h264 luma motion compensation in mmx2/3dnow

Originally committed as revision 3437 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 0c62d343
This diff is collapsed.
......@@ -53,6 +53,53 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
:"%eax", "memory");
}
static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm __volatile(
"testl $1, %0 \n\t"
" jz 1f \n\t"
"movd (%1), %%mm0 \n\t"
"movd (%2), %%mm1 \n\t"
"addl %4, %1 \n\t"
"addl $4, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t"
"decl %0 \n\t"
"1: \n\t"
"movd (%1), %%mm0 \n\t"
"addl %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"addl %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t"
PAVGB" 4(%2), %%mm1 \n\t"
"movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t"
"movd %%mm1, (%3) \n\t"
"addl %5, %3 \n\t"
"movd (%1), %%mm0 \n\t"
"addl %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"addl %4, %1 \n\t"
PAVGB" 8(%2), %%mm0 \n\t"
PAVGB" 12(%2), %%mm1 \n\t"
"movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t"
"movd %%mm1, (%3) \n\t"
"addl %5, %3 \n\t"
"addl $16, %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif
:"S"(src1Stride), "D"(dstStride)
:"memory");
}
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm __volatile(
......@@ -173,6 +220,58 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
:"memory");*/
}
static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm __volatile(
"testl $1, %0 \n\t"
" jz 1f \n\t"
"movd (%1), %%mm0 \n\t"
"movd (%2), %%mm1 \n\t"
"addl %4, %1 \n\t"
"addl $4, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t"
"decl %0 \n\t"
"1: \n\t"
"movd (%1), %%mm0 \n\t"
"addl %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"addl %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t"
PAVGB" 4(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t"
"movd %%mm1, (%3) \n\t"
"addl %5, %3 \n\t"
"movd (%1), %%mm0 \n\t"
"addl %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"addl %4, %1 \n\t"
PAVGB" 8(%2), %%mm0 \n\t"
PAVGB" 12(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t"
"movd %%mm1, (%3) \n\t"
"addl %5, %3 \n\t"
"addl $16, %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif
:"S"(src1Stride), "D"(dstStride)
:"memory");
}
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm __volatile(
......
......@@ -296,6 +296,25 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
}
// avg_pixels
static void DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm __volatile(
"movd %0, %%mm0 \n\t"
"movd %1, %%mm1 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
"movd %%mm2, %0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
}
while (--h);
}
// in case more speed is needed - unroling would certainly help
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment