Commit 0331f092 authored by Loren Merritt's avatar Loren Merritt
Browse files

15% faster put_h264_qpel16_hv_lowpass_mmx2

Originally committed as revision 4949 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 1a44a8b6
...@@ -630,9 +630,9 @@ static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i ...@@ -630,9 +630,9 @@ static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
dst += 4-8*dstStride;\ dst += 4-8*dstStride;\
}\ }\
}\ }\
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ static inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
int h=8;\ int h = size;\
int w=4;\ int w = (size+8)>>2;\
src -= 2*srcStride+2;\ src -= 2*srcStride+2;\
while(w--){\ while(w--){\
asm volatile(\ asm volatile(\
...@@ -652,23 +652,40 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, ...@@ -652,23 +652,40 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
"punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\ QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\ QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
\ : "+a"(src)\
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
: "memory"\
);\
if(size==16){\
asm volatile(\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
: "+a"(src)\ : "+a"(src)\
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\ : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
: "memory"\ : "memory"\
);\ );\
}\
tmp += 4;\ tmp += 4;\
src += 4 - 13*srcStride;\ src += 4 - (size+5)*srcStride;\
}\ }\
tmp -= 4*4;\ tmp -= size+8;\
w = size>>4;\
do{\
h = size;\
asm volatile(\ asm volatile(\
"movq %4, %%mm6 \n\t"\ "movq %4, %%mm6 \n\t"\
"1: \n\t"\ "1: \n\t"\
...@@ -702,7 +719,7 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, ...@@ -702,7 +719,7 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
"psraw $6, %%mm3 \n\t"\ "psraw $6, %%mm3 \n\t"\
"packuswb %%mm3, %%mm0 \n\t"\ "packuswb %%mm3, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, q)\ OP(%%mm0, (%1),%%mm7, q)\
"add $32, %0 \n\t"\ "add $48, %0 \n\t"\
"add %3, %1 \n\t"\ "add %3, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
...@@ -710,6 +727,9 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, ...@@ -710,6 +727,9 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
: "S"((long)dstStride), "m"(ff_pw_32)\ : "S"((long)dstStride), "m"(ff_pw_32)\
: "memory"\ : "memory"\
);\ );\
tmp += 8 - size*24;\
dst += 8 - size*dstStride;\
}while(w--);\
}\ }\
static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
...@@ -729,13 +749,12 @@ static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, ...@@ -729,13 +749,12 @@ static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\ }\
\ \
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
}\
\
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
}\ }\
#define H264_MC(OPNAME, SIZE, MMX) \ #define H264_MC(OPNAME, SIZE, MMX) \
...@@ -816,13 +835,13 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t * ...@@ -816,13 +835,13 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *
}\ }\
\ \
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
uint64_t temp[SIZE*(SIZE+8)/4];\ uint64_t temp[SIZE*(SIZE<8?12:24)/4];\
int16_t * const tmp= (int16_t*)temp;\ int16_t * const tmp= (int16_t*)temp;\
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
}\ }\
\ \
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
uint8_t * const halfH= (uint8_t*)temp;\ uint8_t * const halfH= (uint8_t*)temp;\
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
...@@ -832,7 +851,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t * ...@@ -832,7 +851,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *
}\ }\
\ \
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
uint8_t * const halfH= (uint8_t*)temp;\ uint8_t * const halfH= (uint8_t*)temp;\
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
...@@ -842,7 +861,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t * ...@@ -842,7 +861,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *
}\ }\
\ \
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
uint8_t * const halfV= (uint8_t*)temp;\ uint8_t * const halfV= (uint8_t*)temp;\
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
...@@ -852,7 +871,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t * ...@@ -852,7 +871,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *
}\ }\
\ \
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
uint8_t * const halfV= (uint8_t*)temp;\ uint8_t * const halfV= (uint8_t*)temp;\
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment