From 58c983d1093b941e32ef6460e94bed32998eb8d2 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" <rbultje@google.com> Date: Tue, 5 Feb 2013 15:21:47 -0800 Subject: [PATCH] Add SSE3 versions for sad{32x32,64x64}x4d functions. Overall encoding about 15% faster. Change-Id: I176a775c704317509e32eee83739721804120ff2 --- vp9/common/vp9_rtcd_defs.sh | 4 +- vp9/encoder/x86/vp9_sad_sse3.asm | 155 +++++++++++++++++++++++-------- 2 files changed, 117 insertions(+), 42 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 39af2080ac..0d1a285e74 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -601,10 +601,10 @@ prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint specialize vp9_sad4x4x8 sse4 prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad64x64x4d +specialize vp9_sad64x64x4d sse3 prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad32x32x4d +specialize vp9_sad32x32x4d sse3 prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" specialize vp9_sad16x16x4d sse3 diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm index 2c409cbe54..e0c5c8c2fc 100644 --- a/vp9/encoder/x86/vp9_sad_sse3.asm +++ b/vp9/encoder/x86/vp9_sad_sse3.asm @@ -258,49 +258,49 @@ mov %5, [%1+REG_SZ_BYTES*3] %endmacro -%macro PROCESS_16X2X4 8 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm4, XMMWORD PTR [%3] - lddqu xmm5, XMMWORD PTR [%4] - lddqu xmm6, XMMWORD PTR [%5] - lddqu xmm7, XMMWORD PTR [%6] +%macro PROCESS_16X2X4 8-9 0 +%if %1==0 || %1==3 + movdqa xmm0, XMMWORD PTR [%2+%9] + lddqu xmm4, XMMWORD PTR [%3+%9] + lddqu xmm5, XMMWORD PTR [%4+%9] + lddqu xmm6, XMMWORD PTR [%5+%9] + lddqu xmm7, XMMWORD PTR [%6+%9] psadbw xmm4, xmm0 psadbw xmm5, xmm0 psadbw xmm6, xmm0 psadbw xmm7, xmm0 %else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%4] - lddqu xmm3, XMMWORD PTR [%5] + movdqa xmm0, XMMWORD PTR [%2+%9] + lddqu xmm1, XMMWORD PTR [%3+%9] + lddqu xmm2, XMMWORD PTR [%4+%9] + lddqu xmm3, XMMWORD PTR [%5+%9] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 - paddw xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6] - paddw xmm5, xmm2 - paddw xmm6, xmm3 + paddd xmm4, xmm1 + lddqu xmm1, XMMWORD PTR [%6+%9] + paddd xmm5, xmm2 + paddd xmm6, xmm3 psadbw xmm1, xmm0 - paddw xmm7, xmm1 + paddd xmm7, xmm1 %endif - movdqa xmm0, XMMWORD PTR [%2+%7] - lddqu xmm1, XMMWORD PTR [%3+%8] - lddqu xmm2, XMMWORD PTR [%4+%8] - lddqu xmm3, XMMWORD PTR [%5+%8] + movdqa xmm0, XMMWORD PTR [%2+%7+%9] + lddqu xmm1, XMMWORD PTR [%3+%8+%9] + lddqu xmm2, XMMWORD PTR [%4+%8+%9] + lddqu xmm3, XMMWORD PTR [%5+%8+%9] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 - paddw xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6+%8] - paddw xmm5, xmm2 - paddw xmm6, xmm3 + paddd xmm4, xmm1 + lddqu xmm1, XMMWORD PTR [%6+%8+%9] + paddd xmm5, xmm2 + paddd xmm6, xmm3 %if %1==0 || %1==1 lea %2, [%2+%7*2] @@ -312,7 +312,7 @@ lea %6, [%6+%8*2] %endif psadbw xmm1, xmm0 - paddw xmm7, xmm1 + paddd xmm7, xmm1 %endmacro @@ -697,26 +697,46 @@ sym(vp9_copy32xn_sse3): .copy_is_done: STACK_FRAME_DESTROY_X3 -;void vp9_sad16x16x4d_sse3( +;void vp9_sad64x64x4d_sse3( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr_base, ; int ref_stride, ; int *results) -global sym(vp9_sad16x16x4d_sse3) PRIVATE -sym(vp9_sad16x16x4d_sse3): +global sym(vp9_sad64x64x4d_sse3) PRIVATE +sym(vp9_sad64x64x4d_sse3): STACK_FRAME_CREATE_X4 - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride +%macro PROCESS_64X8X4 2-3+ + PROCESS_16X2X4 %1, %3 + PROCESS_16X2X4 2, %3, 16 + PROCESS_16X2X4 2, %3, 32 + PROCESS_16X2X4 1, %3, 48 + PROCESS_16X2X4 2, %3 + PROCESS_16X2X4 2, %3, 16 + PROCESS_16X2X4 2, %3, 32 + PROCESS_16X2X4 1, %3, 48 + PROCESS_16X2X4 2, %3 + PROCESS_16X2X4 2, %3, 16 + PROCESS_16X2X4 2, %3, 32 + PROCESS_16X2X4 1, %3, 48 + PROCESS_16X2X4 2, %3 + PROCESS_16X2X4 2, %3, 16 + PROCESS_16X2X4 2, %3, 32 + PROCESS_16X2X4 %2, %3, 48 +%endmacro + + PROCESS_64X8X4 3, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_64X8X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_64X8X4 2, 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride +%macro STORE_4D_RESULTS 0 %if ABI_IS_32BIT pop rbp %endif @@ -725,27 +745,82 @@ sym(vp9_sad16x16x4d_sse3): movq xmm0, xmm4 psrldq xmm4, 8 - paddw xmm0, xmm4 + paddd xmm0, xmm4 movd [rcx], xmm0 ;- movq xmm0, xmm5 psrldq xmm5, 8 - paddw xmm0, xmm5 + paddd xmm0, xmm5 movd [rcx+4], xmm0 ;- movq xmm0, xmm6 psrldq xmm6, 8 - paddw xmm0, xmm6 + paddd xmm0, xmm6 movd [rcx+8], xmm0 ;- movq xmm0, xmm7 psrldq xmm7, 8 - paddw xmm0, xmm7 + paddd xmm0, xmm7 movd [rcx+12], xmm0 +%endmacro + + STORE_4D_RESULTS + STACK_FRAME_DESTROY_X4 + +;void vp9_sad32x32x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr_base, +; int ref_stride, +; int *results) +global sym(vp9_sad32x32x4d_sse3) PRIVATE +sym(vp9_sad32x32x4d_sse3): + + STACK_FRAME_CREATE_X4 + +%macro PROCESS_32X4X4 2-3+ + PROCESS_16X2X4 %1, %3 + PROCESS_16X2X4 1, %3, 16 + PROCESS_16X2X4 2, %3 + PROCESS_16X2X4 %2, %3, 16 +%endmacro + + PROCESS_32X4X4 3, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_32X4X4 2, 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_32X4X4 2, 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + + STORE_4D_RESULTS + STACK_FRAME_DESTROY_X4 + +;void vp9_sad16x16x4d_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr_base, +; int ref_stride, +; int *results) +global sym(vp9_sad16x16x4d_sse3) PRIVATE +sym(vp9_sad16x16x4d_sse3): + + STACK_FRAME_CREATE_X4 + + PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride + STORE_4D_RESULTS STACK_FRAME_DESTROY_X4 ;void vp9_sad16x8x4d_sse3( -- GitLab