Commit 58083cb3 authored by Fritz Koenig's avatar Fritz Koenig
Browse files

Revert "Remove stack shadowing for x86-64"

This reverts commit 15acc84f.

Change-Id: Ia640be8cbc134432914849c1750f62575ea084e6
parent 213f7b09
......@@ -8,171 +8,24 @@
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro STACK_FRAME_CREATE_X3 0
%if ABI_IS_32BIT
%define src_ptr rsi
%define src_stride rax
%define ref_ptr rdi
%define ref_stride rdx
%define end_ptr rcx
%define ret_var rbx
%define result_ptr arg(4)
%define max_err arg(4)
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
mov rsi, arg(0) ; src_ptr
mov rdi, arg(2) ; ref_ptr
movsxd rax, dword ptr arg(1) ; src_stride
movsxd rdx, dword ptr arg(3) ; ref_stride
%else
%ifidn __OUTPUT_FORMAT__,x64
%define src_ptr rcx
%define src_stride rdx
%define ref_ptr r8
%define ref_stride r9
%define end_ptr r10
%define ret_var r11
%define result_ptr [rsp+8+4*8]
%define max_err [rsp+8+4*8]
%else
%define src_ptr rdi
%define src_stride rsi
%define ref_ptr rdx
%define ref_stride rcx
%define end_ptr r9
%define ret_var r10
%define result_ptr r8
%define max_err r8
%endif
%endif
%endmacro
%macro STACK_FRAME_DESTROY_X3 0
%define src_ptr
%define src_stride
%define ref_ptr
%define ref_stride
%define end_ptr
%define ret_var
%define result_ptr
%define max_err
%if ABI_IS_32BIT
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
%else
%ifidn __OUTPUT_FORMAT__,x64
%endif
%endif
ret
%endmacro
%macro STACK_FRAME_CREATE_X4 0
%if ABI_IS_32BIT
%define src_ptr rsi
%define src_stride rax
%define r0_ptr rcx
%define r1_ptr rdx
%define r2_ptr rbx
%define r3_ptr rdi
%define ref_stride rbp
%define result_ptr arg(4)
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
push rbx
push rbp
mov rdi, arg(2) ; ref_ptr_base
LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
mov rsi, arg(0) ; src_ptr
movsxd rbx, dword ptr arg(1) ; src_stride
movsxd rbp, dword ptr arg(3) ; ref_stride
xchg rbx, rax
%else
%ifidn __OUTPUT_FORMAT__,x64
%define src_ptr rcx
%define src_stride rdx
%define r0_ptr rsi
%define r1_ptr r10
%define r2_ptr r11
%define r3_ptr r8
%define ref_stride r9
%define result_ptr [rsp+16+4*8]
push rsi
LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
%else
%define src_ptr rdi
%define src_stride rsi
%define r0_ptr r9
%define r1_ptr r10
%define r2_ptr r11
%define r3_ptr rdx
%define ref_stride rcx
%define result_ptr r8
LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
%endif
%endif
%endmacro
%macro STACK_FRAME_DESTROY_X4 0
%define src_ptr
%define src_stride
%define r0_ptr
%define r1_ptr
%define r2_ptr
%define r3_ptr
%define ref_stride
%define result_ptr
%if ABI_IS_32BIT
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
%else
%ifidn __OUTPUT_FORMAT__,x64
pop rsi
%endif
%endif
ret
%endmacro
%include "vpx_ports/x86_abi_support.asm"
%macro PROCESS_16X2X3 5
%if %1==0
movdqa xmm0, XMMWORD PTR [%2]
lddqu xmm5, XMMWORD PTR [%3]
lddqu xmm6, XMMWORD PTR [%3+1]
lddqu xmm7, XMMWORD PTR [%3+2]
%macro PROCESS_16X2X3 1
%if %1
movdqa xmm0, XMMWORD PTR [rsi]
lddqu xmm5, XMMWORD PTR [rdi]
lddqu xmm6, XMMWORD PTR [rdi+1]
lddqu xmm7, XMMWORD PTR [rdi+2]
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
movdqa xmm0, XMMWORD PTR [%2]
lddqu xmm1, XMMWORD PTR [%3]
lddqu xmm2, XMMWORD PTR [%3+1]
lddqu xmm3, XMMWORD PTR [%3+2]
movdqa xmm0, XMMWORD PTR [rsi]
lddqu xmm1, XMMWORD PTR [rdi]
lddqu xmm2, XMMWORD PTR [rdi+1]
lddqu xmm3, XMMWORD PTR [rdi+2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
......@@ -182,15 +35,13 @@
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
movdqa xmm0, XMMWORD PTR [%2+%4]
lddqu xmm1, XMMWORD PTR [%3+%5]
lddqu xmm2, XMMWORD PTR [%3+%5+1]
lddqu xmm3, XMMWORD PTR [%3+%5+2]
%if %1==0 || %1==1
lea %2, [%2+%4*2]
lea %3, [%3+%5*2]
%endif
movdqa xmm0, XMMWORD PTR [rsi+rax]
lddqu xmm1, XMMWORD PTR [rdi+rdx]
lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
......@@ -201,21 +52,21 @@
paddw xmm7, xmm3
%endmacro
%macro PROCESS_8X2X3 5
%if %1==0
movq mm0, QWORD PTR [%2]
movq mm5, QWORD PTR [%3]
movq mm6, QWORD PTR [%3+1]
movq mm7, QWORD PTR [%3+2]
%macro PROCESS_8X2X3 1
%if %1
movq mm0, QWORD PTR [rsi]
movq mm5, QWORD PTR [rdi]
movq mm6, QWORD PTR [rdi+1]
movq mm7, QWORD PTR [rdi+2]
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
movq mm0, QWORD PTR [%2]
movq mm1, QWORD PTR [%3]
movq mm2, QWORD PTR [%3+1]
movq mm3, QWORD PTR [%3+2]
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
movq mm2, QWORD PTR [rdi+1]
movq mm3, QWORD PTR [rdi+2]
psadbw mm1, mm0
psadbw mm2, mm0
......@@ -225,15 +76,13 @@
paddw mm6, mm2
paddw mm7, mm3
%endif
movq mm0, QWORD PTR [%2+%4]
movq mm1, QWORD PTR [%3+%5]
movq mm2, QWORD PTR [%3+%5+1]
movq mm3, QWORD PTR [%3+%5+2]
%if %1==0 || %1==1
lea %2, [%2+%4*2]
lea %3, [%3+%5*2]
%endif
movq mm0, QWORD PTR [rsi+rax]
movq mm1, QWORD PTR [rdi+rdx]
movq mm2, QWORD PTR [rdi+rdx+1]
movq mm3, QWORD PTR [rdi+rdx+2]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
psadbw mm1, mm0
psadbw mm2, mm0
......@@ -252,117 +101,115 @@
mov %5, [%1+REG_SZ_BYTES*3]
%endmacro
%macro PROCESS_16X2X4 8
%if %1==0
movdqa xmm0, XMMWORD PTR [%2]
lddqu xmm4, XMMWORD PTR [%3]
lddqu xmm5, XMMWORD PTR [%4]
lddqu xmm6, XMMWORD PTR [%5]
lddqu xmm7, XMMWORD PTR [%6]
%macro PROCESS_16X2X4 1
%if %1
movdqa xmm0, XMMWORD PTR [rsi]
lddqu xmm4, XMMWORD PTR [rcx]
lddqu xmm5, XMMWORD PTR [rdx]
lddqu xmm6, XMMWORD PTR [rbx]
lddqu xmm7, XMMWORD PTR [rdi]
psadbw xmm4, xmm0
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
movdqa xmm0, XMMWORD PTR [%2]
lddqu xmm1, XMMWORD PTR [%3]
lddqu xmm2, XMMWORD PTR [%4]
lddqu xmm3, XMMWORD PTR [%5]
movdqa xmm0, XMMWORD PTR [rsi]
lddqu xmm1, XMMWORD PTR [rcx]
lddqu xmm2, XMMWORD PTR [rdx]
lddqu xmm3, XMMWORD PTR [rbx]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm4, xmm1
lddqu xmm1, XMMWORD PTR [%6]
lddqu xmm1, XMMWORD PTR [rdi]
paddw xmm5, xmm2
paddw xmm6, xmm3
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endif
movdqa xmm0, XMMWORD PTR [%2+%7]
lddqu xmm1, XMMWORD PTR [%3+%8]
lddqu xmm2, XMMWORD PTR [%4+%8]
lddqu xmm3, XMMWORD PTR [%5+%8]
movdqa xmm0, XMMWORD PTR [rsi+rax]
lddqu xmm1, XMMWORD PTR [rcx+rbp]
lddqu xmm2, XMMWORD PTR [rdx+rbp]
lddqu xmm3, XMMWORD PTR [rbx+rbp]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm4, xmm1
lddqu xmm1, XMMWORD PTR [%6+%8]
lddqu xmm1, XMMWORD PTR [rdi+rbp]
paddw xmm5, xmm2
paddw xmm6, xmm3
%if %1==0 || %1==1
lea %2, [%2+%7*2]
lea %3, [%3+%8*2]
lea rsi, [rsi+rax*2]
lea rcx, [rcx+rbp*2]
lea %4, [%4+%8*2]
lea %5, [%5+%8*2]
lea rdx, [rdx+rbp*2]
lea rbx, [rbx+rbp*2]
lea rdi, [rdi+rbp*2]
lea %6, [%6+%8*2]
%endif
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endmacro
%macro PROCESS_8X2X4 8
%if %1==0
movq mm0, QWORD PTR [%2]
movq mm4, QWORD PTR [%3]
movq mm5, QWORD PTR [%4]
movq mm6, QWORD PTR [%5]
movq mm7, QWORD PTR [%6]
%macro PROCESS_8X2X4 1
%if %1
movq mm0, QWORD PTR [rsi]
movq mm4, QWORD PTR [rcx]
movq mm5, QWORD PTR [rdx]
movq mm6, QWORD PTR [rbx]
movq mm7, QWORD PTR [rdi]
psadbw mm4, mm0
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
movq mm0, QWORD PTR [%2]
movq mm1, QWORD PTR [%3]
movq mm2, QWORD PTR [%4]
movq mm3, QWORD PTR [%5]
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rcx]
movq mm2, QWORD PTR [rdx]
movq mm3, QWORD PTR [rbx]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm4, mm1
movq mm1, QWORD PTR [%6]
movq mm1, QWORD PTR [rdi]
paddw mm5, mm2
paddw mm6, mm3
psadbw mm1, mm0
paddw mm7, mm1
%endif
movq mm0, QWORD PTR [%2+%7]
movq mm1, QWORD PTR [%3+%8]
movq mm2, QWORD PTR [%4+%8]
movq mm3, QWORD PTR [%5+%8]
movq mm0, QWORD PTR [rsi+rax]
movq mm1, QWORD PTR [rcx+rbp]
movq mm2, QWORD PTR [rdx+rbp]
movq mm3, QWORD PTR [rbx+rbp]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm4, mm1
movq mm1, QWORD PTR [%6+%8]
movq mm1, QWORD PTR [rdi+rbp]
paddw mm5, mm2
paddw mm6, mm3
%if %1==0 || %1==1
lea %2, [%2+%7*2]
lea %3, [%3+%8*2]
lea rsi, [rsi+rax*2]
lea rcx, [rcx+rbp*2]
lea %4, [%4+%8*2]
lea %5, [%5+%8*2]
lea rdx, [rdx+rbp*2]
lea rbx, [rbx+rbp*2]
lea rdi, [rdi+rbp*2]
lea %6, [%6+%8*2]
%endif
psadbw mm1, mm0
paddw mm7, mm1
......@@ -376,39 +223,54 @@
; int *results)
global sym(vp8_sad16x16x3_sse3)
sym(vp8_sad16x16x3_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
STACK_FRAME_CREATE_X3
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
mov rcx, result_ptr
mov rdi, arg(4) ;Results
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rcx], xmm0
movd [rdi], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rcx+4], xmm0
movd [rdi+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rcx+8], xmm0
movd [rdi+8], xmm0
STACK_FRAME_DESTROY_X3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad16x8x3_sse3(
; unsigned char *src_ptr,
......@@ -418,35 +280,50 @@ sym(vp8_sad16x16x3_sse3):
; int *results)
global sym(vp8_sad16x8x3_sse3)
sym(vp8_sad16x8x3_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
STACK_FRAME_CREATE_X3
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
mov rcx, result_ptr
mov rdi, arg(4) ;Results
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rcx], xmm0
movd [rdi], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rcx+4], xmm0
movd [rdi+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rcx+8], xmm0
movd [rdi+8], xmm0
STACK_FRAME_DESTROY_X3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad8x16x3_sse3(
; unsigned char *src_ptr,
......@@ -456,26 +333,40 @@ sym(vp8_sad16x8x3_sse3):
; int *results)
global sym(vp8_sad8x16x3_sse3)
sym(vp8_sad8x16x3_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
STACK_FRAME_CREATE_X3
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
mov rcx, result_ptr
PROCESS_8X2X3 1
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
punpckldq mm5, mm6
mov rdi, arg(4) ;Results
movq [rcx], mm5
movd [rcx+8], mm7
movd [rdi], mm5
movd [rdi+4], mm6
movd [rdi+8], mm7
STACK_FRAME_DESTROY_X3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad8x8x3_sse3(
; unsigned char *src_ptr,
......@@ -485,22 +376,36 @@ sym(vp8_sad8x16x3_sse3):
; int *results)
global sym(vp8_sad8x8x3_sse3)
sym(vp8_sad8x8x3_sse3):