Commit 714aa9f3 authored by Jim Bankoski's avatar Jim Bankoski

this commit converts all sad ptrs to uint32

sse4_1 code used uint16_t for returning sad, but that
won't work for 32x32 or 64x64.   This code fixes the
assembly for those and also reenables sse4_1 on linux

Change-Id: I5ce7288d581db870a148e5f7c5092826f59edd81
parent b715e371
......@@ -997,17 +997,6 @@ process_common_toolchain() {
#error "not x32"
#endif
EOF
soft_enable runtime_cpu_detect
soft_enable mmx
soft_enable sse
soft_enable sse2
soft_enable sse3
soft_enable ssse3
if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
else
soft_enable sse4_1
fi
case ${tgt_os} in
win*)
......@@ -1061,6 +1050,18 @@ EOF
;;
esac
soft_enable runtime_cpu_detect
soft_enable mmx
soft_enable sse
soft_enable sse2
soft_enable sse3
soft_enable ssse3
if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
else
soft_enable sse4_1
fi
case "${AS}" in
auto|"")
which nasm >/dev/null 2>&1 && AS=nasm
......
......@@ -449,25 +449,25 @@ specialize vp9_sad8x8x3 sse3
prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x3 sse3
prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad64x64x8
prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad32x32x8
prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad16x16x8 sse4
prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad16x8x8 sse4
prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad8x16x8 sse4
prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad8x8x8 sse4
prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array"
prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
specialize vp9_sad4x4x8 sse4
prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
......@@ -490,7 +490,6 @@ specialize vp9_sad8x8x4d sse2
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x4d sse
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
specialize vp9_sub_pixel_mse16x16 sse2 mmx
......
......@@ -1782,7 +1782,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
int col_min = ref_col - distance;
int col_max = ref_col + distance;
DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8);
DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
unsigned int sad_array[3];
int_mv fcenter_mv;
......
This diff is collapsed.
......@@ -29,7 +29,7 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned short *sad_array);
unsigned int *sad_array);
typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
int source_stride,
......
......@@ -154,6 +154,16 @@
paddw xmm1, xmm5
%endmacro
%macro WRITE_AS_INTS 0
mov rdi, arg(4) ;Results
pxor xmm0, xmm0
movdqa xmm2, xmm1
punpcklwd xmm1, xmm0
punpckhwd xmm2, xmm0
movdqa [rdi], xmm1
movdqa [rdi + 16], xmm2
%endmacro
;void vp9_sad16x16x8_sse4(
; const unsigned char *src_ptr,
......@@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi
......@@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 1
PROCESS_16X2X8 0
PROCESS_16X2X8 0
PROCESS_16X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi
......@@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi
......@@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 1
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
PROCESS_8X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi
......@@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4):
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_4X2X8 1
PROCESS_4X2X8 0
PROCESS_4X2X8 1
PROCESS_4X2X8 0
mov rdi, arg(4) ;Results
movdqa XMMWORD PTR [rdi], xmm1
WRITE_AS_INTS
; begin epilog
pop rdi
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment