Commit da6505ad authored by Ronald S. Bultje's avatar Ronald S. Bultje
Browse files

dsputil: make add_hfyu_left_prediction_sse4() support unaligned src.

This makes add_hfyu_left_prediction_sse4() handle sources that are not
16-byte aligned in its own function rather than by proxying the call to
add_hfyu_left_prediction_ssse3(). This fixes a crash on Win64, since the
sse4 version clobberes xmm6, but the ssse3 version (which uses MMX regs)
does not restore it, thus leading to XMM clobbering and RSP being off.

Fixes bug 342.
parent 9cc74c9f
...@@ -388,12 +388,16 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to ...@@ -388,12 +388,16 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to
RET RET
%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
add srcq, wq add srcq, wq
add dstq, wq add dstq, wq
neg wq neg wq
%%.loop: %%.loop:
%if %2
mova m1, [srcq+wq] mova m1, [srcq+wq]
%else
movu m1, [srcq+wq]
%endif
mova m2, m1 mova m2, m1
psllw m1, 8 psllw m1, 8
paddb m1, m2 paddb m1, m2
...@@ -435,7 +439,7 @@ cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left ...@@ -435,7 +439,7 @@ cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
mova m3, [pb_zz11zz55zz99zzdd] mova m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm movd m0, leftm
psllq m0, 56 psllq m0, 56
ADD_HFYU_LEFT_LOOP 1 ADD_HFYU_LEFT_LOOP 1, 1
INIT_XMM INIT_XMM
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
...@@ -446,12 +450,14 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left ...@@ -446,12 +450,14 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
movd m0, leftm movd m0, leftm
pslldq m0, 15 pslldq m0, 15
test srcq, 15 test srcq, 15
jnz add_hfyu_left_prediction_ssse3.skip_prologue jnz .src_unaligned
test dstq, 15 test dstq, 15
jnz .unaligned jnz .dst_unaligned
ADD_HFYU_LEFT_LOOP 1 ADD_HFYU_LEFT_LOOP 1, 1
.unaligned: .dst_unaligned:
ADD_HFYU_LEFT_LOOP 0 ADD_HFYU_LEFT_LOOP 0, 1
.src_unaligned:
ADD_HFYU_LEFT_LOOP 0, 0
; float scalarproduct_float_sse(const float *v1, const float *v2, int len) ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment