diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 57541751640246ac6d112ad83afce995319e0188..f0336ab175f7bda413c7bc0706ec2907fc4bcb21 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -586,52 +586,45 @@ sym(vp8_sad16x16_sse3):
 
     STACK_FRAME_CREATE_X3
 
-        lea             end_ptr,    [src_ptr+src_stride*8]
-
-        lea             end_ptr,    [end_ptr+src_stride*8]
-        pxor            mm7,        mm7
+        mov             end_ptr,    4
+        pxor            xmm7,        xmm7
 
 .vp8_sad16x16_sse3_loop:
+        movdqa          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
+        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
 
-        movq            ret_var,    mm7
-        cmp             ret_var,    max_err
-        jg              .vp8_sad16x16_early_exit
-
-        movq            mm0,        QWORD PTR [src_ptr]
-        movq            mm2,        QWORD PTR [src_ptr+8]
-
-        movq            mm1,        QWORD PTR [ref_ptr]
-        movq            mm3,        QWORD PTR [ref_ptr+8]
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
 
-        movq            mm4,        QWORD PTR [src_ptr+src_stride]
-        movq            mm5,        QWORD PTR [ref_ptr+ref_stride]
+        movdqa          xmm4,       XMMWORD PTR [src_ptr]
+        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
 
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
+        psadbw          xmm0,       xmm1
 
-        movq            mm1,        QWORD PTR [src_ptr+src_stride+8]
-        movq            mm3,        QWORD PTR [ref_ptr+ref_stride+8]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
 
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
+        psadbw          xmm2,       xmm3
+        psadbw          xmm4,       xmm5
+        psadbw          xmm6,       xmm1
 
         lea             src_ptr,    [src_ptr+src_stride*2]
         lea             ref_ptr,    [ref_ptr+ref_stride*2]
 
-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
+        paddw           xmm7,        xmm0
+        paddw           xmm7,        xmm2
+        paddw           xmm7,        xmm4
+        paddw           xmm7,        xmm6
 
-        cmp             src_ptr,    end_ptr
+        sub             end_ptr,     1
         jne             .vp8_sad16x16_sse3_loop
 
-        movq            ret_var,    mm7
-
-.vp8_sad16x16_early_exit:
-
-        mov             rax,        ret_var
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+        paddw           xmm0,       xmm7
+        movq            rax,        xmm0
 
     STACK_FRAME_DESTROY_X3