Merge "Add subtract_block SSE2 version and unit test."

7756e989 · Ronald S. Bultje · Gerrit Code Review · 9a480482 · 25c588b1 · 7756e989
Commit 7756e989 authored 11 years ago by Ronald S. Bultje Committed by Gerrit Code Review 11 years ago
--- a/test/test.mk
+++ b/test/test.mk
@@ -66,6 +66,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc

--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+extern "C" {
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+}
+
+typedef void (*subtract_fn_t)(int rows, int cols,
+                              int16_t *diff_ptr, ptrdiff_t diff_stride,
+                              const uint8_t *src_ptr, ptrdiff_t src_stride,
+                              const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+
+namespace vp9 {
+
+class VP9SubtractBlockTest : public ::testing::TestWithParam<subtract_fn_t> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+};
+
+using libvpx_test::ACMRandom;
+
+TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  // FIXME(rbultje) split in its own file
+  for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
+       bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+    const int block_width  = 4 << b_width_log2(bsize);
+    const int block_height = 4 << b_height_log2(bsize);
+    int16_t *diff = new int16_t[block_width * block_height * 2];
+    uint8_t *pred = new uint8_t[block_width * block_height * 2];
+    uint8_t *src  = new uint8_t[block_width * block_height * 2];
+
+    for (int n = 0; n < 100; n++) {
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width * 2; ++c) {
+          src[r * block_width * 2 + c] = rnd.Rand8();
+          pred[r * block_width * 2 + c] = rnd.Rand8();
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width,
+                 src, block_width, pred, block_width);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width + c],
+                    (src[r * block_width + c] -
+                     pred[r * block_width + c])) << "r = " << r
+                                                 << ", c = " << c
+                                                 << ", bs = " << bsize;
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width * 2,
+                 src, block_width * 2, pred, block_width * 2);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width * 2 + c],
+                    (src[r * block_width * 2 + c] -
+                     pred[r * block_width * 2 + c])) << "r = " << r
+                                                     << ", c = " << c
+                                                     << ", bs = " << bsize;
+        }
+      }
+    }
+    delete[] diff;
+    delete[] pred;
+    delete[] src;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
+                        ::testing::Values(vp9_subtract_block_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
+                        ::testing::Values(vp9_subtract_block_sse2));
+#endif
+
+}  // namespace vp9
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -533,6 +533,9 @@ prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
 specialize vp9_block_error mmx sse2
 vp9_block_error_sse2=vp9_block_error_xmm

+prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
+specialize vp9_subtract_block sse2
+
 #
 # Structured Similarity (SSIM)
 #

--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -22,10 +22,10 @@
 DECLARE_ALIGNED(16, extern const uint8_t,
                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);

-void vp9_subtract_block(int rows, int cols,
-                        int16_t *diff_ptr, int diff_stride,
-                        const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *pred_ptr, int pred_stride) {
+void vp9_subtract_block_c(int rows, int cols,
+                          int16_t *diff_ptr, ptrdiff_t diff_stride,
+                          const uint8_t *src_ptr, ptrdiff_t src_stride,
+                          const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
  int r, c;

  for (r = 0; r < rows; r++) {

--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -42,10 +42,6 @@ void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

-void vp9_subtract_block(int rows, int cols,
-                        int16_t *diff_ptr, int diff_stride,
-                        const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *pred_ptr, int pred_stride);
 void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);

--- a/vp9/encoder/x86/vp9_subtract_mmx.asm
+++ b/vp9/encoder/x86/vp9_subtract_mmx.asm
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp9_subtract_b_mmx_impl) PRIVATE
-sym(vp9_subtract_b_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi],      mm0
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],mm0
-
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*4],        mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_mmx) PRIVATE
-sym(vp9_subtract_mby_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            16
-            pxor        mm0,            mm0
-
-.submby_loop:
-
-            movq        mm1,            [rsi]
-            movq        mm3,            [rax]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi],          mm1
-            movq        [rdi+8],        mm2
-
-
-            movq        mm1,            [rsi+8]
-            movq        mm3,            [rax+8]
-
-            movq        mm2,            mm1
-            movq        mm4,            mm3
-
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
-
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
-
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi+16],       mm1
-            movq        [rdi+24],       mm2
-
-
-            add         rdi,            32
-            add         rax,            16
-
-            lea         rsi,            [rsi+rdx]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_mmx) PRIVATE
-sym(vp9_subtract_mbuv_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-    ;short *udiff = diff + 256;
-    ;short *vdiff = diff + 320;
-    ;unsigned char *upred = pred + 256;
-    ;unsigned char *vpred = pred + 320;
-
-        ;unsigned char  *z    = usrc;
-        ;unsigned short *diff = udiff;
-        ;unsigned char  *Predictor= upred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-        ;unsigned char  *z    = vsrc;
-        ;unsigned short *diff = vdiff;
-        ;unsigned char  *Predictor= vpred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(2) ;z = usrc
-            add     rdi,        320*2  ;diff = diff + 320 (shorts)
-            add     rax,        320    ;Predictor = pred + 320
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/vp9/encoder/x86/vp9_subtract_sse2.asm
@@ -8,349 +8,121 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;

-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp9_subtract_b_sse2_impl) PRIVATE
-sym(vp9_subtract_b_sse2_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi],      mm0
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*4], mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_sse2) PRIVATE
-sym(vp9_subtract_mby_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            8      ; do two lines at one time
-
-.submby_loop:
-            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
-            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
-
-            movdqa      xmm2,           xmm0
-            psubb       xmm0,           xmm1
-
-            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm2,           [GLOBAL(t80)]
-            pcmpgtb     xmm1,           xmm2            ; obtain sign information
-
-            movdqa      xmm2,    xmm0
-            movdqa      xmm3,    xmm1
-            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi],   xmm0
-            movdqa      XMMWORD PTR [rdi +16], xmm2
-
-            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
-            movdqa      xmm5,           XMMWORD PTR [rax + 16]
-
-            movdqa      xmm6,           xmm4
-            psubb       xmm4,           xmm5
-
-            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm6,           [GLOBAL(t80)]
-            pcmpgtb     xmm5,           xmm6            ; obtain sign information
-
-            movdqa      xmm6,    xmm4
-            movdqa      xmm7,    xmm5
-            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
-            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi +32], xmm4
-            movdqa      XMMWORD PTR [rdi +48], xmm6
-
-            add         rdi,            64
-            add         rax,            32
-            lea         rsi,            [rsi+rdx*2]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_sse2) PRIVATE
-sym(vp9_subtract_mbuv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            lea     rcx,        [rdx + rdx*2]
-
-            ;u
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-            ;v
-            mov     rsi,        arg(2) ;z = vsrc
-            add     rdi,        64*2  ;diff = diff + 320 (shorts)
-            add     rax,        64    ;Predictor = pred + 320
-
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
-
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-t80:
-    times 16 db 0x80
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vp9_subtract_block(int rows, int cols,
+;                         int16_t *diff, ptrdiff_t diff_stride,
+;                         const uint8_t *src, ptrdiff_t src_stride,
+;                         const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+                        rows, cols, diff, diff_stride, src, src_stride, \
+                        pred, pred_stride
+%define pred_str colsq
+  pxor                  m7, m7         ; dedicated zero register
+  cmp                colsd, 4
+  je .case_4
+  cmp                colsd, 8
+  je .case_8
+  cmp                colsd, 16
+  je .case_16
+  cmp                colsd, 32
+  je .case_32
+
+%macro loop16 6
+  mova                  m0, [srcq+%1]
+  mova                  m4, [srcq+%2]
+  mova                  m1, [predq+%3]
+  mova                  m5, [predq+%4]
+  punpckhbw             m2, m0, m7
+  punpckhbw             m3, m1, m7
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  psubw                 m2, m3
+  psubw                 m0, m1
+  punpckhbw             m1, m4, m7
+  punpckhbw             m3, m5, m7
+  punpcklbw             m4, m7
+  punpcklbw             m5, m7
+  psubw                 m1, m3
+  psubw                 m4, m5
+  mova [diffq+mmsize*0+%5], m0
+  mova [diffq+mmsize*1+%5], m2
+  mova [diffq+mmsize*0+%6], m4
+  mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+  mov             pred_str, pred_stridemp
+.loop_64:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_64
+  RET
+
+.case_32:
+  mov             pred_str, pred_stridemp
+.loop_32:
+  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_32
+  RET
+
+.case_16:
+  mov             pred_str, pred_stridemp
+.loop_16:
+  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                predq, [predq+pred_str*2]
+  lea                 srcq, [srcq+src_strideq*2]
+  sub                rowsd, 2
+  jg .loop_16
+  RET
+
+%macro loop_h 0
+  movh                  m0, [srcq]
+  movh                  m2, [srcq+src_strideq]
+  movh                  m1, [predq]
+  movh                  m3, [predq+pred_str]
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  punpcklbw             m2, m7
+  punpcklbw             m3, m7
+  psubw                 m0, m1
+  psubw                 m2, m3
+  mova             [diffq], m0
+  mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+  mov             pred_str, pred_stridemp
+.loop_8:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_8
+  RET
+
+INIT_MMX
+.case_4:
+  mov             pred_str, pred_stridemp
+.loop_4:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_4
+  emms
+  RET
--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/x86/vp9_dct_mmx.h"
-
-// TODO(jimbankoski) Consider rewriting the c to take the same values rather
-// than going through these pointer conversions
-#if 0 && HAVE_MMX
-void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
-  vp9_short_fdct4x4_mmx(input,   output,    pitch);
-  vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
-
-void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-                             short *diff, unsigned char *predictor,
-                             int pitch);
-void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *z = *(be->base_src) + be->src;
-  unsigned int  src_stride = be->src_stride;
-  short *diff = &be->src_diff[0];
-  unsigned char *predictor = *(bd->base_dst) + bd->dst;
-  // TODO(jingning): The prototype function in c has been changed. Need to
-  // modify the mmx and sse versions.
-  vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-#if 0 && HAVE_SSE2
-void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-                              short *diff, unsigned char *predictor,
-                              int pitch);
-void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *z = *(be->base_src) + be->src;
-  unsigned int  src_stride = be->src_stride;
-  short *diff = &be->src_diff[0];
-  unsigned char *predictor = *(bd->base_dst) + bd->dst;
-  // TODO(jingning): The prototype function in c has been changed. Need to
-  // modify the mmx and sse versions.
-  vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -73,13 +73,11 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h


 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm