From 7a07eea13fc94036f54cdb6f1233b9af8b094ced Mon Sep 17 00:00:00 2001
From: John Koleszar <jkoleszar@google.com>
Date: Mon, 28 Jan 2013 16:59:03 -0800
Subject: [PATCH] Convert subpixel filters to use convolve framework

Update the code to call the new convolution functions to do subpixel
prediction rather than the existing functions. Remove the old C and
assembly code, since it is unused. This causes a 50% performance
reduction on the decoder, but that will be resolved when the asm for
the new functions is available.

There is no consensus for whether 6-tap or 2-tap predictors will be
supported in the final codec, so these filters are implemented in
terms of the 8-tap code, so that quality testing of these modes
can continue. Implementing the lower complexity algorithms is a
simple exercise, should it be necessary.

This code produces slightly better results in the EIGHTTAP_SMOOTH
case, since the filter is now applied in only one direction when
the subpel motion is only in one direction. Like the previous code,
the filtering is skipped entirely on full-pel MVs. This combination
seems to give the best quality gains, but this may be indicative of a
bug in the encoder's filter selection, since the encoder could
achieve the result of skipping the filtering on full-pel by selecting
one of the other filters. This should be revisited.

Quality gains on derf positive on almost all clips. The only clip
that seemed to be hurt at all datarates was football
(-0.115% PSNR average, -0.587% min). Overall averages 0.375% PSNR,
0.347% SSIM.

Change-Id: I7d469716091b1d89b4b08adde5863999319d69ff
---
 vp9/common/generic/vp9_systemdependent.c |    2 -
 vp9/common/ppc/vp9_systemdependent.c     |    1 -
 vp9/common/vp9_blockd.h                  |   11 +-
 vp9/common/vp9_convolve.c                |   46 +
 vp9/common/vp9_filter.c                  | 1119 +---------------
 vp9/common/vp9_filter.h                  |   11 +-
 vp9/common/vp9_findnearmv.c              |    8 +-
 vp9/common/vp9_reconinter.c              |  324 ++---
 vp9/common/vp9_reconinter.h              |    6 +-
 vp9/common/vp9_rtcd_defs.sh              |  136 +-
 vp9/common/vp9_subpixel.h                |   20 -
 vp9/common/x86/vp9_asm_stubs.c           |  566 --------
 vp9/common/x86/vp9_filter_sse2.c         |  290 -----
 vp9/common/x86/vp9_filter_sse4.c         |  362 ------
 vp9/common/x86/vp9_subpixel_8t_ssse3.asm |  550 --------
 vp9/common/x86/vp9_subpixel_mmx.asm      |  268 ----
 vp9/common/x86/vp9_subpixel_sse2.asm     | 1372 --------------------
 vp9/common/x86/vp9_subpixel_ssse3.asm    | 1515 ----------------------
 vp9/common/x86/vp9_subpixel_x86.h        |  109 --
 vp9/encoder/vp9_onyx_if.c                |    2 +
 vp9/encoder/vp9_rdopt.c                  |    4 +-
 vp9/encoder/vp9_temporal_filter.c        |   31 +-
 vp9/encoder/vp9_variance_c.c             |   28 +-
 vp9/vp9_common.mk                        |   15 -
 24 files changed, 261 insertions(+), 6535 deletions(-)
 delete mode 100644 vp9/common/vp9_subpixel.h
 delete mode 100644 vp9/common/x86/vp9_filter_sse2.c
 delete mode 100644 vp9/common/x86/vp9_filter_sse4.c
 delete mode 100644 vp9/common/x86/vp9_subpixel_8t_ssse3.asm
 delete mode 100644 vp9/common/x86/vp9_subpixel_mmx.asm
 delete mode 100644 vp9/common/x86/vp9_subpixel_sse2.asm
 delete mode 100644 vp9/common/x86/vp9_subpixel_ssse3.asm
 delete mode 100644 vp9/common/x86/vp9_subpixel_x86.h

diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
index b02f3f0834..79092cd0eb 100644
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -11,8 +11,6 @@
 
 #include "./vpx_config.h"
 #include "vp9_rtcd.h"
-#include "vp9/common/vp9_subpixel.h"
-#include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 void vp9_machine_specific_config(VP9_COMMON *ctx) {
diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c
index 106a2b763e..02035191f3 100644
--- a/vp9/common/ppc/vp9_systemdependent.c
+++ b/vp9/common/ppc/vp9_systemdependent.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_subpixel.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "recon.h"
 #include "vp9/common/vp9_onyxc_int.h"
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index b34f308d3a..241cb8a13f 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -16,9 +16,9 @@ void vpx_log(const char *format, ...);
 
 #include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_treecoder.h"
-#include "vp9/common/vp9_subpixel.h"
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_common.h"
 
@@ -393,15 +393,8 @@ typedef struct macroblockd {
   void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
   void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);
 
+  struct subpix_fn_table  subpix;
 
-  vp9_subpix_fn_t  subpixel_predict4x4;
-  vp9_subpix_fn_t  subpixel_predict8x4;
-  vp9_subpix_fn_t  subpixel_predict8x8;
-  vp9_subpix_fn_t  subpixel_predict16x16;
-  vp9_subpix_fn_t  subpixel_predict_avg4x4;
-  vp9_subpix_fn_t  subpixel_predict_avg8x4;
-  vp9_subpix_fn_t  subpixel_predict_avg8x8;
-  vp9_subpix_fn_t  subpixel_predict_avg16x16;
   int allow_high_precision_mv;
 
   int corrupted;
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index ed188c3f25..f21f1d84e8 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -297,3 +297,49 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
                  filter_x, x_step_q4, filter_y, y_step_q4,
                  w, h, 8);
 }
+
+void vp9_convolve_copy(const uint8_t *src, int src_stride,
+                       uint8_t *dst, int dst_stride,
+                       const int16_t *filter_x, int filter_x_stride,
+                       const int16_t *filter_y, int filter_y_stride,
+                       int w, int h) {
+  if (h == 16) {
+    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
+  } else if (h == 8) {
+    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
+  } else if (w == 8) {
+    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
+  } else {
+    // 4x4
+    int r;
+
+    for (r = 0; r < 4; ++r) {
+#if !(CONFIG_FAST_UNALIGNED)
+      dst[0]  = src[0];
+      dst[1]  = src[1];
+      dst[2]  = src[2];
+      dst[3]  = src[3];
+#else
+      *(uint32_t *)dst = *(const uint32_t *)src;
+#endif
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+void vp9_convolve_avg(const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride,
+                      const int16_t *filter_x, int filter_x_stride,
+                      const int16_t *filter_y, int filter_y_stride,
+                      int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      dst[x] = (dst[x] + src[x] + 1) >> 1;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 07d8a169f6..5e425895fd 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -15,23 +15,23 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
-  { 128,   0 },
-  { 120,   8 },
-  { 112,  16 },
-  { 104,  24 },
-  {  96,  32 },
-  {  88,  40 },
-  {  80,  48 },
-  {  72,  56 },
-  {  64,  64 },
-  {  56,  72 },
-  {  48,  80 },
-  {  40,  88 },
-  {  32,  96 },
-  {  24, 104 },
-  {  16, 112 },
-  {   8, 120 }
+DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
+  { 0, 0, 0, 128,   0, 0, 0, 0 },
+  { 0, 0, 0, 120,   8, 0, 0, 0 },
+  { 0, 0, 0, 112,  16, 0, 0, 0 },
+  { 0, 0, 0, 104,  24, 0, 0, 0 },
+  { 0, 0, 0,  96,  32, 0, 0, 0 },
+  { 0, 0, 0,  88,  40, 0, 0, 0 },
+  { 0, 0, 0,  80,  48, 0, 0, 0 },
+  { 0, 0, 0,  72,  56, 0, 0, 0 },
+  { 0, 0, 0,  64,  64, 0, 0, 0 },
+  { 0, 0, 0,  56,  72, 0, 0, 0 },
+  { 0, 0, 0,  48,  80, 0, 0, 0 },
+  { 0, 0, 0,  40,  88, 0, 0, 0 },
+  { 0, 0, 0,  32,  96, 0, 0, 0 },
+  { 0, 0, 0,  24, 104, 0, 0, 0 },
+  { 0, 0, 0,  16, 112, 0, 0, 0 },
+  { 0, 0, 0,   8, 120, 0, 0, 0 }
 };
 
 #define FILTER_ALPHA       0
@@ -144,1072 +144,21 @@ DECLARE_ALIGNED(16, const int16_t,
   { 1, -2, -7, 37, 80, 28, -8, -1}
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
-  {0,   0, 128,   0,   0, 0},
-  {1,  -5, 125,   8,  -2, 1},
-  {1,  -8, 122,  17,  -5, 1},
-  {2, -11, 116,  27,  -8, 2},
-  {3, -14, 110,  37, -10, 2},
-  {3, -15, 103,  47, -12, 2},
-  {3, -16,  95,  57, -14, 3},
-  {3, -16,  86,  67, -15, 3},
-  {3, -16,  77,  77, -16, 3},
-  {3, -15,  67,  86, -16, 3},
-  {3, -14,  57,  95, -16, 3},
-  {2, -12,  47, 103, -15, 3},
-  {2, -10,  37, 110, -14, 3},
-  {2,  -8,  27, 116, -11, 2},
-  {1,  -5,  17, 122,  -8, 1},
-  {1,  -2,   8, 125,  -5, 1}
+DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]) = {
+  {0, 0,   0, 128,   0,   0, 0,  0},
+  {0, 1,  -5, 125,   8,  -2, 1,  0},
+  {0, 1,  -8, 122,  17,  -5, 1,  0},
+  {0, 2, -11, 116,  27,  -8, 2,  0},
+  {0, 3, -14, 110,  37, -10, 2,  0},
+  {0, 3, -15, 103,  47, -12, 2,  0},
+  {0, 3, -16,  95,  57, -14, 3,  0},
+  {0, 3, -16,  86,  67, -15, 3,  0},
+  {0, 3, -16,  77,  77, -16, 3,  0},
+  {0, 3, -15,  67,  86, -16, 3,  0},
+  {0, 3, -14,  57,  95, -16, 3,  0},
+  {0, 2, -12,  47, 103, -15, 3,  0},
+  {0, 2, -10,  37, 110, -14, 3,  0},
+  {0, 2,  -8,  27, 116, -11, 2,  0},
+  {0, 1,  -5,  17, 122,  -8, 1,  0},
+  {0, 1,  -2,   8, 125,  -5, 1,  0}
 };
-
-static void filter_block2d_first_pass_6(uint8_t *src_ptr,
-                                        int *output_ptr,
-                                        unsigned int src_pixels_per_line,
-                                        unsigned int pixel_step,
-                                        unsigned int output_height,
-                                        unsigned int output_width,
-                                        const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]       * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]       * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);      /* Rounding */
-
-      /* Normalize back to 0-255 */
-      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);
-      src_ptr++;
-    }
-
-    /* Next row... */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-static void filter_block2d_second_pass_6(int *src_ptr,
-                                         uint8_t *output_ptr,
-                                         int output_pitch,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned int pixel_step,
-                                         unsigned int output_height,
-                                         unsigned int output_width,
-                                         const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      /* Apply filter */
-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
-
-      /* Normalize back to 0-255 */
-      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);
-      src_ptr++;
-    }
-
-    /* Start next row */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_pitch;
-  }
-}
-
-/*
- * The only functional difference between filter_block2d_second_pass()
- * and this function is that filter_block2d_second_pass() does a sixtap
- * filter on the input and stores it in the output. This function
- * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
- * and then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_second_pass_avg_6(int *src_ptr,
-                                             uint8_t *output_ptr,
-                                             int output_pitch,
-                                             unsigned int src_pixels_per_line,
-                                             unsigned int pixel_step,
-                                             unsigned int output_height,
-                                             unsigned int output_width,
-                                             const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      /* Apply filter */
-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
-             ((int)src_ptr[0]                    * vp9_filter[2]) +
-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +
-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +
-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
-
-      /* Normalize back to 0-255 */
-      output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) +
-                       output_ptr[j] + 1) >> 1;
-      src_ptr++;
-    }
-
-    /* Start next row */
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_pitch;
-  }
-}
-
-#define Interp_Extend 3
-static void filter_block2d_6(uint8_t *src_ptr,
-                             uint8_t *output_ptr,
-                             unsigned int src_pixels_per_line,
-                             int output_pitch,
-                             const int16_t *HFilter,
-                             const int16_t *VFilter) {
-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr,
-                               output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-
-void vp9_sixtap_predict4x4_c(uint8_t *src_ptr,
-                             int src_pixels_per_line,
-                             int xoffset,
-                             int yoffset,
-                             uint8_t *dst_ptr,
-                             int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
-                   VFilter);
-}
-
-/*
- * The difference between filter_block2d_6() and filter_block2d_avg_6 is
- * that filter_block2d_6() does a 6-tap filter and stores it in the output
- * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and
- * then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_avg_6(uint8_t *src_ptr,
-                                 uint8_t *output_ptr,
-                                 unsigned int src_pixels_per_line,
-                                 int output_pitch,
-                                 const int16_t *HFilter,
-                                 const int16_t *VFilter) {
-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,
-                                   output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr,
-                                 int src_pixels_per_line,
-                                 int xoffset,
-                                 int yoffset,
-                                 uint8_t *dst_ptr,
-                                 int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
-                       HFilter, VFilter);
-}
-
-void vp9_sixtap_predict8x8_c(uint8_t *src_ptr,
-                             int src_pixels_per_line,
-                             int xoffset,
-                             int yoffset,
-                             uint8_t *dst_ptr,
-                             int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
-                               dst_pitch, 8, 8, 8, 8, VFilter);
-
-}
-
-void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr,
-                                 int src_pixels_per_line,
-                                 int xoffset,
-                                 int yoffset,
-                                 uint8_t *dst_ptr,
-                                 int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
-                                   dst_pitch, 8, 8, 8, 8, VFilter);
-}
-
-void vp9_sixtap_predict8x4_c(uint8_t *src_ptr,
-                             int src_pixels_per_line,
-                             int xoffset,
-                             int yoffset,
-                             uint8_t *dst_ptr,
-                             int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
-                               dst_pitch, 8, 8, 4, 8, VFilter);
-}
-
-void vp9_sixtap_predict16x16_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr,
-                               dst_pitch, 16, 16, 16, 16, VFilter);
-}
-
-void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */
-
-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(
-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
-
-  /* then filter vertically... */
-  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr,
-                                   dst_pitch, 16, 16, 16, 16, VFilter);
-}
-
-typedef enum {
-  VPX_FILTER_4x4 = 0,
-  VPX_FILTER_8x8 = 1,
-  VPX_FILTER_8x4 = 2,
-  VPX_FILTER_16x16 = 3,
-} filter_size_t;
-
-static const unsigned int filter_size_to_wh[][2] = {
-  {4, 4},
-  {8, 8},
-  {8, 4},
-  {16,16},
-};
-
-static void filter_block2d_8_c(const uint8_t *src_ptr,
-                               const unsigned int src_stride,
-                               const int16_t *HFilter,
-                               const int16_t *VFilter,
-                               const filter_size_t filter_size,
-                               uint8_t *dst_ptr,
-                               unsigned int dst_stride) {
-  const unsigned int output_width = filter_size_to_wh[filter_size][0];
-  const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
-  // Between passes, we use an intermediate buffer whose height is extended to
-  // have enough horizontally filtered values as input for the vertical pass.
-  // This buffer is allocated to be big enough for the largest block type we
-  // support.
-  const int kInterp_Extend = 4;
-  const unsigned int intermediate_height =
-    (kInterp_Extend - 1) +     output_height + kInterp_Extend;
-
-  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
-   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
-   *                                 + kInterp_Extend
-   *                               = 3 + 16 + 4
-   *                               = 23
-   * and filter_max_width = 16
-   */
-  uint8_t intermediate_buffer[23 * 16];
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
-
-  // Horizontal pass (src -> transposed intermediate).
-  {
-    uint8_t *output_ptr = intermediate_buffer;
-    const int src_next_row_stride = src_stride - output_width;
-    unsigned int i, j;
-    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-    for (i = 0; i < intermediate_height; i++) {
-      for (j = 0; j < output_width; j++) {
-        // Apply filter...
-        int temp = ((int)src_ptr[0] * HFilter[0]) +
-                   ((int)src_ptr[1] * HFilter[1]) +
-                   ((int)src_ptr[2] * HFilter[2]) +
-                   ((int)src_ptr[3] * HFilter[3]) +
-                   ((int)src_ptr[4] * HFilter[4]) +
-                   ((int)src_ptr[5] * HFilter[5]) +
-                   ((int)src_ptr[6] * HFilter[6]) +
-                   ((int)src_ptr[7] * HFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1); // Rounding
-
-        // Normalize back to 0-255...
-        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
-        src_ptr++;
-        output_ptr += intermediate_height;
-      }
-      src_ptr += src_next_row_stride;
-      output_ptr += intermediate_next_stride;
-    }
-  }
-
-  // Vertical pass (transposed intermediate -> dst).
-  {
-    uint8_t *src_ptr = intermediate_buffer;
-    const int dst_next_row_stride = dst_stride - output_width;
-    unsigned int i, j;
-    for (i = 0; i < output_height; i++) {
-      for (j = 0; j < output_width; j++) {
-        // Apply filter...
-        int temp = ((int)src_ptr[0] * VFilter[0]) +
-                   ((int)src_ptr[1] * VFilter[1]) +
-                   ((int)src_ptr[2] * VFilter[2]) +
-                   ((int)src_ptr[3] * VFilter[3]) +
-                   ((int)src_ptr[4] * VFilter[4]) +
-                   ((int)src_ptr[5] * VFilter[5]) +
-                   ((int)src_ptr[6] * VFilter[6]) +
-                   ((int)src_ptr[7] * VFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1); // Rounding
-
-        // Normalize back to 0-255...
-        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
-        src_ptr += intermediate_height;
-      }
-      src_ptr += intermediate_next_stride;
-      dst_ptr += dst_next_row_stride;
-    }
-  }
-}
-
-void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr,
-                                const unsigned int src_stride,
-                                const int16_t *HFilter_aligned16,
-                                const int16_t *VFilter_aligned16,
-                                uint8_t *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_4x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr,
-                                const unsigned int src_stride,
-                                const int16_t *HFilter_aligned16,
-                                const int16_t *VFilter_aligned16,
-                                uint8_t *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_8x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr,
-                                const unsigned int src_stride,
-                                const int16_t *HFilter_aligned16,
-                                const int16_t *VFilter_aligned16,
-                                uint8_t *dst_ptr,
-                                unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_8x8, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr,
-                                  const unsigned int src_stride,
-                                  const int16_t *HFilter_aligned16,
-                                  const int16_t *VFilter_aligned16,
-                                  uint8_t *dst_ptr,
-                                  unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
-                     VPX_FILTER_16x16, dst_ptr, dst_stride);
-}
-
-static void block2d_average_c(uint8_t *src,
-                              unsigned int src_stride,
-                              uint8_t *output_ptr,
-                              unsigned int output_stride,
-                              const filter_size_t filter_size) {
-  const unsigned int output_width = filter_size_to_wh[filter_size][0];
-  const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
-  unsigned int i, j;
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
-    }
-    output_ptr += output_stride;
-  }
-}
-
-#define block2d_average block2d_average_c
-
-void vp9_eighttap_predict4x4_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_8[xoffset];
-  VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-  uint8_t tmp[4 * 4];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           4);
-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_8s[xoffset];
-  VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr,
-                                      int src_pixels_per_line,
-                                      int xoffset,
-                                      int yoffset,
-                                      uint8_t *dst_ptr,
-                                      int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr,
-                                         int src_pixels_per_line,
-                                         int xoffset,
-                                         int yoffset,
-                                         uint8_t *dst_ptr,
-                                         int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-  uint8_t tmp[4 * 4];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           4);
-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr,
-                                          int src_pixels_per_line,
-                                          int xoffset,
-                                          int yoffset,
-                                          uint8_t *dst_ptr,
-                                          int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-  uint8_t tmp[4 * 4];
-
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           4);
-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-
-void vp9_eighttap_predict8x8_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr,
-                                      int src_pixels_per_line,
-                                      int xoffset,
-                                      int yoffset,
-                                      uint8_t *dst_ptr,
-                                      int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  uint8_t tmp[8 * 8];
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           8);
-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr,
-                                         int src_pixels_per_line,
-                                         int xoffset,
-                                         int yoffset,
-                                         uint8_t *dst_ptr,
-                                         int dst_pitch) {
-  uint8_t tmp[8 * 8];
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           8);
-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr,
-                                          int src_pixels_per_line,
-                                          int xoffset,
-                                          int yoffset,
-                                          uint8_t *dst_ptr,
-                                          int dst_pitch) {
-  uint8_t tmp[8 * 8];
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
-                           8);
-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict8x4_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr,
-                                      int src_pixels_per_line,
-                                      int xoffset,
-                                      int yoffset,
-                                      uint8_t *dst_ptr,
-                                      int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                           dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_c(uint8_t *src_ptr,
-                                 int src_pixels_per_line,
-                                 int xoffset,
-                                 int yoffset,
-                                 uint8_t *dst_ptr,
-                                 int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr,
-                                       int src_pixels_per_line,
-                                       int xoffset,
-                                       int yoffset,
-                                       uint8_t *dst_ptr,
-                                       int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr,
-                                        int src_pixels_per_line,
-                                        int xoffset,
-                                        int yoffset,
-                                        uint8_t *dst_ptr,
-                                        int dst_pitch) {
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             tmp, 16);
-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr,
-                                           int src_pixels_per_line,
-                                           int xoffset,
-                                           int yoffset,
-                                           uint8_t *dst_ptr,
-                                           int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             tmp, 16);
-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr,
-                                            int src_pixels_per_line,
-                                            int xoffset,
-                                            int yoffset,
-                                            uint8_t *dst_ptr,
-                                            int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
-                             tmp, 16);
-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : uint8_t  *src_ptr    : Pointer to source block.
- *                  uint32_t  src_stride : Stride of source block.
- *                  uint32_t  height     : Block height.
- *                  uint32_t  width      : Block width.
- *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : int32_t  *dst_ptr    : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
- *                  in the horizontal direction to produce the filtered output
- *                  block. Used to implement first-pass of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_first_pass(uint8_t *src_ptr,
-                                          uint16_t *dst_ptr,
-                                          unsigned int src_stride,
-                                          unsigned int height,
-                                          unsigned int width,
-                                          const int16_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply bilinear filter */
-      dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
-                    ((int)src_ptr[1] * vp9_filter[1]) +
-                    (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
-      src_ptr++;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride - width;
-    dst_ptr += width;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : int32_t  *src_ptr    : Pointer to source block.
- *                  uint32_t  dst_pitch  : Destination block pitch.
- *                  uint32_t  height     : Block height.
- *                  uint32_t  width      : Block width.
- *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : uint16_t *dst_ptr    : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
- *                  in the vertical direction to produce the filtered output
- *                  block. Used to implement second-pass of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_second_pass(uint16_t *src_ptr,
-                                           uint8_t *dst_ptr,
-                                           int dst_pitch,
-                                           unsigned int height,
-                                           unsigned int width,
-                                           const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply filter */
-      temp = ((int)src_ptr[0]     * vp9_filter[0]) +
-             ((int)src_ptr[width] * vp9_filter[1]) +
-             (VP9_FILTER_WEIGHT / 2);
-      dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT);
-      src_ptr++;
-    }
-
-    /* Next row... */
-    dst_ptr += dst_pitch;
-  }
-}
-
-/*
- * As before for filter_block2d_second_pass_avg(), the functional difference
- * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
- * is that filter_block2d_bil_second_pass() does a bilinear filter on input
- * and stores the result in output; filter_block2d_bil_second_pass_avg(),
- * instead, does a bilinear filter on input, averages the resulting value
- * with the values already present in the output and stores the result of
- * that back into the output ((filter_result + dest + 1) >> 1).
- */
-static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr,
-                                               uint8_t *dst_ptr,
-                                               int dst_pitch,
-                                               unsigned int height,
-                                               unsigned int width,
-                                               const int16_t *vp9_filter) {
-  unsigned int i, j;
-  int temp;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      /* Apply filter */
-      temp = (((int)src_ptr[0]     * vp9_filter[0]) +
-              ((int)src_ptr[width] * vp9_filter[1]) +
-              (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
-      dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1);
-      src_ptr++;
-    }
-
-    /* Next row... */
-    dst_ptr += dst_pitch;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil
- *
- *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
- *                  uint32_t  src_pitch        : Stride of source block.
- *                  uint32_t  dst_pitch        : Stride of destination block.
- *                  int32_t  *HFilter          : Array of 2 horizontal filter taps.
- *                  int32_t  *VFilter          : Array of 2 vertical filter taps.
- *                  int32_t  Width             : Block width
- *                  int32_t  Height            : Block height
- *
- *  OUTPUTS       : uint16_t *dst_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 2-D filters an input block by applying a 2-tap
- *                  bi-linear filter horizontally followed by a 2-tap
- *                  bi-linear filter vertically on the result.
- *
- *  SPECIAL NOTES : The largest block size can be handled here is 16x16
- *
- ****************************************************************************/
-static void filter_block2d_bil(uint8_t *src_ptr,
-                               uint8_t *dst_ptr,
-                               unsigned int src_pitch,
-                               unsigned int dst_pitch,
-                               const int16_t *HFilter,
-                               const int16_t *VFilter,
-                               int Width,
-                               int Height) {
-
-  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
-  /* then 1-D vertically... */
-  filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-static void filter_block2d_bil_avg(uint8_t *src_ptr,
-                                   uint8_t *dst_ptr,
-                                   unsigned int src_pitch,
-                                   unsigned int dst_pitch,
-                                   const int16_t *HFilter,
-                                   const int16_t *VFilter,
-                                   int Width,
-                                   int Height) {
-  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */
-
-  /* First filter 1-D horizontally... */
-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
-  /* then 1-D vertically... */
-  filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-void vp9_bilinear_predict4x4_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict8x8_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
-
-}
-
-void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr,
-                                   int src_pixels_per_line,
-                                   int xoffset,
-                                   int yoffset,
-                                   uint8_t *dst_ptr,
-                                   int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 8, 8);
-}
-
-void vp9_bilinear_predict8x4_c(uint8_t *src_ptr,
-                               int src_pixels_per_line,
-                               int xoffset,
-                               int yoffset,
-                               uint8_t *dst_ptr,
-                               int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
-
-}
-
-void vp9_bilinear_predict16x16_c(uint8_t *src_ptr,
-                                 int src_pixels_per_line,
-                                 int xoffset,
-                                 int yoffset,
-                                 uint8_t *dst_ptr,
-                                 int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
-}
-
-void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr,
-                                     int src_pixels_per_line,
-                                     int xoffset,
-                                     int yoffset,
-                                     uint8_t *dst_ptr,
-                                     int dst_pitch) {
-  const int16_t *HFilter;
-  const int16_t *VFilter;
-
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
-
-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
-                         dst_pitch, HFilter, VFilter, 16, 16);
-}
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index cd666578d3..1ccfdaac25 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -21,10 +21,17 @@
 
 #define SUBPEL_SHIFTS 16
 
-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];
 extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
 extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
 extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
 
+// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
+// filter kernel as a 2 tap filter.
+#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \
+                   sizeof(vp9_bilinear_filters[0][0]))
+#define BF_OFFSET (BF_LENGTH / 2 - 1)
+#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)
+
 #endif  // VP9_COMMON_VP9_FILTER_H_
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 88f2ea9c18..f2c8891081 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -87,8 +87,8 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
   uint8_t temp2[2 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3,
                                     src_pixels_per_line, 1, 3, 16, HFilter);
@@ -108,8 +108,8 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
   uint8_t temp2[2 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3,
                                     src_pixels_per_line, 1, 17, 2, HFilter);
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 20de7b7f1d..d4435d872d 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -8,66 +8,58 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
+  // TODO(agrange): Investigate the best choice of functions to use here
+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+  // to do at full-pel offsets. The current selection, where the filter is
+  // applied in one direction only, and not at all for 0,0, seems to give the
+  // best quality, but it may be worth trying an additional mode that does
+  // do the filtering on full-pel.
+  xd->subpix.predict[0][0][0] = vp9_convolve_copy;
+  xd->subpix.predict[0][0][1] = vp9_convolve_avg;
+  xd->subpix.predict[0][1][0] = vp9_convolve8_vert;
+  xd->subpix.predict[0][1][1] = vp9_convolve8_avg_vert;
+  xd->subpix.predict[1][0][0] = vp9_convolve8_horiz;
+  xd->subpix.predict[1][0][1] = vp9_convolve8_avg_horiz;
+  xd->subpix.predict[1][1][0] = vp9_convolve8;
+  xd->subpix.predict[1][1][1] = vp9_convolve8_avg;
+
+  xd->subpix.x_step_q4 = 16;
+  xd->subpix.y_step_q4 = 16;
+  switch (mcomp_filter_type) {
+    case EIGHTTAP:
+    case SWITCHABLE:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;
+      break;
+    case EIGHTTAP_SMOOTH:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;
+      break;
+    case EIGHTTAP_SHARP:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;
+      break;
+    case BILINEAR:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
+      break;
 #if CONFIG_ENABLE_6TAP
-  if (mcomp_filter_type == SIXTAP) {
-    xd->subpixel_predict4x4     = vp9_sixtap_predict4x4;
-    xd->subpixel_predict8x4     = vp9_sixtap_predict8x4;
-    xd->subpixel_predict8x8     = vp9_sixtap_predict8x8;
-    xd->subpixel_predict16x16   = vp9_sixtap_predict16x16;
-    xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4;
-    xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;
-    xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;
-  } else {
+    case SIXTAP:
+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;
+      break;
 #endif
-  if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4;
-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4;
-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8;
-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16;
-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4;
-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;
-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;
-  } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) {
-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_smooth;
-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_smooth;
-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_smooth;
-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_smooth;
-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth;
-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth;
-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth;
-  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_sharp;
-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_sharp;
-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_sharp;
-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_sharp;
-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp;
-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;
-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;
-  } else {
-    xd->subpixel_predict4x4     = vp9_bilinear_predict4x4;
-    xd->subpixel_predict8x4     = vp9_bilinear_predict8x4;
-    xd->subpixel_predict8x8     = vp9_bilinear_predict8x8;
-    xd->subpixel_predict16x16   = vp9_bilinear_predict16x16;
-    xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4;
-    xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;
-    xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;
-  }
-#if CONFIG_ENABLE_6TAP
   }
-#endif
 }
 
-void vp9_copy_mem16x16_c(uint8_t *src,
+void vp9_copy_mem16x16_c(const uint8_t *src,
                          int src_stride,
                          uint8_t *dst,
                          int dst_stride) {
@@ -93,10 +85,10 @@ void vp9_copy_mem16x16_c(uint8_t *src,
     dst[15] = src[15];
 
 #else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
-    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];
-    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];
+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
+    ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
+    ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
 
 #endif
     src += src_stride;
@@ -104,25 +96,7 @@ void vp9_copy_mem16x16_c(uint8_t *src,
   }
 }
 
-void vp9_avg_mem16x16_c(uint8_t *src,
-                        int src_stride,
-                        uint8_t *dst,
-                        int dst_stride) {
-  int r;
-
-  for (r = 0; r < 16; r++) {
-    int n;
-
-    for (n = 0; n < 16; n++) {
-      dst[n] = (dst[n] + src[n] + 1) >> 1;
-    }
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x8_c(uint8_t *src,
+void vp9_copy_mem8x8_c(const uint8_t *src,
                        int src_stride,
                        uint8_t *dst,
                        int dst_stride) {
@@ -139,33 +113,15 @@ void vp9_copy_mem8x8_c(uint8_t *src,
     dst[6] = src[6];
     dst[7] = src[7];
 #else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 #endif
     src += src_stride;
     dst += dst_stride;
   }
 }
 
-void vp9_avg_mem8x8_c(uint8_t *src,
-                      int src_stride,
-                      uint8_t *dst,
-                      int dst_stride) {
-  int r;
-
-  for (r = 0; r < 8; r++) {
-    int n;
-
-    for (n = 0; n < 8; n++) {
-      dst[n] = (dst[n] + src[n] + 1) >> 1;
-    }
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x4_c(uint8_t *src,
+void vp9_copy_mem8x4_c(const uint8_t *src,
                        int src_stride,
                        uint8_t *dst,
                        int dst_stride) {
@@ -182,16 +138,16 @@ void vp9_copy_mem8x4_c(uint8_t *src,
     dst[6] = src[6];
     dst[7] = src[7];
 #else
-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
 #endif
     src += src_stride;
     dst += dst_stride;
   }
 }
 
-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
-  int r;
+void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                  struct subpix_fn_table *subpix) {
   uint8_t *ptr_base;
   uint8_t *ptr;
   uint8_t *pred_ptr = d->predictor;
@@ -199,30 +155,14 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
 
   ptr_base = *(d->base_pre);
   mv.as_int = d->bmi.as_mv.first.as_int;
+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+        (mv.as_mv.col >> 3);
 
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-          (mv.as_mv.col >> 3);
-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
-         pred_ptr, pitch);
-  } else {
-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-                (mv.as_mv.col >> 3);
-    ptr = ptr_base;
-
-    for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-      pred_ptr[0]  = ptr[0];
-      pred_ptr[1]  = ptr[1];
-      pred_ptr[2]  = ptr[2];
-      pred_ptr[3]  = ptr[3];
-#else
-      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;
-#endif
-      pred_ptr     += pitch;
-      ptr         += d->pre_stride;
-    }
-  }
+  subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
+      ptr, d->pre_stride, pred_ptr, pitch,
+      subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,
+      subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,
+      4, 4);
 }
 
 /*
@@ -232,8 +172,7 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
  * predictor of the second reference frame / motion vector.
  */
 void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
-                                      vp9_subpix_fn_t sppf) {
-  int r;
+                                      struct subpix_fn_table *subpix) {
   uint8_t *ptr_base;
   uint8_t *ptr;
   uint8_t *pred_ptr = d->predictor;
@@ -241,26 +180,14 @@ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
 
   ptr_base = *(d->base_second_pre);
   mv.as_int = d->bmi.as_mv.second.as_int;
+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+        (mv.as_mv.col >> 3);
 
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-          (mv.as_mv.col >> 3);
-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
-         pred_ptr, pitch);
-  } else {
-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-                (mv.as_mv.col >> 3);
-    ptr = ptr_base;
-
-    for (r = 0; r < 4; r++) {
-      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;
-      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;
-      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;
-      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;
-      pred_ptr    += pitch;
-      ptr         += d->pre_stride;
-    }
-  }
+  subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](
+      ptr, d->pre_stride, pred_ptr, pitch,
+      subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,
+      subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,
+      4, 4);
 }
 
 void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
@@ -274,12 +201,11 @@ void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
         (mv.as_mv.col >> 3);
 
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
-  } else {
-    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
-  }
+  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
+      ptr, d->pre_stride, pred_ptr, pitch,
+      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
+      8, 8);
 }
 
 /*
@@ -300,12 +226,11 @@ void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
         (mv.as_mv.col >> 3);
 
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
-  } else {
-    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
-  }
+  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](
+      ptr, d->pre_stride, pred_ptr, pitch,
+      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
+      8, 8);
 }
 
 static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
@@ -319,12 +244,11 @@ static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
         (mv.as_mv.col >> 3);
 
-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
-    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
-                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
-  } else {
-    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);
-  }
+  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
+      ptr, d->pre_stride, pred_ptr, pitch,
+      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
+      8, 4);
 }
 
 /*encoder only*/
@@ -411,13 +335,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
     if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
       build_inter_predictors2b(xd, d0, 8);
     else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
+      vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
+      vp9_build_inter_predictors_b(d1, 8, &xd->subpix);
     }
 
     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
+      vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);
+      vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);
     }
   }
 }
@@ -475,14 +399,11 @@ void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
 
   ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
 
-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
-      xd->subpixel_predict16x16(ptr, pre_stride,
-                                (ymv.as_mv.col & 7) << 1,
-                                (ymv.as_mv.row & 7) << 1,
-                                dst_y, dst_ystride);
-    } else {
-      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
+  xd->subpix.predict[!!(ymv.as_mv.col & 7)][!!(ymv.as_mv.row & 7)][0](
+      ptr, pre_stride, dst_y, dst_ystride,
+      xd->subpix.filter_x[(ymv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[(ymv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
+      16, 16);
 }
 
 void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
@@ -523,15 +444,19 @@ void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
   uptr = xd->pre.u_buffer + offset;
   vptr = xd->pre.v_buffer + offset;
 
-    if (_o16x16mv.as_int & 0x000f000f) {
-      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
-                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
-      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,
-                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);
-    } else {
-      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
-      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
-    }
+  xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]
+                    [!!(_o16x16mv.as_mv.row & 15)][0](
+      uptr, pre_stride, dst_u, dst_uvstride,
+      xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,
+      8, 8);
+
+  xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]
+                    [!!(_o16x16mv.as_mv.row & 15)][0](
+      vptr, pre_stride, dst_v, dst_uvstride,
+      xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,
+      8, 8);
 }
 
 
@@ -714,12 +639,11 @@ void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
 
   ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
 
-  if ((mv_row | mv_col) & 7) {
-    xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
-                                  (mv_row & 7) << 1, dst_y, dst_ystride);
-  } else {
-    vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-  }
+  xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][1](
+      ptr, pre_stride, dst_y, dst_ystride,
+      xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,
+      16, 16);
 }
 
 void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
@@ -758,15 +682,17 @@ void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
   uptr = xd->second_pre.u_buffer + offset;
   vptr = xd->second_pre.v_buffer + offset;
 
-    if ((omv_row | omv_col) & 15) {
-      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
-                                  omv_row & 15, dst_u, dst_uvstride);
-      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,
-                                  omv_row & 15, dst_v, dst_uvstride);
-    } else {
-      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
-      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
-    }
+  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](
+      uptr, pre_stride, dst_u, dst_uvstride,
+      xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,
+      8, 8);
+
+  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](
+      vptr, pre_stride, dst_v, dst_uvstride,
+      xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,
+      8, 8);
 }
 
 void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
@@ -835,13 +761,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
       if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
         build_inter_predictors2b(xd, d0, 16);
       else {
-        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4);
-        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4);
+        vp9_build_inter_predictors_b(d0, 16, &xd->subpix);
+        vp9_build_inter_predictors_b(d1, 16, &xd->subpix);
       }
 
       if (mbmi->second_ref_frame > 0) {
-        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4);
-        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4);
+        vp9_build_2nd_inter_predictors_b(d0, 16, &xd->subpix);
+        vp9_build_2nd_inter_predictors_b(d1, 16, &xd->subpix);
       }
     }
   }
@@ -853,13 +779,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
     if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
       build_inter_predictors2b(xd, d0, 8);
     else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
+      vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
+      vp9_build_inter_predictors_b(d1, 8, &xd->subpix);
     }
 
     if (mbmi->second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
+      vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);
+      vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);
     }
   }
 }
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 89868b95ef..903bd2e86d 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -14,6 +14,8 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+struct subpix_fn_table;
+
 extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
                                                     uint8_t *dst_y,
                                                     int dst_ystride,
@@ -64,10 +66,10 @@ extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
 extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
 
 extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
-                                         vp9_subpix_fn_t sppf);
+                                         struct subpix_fn_table *sppf);
 
 extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
-                                             vp9_subpix_fn_t sppf);
+                                             struct subpix_fn_table *sppf);
 
 extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
                                          int pitch);
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 762dd75c0a..9698172b2a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -23,21 +23,6 @@ EOF
 }
 forward_decls vp9_common_forward_decls
 
-prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-
-# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
-# compiles warning free but a dissassembly of generated code show bugs. To be
-# on the safe side, only enabled when compiled with 'gcc'.
-if [ "$CONFIG_GCC" = "yes" ]; then
-    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
-fi
-    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
-    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
-    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
-
 #
 # Dequant
 #
@@ -86,27 +71,17 @@ specialize vp9_dequant_idct_add_uv_block_16x16
 #
 # RECON
 #
-prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem16x16 mmx sse2 dspr2
 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
 
-prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x8 mmx dspr2
 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
 
-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx
 
-prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_avg_mem16x16
-
-prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_avg_mem8x8
-
-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx dspr2
-vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
-
 prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_b
 
@@ -287,111 +262,6 @@ specialize vp9_convolve8_avg_horiz
 prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_vert
 
-prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict16x16
-
-prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x8
-
-prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg16x16
-
-prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg8x8
-
-prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg4x4
-
-prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x4
-
-prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict4x4
-
-prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict16x16_sharp
-
-prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x8_sharp
-
-prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg16x16_sharp
-
-prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg8x8_sharp
-
-prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg4x4_sharp
-
-prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x4_sharp
-
-prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict4x4_sharp
-
-prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict16x16_smooth
-
-prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x8_smooth
-
-prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg16x16_smooth
-
-prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg8x8_smooth
-
-prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_avg4x4_smooth
-
-prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict8x4_smooth
-
-prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict4x4_smooth
-
-prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict16x16
-
-prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict8x8
-
-prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict_avg16x16
-
-prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict_avg8x8
-
-prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict8x4
-
-prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict4x4
-
-prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict_avg4x4
-
-prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict16x16 sse2
-
-prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict8x8 sse2
-
-prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict_avg16x16
-
-prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict_avg8x8
-
-prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict8x4
-
-prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict4x4
-
-prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict_avg4x4
-
 #
 # dct
 #
diff --git a/vp9/common/vp9_subpixel.h b/vp9/common/vp9_subpixel.h
deleted file mode 100644
index dc4eadfb19..0000000000
--- a/vp9/common/vp9_subpixel.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SUBPIXEL_H_
-#define VP9_COMMON_VP9_SUBPIXEL_H_
-
-#define prototype_subpixel_predict(sym) \
-  void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \
-           uint8_t *dst, int dst_pitch)
-
-typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
-
-#endif  // VP9_COMMON_VP9_SUBPIXEL_H_
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f09e2d78be..d233247b2b 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -11,88 +11,6 @@
 
 #include "./vpx_config.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_subpixel.h"
-
-extern const short vp9_six_tap_mmx[8][6 * 8];
-
-extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
-                                      unsigned short  *output_ptr,
-                                      unsigned int     src_pixels_per_line,
-                                      unsigned int     pixel_step,
-                                      unsigned int     output_height,
-                                      unsigned int     output_width,
-                                      const short     *vp9_filter);
-
-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
-                                       unsigned char  *output_ptr,
-                                       int             output_pitch,
-                                       unsigned int    pixels_per_line,
-                                       unsigned int    pixel_step,
-                                       unsigned int    output_height,
-                                       unsigned int    output_width,
-                                       const short    *vp9_filter);
-
-extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,
-                                        unsigned short *output_ptr,
-                                        unsigned int    src_pixels_per_line,
-                                        unsigned int    pixel_step,
-                                        unsigned int    output_height,
-                                        unsigned int    output_width,
-                                        const short    *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,
-                                         unsigned short *output_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned int    pixel_step,
-                                         unsigned int    output_height,
-                                         unsigned int    output_width,
-                                         const short    *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
-                                        unsigned char *output_ptr,
-                                        int dst_ptich,
-                                        unsigned int pixels_per_line,
-                                        unsigned int pixel_step,
-                                        unsigned int output_height,
-                                        unsigned int output_width,
-                                        const short    *vp9_filter);
-
-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
-                                         unsigned char *output_ptr,
-                                         int dst_ptich,
-                                         unsigned int pixels_per_line,
-                                         unsigned int pixel_step,
-                                         unsigned int output_height,
-                                         unsigned int output_width,
-                                         const short    *vp9_filter);
-
-extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,
-                                         unsigned short *output_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned int    output_height,
-                                         unsigned int    output_width);
-
-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
-                                             unsigned int   src_pixels_per_line,
-                                             unsigned char *output_ptr,
-                                             int            dst_pitch,
-                                             unsigned int   output_height,
-                                             const short   *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
-                                              unsigned int   src_pixels_per_lin,
-                                              unsigned char *output_ptr,
-                                              int            dst_pitch,
-                                              unsigned int   output_height,
-                                              const short   *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
-                                             unsigned int   src_pixels_per_line,
-                                             unsigned char *output_ptr,
-                                             int            dst_pitch,
-                                             unsigned int   output_height,
-                                             const short   *vp9_filter);
-
 ///////////////////////////////////////////////////////////////////////////
 // the mmx function that does the bilinear filtering and var calculation //
 // int one pass                                                          //
@@ -115,487 +33,3 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
   {  16, 16, 16, 16, 112, 112, 112, 112 },
   {   8,  8,  8,  8, 120, 120, 120, 120 }
 };
-
-#if HAVE_MMX
-void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict4x4_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
-  const short *hfilter, *vfilter;
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
-                            src_pixels_per_line, 1, 9, 8, hfilter);
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
-                             8, 4, 4, 4, vfilter);
-}
-
-void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
-                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
-                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,
-                            hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,
-                             32, 16, 16, 16, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
-                             32, 16, 16, 16, vfilter);
-}
-
-void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 13, 16,
-                            hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,
-                            hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
-                             16, 8, 8, 8, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
-                             16, 8, 8, 8, vfilter);
-}
-
-void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_mmx\n");
-#endif
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-
-  hfilter = vp9_six_tap_mmx[xoffset];
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
-                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);
-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
-                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
-
-  vfilter = vp9_six_tap_mmx[yoffset];
-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,
-                             16, 8, 4, 8, vfilter);
-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
-                             16, 8, 4, 8, vfilter);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,
-                                  int  src_pixels_per_line,
-                                  int  xoffset,
-                                  int  yoffset,
-                                  unsigned char *dst_ptr,
-                                  int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                   src_pixels_per_line, 1, 21, 32, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
-                                   32, 16, 16, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                        dst_ptr, dst_pitch, 16, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                 src_pixels_per_line, 21, 32);
-    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
-                                 32, 16, 16, dst_pitch, vfilter);
-  }
-}
-
-void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,
-                                int  src_pixels_per_line,
-                                int  xoffset,
-                                int  yoffset,
-                                unsigned char *dst_ptr,
-                                int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                  src_pixels_per_line, 1, 13, 16, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
-                                  16, 8, 8, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                       dst_ptr, dst_pitch, 8, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
-                                     src_pixels_per_line,
-                                     dst_ptr, dst_pitch, 8, vfilter);
-  }
-}
-
-void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,
-                                int  src_pixels_per_line,
-                                int  xoffset,
-                                int  yoffset,
-                                unsigned char *dst_ptr,
-                                int  dst_pitch) {
-  /* Temp data bufffer used in filtering */
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
-  const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_sse2\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
-                                  src_pixels_per_line, 1, 9, 16, hfilter);
-      vfilter = vp9_six_tap_mmx[yoffset];
-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
-                                  16, 8, 4, dst_pitch, vfilter);
-    } else {
-      /* First-pass only */
-      hfilter = vp9_six_tap_mmx[xoffset];
-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
-                                       dst_ptr, dst_pitch, 4, hfilter);
-    }
-  } else {
-    /* Second-pass only */
-    vfilter = vp9_six_tap_mmx[yoffset];
-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
-                                     src_pixels_per_line,
-                                     dst_ptr, dst_pitch, 4, vfilter);
-  }
-}
-#endif
-
-#if HAVE_SSSE3
-extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned char  *output_ptr,
-                                         unsigned int    output_pitch,
-                                         unsigned int    output_height,
-                                         unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,
-                                          unsigned int    src_pixels_per_line,
-                                          unsigned char  *output_ptr,
-                                          unsigned int    output_pitch,
-                                          unsigned int    output_height,
-                                          unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
-                                          unsigned int   src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int   out_pitch,
-                                          unsigned int   output_height,
-                                          unsigned int   vp9_filter_index);
-
-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
-                                         unsigned int   src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int   out_pitch,
-                                         unsigned int   output_height,
-                                         unsigned int   vp9_filter_index);
-
-extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,
-                                         unsigned int    src_pixels_per_line,
-                                         unsigned char  *output_ptr,
-                                         unsigned int    output_pitch,
-                                         unsigned int    output_height,
-                                         unsigned int    vp9_filter_index);
-
-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
-                                         unsigned int   src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int   out_pitch,
-                                         unsigned int   output_height,
-                                         unsigned int   vp9_filter_index);
-
-void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict16x16_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                    src_pixels_per_line,
-                                    fdata2, 16, 21, xoffset);
-      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
-                                    16, yoffset);
-    } else {
-      /* First-pass only */
-      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
-                                    dst_ptr, dst_pitch, 16, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                  src_pixels_per_line,
-                                  dst_ptr, dst_pitch, 16, yoffset);
-  }
-}
-
-void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x8_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 8, 13, xoffset);
-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
-    } else {
-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 8, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 8, yoffset);
-  }
-}
-
-void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict8x4_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 8, 9, xoffset);
-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
-    } else {
-      /* First-pass only */
-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 4, xoffset);
-    }
-  } else {
-    /* Second-pass only */
-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 4, yoffset);
-  }
-}
-
-void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,
-                                 int   src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
-#ifdef ANNOUNCE_FUNCTION
-  printf("vp9_sixtap_predict4x4_ssse3\n");
-#endif
-
-  if (xoffset) {
-    if (yoffset) {
-      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                   src_pixels_per_line, fdata2, 4, 9, xoffset);
-      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
-    } else {
-      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
-                                   dst_ptr, dst_pitch, 4, xoffset);
-    }
-  } else {
-    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
-                                 src_pixels_per_line,
-                                 dst_ptr, dst_pitch, 4, yoffset);
-  }
-}
-
-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
-                                      const unsigned int src_stride,
-                                      const short *hfilter_aligned16,
-                                      const short *vfilter_aligned16,
-                                      unsigned char *dst_ptr,
-                                      unsigned int dst_stride) {
-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
-    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                  fdata2, 16, 23, hfilter_aligned16);
-    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
-                                  vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
-                                    16, hfilter_aligned16);
-    } else {
-      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                    dst_ptr, dst_stride, 16, vfilter_aligned16);
-    }
-  }
-}
-
-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
-                                    const unsigned int src_stride,
-                                    const short *hfilter_aligned16,
-                                    const short *vfilter_aligned16,
-                                    unsigned char *dst_ptr,
-                                    unsigned int dst_stride) {
-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
-    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                 fdata2, 16, 15, hfilter_aligned16);
-    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
-                                 vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
-                                   hfilter_aligned16);
-    } else {
-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   dst_ptr, dst_stride, 8, vfilter_aligned16);
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
-                                    const unsigned int src_stride,
-                                    const short *hfilter_aligned16,
-                                    const short *vfilter_aligned16,
-                                    unsigned char *dst_ptr,
-                                    unsigned int dst_stride) {
-  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
-      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   fdata2, 16, 11, hfilter_aligned16);
-      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
-                                   vfilter_aligned16);
-  } else {
-    if (hfilter_aligned16[3] != 128) {
-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
-                                   hfilter_aligned16);
-    } else {
-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
-                                   dst_ptr, dst_stride, 4, vfilter_aligned16);
-    }
-  }
-}
-#endif
diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c
deleted file mode 100644
index 8e02ac1975..0000000000
--- a/vp9/common/x86/vp9_filter_sse2.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <emmintrin.h> // SSE2
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/emmintrin_compat.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-//           just a quick partial snapshot so that other can already use some
-//           speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-//           filtering.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-//           of positive above 128), or have higher precision filter
-//           coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \
-  {                                                                            \
-  /* Do shifted load to achieve require shuffles through unpacking */          \
-  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
-  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
-  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
-  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
-  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \
-  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \
-  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \
-  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \
-  /* Shit by 4 bytes through suffle to get additional shifted loads */         \
-  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \
-  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \
-  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \
-  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \
-  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \
-  /* multiply accumulate them */                                               \
-  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \
-  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \
-  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \
-  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \
-  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \
-  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \
-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
-  }
-
-void vp9_filter_block2d_4x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  __m128i intermediateA, intermediateB, intermediateC;
-
-  const int kInterp_Extend = 4;
-
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-
-  // check alignment
-  assert(0 == ((long)HFilter_aligned16)%16);
-  assert(0 == ((long)VFilter_aligned16)%16);
-
-  {
-    __m128i transpose3_0;
-    __m128i transpose3_1;
-    __m128i transpose3_2;
-    __m128i transpose3_3;
-
-    // Horizontal pass (src -> intermediate).
-    {
-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
-      {
-        __m128i mad_all0;
-        __m128i mad_all1;
-        __m128i mad_all2;
-        __m128i mad_all3;
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
-      }
-    }
-
-    // Transpose result (intermediate -> transpose3_x)
-    {
-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
-      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
-      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
-      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
-      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
-      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
-      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
-      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
-      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
-      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
-      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
-      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
-      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
-      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
-      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
-      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
-      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
-      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
-      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
-      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
-      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
-      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
-      transpose3_0 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose2_2),
-                                           _MM_SHUFFLE(1, 0, 1, 0)));
-      transpose3_1 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose2_2),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
-      transpose3_2 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose2_3),
-                                           _MM_SHUFFLE(1, 0, 1, 0)));
-      transpose3_3 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose2_3),
-                                           _MM_SHUFFLE(3, 2, 3, 2)));
-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
-    }
-
-    // Vertical pass (transpose3_x -> dst).
-    {
-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      __m128i col0, col1, col2, col3;
-        DECLARE_ALIGNED(16, unsigned char, temp[32]);
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_0);
-        DO_FOUR_PIXELS(col0, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_1);
-        DO_FOUR_PIXELS(col1, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_2);
-        DO_FOUR_PIXELS(col2, temp, 0);
-      }
-      {
-        _mm_store_si128((__m128i *)temp, transpose3_3);
-        DO_FOUR_PIXELS(col3, temp, 0);
-      }
-      // transpose
-      {
-        __m128i T0 = _mm_unpacklo_epi32(col0, col1);
-        __m128i T1 = _mm_unpacklo_epi32(col2, col3);
-        __m128i T2 = _mm_unpackhi_epi32(col0, col1);
-        __m128i T3 = _mm_unpackhi_epi32(col2, col3);
-        col0 = _mm_unpacklo_epi64(T0, T1);
-        col1 = _mm_unpackhi_epi64(T0, T1);
-        col2 = _mm_unpacklo_epi64(T2, T3);
-        col3 = _mm_unpackhi_epi64(T2, T3);
-      }
-      // saturate to 8 bit
-      {
-        col0 = _mm_packs_epi32(col0, col0);
-        col0 = _mm_packus_epi16(col0, col0);
-        col1 = _mm_packs_epi32(col1, col1);
-        col1 = _mm_packus_epi16(col1, col1);
-        col2 = _mm_packs_epi32 (col2, col2);
-        col2 = _mm_packus_epi16(col2, col2);
-        col3 = _mm_packs_epi32 (col3, col3);
-        col3 = _mm_packus_epi16(col3, col3);
-      }
-      // store
-      {
-        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
-        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
-        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
-        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
-      }
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int j;
-  for (j=0; j<8; j+=4) {
-    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
-                                  HFilter_aligned16, VFilter_aligned16,
-                                  dst_ptr + j, dst_stride);
-  }
-}
-
-void vp9_filter_block2d_8x8_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<8; i+=4) {
-    for (j=0; j<8; j+=4) {
-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
-
-void vp9_filter_block2d_16x16_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<16; i+=4) {
-    for (j=0; j<16; j+=4) {
-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c
deleted file mode 100644
index 52c35b2968..0000000000
--- a/vp9/common/x86/vp9_filter_sse4.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <smmintrin.h> // SSE4.1
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-//           just a quick partial snapshot so that other can already use some
-//           speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-//           filtering.
-// TODO(cd): Reduce source size by using macros instead of current code
-//           duplication.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-//           of positive above 128), or have higher precision filter
-//           coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
-  0x00, 0x01,
-  0x01, 0x02,
-  0x02, 0x03,
-  0x03, 0x04,
-  0x02, 0x03,
-  0x03, 0x04,
-  0x04, 0x05,
-  0x05, 0x06,
-};
-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
-  0x04, 0x05,
-  0x05, 0x06,
-  0x06, 0x07,
-  0x07, 0x08,
-  0x06, 0x07,
-  0x07, 0x08,
-  0x08, 0x09,
-  0x09, 0x0A,
-};
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-  VP9_FILTER_WEIGHT >> 1,
-};
-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
-  0, 4,  8, 12,
-  1, 5,  9, 13,
-  2, 6, 10, 14,
-  3, 7, 11, 15
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, offset)                                         \
-  {                                                                            \
-  /*load pixels*/                                                              \
-  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \
-  /* extract the ones used for first column */                                 \
-  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \
-  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \
-  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \
-  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \
-  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \
-  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \
-  /* multiply accumulate them */                                               \
-  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \
-  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \
-  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \
-  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \
-  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \
-  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \
-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \
-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \
-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \
-  }
-
-void vp9_filter_block2d_4x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  __m128i intermediateA, intermediateB, intermediateC;
-
-  const int kInterp_Extend = 4;
-
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
-  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
-
-  // check alignment
-  assert(0 == ((long)HFilter_aligned16)%16);
-  assert(0 == ((long)VFilter_aligned16)%16);
-
-  {
-    __m128i transpose3_0;
-    __m128i transpose3_1;
-    __m128i transpose3_2;
-    __m128i transpose3_3;
-
-    // Horizontal pass (src -> intermediate).
-    {
-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
-      {
-        __m128i mad_all0;
-        __m128i mad_all1;
-        __m128i mad_all2;
-        __m128i mad_all3;
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
-        // --
-        src_ptr += src_stride*4;
-        // --
-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)
-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)
-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)
-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
-      }
-    }
-
-    // Transpose result (intermediate -> transpose3_x)
-    {
-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
-      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
-      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
-      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
-      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
-      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
-      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-      transpose3_0 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(0, 0, 1, 0)));
-      transpose3_1 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(1, 1, 3, 2)));
-      transpose3_2 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(2, 2, 1, 0)));
-      transpose3_3 = _mm_castps_si128(
-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
-                                           _mm_castsi128_ps(transpose1_2),
-                                           _MM_SHUFFLE(3, 3, 3, 2)));
-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
-    }
-
-    // Vertical pass (transpose3_x -> dst).
-    {
-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
-      // get first two columns filter coefficients
-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
-      __m128i col0, col1, col2, col3;
-      {
-        //load pixels
-        __m128i src  = transpose3_0;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col0 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_1;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col1 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_2;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col2 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        //load pixels
-        __m128i src  = transpose3_3;
-        // extract the ones used for first column
-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
-        // multiply accumulate them
-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);
-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);
-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
-        mad_all = _mm_add_epi32(mad_all, rounding);
-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
-        mad_all = _mm_packs_epi32(mad_all, mad_all);
-        col3 = _mm_packus_epi16(mad_all, mad_all);
-      }
-      {
-        __m128i col01 = _mm_unpacklo_epi8(col0, col1);
-        __m128i col23 = _mm_unpacklo_epi8(col2, col3);
-        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
-        //TODO(cd): look into Ronald's comment:
-        //    Future suggestion: I believe here, too, you can merge the
-        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that
-        //    you get the data in a single register, and then use pshufb
-        //    (shuffle_epi8()) instead of the unpacks here. Should be
-        //    2+3+2 instructions faster.
-        *((unsigned int *)&dst_ptr[dst_stride * 0]) =
-            _mm_extract_epi32(col0123, 0);
-        *((unsigned int *)&dst_ptr[dst_stride * 1]) =
-            _mm_extract_epi32(col0123, 1);
-        *((unsigned int *)&dst_ptr[dst_stride * 2]) =
-            _mm_extract_epi32(col0123, 2);
-        *((unsigned int *)&dst_ptr[dst_stride * 3]) =
-            _mm_extract_epi32(col0123, 3);
-      }
-    }
-  }
-}
-
-void vp9_filter_block2d_8x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int j;
-  for (j=0; j<8; j+=4) {
-    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
-                                    HFilter_aligned16, VFilter_aligned16,
-                                    dst_ptr + j, dst_stride);
-  }
-}
-
-void vp9_filter_block2d_8x8_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<8; i+=4) {
-    for (j=0; j<8; j+=4) {
-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
-                                      HFilter_aligned16, VFilter_aligned16,
-                                      dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
-
-void vp9_filter_block2d_16x16_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
-  int i, j;
-  for (i=0; i<16; i+=4) {
-    for (j=0; j<16; j+=4) {
-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
-                                      HFilter_aligned16, VFilter_aligned16,
-                                      dst_ptr + j + i*dst_stride, dst_stride);
-    }
-  }
-}
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
deleted file mode 100644
index c6d65e9043..0000000000
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,550 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-
-;void vp9_filter_block1d8_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-    add         rax, rdx
-
-    lea         rbx, [rdx + rdx*4]
-    add         rbx, rdx                    ;pitch * 6
-
-.vp9_filter_block1d8_v8_ssse3_loop:
-    movq        xmm0, [rsi]                 ;A
-    movq        xmm1, [rsi + rdx]           ;B
-    movq        xmm2, [rsi + rdx * 2]       ;C
-    movq        xmm3, [rax + rdx * 2]       ;D
-    movq        xmm4, [rsi + rdx * 4]       ;E
-    movq        xmm5, [rax + rdx * 4]       ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-    movq        xmm6, [rsi + rbx]           ;G
-    movq        xmm7, [rax + rbx]           ;H
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    punpcklbw   xmm6, xmm7                  ;G H
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
-    paddsw      xmm0, xmm4
-
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    add         rsi,  rdx
-    add         rax,  rdx
-
-    movq        [rdi], xmm0
-
-%if ABI_IS_32BIT
-    add         rdi, DWORD PTR arg(3)       ;out_pitch
-%else
-    add         rdi, r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v8_ssse3_loop
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d16_v8_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-    add         rax, rdx
-
-    lea         rbx, [rdx + rdx*4]
-    add         rbx, rdx                    ;pitch * 6
-
-.vp9_filter_block1d16_v8_ssse3_loop:
-    movq        xmm0, [rsi]                 ;A
-    movq        xmm1, [rsi + rdx]           ;B
-    movq        xmm2, [rsi + rdx * 2]       ;C
-    movq        xmm3, [rax + rdx * 2]       ;D
-    movq        xmm4, [rsi + rdx * 4]       ;E
-    movq        xmm5, [rax + rdx * 4]       ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-    movq        xmm6, [rsi + rbx]           ;G
-    movq        xmm7, [rax + rbx]           ;H
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    punpcklbw   xmm6, xmm7                  ;G H
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
-    paddsw      xmm0, xmm4
-
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    movq        [rdi], xmm0
-
-    movq        xmm0, [rsi + 8]             ;A
-    movq        xmm1, [rsi + rdx + 8]       ;B
-    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
-    movq        xmm3, [rax + rdx * 2 + 8]   ;D
-    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
-    movq        xmm5, [rax + rdx * 4 + 8]   ;F
-
-    punpcklbw   xmm0, xmm1                  ;A B
-    punpcklbw   xmm2, xmm3                  ;C D
-    punpcklbw   xmm4, xmm5                  ;E F
-
-
-    movq        xmm6, [rsi + rbx + 8]       ;G
-    movq        xmm7, [rax + rbx + 8]       ;H
-    punpcklbw   xmm6, xmm7                  ;G H
-
-
-    pmaddubsw   xmm0, k0k1
-    pmaddubsw   xmm2, k2k3
-    pmaddubsw   xmm4, k4k5
-    pmaddubsw   xmm6, k6k7
-
-    paddsw      xmm0, xmm2
-    paddsw      xmm4, xmm6
-    paddsw      xmm0, krd
-    paddsw      xmm0, xmm4
-
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    add         rsi,  rdx
-    add         rax,  rdx
-
-    movq        [rdi+8], xmm0
-
-%if ABI_IS_32BIT
-    add         rdi, DWORD PTR arg(3)       ;out_pitch
-%else
-    add         rdi, r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v8_ssse3_loop
-
-    add rsp, 16*5
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d8_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-;    movdqa      krd, xmm5
-
-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
-    movsxd      rdx, dword ptr arg(3)       ;output_pitch
-    movsxd      rcx, dword ptr arg(4)       ;output_height
-
-.filter_block1d8_h8_rowloop_ssse3:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
-
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
-    punpcklqdq  xmm0,   xmm3
-
-    movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
-
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
-
-    paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   xmm5
-    paddsw      xmm0,   xmm4
-    psraw       xmm0,   7
-    packuswb    xmm0,   xmm0
-
-    lea         rsi,    [rsi + rax]
-    movq        [rdi],  xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .filter_block1d8_h8_rowloop_ssse3
-
-    add rsp, 16*5
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d16_h8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16*5
-    %define k0k1 [rsp + 16*0]
-    %define k2k3 [rsp + 16*1]
-    %define k4k5 [rsp + 16*2]
-    %define k6k7 [rsp + 16*3]
-    %define krd [rsp + 16*4]
-
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
-    packsswb    xmm4, xmm4
-    pshuflw     xmm0, xmm4, 0b              ;k0_k1
-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
-
-    punpcklqdq  xmm0, xmm0
-    punpcklqdq  xmm1, xmm1
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm1
-    pshufd      xmm5, xmm5, 0
-    movdqa      k4k5, xmm2
-    movdqa      k6k7, xmm3
-    movdqa      krd, xmm5
-
-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
-    movsxd      rdx, dword ptr arg(3)       ;output_pitch
-    movsxd      rcx, dword ptr arg(4)       ;output_height
-
-.filter_block1d16_h8_rowloop_ssse3:
-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4
-
-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11
-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
-    punpcklqdq  xmm0,   xmm3
-
-    movdqa      xmm1,   xmm0
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm0,   k0k1
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
-
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
-
-    paddsw      xmm0,   xmm1
-    paddsw      xmm0,   xmm4
-    paddsw      xmm0,   xmm2
-    paddsw      xmm0,   krd
-    psraw       xmm0,   7
-    packuswb    xmm0,   xmm0
-
-
-    movq        xmm3,   [rsi +  5]
-;    movq        xmm7,   [rsi + 12]
-    movq        xmm7,   [rsi + 13]
-;note: same as above
-;    punpcklbw   xmm3,   xmm7
-    punpcklqdq  xmm3,   xmm7
-
-    movdqa      xmm1,   xmm3
-    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]
-    pmaddubsw   xmm3,   k0k1
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   xmm1,   k2k3
-
-    movdqa      xmm4,   xmm2
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pmaddubsw   xmm2,   k4k5
-
-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]
-    pmaddubsw   xmm4,   k6k7
-
-    paddsw      xmm3,   xmm1
-    paddsw      xmm3,   xmm2
-    paddsw      xmm3,   krd
-    paddsw      xmm3,   xmm4
-    psraw       xmm3,   7
-    packuswb    xmm3,   xmm3
-    punpcklqdq  xmm0,   xmm3
-
-    lea         rsi,    [rsi + rax]
-    movdqa      [rdi],  xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .filter_block1d16_h8_rowloop_ssse3
-
-    add rsp, 16*5
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-shuf_t0t1:
-    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-align 16
-shuf_t2t3:
-    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-align 16
-shuf_t4t5:
-    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-align 16
-shuf_t6t7:
-    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm
deleted file mode 100644
index dee29b8fbb..0000000000
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ /dev/null
@@ -1,268 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define vp9_filter_weight 128
-%define VP9_FILTER_SHIFT  7
-
-
-;void vp9_filter_block1d_h6_mmx
-;(
-;    unsigned char   *src_ptr,
-;    unsigned short  *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           * vp9_filter
-;)
-global sym(vp9_filter_block1d_h6_mmx) PRIVATE
-sym(vp9_filter_block1d_h6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,    arg(6) ;vp9_filter
-
-        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
-        movq        mm2,    [rdx + 32]         ;
-        movq        mm6,    [rdx + 48]        ;
-        movq        mm7,    [rdx + 64]        ;
-
-        mov         rdi,    arg(1) ;output_ptr
-        mov         rsi,    arg(0) ;src_ptr
-        movsxd      rcx,    dword ptr arg(4) ;output_height
-        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
-        pxor        mm0,    mm0              ; mm0 = 00000000
-
-.nextrow:
-        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
-        movq        mm4,    mm3              ; mm4 = p-2..p5
-        psrlq       mm3,    8                ; mm3 = p-1..p5
-        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
-        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
-
-        movq        mm5,    mm4              ; mm5 = p-2..p5
-        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
-        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        movq        mm4,    mm5              ; mm4 = p-2..p5;
-        psrlq       mm5,    16               ; mm5 = p0..p5;
-        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
-        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
-        paddsw      mm3,    mm5              ; mm3 += mm5
-
-        movq        mm5,    mm4              ; mm5 = p-2..p5
-        psrlq       mm4,    24               ; mm4 = p1..p5
-        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
-        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        ; do outer positive taps
-        movd        mm4,    [rsi+3]
-        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
-        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
-        paddsw      mm3,    mm4              ; mm3 += mm5
-
-        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
-        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
-        paddsw      mm3,    mm5              ; mm3 += mm5
-
-        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
-        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3,    mm0              ; pack and unpack to saturate
-        punpcklbw   mm3,    mm0              ;
-
-        movq        [rdi],  mm3              ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
-        add         rdi,    rax;
-%else
-        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
-        add         rdi,    rax;
-
-        add         rsi,    r8               ; next line
-%endif
-
-        dec         rcx                      ; decrement count
-        jnz         .nextrow                 ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1dc_v6_mmx
-;(
-;   short *src_ptr,
-;   unsigned char *output_ptr,
-;    int output_pitch,
-;   unsigned int pixels_per_line,
-;   unsigned int pixel_step,
-;   unsigned int output_height,
-;   unsigned int output_width,
-;   short * vp9_filter
-;)
-global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
-sym(vp9_filter_block1dc_v6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        movq      mm5, [GLOBAL(rd)]
-        push        rbx
-        mov         rbx, arg(7) ;vp9_filter
-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
-        movq      mm2, [rbx + 32]         ;
-        movq      mm6, [rbx + 48]        ;
-        movq      mm7, [rbx + 64]        ;
-
-        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
-        mov         rdi, arg(1) ;output_ptr
-        mov         rsi, arg(0) ;src_ptr
-        sub         rsi, rdx
-        sub         rsi, rdx
-        movsxd      rcx, DWORD PTR arg(5) ;output_height
-        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-
-.nextrow_cv:
-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
-
-
-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        paddsw      mm3, mm5               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3, mm0              ; pack and saturate
-
-        movd        [rdi],mm3             ; store the results in the destination
-        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
-        ; recon block should be in cache this shouldn't cost much.  Its obviously
-        ; avoidable!!!.
-        lea         rdi,  [rdi+rax] ;
-        dec         rcx                   ; decrement count
-        jnz         .nextrow_cv           ; next row
-
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-rd:
-    times 4 dw 0x40
-
-align 16
-global HIDDEN_DATA(sym(vp9_six_tap_mmx))
-sym(vp9_six_tap_mmx):
-    times 8 dw 0
-    times 8 dw 0
-    times 8 dw 128
-    times 8 dw 0
-    times 8 dw 0
-    times 8 dw 0
-
-    times 8 dw 0
-    times 8 dw -6
-    times 8 dw 123
-    times 8 dw 12
-    times 8 dw -1
-    times 8 dw 0
-
-    times 8 dw 2
-    times 8 dw -11
-    times 8 dw 108
-    times 8 dw 36
-    times 8 dw -8
-    times 8 dw 1
-
-    times 8 dw 0
-    times 8 dw -9
-    times 8 dw 93
-    times 8 dw 50
-    times 8 dw -6
-    times 8 dw 0
-
-    times 8 dw 3
-    times 8 dw -16
-    times 8 dw 77
-    times 8 dw 77
-    times 8 dw -16
-    times 8 dw 3
-
-    times 8 dw 0
-    times 8 dw -6
-    times 8 dw 50
-    times 8 dw 93
-    times 8 dw -9
-    times 8 dw 0
-
-    times 8 dw 1
-    times 8 dw -8
-    times 8 dw 36
-    times 8 dw 108
-    times 8 dw -11
-    times 8 dw 2
-
-    times 8 dw 0
-    times 8 dw -1
-    times 8 dw 12
-    times 8 dw 123
-    times 8 dw -6
-    times 8 dw 0
-
diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm
deleted file mode 100644
index b0c4f12825..0000000000
--- a/vp9/common/x86/vp9_subpixel_sse2.asm
+++ /dev/null
@@ -1,1372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           *vp9_filter
-;)
-global sym(vp9_filter_block1d8_h6_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(6) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;output_width
-%endif
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d8_h6_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm4
-        lea         rsi,        [rsi + rax]
-
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(5) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-
-        jnz         .filter_block1d8_h6_rowloop                ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    pixel_step,
-;    unsigned int    output_height,
-;    unsigned int    output_width,
-;    short           *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(6) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;output_width
-%endif
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d16_h6_sse2_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
-
-        por         xmm2,       xmm1
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm4
-
-        movdqa      xmm3,       xmm2
-        movdqa      xmm4,       xmm2
-
-        movdqa      xmm5,       xmm2
-        movdqa      xmm6,       xmm2
-
-        movdqa      xmm7,       xmm2
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-        punpcklbw   xmm4,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi+16],      xmm4
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(5) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-
-        dec         rcx
-        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_v6_sse2
-;(
-;    short *src_ptr,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int pixels_per_line,
-;    unsigned int pixel_step,
-;    unsigned int output_height,
-;    unsigned int output_width,
-;    short * vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rax,        arg(7) ;vp9_filter
-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
-
-        mov         rdi,        arg(1) ;output_ptr
-        mov         rsi,        arg(0) ;src_ptr
-
-        sub         rsi,        rdx
-        sub         rsi,        rdx
-
-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_sse2_loop:
-        movdqa      xmm1,       XMMWORD PTR [rsi]
-        pmullw      xmm1,       [rax]
-
-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
-        pmullw      xmm2,       [rax + 16]
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
-        pmullw      xmm3,       [rax + 32]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
-        pmullw      xmm5,       [rax + 64]
-
-        add         rsi,        rdx
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
-
-        pmullw      xmm4,       [rax + 48]
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
-
-        pmullw      xmm6,       [rax + 80]
-
-        paddsw      xmm2,       xmm5
-        paddsw      xmm2,       xmm3
-
-        paddsw      xmm2,       xmm1
-        paddsw      xmm2,       xmm4
-
-        paddsw      xmm2,       xmm6
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm2,       7
-        packuswb    xmm2,       xmm0              ; pack and saturate
-
-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_v6_sse2
-;(
-;    unsigned short *src_ptr,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int pixels_per_line,
-;    unsigned int pixel_step,
-;    unsigned int output_height,
-;    unsigned int output_width,
-;    const short    *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2) PRIVATE
-sym(vp9_filter_block1d16_v6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rax,        arg(7) ;vp9_filter
-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
-
-        mov         rdi,        arg(1) ;output_ptr
-        mov         rsi,        arg(0) ;src_ptr
-
-        sub         rsi,        rdx
-        sub         rsi,        rdx
-
-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d16_v6_sse2_loop:
-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
-        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
-        pmullw      xmm1,       [rax + 16]
-        pmullw      xmm2,       [rax + 16]
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
-        pmullw      xmm3,       [rax + 64]
-        pmullw      xmm4,       [rax + 64]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
-        pmullw      xmm5,       [rax + 32]
-        pmullw      xmm6,       [rax + 32]
-
-        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
-        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
-        pmullw      xmm7,       [rax]
-        pmullw      xmm0,       [rax]
-
-        paddsw      xmm1,       xmm3
-        paddsw      xmm2,       xmm4
-        paddsw      xmm1,       xmm5
-        paddsw      xmm2,       xmm6
-        paddsw      xmm1,       xmm7
-        paddsw      xmm2,       xmm0
-
-        add         rsi,        rdx
-
-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
-        pmullw      xmm3,       [rax + 48]
-        pmullw      xmm4,       [rax + 48]
-
-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
-        pmullw      xmm5,       [rax + 80]
-        pmullw      xmm6,       [rax + 80]
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        paddsw      xmm1,       xmm3
-        paddsw      xmm2,       xmm4
-        paddsw      xmm1,       xmm5
-        paddsw      xmm2,       xmm6
-
-        paddsw      xmm1,       xmm7
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm1,       7
-        psraw       xmm2,       7
-
-        packuswb    xmm1,       xmm2              ; pack and saturate
-        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_h6_only_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    int dst_ptich,
-;    unsigned int    output_height,
-;    const short    *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(5) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ;dst_ptich
-%endif
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d8_h6_only_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0
-
-        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
-        lea         rsi,        [rsi + rax]
-
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-
-        jnz         .filter_block1d8_h6_only_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d16_h6_only_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    int dst_ptich,
-;    unsigned int    output_height,
-;    const short    *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rdx,        arg(5) ;vp9_filter
-        mov         rsi,        arg(0) ;src_ptr
-
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ;dst_ptich
-%endif
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-
-.filter_block1d16_h6_only_sse2_rowloop:
-        movq        xmm3,       MMWORD PTR [rsi - 2]
-        movq        xmm1,       MMWORD PTR [rsi + 6]
-
-        movq        xmm2,       MMWORD PTR [rsi +14]
-        pslldq      xmm2,       8
-
-        por         xmm2,       xmm1
-        prefetcht2  [rsi+rax-2]
-
-        pslldq      xmm1,       8
-        por         xmm1,       xmm3
-
-        movdqa      xmm4,       xmm1
-        movdqa      xmm5,       xmm1
-
-        movdqa      xmm6,       xmm1
-        movdqa      xmm7,       xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm1
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0                        ; lower 8 bytes
-
-        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
-
-        movdqa      xmm3,       xmm2
-        movdqa      xmm4,       xmm2
-
-        movdqa      xmm5,       xmm2
-        movdqa      xmm6,       xmm2
-
-        movdqa      xmm7,       xmm2
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
-
-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
-
-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
-
-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
-
-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
-
-        paddsw      xmm4,       xmm7
-        paddsw      xmm4,       xmm5
-
-        paddsw      xmm4,       xmm3
-        paddsw      xmm4,       xmm6
-
-        paddsw      xmm4,       xmm2
-        paddsw      xmm4,       [GLOBAL(rd)]
-
-        psraw       xmm4,       7
-
-        packuswb    xmm4,       xmm0                        ; higher 8 bytes
-
-        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
-%else
-        add         rdi,        r8
-%endif
-
-        dec         rcx
-        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_filter_block1d8_v6_only_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char *output_ptr,
-;    int dst_ptich,
-;    unsigned int output_height,
-;    const short    *vp9_filter
-;)
-; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_only_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(2) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(4) ;output_height
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        mov         rax,        arg(5) ;vp9_filter
-
-        pxor        xmm0,       xmm0                        ; clear xmm0
-
-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(3) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_only_sse2_loop:
-        movq        xmm1,       MMWORD PTR [rsi]
-        movq        xmm2,       MMWORD PTR [rsi + rdx]
-        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
-        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
-        add         rsi,        rdx
-        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
-        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
-
-        punpcklbw   xmm1,       xmm0
-        pmullw      xmm1,       [rax]
-
-        punpcklbw   xmm2,       xmm0
-        pmullw      xmm2,       [rax + 16]
-
-        punpcklbw   xmm3,       xmm0
-        pmullw      xmm3,       [rax + 32]
-
-        punpcklbw   xmm5,       xmm0
-        pmullw      xmm5,       [rax + 64]
-
-        punpcklbw   xmm4,       xmm0
-        pmullw      xmm4,       [rax + 48]
-
-        punpcklbw   xmm6,       xmm0
-        pmullw      xmm6,       [rax + 80]
-
-        paddsw      xmm2,       xmm5
-        paddsw      xmm2,       xmm3
-
-        paddsw      xmm2,       xmm1
-        paddsw      xmm2,       xmm4
-
-        paddsw      xmm2,       xmm6
-        paddsw      xmm2,       xmm7
-
-        psraw       xmm2,       7
-        packuswb    xmm2,       xmm0              ; pack and saturate
-
-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx         ; decrement count
-        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_unpack_block1d16_h6_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned short *output_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned int    output_height,
-;    unsigned int    output_width
-;)
-global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE
-sym(vp9_unpack_block1d16_h6_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;output_ptr
-
-        movsxd      rcx,        dword ptr arg(3) ;output_height
-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
-
-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
-%endif
-
-.unpack_block1d16_h6_sse2_rowloop:
-        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
-        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
-
-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
-        punpcklbw   xmm1,       xmm0
-
-        movdqa      XMMWORD Ptr [rdi],         xmm1
-        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
-
-        lea         rsi,        [rsi + rax]
-%if ABI_IS_32BIT
-        add         rdi,        DWORD Ptr arg(4) ;[output_width]
-%else
-        add         rdi,        r8
-%endif
-        dec         rcx
-        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_bilinear_predict16x16_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2) PRIVATE
-sym(vp9_bilinear_predict16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-
-        cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-%endif
-        ; get the first horizontal line done
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        add         rsi,        rdx                 ; next line
-.next_row:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       [rax]
-        pmullw      xmm6,       [rax]
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        pmullw      xmm3,       [rax+16]
-        pmullw      xmm4,       [rax+16]
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rdx                 ; next line
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5) ;dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        add         rsi,        rax                 ; next line
-.next_row_spo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        movdqa      xmm7,       xmm3
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm5,       xmm1
-        pmullw      xmm6,       xmm1
-        pmullw      xmm3,       xmm2
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ;dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_spo
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-        pxor        xmm0,       xmm0
-
-.next_row_fpo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ; dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_fpo
-
-.done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_bilinear_predict8x8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2) PRIVATE
-sym(vp9_bilinear_predict8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm5,       [rax]
-        movdqa      xmm6,       [rax+16]
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       XMMWORD PTR [rsp]
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        add         rsp,        16                 ; next line
-.next_row8x8:
-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-        pmullw      xmm7,       xmm5
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm4,       xmm3
-
-        pmullw      xmm3,       xmm6
-        paddw       xmm3,       xmm7
-
-        movdqa      xmm7,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    xmm3,       xmm0
-        movq        [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsp,        16                 ; next line
-        add         rdi,        rdx
-
-        cmp         rdi,        rcx
-        jne         .next_row8x8
-
-    ;add rsp, 144
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-rd:
-    times 8 dw 0x40
diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm
deleted file mode 100644
index b260480e03..0000000000
--- a/vp9/common/x86/vp9_subpixel_ssse3.asm
+++ /dev/null
@@ -1,1515 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4
-
-    movdqa      xmm7, [GLOBAL(rd)]
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-    mov         rdi, arg(2)             ;output_ptr
-
-    cmp         esi, DWORD PTR [rax]
-    je          vp9_filter_block1d8_h4_ssse3
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-    sub         rdi, rdx
-;xmm3 free
-.filter_block1d8_h6_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm1,   xmm0
-    pmaddubsw   xmm0,   xmm4
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-    pmaddubsw   xmm1,   xmm5
-
-    lea         rdi,    [rdi + rdx]
-    pmaddubsw   xmm2,   xmm6
-
-    lea         rsi,    [rsi + rax]
-    dec         rcx
-
-    paddsw      xmm0,   xmm1
-    paddsw      xmm2,   xmm7
-
-    paddsw      xmm0,   xmm2
-
-    psraw       xmm0,   7
-
-    packuswb    xmm0,   xmm0
-
-    movq        MMWORD Ptr [rdi], xmm0
-    jnz         .filter_block1d8_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-vp9_filter_block1d8_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
-    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-    sub         rdi, rdx
-
-.filter_block1d8_h4_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm2,   xmm0
-    pshufb      xmm0,   xmm3
-
-    pshufb      xmm2,   xmm4
-    pmaddubsw   xmm0,   xmm5
-
-    lea         rdi,    [rdi + rdx]
-    pmaddubsw   xmm2,   xmm6
-
-    lea         rsi,    [rsi + rax]
-    dec         rcx
-
-    paddsw      xmm0,   xmm7
-
-    paddsw      xmm0,   xmm2
-
-    psraw       xmm0,   7
-
-    packuswb    xmm0,   xmm0
-
-    movq        MMWORD Ptr [rdi], xmm0
-
-    jnz         .filter_block1d8_h4_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-;void vp9_filter_block1d16_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)           ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    mov         rdi, arg(2)                     ;output_ptr
-
-    mov         rsi, arg(0)                     ;src_ptr
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)           ;output_height
-    movsxd      rdx, dword ptr arg(3)           ;output_pitch
-
-.filter_block1d16_h6_rowloop_ssse3:
-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
-
-    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
-
-    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
-
-    movdqa      xmm1,   xmm0
-    pmaddubsw   xmm0,   xmm4
-
-    movdqa      xmm2,   xmm1
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-    movq        xmm3,   MMWORD PTR [rsi +  6]
-
-    pmaddubsw   xmm1,   xmm5
-    movq        xmm7,   MMWORD PTR [rsi + 11]
-
-    pmaddubsw   xmm2,   xmm6
-    punpcklbw   xmm3,   xmm7
-
-    paddsw      xmm0,   xmm1
-    movdqa      xmm1,   xmm3
-
-    pmaddubsw   xmm3,   xmm4
-    paddsw      xmm0,   xmm2
-
-    movdqa      xmm2,   xmm1
-    paddsw      xmm0,   [GLOBAL(rd)]
-
-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
-
-    psraw       xmm0,   7
-    pmaddubsw   xmm1,   xmm5
-
-    pmaddubsw   xmm2,   xmm6
-    packuswb    xmm0,   xmm0
-
-    lea         rsi,    [rsi + rax]
-    paddsw      xmm3,   xmm1
-
-    paddsw      xmm3,   xmm2
-
-    paddsw      xmm3,   [GLOBAL(rd)]
-
-    psraw       xmm3,   7
-
-    packuswb    xmm3,   xmm3
-
-    punpcklqdq  xmm0,   xmm3
-
-    movdqa      XMMWORD Ptr [rdi], xmm0
-
-    lea         rdi,    [rdi + rdx]
-    dec         rcx
-    jnz         .filter_block1d16_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d4_h6_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    unsigned int    vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-    movdqa      xmm7, [GLOBAL(rd)]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d4_h4_ssse3
-
-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    mov         rdi, arg(2)             ;output_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-;xmm3 free
-.filter_block1d4_h6_rowloop_ssse3:
-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm1, xmm0
-    pshufb      xmm0, [GLOBAL(shuf1b)]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, [GLOBAL(shuf2b)]
-    pmaddubsw   xmm0, xmm4
-    pshufb      xmm2, [GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-;--
-    pmaddubsw   xmm2, xmm6
-
-    lea         rsi,    [rsi + rax]
-;--
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm7
-    pxor        xmm1, xmm1
-    paddsw      xmm0, xmm2
-    psraw       xmm0, 7
-    packuswb    xmm0, xmm0
-
-    movd        DWORD PTR [rdi], xmm0
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         .filter_block1d4_h6_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d4_h4_ssse3:
-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
-
-    mov         rsi, arg(0)             ;src_ptr
-    mov         rdi, arg(2)             ;output_ptr
-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
-    movsxd      rcx, dword ptr arg(4)   ;output_height
-
-    movsxd      rdx, dword ptr arg(3)   ;output_pitch
-
-.filter_block1d4_h4_rowloop_ssse3:
-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
-
-    movdqa      xmm2, xmm1
-    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
-    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
-    pmaddubsw   xmm1, xmm5
-
-;--
-    pmaddubsw   xmm2, xmm6
-
-    lea         rsi,    [rsi + rax]
-;--
-    paddsw      xmm1, xmm7
-    paddsw      xmm1, xmm2
-    psraw       xmm1, 7
-    packuswb    xmm1, xmm1
-
-    movd        DWORD PTR [rdi], xmm1
-
-    add         rdi, rdx
-    dec         rcx
-    jnz         .filter_block1d4_h4_rowloop_ssse3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void vp9_filter_block1d16_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d16_v4_ssse3
-
-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)   ;output_height
-    add         rax, rdx
-
-
-.vp9_filter_block1d16_v6_ssse3_loop:
-    movq        xmm1, MMWORD PTR [rsi]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, [GLOBAL(rd)]
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2          ;store the results
-
-    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, [GLOBAL(rd)]
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;out_pitch
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d16_v4_ssse3:
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
-%endif
-    mov         rax, rsi
-    movsxd      rcx, DWORD PTR arg(4)   ;output_height
-    add         rax, rdx
-
-.vp9_filter_block1d16_v4_ssse3_loop:
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    pmaddubsw   xmm3, xmm6
-    pmaddubsw   xmm2, xmm7
-    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
-    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
-
-    paddsw      xmm2, [GLOBAL(rd)]
-    paddsw      xmm2, xmm3
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    punpcklbw   xmm5, xmm4                  ;B D
-    punpcklbw   xmm1, xmm0                  ;C E
-
-    pmaddubsw   xmm1, xmm6
-    pmaddubsw   xmm5, xmm7
-
-    movdqa      xmm4, [GLOBAL(rd)]
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm5, xmm1
-    paddsw      xmm5, xmm4
-    psraw       xmm5, 7
-    packuswb    xmm5, xmm5
-
-    punpcklqdq  xmm2, xmm5
-
-    movdqa       XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;out_pitch
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d16_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_filter_block1d8_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
-%endif
-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d8_v4_ssse3
-
-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d8_v6_ssse3_loop:
-    movq        xmm1, MMWORD PTR [rsi]                  ;A
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
-    movdqa      xmm4, [GLOBAL(rd)]
-
-    pmaddubsw   xmm3, xmm6
-    punpcklbw   xmm1, xmm0                  ;A F
-    pmaddubsw   xmm2, xmm7
-    pmaddubsw   xmm1, xmm5
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm1
-    paddsw      xmm2, xmm4
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d8_v4_ssse3:
-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
-    movdqa      xmm5, [GLOBAL(rd)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d8_v4_ssse3_loop:
-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   xmm2, xmm4                  ;B D
-    punpcklbw   xmm3, xmm0                  ;C E
-
-    pmaddubsw   xmm3, xmm6
-    pmaddubsw   xmm2, xmm7
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      xmm2, xmm3
-    paddsw      xmm2, xmm5
-    psraw       xmm2, 7
-    packuswb    xmm2, xmm2
-
-    movq        MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d8_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-;void vp9_filter_block1d4_v6_ssse3
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    unsigned int   vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v6_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    movsxd      rdx, DWORD PTR arg(5)   ;table index
-    xor         rsi, rsi
-    shl         rdx, 4      ;
-
-    lea         rax, [GLOBAL(k0_k5)]
-    add         rax, rdx
-
-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
-    mov         rdi, arg(2)             ;output_ptr
-%if ABI_IS_32BIT=0
-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
-%endif
-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
-
-    cmp         esi, DWORD PTR [rax]
-    je          .vp9_filter_block1d4_v4_ssse3
-
-    movq        mm5, MMWORD PTR [rax]         ;k0_k5
-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d4_v6_ssse3_loop:
-    movd        mm1, DWORD PTR [rsi]                  ;A
-    movd        mm2, DWORD PTR [rsi + rdx]            ;B
-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   mm2, mm4                  ;B D
-    punpcklbw   mm3, mm0                  ;C E
-
-    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
-
-    movq        mm4, [GLOBAL(rd)]
-
-    pmaddubsw   mm3, mm6
-    punpcklbw   mm1, mm0                  ;A F
-    pmaddubsw   mm2, mm7
-    pmaddubsw   mm1, mm5
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      mm2, mm3
-    paddsw      mm2, mm1
-    paddsw      mm2, mm4
-    psraw       mm2, 7
-    packuswb    mm2, mm2
-
-    movd        DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d4_v6_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-.vp9_filter_block1d4_v4_ssse3:
-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
-    movq        mm5, MMWORD PTR [GLOBAL(rd)]
-
-    mov         rsi, arg(0)             ;src_ptr
-
-    mov         rax, rsi
-    add         rax, rdx
-
-.vp9_filter_block1d4_v4_ssse3_loop:
-    movd        mm2, DWORD PTR [rsi + rdx]            ;B
-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
-
-    punpcklbw   mm2, mm4                  ;B D
-    punpcklbw   mm3, mm0                  ;C E
-
-    pmaddubsw   mm3, mm6
-    pmaddubsw   mm2, mm7
-    add         rsi,  rdx
-    add         rax,  rdx
-;--
-;--
-    paddsw      mm2, mm3
-    paddsw      mm2, mm5
-    psraw       mm2, 7
-    packuswb    mm2, mm2
-
-    movd        DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
-%else
-    add         rdi,        r8
-%endif
-    dec         rcx
-    jnz         .vp9_filter_block1d4_v4_ssse3_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_bilinear_predict16x16_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE
-sym(vp9_bilinear_predict16x16_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
-        movsxd      rax,        dword ptr arg(2)    ; xoffset
-
-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; HFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        mov         rsi,        arg(0)              ; src_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm1,       [rax]
-
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-
-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
-
-        movdqa      xmm2,       [rax]
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
-%endif
-        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-
-        lea         rsi,        [rsi + rdx]         ; next line
-
-        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
-
-        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
-        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
-        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm6,       xmm5
-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-        lea         rsi,        [rsi + rdx]         ; next line
-
-        pmaddubsw   xmm6,       xmm1
-
-        punpcklbw   xmm4,       xmm5
-        pmaddubsw   xmm4,       xmm1
-
-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128
-
-        packuswb    xmm6,       xmm4
-        movdqa      xmm5,       xmm7
-
-        punpcklbw   xmm5,       xmm6
-        pmaddubsw   xmm5,       xmm2
-
-        punpckhbw   xmm7,       xmm6
-        pmaddubsw   xmm7,       xmm2
-
-        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
-        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128
-
-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
-
-        packuswb    xmm5,       xmm7
-        movdqa      xmm7,       xmm6
-
-        movdqa      [rdi],      xmm5                ; store the results in the destination
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        mov         rsi,        arg(0)              ; src_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm1,       [rax]               ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
-
-        ; get the first horizontal line done
-        movq        xmm4,       [rsi]               ; load row 0
-        movq        xmm2,       [rsi + 8]           ; load row 0
-
-        lea         rsi,        [rsi + rax]         ; next line
-.next_row_sp:
-        movq        xmm3,       [rsi]               ; load row + 1
-        movq        xmm5,       [rsi + 8]           ; load row + 1
-
-        punpcklbw   xmm4,       xmm3
-        punpcklbw   xmm2,       xmm5
-
-        pmaddubsw   xmm4,       xmm1
-        movq        xmm7,       [rsi + rax]         ; load row + 2
-
-        pmaddubsw   xmm2,       xmm1
-        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
-
-        punpcklbw   xmm3,       xmm7
-        punpcklbw   xmm5,       xmm6
-
-        pmaddubsw   xmm3,       xmm1
-        paddw       xmm4,       [GLOBAL(rd)]
-
-        pmaddubsw   xmm5,       xmm1
-        paddw       xmm2,       [GLOBAL(rd)]
-
-        psraw       xmm4,       VP9_FILTER_SHIFT
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        packuswb    xmm4,       xmm2
-        paddw       xmm3,       [GLOBAL(rd)]
-
-        movdqa      [rdi],      xmm4                ; store row 0
-        paddw       xmm5,       [GLOBAL(rd)]
-
-        psraw       xmm3,       VP9_FILTER_SHIFT
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm5
-        movdqa      xmm4,       xmm7
-
-        movdqa      [rdi + rdx],xmm3                ; store row 1
-        lea         rsi,        [rsi + 2*rax]
-
-        movdqa      xmm2,       xmm6
-        lea         rdi,        [rdi + 2*rdx]
-
-        cmp         rdi,        rcx
-        jne         .next_row_sp
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
-
-.next_row_fp:
-        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
-        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
-
-        punpcklbw   xmm2,       xmm4
-        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
-
-        pmaddubsw   xmm2,       xmm1
-        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
-
-        lea         rsi,        [rsi + rax]         ; next line
-        punpcklbw   xmm3,       xmm4
-
-        pmaddubsw   xmm3,       xmm1
-        movq        xmm5,       [rsi]
-
-        paddw       xmm2,       [GLOBAL(rd)]
-        movq        xmm7,       [rsi+1]
-
-        movq        xmm6,       [rsi+8]
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        punpcklbw   xmm5,       xmm7
-        movq        xmm7,       [rsi+9]
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        pmaddubsw   xmm5,       xmm1
-
-        psraw       xmm3,       VP9_FILTER_SHIFT
-        punpcklbw   xmm6,       xmm7
-
-        packuswb    xmm2,       xmm3
-        pmaddubsw   xmm6,       xmm1
-
-        movdqa      [rdi],      xmm2                ; store the results in the destination
-        paddw       xmm5,       [GLOBAL(rd)]
-
-        lea         rdi,        [rdi + rdx]         ; dst_pitch
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        paddw       xmm6,       [GLOBAL(rd)]
-        psraw       xmm6,       VP9_FILTER_SHIFT
-
-        packuswb    xmm5,       xmm6
-        lea         rsi,        [rsi + rax]         ; next line
-
-        movdqa      [rdi],      xmm5                ; store the results in the destination
-        lea         rdi,        [rdi + rdx]         ; dst_pitch
-
-        cmp         rdi,        rcx
-
-        jne         .next_row_fp
-
-.done:
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_bilinear_predict8x8_ssse3
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE
-sym(vp9_bilinear_predict8x8_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2)    ; xoffset
-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          .b8x8_sp_only
-
-        shl         rax,        4
-        add         rax,        rcx                 ; HFilter
-
-        mov         rdi,        arg(4)              ; dst_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm0,       [rax]
-
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          .b8x8_fp_only
-
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm1,       [rax]
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
-
-        psrldq      xmm5,       1
-        lea         rsp,        [rsp + 16]          ; next line
-
-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
-        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
-        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        lea         rsp,        [rsp + 16]          ; next line
-
-        movdqa      xmm5,       xmm6
-
-        psrldq      xmm5,       1
-
-        punpcklbw   xmm6,       xmm5
-        pmaddubsw   xmm6,       xmm0
-
-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128
-
-        packuswb    xmm6,       xmm6
-
-        punpcklbw   xmm7,       xmm6
-        pmaddubsw   xmm7,       xmm1
-
-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128
-
-        packuswb    xmm7,       xmm7
-
-        movq        [rdi],      xmm7                ; store the results in the destination
-        lea         rdi,        [rdi + rdx]
-
-        movdqa      xmm7,       xmm6
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done8x8
-
-.b8x8_sp_only:
-        movsxd      rax,        dword ptr arg(3)    ; yoffset
-        shl         rax,        4
-        lea         rax,        [rax + rcx]         ; VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
-
-        movdqa      xmm0,       [rax]               ; VFilter
-
-        movq        xmm1,       XMMWORD PTR [rsp]
-        movq        xmm2,       XMMWORD PTR [rsp+16]
-
-        movq        xmm3,       XMMWORD PTR [rsp+32]
-        punpcklbw   xmm1,       xmm2
-
-        movq        xmm4,       XMMWORD PTR [rsp+48]
-        punpcklbw   xmm2,       xmm3
-
-        movq        xmm5,       XMMWORD PTR [rsp+64]
-        punpcklbw   xmm3,       xmm4
-
-        movq        xmm6,       XMMWORD PTR [rsp+80]
-        punpcklbw   xmm4,       xmm5
-
-        movq        xmm7,       XMMWORD PTR [rsp+96]
-        punpcklbw   xmm5,       xmm6
-
-        pmaddubsw   xmm1,       xmm0
-        pmaddubsw   xmm2,       xmm0
-
-        pmaddubsw   xmm3,       xmm0
-        pmaddubsw   xmm4,       xmm0
-
-        pmaddubsw   xmm5,       xmm0
-        punpcklbw   xmm6,       xmm7
-
-        pmaddubsw   xmm6,       xmm0
-        paddw       xmm1,       [GLOBAL(rd)]
-
-        paddw       xmm2,       [GLOBAL(rd)]
-        psraw       xmm1,       VP9_FILTER_SHIFT
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        psraw       xmm2,       VP9_FILTER_SHIFT
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm3,       VP9_FILTER_SHIFT
-
-        paddw       xmm5,       [GLOBAL(rd)]
-        psraw       xmm4,       VP9_FILTER_SHIFT
-
-        paddw       xmm6,       [GLOBAL(rd)]
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        psraw       xmm6,       VP9_FILTER_SHIFT
-        packuswb    xmm1,       xmm1
-
-        packuswb    xmm2,       xmm2
-        movq        [rdi],      xmm1
-
-        packuswb    xmm3,       xmm3
-        movq        [rdi+rdx],  xmm2
-
-        packuswb    xmm4,       xmm4
-        movq        xmm1,       XMMWORD PTR [rsp+112]
-
-        lea         rdi,        [rdi + 2*rdx]
-        movq        xmm2,       XMMWORD PTR [rsp+128]
-
-        packuswb    xmm5,       xmm5
-        movq        [rdi],      xmm3
-
-        packuswb    xmm6,       xmm6
-        movq        [rdi+rdx],  xmm4
-
-        lea         rdi,        [rdi + 2*rdx]
-        punpcklbw   xmm7,       xmm1
-
-        movq        [rdi],      xmm5
-        pmaddubsw   xmm7,       xmm0
-
-        movq        [rdi+rdx],  xmm6
-        punpcklbw   xmm1,       xmm2
-
-        pmaddubsw   xmm1,       xmm0
-        paddw       xmm7,       [GLOBAL(rd)]
-
-        psraw       xmm7,       VP9_FILTER_SHIFT
-        paddw       xmm1,       [GLOBAL(rd)]
-
-        psraw       xmm1,       VP9_FILTER_SHIFT
-        packuswb    xmm7,       xmm7
-
-        packuswb    xmm1,       xmm1
-        lea         rdi,        [rdi + 2*rdx]
-
-        movq        [rdi],      xmm7
-
-        movq        [rdi+rdx],  xmm1
-        lea         rsp,        [rsp + 144]
-
-        jmp         .done8x8
-
-.b8x8_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-
-.next_row_fp:
-        movdqa      xmm1,       XMMWORD PTR [rsp]
-        movdqa      xmm3,       XMMWORD PTR [rsp+16]
-
-        movdqa      xmm2,       xmm1
-        movdqa      xmm5,       XMMWORD PTR [rsp+32]
-
-        psrldq      xmm2,       1
-        movdqa      xmm7,       XMMWORD PTR [rsp+48]
-
-        movdqa      xmm4,       xmm3
-        psrldq      xmm4,       1
-
-        movdqa      xmm6,       xmm5
-        psrldq      xmm6,       1
-
-        punpcklbw   xmm1,       xmm2
-        pmaddubsw   xmm1,       xmm0
-
-        punpcklbw   xmm3,       xmm4
-        pmaddubsw   xmm3,       xmm0
-
-        punpcklbw   xmm5,       xmm6
-        pmaddubsw   xmm5,       xmm0
-
-        movdqa      xmm2,       xmm7
-        psrldq      xmm2,       1
-
-        punpcklbw   xmm7,       xmm2
-        pmaddubsw   xmm7,       xmm0
-
-        paddw       xmm1,       [GLOBAL(rd)]
-        psraw       xmm1,       VP9_FILTER_SHIFT
-
-        paddw       xmm3,       [GLOBAL(rd)]
-        psraw       xmm3,       VP9_FILTER_SHIFT
-
-        paddw       xmm5,       [GLOBAL(rd)]
-        psraw       xmm5,       VP9_FILTER_SHIFT
-
-        paddw       xmm7,       [GLOBAL(rd)]
-        psraw       xmm7,       VP9_FILTER_SHIFT
-
-        packuswb    xmm1,       xmm1
-        packuswb    xmm3,       xmm3
-
-        packuswb    xmm5,       xmm5
-        movq        [rdi],      xmm1
-
-        packuswb    xmm7,       xmm7
-        movq        [rdi+rdx],  xmm3
-
-        lea         rdi,        [rdi + 2*rdx]
-        movq        [rdi],      xmm5
-
-        lea         rsp,        [rsp + 4*16]
-        movq        [rdi+rdx],  xmm7
-
-        lea         rdi,        [rdi + 2*rdx]
-        cmp         rdi,        rcx
-
-        jne         .next_row_fp
-
-        lea         rsp,        [rsp + 16]
-
-.done8x8:
-    ;add rsp, 144
-    pop         rsp
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-shuf1b:
-    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
-    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
-    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-shuf2bfrom1:
-    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
-align 16
-shuf3bfrom1:
-    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
-
-align 16
-rd:
-    times 8 dw 0x40
-
-align 16
-k0_k5:
-    times 8 db 0, 0             ;placeholder
-    times 8 db 0, 0
-    times 8 db 2, 1
-    times 8 db 0, 0
-    times 8 db 3, 3
-    times 8 db 0, 0
-    times 8 db 1, 2
-    times 8 db 0, 0
-k1_k3:
-    times 8 db  0,    0         ;placeholder
-    times 8 db  -6,  12
-    times 8 db -11,  36
-    times 8 db  -9,  50
-    times 8 db -16,  77
-    times 8 db  -6,  93
-    times 8 db  -8, 108
-    times 8 db  -1, 123
-k2_k4:
-    times 8 db 128,    0        ;placeholder
-    times 8 db 123,   -1
-    times 8 db 108,   -8
-    times 8 db  93,   -6
-    times 8 db  77,  -16
-    times 8 db  50,   -9
-    times 8 db  36,  -11
-    times 8 db  12,   -6
-align 16
-bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 120, 8
-    times 8 db 112, 16
-    times 8 db 104, 24
-    times 8 db 96,  32
-    times 8 db 88,  40
-    times 8 db 80,  48
-    times 8 db 72,  56
-    times 8 db 64,  64
-    times 8 db 56,  72
-    times 8 db 48,  80
-    times 8 db 40,  88
-    times 8 db 32,  96
-    times 8 db 24,  104
-    times 8 db 16,  112
-    times 8 db 8,   120
-
diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h
deleted file mode 100644
index 25bc26d9bd..0000000000
--- a/vp9/common/x86/vp9_subpixel_x86.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
-
-#undef  vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
-
-#undef  vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
-
-#undef  vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
-
-
-#undef  vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
-
-#endif
-#endif
-
-
-
-#endif
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 4a0794a9b2..1bbf95468f 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -11,6 +11,7 @@
 
 #include "vpx_config.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_quantize.h"
@@ -3953,6 +3954,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
   cm->new_fb_idx = get_free_fb(cm);
 
+  vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
   if (cpi->pass == 1) {
     Pass1Encode(cpi, size, dest, frame_flags);
   } else if (cpi->pass == 2) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 2e9bbcfc1e..200a6a9473 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2222,9 +2222,9 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x,
       BLOCK *be = &x->block[i];
       int thisdistortion;
 
-      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4);
+      vp9_build_inter_predictors_b(bd, 16, &xd->subpix);
       if (xd->mode_info_context->mbmi.second_ref_frame > 0)
-        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4);
+        vp9_build_2nd_inter_predictors_b(bd, 16, &xd->subpix);
       vp9_subtract_b(be, bd, 16);
       x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
       x->quantize_b_4x4(be, bd);
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 8bbe534860..7bca01e051 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -50,12 +50,11 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
   // Y
   yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
 
-  if ((mv_row | mv_col) & 7) {
-    xd->subpixel_predict16x16(yptr, stride,
-                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);
-  } else {
-    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);
-  }
+  xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][0](
+      yptr, stride, &pred[0], 16,
+      xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,
+      16, 16);
 
   // U & V
   omv_row = mv_row;
@@ -67,15 +66,17 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
   uptr = u_mb_ptr + offset;
   vptr = v_mb_ptr + offset;
 
-  if ((omv_row | omv_col) & 15) {
-    xd->subpixel_predict8x8(uptr, stride,
-                           (omv_col & 15), (omv_row & 15), &pred[256], 8);
-    xd->subpixel_predict8x8(vptr, stride,
-                           (omv_col & 15), (omv_row & 15), &pred[320], 8);
-  } else {
-    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);
-    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);
-  }
+  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0](
+      uptr, stride, &pred[256], 8,
+      xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4,
+      8, 8);
+
+  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0](
+      vptr, stride, &pred[320], 8,
+      xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4,
+      xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4,
+      8, 8);
 }
 
 void vp9_temporal_filter_apply_c(uint8_t *frame1,
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index d03e285c63..d07a65b455 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -142,8 +142,8 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
   const int16_t *HFilter, *VFilter;
   uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   // First filter 1d Horizontal
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
@@ -166,8 +166,8 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
@@ -186,8 +186,8 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
@@ -206,8 +206,8 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
   uint8_t temp2[68 * 64];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
                                     1, 65, 64, HFilter);
@@ -227,8 +227,8 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
   uint8_t temp2[36 * 32];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
@@ -367,8 +367,8 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
@@ -387,8 +387,8 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
   uint8_t temp2[20 * 16];
   const int16_t *HFilter, *VFilter;
 
-  HFilter = vp9_bilinear_filters[xoffset];
-  VFilter = vp9_bilinear_filters[yoffset];
+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
                                     1, 17, 8, HFilter);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index d1805be62b..c7e8acb454 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -56,7 +56,6 @@ VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
 VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
-VP9_COMMON_SRCS-yes += common/vp9_subpixel.h
 VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
@@ -81,7 +80,6 @@ VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
 
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
@@ -90,7 +88,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
@@ -98,10 +95,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm
 ifeq ($(CONFIG_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
@@ -113,19 +107,10 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
 VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
 endif
 
-VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c
-ifeq ($(HAVE_SSE4_1),yes)
-vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4
-vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4
-endif
-
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c
 ifeq ($(HAVE_SSE2),yes)
-vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2
 endif
-- 
GitLab