diff --git a/build/make/Makefile b/build/make/Makefile
index 92113ccb8c24ee29f4169bdfe68a0b896899703e..da7fb03a0bba0244c4f696ce0600740c36eaac8b 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -74,7 +74,7 @@ HOSTCC?=gcc
 TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
 TGT_OS:=$(word 2, $(subst -, ,$(TOOLCHAIN)))
 TGT_CC:=$(word 3, $(subst -, ,$(TOOLCHAIN)))
-quiet:=$(if $(verbose),,yes)
+quiet:=$(if $(or $(verbose), $(V)),, yes)
 qexec=$(if $(quiet),@)
 
 # Cancel built-in implicit rules
@@ -380,6 +380,7 @@ ifneq ($(call enabled,DIST-SRCS),)
     # Include obj_int_extract if we use offsets from asm_*_offsets
     DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
+    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas_apple.pl
     DIST-SRCS-yes            += $(target:-$(TOOLCHAIN)=).mk
 endif
 INSTALL-SRCS := $(call cond_enabled,CONFIG_INSTALL_SRCS,INSTALL-SRCS)
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 81280bf7850c5cc6065112cc64f91fd7650decd9..51e6fbcbdb1e413b249335319498abf1320b909f 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -10,12 +10,12 @@
 ##
 
 
-# ads2gas.pl
+# ads2gas_apple.pl
 # Author: Eric Fung (efung (at) acm.org)
 #
 # Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format
 #
-# Usage: cat inputfile | perl ads2gas.pl > outputfile
+# Usage: cat inputfile | perl ads2gas_apple.pl > outputfile
 #
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
diff --git a/build/make/configure.sh b/build/make/configure.sh
index e558ff2b8e4f5d48eba81653d65e529b8572b6ac..1a78f272bd4d5efabe08cc73333607a3545b66eb 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -598,8 +598,13 @@ process_common_toolchain() {
             armv6*)
                 tgt_isa=armv6
                 ;;
+            armv7*-hardfloat*)
+                tgt_isa=armv7
+                float_abi=hard
+                ;;
             armv7*)
                 tgt_isa=armv7
+                float_abi=softfp
                 ;;
             armv5te*)
                 tgt_isa=armv5te
@@ -643,6 +648,9 @@ process_common_toolchain() {
                 tgt_isa=x86_64
                 tgt_os=darwin12
                 ;;
+            x86_64*mingw32*)
+                tgt_os=win64
+                ;;
             *mingw32*|*cygwin*)
                 [ -z "$tgt_isa" ] && tgt_isa=x86
                 tgt_os=win32
@@ -785,8 +793,9 @@ process_common_toolchain() {
             check_add_asflags --defsym ARCHITECTURE=${arch_int}
             tune_cflags="-mtune="
             if [ ${tgt_isa} == "armv7" ]; then
-                check_add_cflags  -march=armv7-a -mfloat-abi=softfp
-                check_add_asflags -march=armv7-a -mfloat-abi=softfp
+                [ -z "${float_abi}" ] && float_abi=softfp
+                check_add_cflags  -march=armv7-a -mfloat-abi=${float_abi}
+                check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
 
                 if enabled neon
                 then
@@ -1038,7 +1047,7 @@ EOF
                 add_ldflags -m${bits}
                 link_with_cc=gcc
                 tune_cflags="-march="
-            setup_gnu_toolchain
+                setup_gnu_toolchain
                 #for 32 bit x86 builds, -O3 did not turn on this flag
                 enabled optimizations && check_add_cflags -fomit-frame-pointer
             ;;
@@ -1056,6 +1065,8 @@ EOF
         soft_enable sse2
         soft_enable sse3
         soft_enable ssse3
+        # We can't use 'check_cflags' until the compiler is configured and CC is
+        # populated.
         if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then
             RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "
         else
@@ -1082,7 +1093,7 @@ EOF
                 add_asflags -f x64
                 enabled debug && add_asflags -g cv8
             ;;
-            linux*|solaris*)
+            linux*|solaris*|android*)
                 add_asflags -f elf${bits}
                 enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
                 enabled debug && [ "${AS}" = nasm ] && add_asflags -g
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index ddf9e09a4e9674db91cbc79381691a1d7eada0c3..6cc36843b1a6266e8d16e317138e3bb03c525166 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -59,13 +59,13 @@ for f in $defs_file; do [ -f "$f" ] || usage; done
 # Routines for the RTCD DSL to call
 #
 prototype() {
-  local rtyp
+  rtyp=""
   case "$1" in
     unsigned) rtyp="$1 "; shift;;
   esac
   rtyp="${rtyp}$1"
-  local fn="$2"
-  local args="$3"
+  fn="$2"
+  args="$3"
 
   eval "${2}_rtyp='$rtyp'"
   eval "${2}_args='$3'"
@@ -74,7 +74,7 @@ prototype() {
 }
 
 specialize() {
-  local fn="$1"
+  fn="$1"
   shift
   for opt in "$@"; do
     eval "${fn}_${opt}=${fn}_${opt}"
@@ -84,13 +84,13 @@ specialize() {
 require() {
   for fn in $ALL_FUNCS; do
     for opt in "$@"; do
-      local ofn=$(eval "echo \$${fn}_${opt}")
+      ofn=$(eval "echo \$${fn}_${opt}")
       [ -z "$ofn" ] && continue
 
       # if we already have a default, then we can disable it, as we know
       # we can do better.
-      local best=$(eval "echo \$${fn}_default")
-      local best_ofn=$(eval "echo \$${best}")
+      best=$(eval "echo \$${fn}_default")
+      best_ofn=$(eval "echo \$${best}")
       [ -n "$best" ] && [ "$best_ofn" != "$ofn" ] && eval "${best}_link=false"
       eval "${fn}_default=${fn}_${opt}"
       eval "${fn}_${opt}_link=true"
@@ -121,15 +121,15 @@ process_forward_decls() {
 determine_indirection() {
   [ "$CONFIG_RUNTIME_CPU_DETECT" = "yes" ] || require $ALL_ARCHS
   for fn in $ALL_FUNCS; do
-    local n=""
-    local rtyp="$(eval "echo \$${fn}_rtyp")"
-    local args="$(eval "echo \"\$${fn}_args\"")"
-    local dfn="$(eval "echo \$${fn}_default")"
+    n=""
+    rtyp="$(eval "echo \$${fn}_rtyp")"
+    args="$(eval "echo \"\$${fn}_args\"")"
+    dfn="$(eval "echo \$${fn}_default")"
     dfn=$(eval "echo \$${dfn}")
     for opt in "$@"; do
-      local ofn=$(eval "echo \$${fn}_${opt}")
+      ofn=$(eval "echo \$${fn}_${opt}")
       [ -z "$ofn" ] && continue
-      local link=$(eval "echo \$${fn}_${opt}_link")
+      link=$(eval "echo \$${fn}_${opt}_link")
       [ "$link" = "false" ] && continue
       n="${n}x"
     done
@@ -143,12 +143,12 @@ determine_indirection() {
 
 declare_function_pointers() {
   for fn in $ALL_FUNCS; do
-    local rtyp="$(eval "echo \$${fn}_rtyp")"
-    local args="$(eval "echo \"\$${fn}_args\"")"
-    local dfn="$(eval "echo \$${fn}_default")"
+    rtyp="$(eval "echo \$${fn}_rtyp")"
+    args="$(eval "echo \"\$${fn}_args\"")"
+    dfn="$(eval "echo \$${fn}_default")"
     dfn=$(eval "echo \$${dfn}")
     for opt in "$@"; do
-      local ofn=$(eval "echo \$${fn}_${opt}")
+      ofn=$(eval "echo \$${fn}_${opt}")
       [ -z "$ofn" ] && continue
       echo "$rtyp ${ofn}($args);"
     done
@@ -163,20 +163,20 @@ declare_function_pointers() {
 
 set_function_pointers() {
   for fn in $ALL_FUNCS; do
-    local n=""
-    local rtyp="$(eval "echo \$${fn}_rtyp")"
-    local args="$(eval "echo \"\$${fn}_args\"")"
-    local dfn="$(eval "echo \$${fn}_default")"
+    n=""
+    rtyp="$(eval "echo \$${fn}_rtyp")"
+    args="$(eval "echo \"\$${fn}_args\"")"
+    dfn="$(eval "echo \$${fn}_default")"
     dfn=$(eval "echo \$${dfn}")
     if $(eval "echo \$${fn}_indirect"); then
       echo "    $fn = $dfn;"
       for opt in "$@"; do
-        local ofn=$(eval "echo \$${fn}_${opt}")
+        ofn=$(eval "echo \$${fn}_${opt}")
         [ -z "$ofn" ] && continue
         [ "$ofn" = "$dfn" ] && continue;
-        local link=$(eval "echo \$${fn}_${opt}_link")
+        link=$(eval "echo \$${fn}_${opt}_link")
         [ "$link" = "false" ] && continue
-        local cond="$(eval "echo \$have_${opt}")"
+        cond="$(eval "echo \$have_${opt}")"
         echo "    if (${cond}) $fn = $ofn;"
       done
     fi
@@ -185,7 +185,7 @@ set_function_pointers() {
 }
 
 filter() {
-  local filtered
+  filtered=""
   for opt in "$@"; do
     [ -z $(eval "echo \$disable_${opt}") ] && filtered="$filtered $opt"
   done
@@ -196,8 +196,9 @@ filter() {
 # Helper functions for generating the arch specific RTCD files
 #
 common_top() {
-  local outfile_basename=$(basename ${symbol:-rtcd.h})
-  local include_guard=$(echo $outfile_basename | tr '[a-z]' '[A-Z]' | tr -c '[A-Z]' _)
+  outfile_basename=$(basename ${symbol:-rtcd})
+  include_guard=$(echo $outfile_basename | tr '[a-z]' '[A-Z]' | \
+    tr -c '[A-Z0-9]' _)H_
   cat <<EOF
 #ifndef ${include_guard}
 #define ${include_guard}
@@ -227,7 +228,7 @@ x86() {
 
   # Assign the helper variable for each enabled extension
   for opt in $ALL_ARCHS; do
-    local uc=$(echo $opt | tr '[a-z]' '[A-Z]')
+    uc=$(echo $opt | tr '[a-z]' '[A-Z]')
     eval "have_${opt}=\"flags & HAS_${uc}\""
   done
 
@@ -254,7 +255,7 @@ arm() {
 
   # Assign the helper variable for each enabled extension
   for opt in $ALL_ARCHS; do
-    local uc=$(echo $opt | tr '[a-z]' '[A-Z]')
+    uc=$(echo $opt | tr '[a-z]' '[A-Z]')
     eval "have_${opt}=\"flags & HAS_${uc}\""
   done
 
diff --git a/configure b/configure
index f5b45d03591593de1a63a6cf217aed70967f7ba3..f55f798635b2d147a16caa18b7490e65af9c2670 100755
--- a/configure
+++ b/configure
@@ -106,6 +106,7 @@ all_platforms="${all_platforms} ppc64-darwin8-gcc"
 all_platforms="${all_platforms} ppc64-darwin9-gcc"
 all_platforms="${all_platforms} ppc64-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
+all_platforms="${all_platforms} x86-android-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
 all_platforms="${all_platforms} x86-darwin8-icc"
 all_platforms="${all_platforms} x86-darwin9-gcc"
@@ -650,7 +651,7 @@ process_toolchain() {
         enabled postproc || die "postproc_visualizer requires postproc to be enabled"
     fi
 
-    # Enable unit tests if we have a working C++ compiler
+    # Enable unit tests by default if we have a working C++ compiler.
     case "$toolchain" in
         *-vs*)
             soft_enable unit_tests
@@ -663,7 +664,7 @@ process_toolchain() {
             # x86 targets.
         ;;
         *)
-            check_cxx "$@" <<EOF && soft_enable unit_tests
+            enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
 EOF
         ;;
diff --git a/libs.mk b/libs.mk
index 1f0ade34d42fbd6613112ea54f8ac4f42ea56ae9..872a16bae0bd22c87dddb3ce22817b974a9ca3bc 100644
--- a/libs.mk
+++ b/libs.mk
@@ -17,7 +17,6 @@ else
   ASM:=.asm
 endif
 
-
 #
 # Calculate platform- and compiler-specific offsets for hand coded assembly
 #
@@ -167,7 +166,9 @@ CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
+ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
 CODEC_SRCS-$(BUILD_LIBVPX) += third_party/x86inc/x86inc.asm
+endif
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
@@ -221,7 +222,6 @@ obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
     -I"$(SRC_PATH_BARE)" \
 
 PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj
-PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat
 
 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
@@ -325,7 +325,11 @@ vpx.pc: config.mk libs.mk
 	$(qexec)echo 'Requires:' >> $@
 	$(qexec)echo 'Conflicts:' >> $@
 	$(qexec)echo 'Libs: -L$${libdir} -lvpx -lm' >> $@
+ifeq ($(HAVE_PTHREAD_H),yes)
 	$(qexec)echo 'Libs.private: -lm -lpthread' >> $@
+else
+	$(qexec)echo 'Libs.private: -lm' >> $@
+endif
 	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
 INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
 INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
@@ -373,7 +377,7 @@ LIBVPX_TEST_DATA_PATH ?= .
 
 include $(SRC_PATH_BARE)/test/test.mk
 LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
-LIBVPX_TEST_BINS=./test_libvpx
+LIBVPX_TEST_BINS=./test_libvpx$(EXE_SFX)
 LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                      $(call enabled,LIBVPX_TEST_DATA))
 libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 72741a901e02406ad804809383e3224d249983b1..165e2c8f0c58182513ba2a63105f70848c5ad63f 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -15,8 +15,13 @@
 
 extern "C" {
 #include "./vpx_config.h"
+#if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
-#include "vp8/common/blockd.h"
+//#include "vp8/common/blockd.h"
+#endif
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
 #include "vpx_mem/vpx_mem.h"
 }
 
@@ -32,14 +37,22 @@ typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr,
                                         int reference_stride,
                                         unsigned int max_sad);
 
+typedef void (*sad_n_by_n_by_4_fn_t)(const uint8_t *src_ptr,
+                                     int src_stride,
+                                     const unsigned char * const ref_ptr[],
+                                     int ref_stride,
+                                     unsigned int *sad_array);
+
 using libvpx_test::ACMRandom;
 
 namespace {
-class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {
+class SADTestBase : public ::testing::Test {
  public:
+  SADTestBase(int width, int height) : width_(width), height_(height) {}
+
   static void SetUpTestCase() {
     source_data_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kDataBufferSize));
+        vpx_memalign(kDataAlignment, kDataBlockSize));
     reference_data_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
   }
@@ -52,36 +65,31 @@ class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {
   }
 
  protected:
+  // Handle blocks up to 4 blocks 64x64 with stride up to 128
   static const int kDataAlignment = 16;
-  static const int kDataBufferSize = 16 * 32;
+  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBufferSize = 4 * kDataBlockSize;
 
   virtual void SetUp() {
-    sad_fn_ = GET_PARAM(2);
-    height_ = GET_PARAM(1);
-    width_ = GET_PARAM(0);
-    source_stride_ = width_ * 2;
+    source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
-  sad_m_by_n_fn_t sad_fn_;
-  virtual unsigned int SAD(unsigned int max_sad) {
-    unsigned int ret;
-    REGISTER_STATE_CHECK(ret = sad_fn_(source_data_, source_stride_,
-                                       reference_data_, reference_stride_,
-                                       max_sad));
-    return ret;
+  virtual uint8_t* GetReference(int block_idx) {
+    return reference_data_ + block_idx * kDataBlockSize;
   }
 
   // Sum of Absolute Differences. Given two blocks, calculate the absolute
   // difference between two pixels in the same relative location; accumulate.
-  unsigned int ReferenceSAD(unsigned int max_sad) {
+  unsigned int ReferenceSAD(unsigned int max_sad, int block_idx = 0) {
     unsigned int sad = 0;
+    const uint8_t* const reference = GetReference(block_idx);
 
     for (int h = 0; h < height_; ++h) {
       for (int w = 0; w < width_; ++w) {
         sad += abs(source_data_[h * source_stride_ + w]
-               - reference_data_[h * reference_stride_ + w]);
+               - reference[h * reference_stride_ + w]);
       }
       if (sad > max_sad) {
         break;
@@ -106,6 +114,32 @@ class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {
     }
   }
 
+  int width_, height_;
+  static uint8_t* source_data_;
+  int source_stride_;
+  static uint8_t* reference_data_;
+  int reference_stride_;
+
+  ACMRandom rnd_;
+};
+
+class SADTest : public SADTestBase,
+    public ::testing::WithParamInterface<
+        std::tr1::tuple<int, int, sad_m_by_n_fn_t> > {
+ public:
+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  unsigned int SAD(unsigned int max_sad, int block_idx = 0) {
+    unsigned int ret;
+    const uint8_t* const reference = GetReference(block_idx);
+
+    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                            reference, reference_stride_,
+                                            max_sad));
+    return ret;
+  }
+
   void CheckSad(unsigned int max_sad) {
     unsigned int reference_sad, exp_sad;
 
@@ -119,19 +153,38 @@ class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {
       ASSERT_GE(exp_sad, reference_sad);
     }
   }
+};
 
-  // Handle blocks up to 16x16 with stride up to 32
-  int height_, width_;
-  static uint8_t* source_data_;
-  int source_stride_;
-  static uint8_t* reference_data_;
-  int reference_stride_;
+class SADx4Test : public SADTestBase,
+    public ::testing::WithParamInterface<
+        std::tr1::tuple<int, int, sad_n_by_n_by_4_fn_t> > {
+ public:
+  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
-  ACMRandom rnd_;
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t* refs[] = {GetReference(0), GetReference(1),
+                             GetReference(2), GetReference(3)};
+
+    REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
+                                      refs, reference_stride_,
+                                      results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; block++) {
+      reference_sad = ReferenceSAD(UINT_MAX, block);
+
+      EXPECT_EQ(exp_sad[block], reference_sad) << "block " << block;
+    }
+  }
 };
 
-uint8_t* SADTest::source_data_ = NULL;
-uint8_t* SADTest::reference_data_ = NULL;
+uint8_t* SADTestBase::source_data_ = NULL;
+uint8_t* SADTestBase::reference_data_ = NULL;
 
 TEST_P(SADTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -139,12 +192,30 @@ TEST_P(SADTest, MaxRef) {
   CheckSad(UINT_MAX);
 }
 
+TEST_P(SADx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, 255);
+  FillConstant(GetReference(1), reference_stride_, 255);
+  FillConstant(GetReference(2), reference_stride_, 255);
+  FillConstant(GetReference(3), reference_stride_, 255);
+  CheckSADs();
+}
+
 TEST_P(SADTest, MaxSrc) {
   FillConstant(source_data_, source_stride_, 255);
   FillConstant(reference_data_, reference_stride_, 0);
   CheckSad(UINT_MAX);
 }
 
+TEST_P(SADx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
 TEST_P(SADTest, ShortRef) {
   int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
@@ -154,6 +225,18 @@ TEST_P(SADTest, ShortRef) {
   reference_stride_ = tmp_stride;
 }
 
+TEST_P(SADx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
 TEST_P(SADTest, UnalignedRef) {
   // The reference frame, but not the source frame, may be unaligned for
   // certain types of searches.
@@ -165,6 +248,20 @@ TEST_P(SADTest, UnalignedRef) {
   reference_stride_ = tmp_stride;
 }
 
+TEST_P(SADx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
 TEST_P(SADTest, ShortSrc) {
   int tmp_stride = source_stride_;
   source_stride_ >>= 1;
@@ -174,6 +271,18 @@ TEST_P(SADTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_stride_ = tmp_stride;
+}
+
 TEST_P(SADTest, MaxSAD) {
   // Verify that, when max_sad is set, the implementation does not return a
   // value lower than the reference.
@@ -184,17 +293,61 @@ TEST_P(SADTest, MaxSAD) {
 
 using std::tr1::make_tuple;
 
+#if CONFIG_VP8_ENCODER && CONFIG_VP9_ENCODER
+#define VP8_VP9_SEPARATOR ,
+#else
+#define VP8_VP9_SEPARATOR
+#endif
+
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c;
 const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c;
 const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c;
 const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c;
 const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c;
+#endif
+#if CONFIG_VP9_ENCODER
+const sad_m_by_n_fn_t sad_64x64_c_vp9 = vp9_sad64x64_c;
+const sad_m_by_n_fn_t sad_32x32_c_vp9 = vp9_sad32x32_c;
+const sad_m_by_n_fn_t sad_16x16_c_vp9 = vp9_sad16x16_c;
+const sad_m_by_n_fn_t sad_8x16_c_vp9 = vp9_sad8x16_c;
+const sad_m_by_n_fn_t sad_16x8_c_vp9 = vp9_sad16x8_c;
+const sad_m_by_n_fn_t sad_8x8_c_vp9 = vp9_sad8x8_c;
+const sad_m_by_n_fn_t sad_4x4_c_vp9 = vp9_sad4x4_c;
+#endif
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::Values(
+#if CONFIG_VP8_ENCODER
                         make_tuple(16, 16, sad_16x16_c),
                         make_tuple(8, 16, sad_8x16_c),
                         make_tuple(16, 8, sad_16x8_c),
                         make_tuple(8, 8, sad_8x8_c),
-                        make_tuple(4, 4, sad_4x4_c)));
+                        make_tuple(4, 4, sad_4x4_c)
+#endif
+                        VP8_VP9_SEPARATOR
+#if CONFIG_VP9_ENCODER
+                        make_tuple(64, 64, sad_64x64_c_vp9),
+                        make_tuple(32, 32, sad_32x32_c_vp9),
+                        make_tuple(16, 16, sad_16x16_c_vp9),
+                        make_tuple(8, 16, sad_8x16_c_vp9),
+                        make_tuple(16, 8, sad_16x8_c_vp9),
+                        make_tuple(8, 8, sad_8x8_c_vp9),
+                        make_tuple(4, 4, sad_4x4_c_vp9)
+#endif
+                        ));
+
+#if CONFIG_VP9_ENCODER
+const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;
+const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c;
+const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c;
+const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c;
+const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c;
+INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
+                        make_tuple(64, 64, sad_64x64x4d_c),
+                        make_tuple(32, 32, sad_32x32x4d_c),
+                        make_tuple(16, 16, sad_16x16x4d_c),
+                        make_tuple(8, 8, sad_8x8x4d_c),
+                        make_tuple(4, 4, sad_4x4x4d_c)));
+#endif
 
 // ARM tests
 #if HAVE_MEDIA
@@ -219,31 +372,120 @@ INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
 
 // X86 tests
 #if HAVE_MMX
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx;
 const sad_m_by_n_fn_t sad_8x16_mmx = vp8_sad8x16_mmx;
 const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx;
 const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx;
 const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx;
+#endif
+#if CONFIG_VP9_ENCODER
+const sad_m_by_n_fn_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
+const sad_m_by_n_fn_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
+const sad_m_by_n_fn_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
+const sad_m_by_n_fn_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
+const sad_m_by_n_fn_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
+#endif
+
 INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::Values(
+#if CONFIG_VP8_ENCODER
                         make_tuple(16, 16, sad_16x16_mmx),
                         make_tuple(8, 16, sad_8x16_mmx),
                         make_tuple(16, 8, sad_16x8_mmx),
                         make_tuple(8, 8, sad_8x8_mmx),
-                        make_tuple(4, 4, sad_4x4_mmx)));
+                        make_tuple(4, 4, sad_4x4_mmx)
+#endif
+                        VP8_VP9_SEPARATOR
+#if CONFIG_VP9_ENCODER
+                        make_tuple(16, 16, sad_16x16_mmx_vp9),
+                        make_tuple(8, 16, sad_8x16_mmx_vp9),
+                        make_tuple(16, 8, sad_16x8_mmx_vp9),
+                        make_tuple(8, 8, sad_8x8_mmx_vp9),
+                        make_tuple(4, 4, sad_4x4_mmx_vp9)
+#endif
+                        ));
+#endif
+
+#if HAVE_SSE
+#if CONFIG_VP9_ENCODER
+const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
+INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
+                        make_tuple(4, 4, sad_4x4_sse_vp9)));
+
+const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
+INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
+                        make_tuple(4, 4, sad_4x4x4d_sse)));
 #endif
+#endif
+
 #if HAVE_SSE2
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_wmt = vp8_sad16x16_wmt;
 const sad_m_by_n_fn_t sad_8x16_wmt = vp8_sad8x16_wmt;
 const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt;
 const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
+#endif
+#if CONFIG_VP9_ENCODER
+const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
+const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
+const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
+const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
+const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
+const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
+#endif
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::Values(
+#if CONFIG_VP8_ENCODER
                         make_tuple(16, 16, sad_16x16_wmt),
                         make_tuple(8, 16, sad_8x16_wmt),
                         make_tuple(16, 8, sad_16x8_wmt),
                         make_tuple(8, 8, sad_8x8_wmt),
-                        make_tuple(4, 4, sad_4x4_wmt)));
+                        make_tuple(4, 4, sad_4x4_wmt)
+#endif
+                        VP8_VP9_SEPARATOR
+#if CONFIG_VP9_ENCODER
+                        make_tuple(64, 64, sad_64x64_sse2_vp9),
+                        make_tuple(32, 32, sad_32x32_sse2_vp9),
+                        make_tuple(16, 16, sad_16x16_sse2_vp9),
+                        make_tuple(8, 16, sad_8x16_sse2_vp9),
+                        make_tuple(16, 8, sad_16x8_sse2_vp9),
+                        make_tuple(8, 8, sad_8x8_sse2_vp9)
+#endif
+                        ));
+
+#if CONFIG_VP9_ENCODER
+const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
+INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
+                        make_tuple(64, 64, sad_64x64x4d_sse2),
+                        make_tuple(32, 32, sad_32x32x4d_sse2),
+                        make_tuple(16, 16, sad_16x16x4d_sse2),
+                        make_tuple(16, 8, sad_16x8x4d_sse2),
+                        make_tuple(8, 16, sad_8x16x4d_sse2),
+                        make_tuple(8, 8, sad_8x8x4d_sse2)));
 #endif
+#endif
+
+#if HAVE_SSE3
+#if CONFIG_VP8_ENCODER
+const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3;
+const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3;
+const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3;
+const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3;
+const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3;
+INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16x4d_sse3),
+                        make_tuple(16, 8, sad_16x8x4d_sse3),
+                        make_tuple(8, 16, sad_8x16x4d_sse3),
+                        make_tuple(8, 8, sad_8x8x4d_sse3),
+                        make_tuple(4, 4, sad_4x4x4d_sse3)));
+#endif
+#endif
+
 #if HAVE_SSSE3
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index 2d4581dc07fb0b4413d2a8642ba3938df207c1d5..9ab7a73479d1e184a8e9dc1208321b7cb122c185 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -61,9 +61,9 @@ class SixtapPredictTest : public PARAMS(int, int, sixtap_predict_fn_t) {
     width_ = GET_PARAM(0);
     height_ = GET_PARAM(1);
     sixtap_predict_ = GET_PARAM(2);
-    memset(src_, 0, sizeof(src_));
-    memset(dst_, 0, sizeof(dst_));
-    memset(dst_c_, 0, sizeof(dst_c_));
+    memset(src_, 0, kSrcSize);
+    memset(dst_, 0, kDstSize);
+    memset(dst_c_, 0, kDstSize);
   }
 
   int width_;
diff --git a/test/test.mk b/test/test.mk
index 3d56bd838c4a1b032db8b6f32aa8ce69d4c0b4a6..f7a1462673ce7a3f0793a8ceaa65a2ec7451a42c 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -50,10 +50,11 @@ endif
 LIBVPX_TEST_SRCS-yes                   += idctllm_test.cc
 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
-LIBVPX_TEST_SRCS-yes                   += sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 
 endif # VP8
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 83f1139949dfb981fc7e25b17e414a7de429e8c2..120df316e0d3cdcf2ab976ae98f57b9ed021f3cd 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -12,11 +12,17 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "vpx/vpx_integer.h"
 #include "vpx_config.h"
 extern "C" {
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx/vpx_integer.h"
-#include "vp9_rtcd.h"
+#if CONFIG_VP8_ENCODER
+# include "vp8/common/variance.h"
+# include "vp8_rtcd.h"
+#endif
+#if CONFIG_VP9_ENCODER
+# include "vp9/encoder/vp9_variance.h"
+# include "vp9_rtcd.h"
+#endif
 }
 
 namespace {
@@ -25,11 +31,12 @@ using ::std::tr1::get;
 using ::std::tr1::make_tuple;
 using ::std::tr1::tuple;
 
-class VP9VarianceTest :
-    public ::testing::TestWithParam<tuple<int, int, vp9_variance_fn_t> > {
+template<typename VarianceFunctionType>
+class VarianceTest :
+    public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
  public:
   virtual void SetUp() {
-    const tuple<int, int, vp9_variance_fn_t>& params = GetParam();
+    const tuple<int, int, VarianceFunctionType>& params = this->GetParam();
     width_  = get<0>(params);
     height_ = get<1>(params);
     variance_ = get<2>(params);
@@ -47,15 +54,20 @@ class VP9VarianceTest :
   }
 
  protected:
+  void ZeroTest();
+  void OneQuarterTest();
+
   uint8_t* src_;
   uint8_t* ref_;
   int width_;
   int height_;
   int block_size_;
-  vp9_variance_fn_t variance_;
+  VarianceFunctionType variance_;
+
 };
 
-TEST_P(VP9VarianceTest, Zero) {
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::ZeroTest() {
   for (int i = 0; i <= 255; ++i) {
     memset(src_, i, block_size_);
     for (int j = 0; j <= 255; ++j) {
@@ -67,7 +79,8 @@ TEST_P(VP9VarianceTest, Zero) {
   }
 }
 
-TEST_P(VP9VarianceTest, OneQuarter) {
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
   memset(src_, 255, block_size_);
   const int half = block_size_ / 2;
   memset(ref_, 255, half);
@@ -78,6 +91,74 @@ TEST_P(VP9VarianceTest, OneQuarter) {
   EXPECT_EQ(expected, var);
 }
 
+// -----------------------------------------------------------------------------
+// VP8 test cases.
+
+namespace vp8 {
+
+#if CONFIG_VP8_ENCODER
+typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;
+
+TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
+TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }
+
+const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
+const vp8_variance_fn_t variance8x8_c = vp8_variance8x8_c;
+const vp8_variance_fn_t variance8x16_c = vp8_variance8x16_c;
+const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
+const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VP8VarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance4x4_c),
+                      make_tuple(8, 8, variance8x8_c),
+                      make_tuple(8, 16, variance8x16_c),
+                      make_tuple(16, 8, variance16x8_c),
+                      make_tuple(16, 16, variance16x16_c)));
+
+#if HAVE_MMX
+const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
+const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
+const vp8_variance_fn_t variance8x16_mmx = vp8_variance8x16_mmx;
+const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
+const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
+INSTANTIATE_TEST_CASE_P(
+    MMX, VP8VarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance4x4_mmx),
+                      make_tuple(8, 8, variance8x8_mmx),
+                      make_tuple(8, 16, variance8x16_mmx),
+                      make_tuple(16, 8, variance16x8_mmx),
+                      make_tuple(16, 16, variance16x16_mmx)));
+#endif
+
+#if HAVE_SSE2
+const vp8_variance_fn_t variance4x4_wmt = vp8_variance4x4_wmt;
+const vp8_variance_fn_t variance8x8_wmt = vp8_variance8x8_wmt;
+const vp8_variance_fn_t variance8x16_wmt = vp8_variance8x16_wmt;
+const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
+const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP8VarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance4x4_wmt),
+                      make_tuple(8, 8, variance8x8_wmt),
+                      make_tuple(8, 16, variance8x16_wmt),
+                      make_tuple(16, 8, variance16x8_wmt),
+                      make_tuple(16, 16, variance16x16_wmt)));
+#endif
+#endif  // CONFIG_VP8_ENCODER
+
+}  // namespace vp8
+
+// -----------------------------------------------------------------------------
+// VP9 test cases.
+
+namespace vp9 {
+
+#if CONFIG_VP9_ENCODER
+typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
+
+TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
+TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
+
 const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
 const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
 const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
@@ -120,4 +201,8 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(16, 8, variance16x8_wmt),
                       make_tuple(16, 16, variance16x16_wmt)));
 #endif
+#endif  // CONFIG_VP9_ENCODER
+
+}  // namespace vp9
+
 }  // namespace
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index d9927ecfa9c267fb54c4dfe0f573db12b77c3f69..a66a96bba6273fedb4410360c3acd9abc7082a8f 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -61,6 +61,8 @@
     %define mangle(x) x
 %elifidn __OUTPUT_FORMAT__,x64
     %define mangle(x) x
+%elifidn __OUTPUT_FORMAT__,win64
+    %define mangle(x) x
 %else
     %define mangle(x) _ %+ x
 %endif
@@ -112,7 +114,12 @@
 %endif
 
 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+%ifndef __NASM_VER__
 CPU amdnop
+%else
+%use smartalign
+ALIGNMODE k7
+%endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
index 8a8f92f6818bccd325e8ed1a1e99f8f13bf784e9..069332660e3276c4075fde9cf7c1069efdbf3e3a 100644
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@@ -280,7 +280,7 @@ void vp8_multiframe_quality_enhance
 
     FRAME_TYPE frame_type = cm->frame_type;
     /* Point at base of Mb MODE_INFO list has motion vectors etc */
-    const MODE_INFO *mode_info_context = cm->mi;
+    const MODE_INFO *mode_info_context = cm->show_frame_mi;
     int mb_row;
     int mb_col;
     int totmap, map[4];
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 03c97187a122e4d9aef8aa415404d2585ed432ed..276dd72ead07721a2dc8652a6d7078f7986258e3 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -127,7 +127,8 @@ typedef struct VP8Common
     MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
     MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
 #endif
-
+    MODE_INFO *show_frame_mi;  /* MODE_INFO for the last decoded frame
+                                  to show */
     LOOPFILTERTYPE filter_type;
 
     loop_filter_info_n lf_info;
diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h
index fd7e051ed69b6714551b60be239b617bb28b5f6e..97c81c130a0adc3443ec945a32fc38a36d9e9d4b 100644
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -34,7 +34,6 @@ extern "C"
         int     postprocess;
         int     max_threads;
         int     error_concealment;
-        int     input_fragments;
     } VP8D_CONFIG;
 
     typedef enum
@@ -56,10 +55,6 @@ extern "C"
     vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
     vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
 
-    struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
-
-    void vp8dx_remove_decompressor(struct VP8D_COMP* comp);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index fd313b6fe768fa236917148323fe7c007210233a..e40fb111cd774850dd8594c4602a9512203276d0 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -334,7 +334,7 @@ void vp8_deblock(VP8_COMMON                 *cm,
     double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
     int ppl = (int)(level + .5);
 
-    const MODE_INFO *mode_info_context = cm->mi;
+    const MODE_INFO *mode_info_context = cm->show_frame_mi;
     int mbr, mbc;
 
     /* The pixel thresholds are adjusted according to if or not the macroblock
diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c
index da08affb81a2237254ff4fb9a7d08903ff0273a1..773b655efc518128e9ac38c71a025f78ef21e842 100644
--- a/vp8/common/variance_c.c
+++ b/vp8/common/variance_c.c
@@ -75,7 +75,7 @@ unsigned int vp8_variance16x16_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 8));
+    return (var - (((unsigned int)avg * avg) >> 8));
 }
 
 unsigned int vp8_variance8x16_c(
@@ -91,7 +91,7 @@ unsigned int vp8_variance8x16_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 7));
+    return (var - (((unsigned int)avg * avg) >> 7));
 }
 
 unsigned int vp8_variance16x8_c(
@@ -107,7 +107,7 @@ unsigned int vp8_variance16x8_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 7));
+    return (var - (((unsigned int)avg * avg) >> 7));
 }
 
 
@@ -124,7 +124,7 @@ unsigned int vp8_variance8x8_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 6));
+    return (var - (((unsigned int)avg * avg) >> 6));
 }
 
 unsigned int vp8_variance4x4_c(
@@ -140,7 +140,7 @@ unsigned int vp8_variance4x4_c(
 
     variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 4));
+    return (var - (((unsigned int)avg * avg) >> 4));
 }
 
 
diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm
index 4aac09484de7251d4e59609a9d416d9c878d29c5..158c3b745838ca406bf31d29c0e004ed7000aaa2 100644
--- a/vp8/common/x86/iwalsh_mmx.asm
+++ b/vp8/common/x86/iwalsh_mmx.asm
@@ -24,7 +24,7 @@ sym(vp8_short_inv_walsh4x4_mmx):
 
     movq        mm0, [rdx + 0]    ;ip[0]
     movq        mm1, [rdx + 8]    ;ip[4]
-    movd        mm7, rax
+    movq        mm7, rax
 
     movq        mm2, [rdx + 16]   ;ip[8]
     movq        mm3, [rdx + 24]   ;ip[12]
diff --git a/vp8/common/x86/loopfilter_block_sse2.asm b/vp8/common/x86/loopfilter_block_sse2.asm
index 3d45c617b7ecc051647029667360ce3933544ff0..6d5aaa19db79045ff9a343ce68eb4843770ddc00 100644
--- a/vp8/common/x86/loopfilter_block_sse2.asm
+++ b/vp8/common/x86/loopfilter_block_sse2.asm
@@ -136,7 +136,7 @@
 global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
 sym(vp8_loop_filter_bh_y_sse2):
 
-%ifidn __OUTPUT_FORMAT__,x64
+%if LIBVPX_YASM_WIN64
     %define src      rcx ; src_ptr
     %define stride   rdx ; src_pixel_step
     %define blimit   r8
@@ -256,7 +256,7 @@ LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
         movdqa       i12, xmm3
         movdqa       i13, xmm8
 
-%ifidn __OUTPUT_FORMAT__,x64
+%if LIBVPX_YASM_WIN64
     pop    r13
     pop    r12
     RESTORE_XMM
@@ -278,7 +278,7 @@ LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
 global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
 sym(vp8_loop_filter_bv_y_sse2):
 
-%ifidn __OUTPUT_FORMAT__,x64
+%if LIBVPX_YASM_WIN64
     %define src      rcx ; src_ptr
     %define stride   rdx ; src_pixel_step
     %define blimit   r8
@@ -779,7 +779,7 @@ LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
     ; un-ALIGN_STACK
     pop          rsp
 
-%ifidn __OUTPUT_FORMAT__,x64
+%if LIBVPX_YASM_WIN64
     pop    r13
     pop    r12
     RESTORE_XMM
diff --git a/vp8/common/x86/mfqe_sse2.asm b/vp8/common/x86/mfqe_sse2.asm
index c1d21743d95f5633641e0028c2de4f9b70d95cb4..a8a7f568dcdfb09365114b7c02a1e24e37f23588 100644
--- a/vp8/common/x86/mfqe_sse2.asm
+++ b/vp8/common/x86/mfqe_sse2.asm
@@ -271,7 +271,13 @@ sym(vp8_variance_and_sad_16x16_sse2):
 SECTION_RODATA
 align 16
 t128:
+%ifndef __NASM_VER__
     ddq 128
+%elif CONFIG_BIG_ENDIAN
+    dq  0, 128
+%else
+    dq  128, 0
+%endif
 align 16
 tMFQE: ; 1 << MFQE_PRECISION
     times 8 dw 0x10
diff --git a/vp8/common/x86/sad_sse3.asm b/vp8/common/x86/sad_sse3.asm
index f90a589985a4570bd557af0049d641f9fefc8c3e..69c8d376973c424fffa54545e05a93f7649bed22 100644
--- a/vp8/common/x86/sad_sse3.asm
+++ b/vp8/common/x86/sad_sse3.asm
@@ -33,7 +33,7 @@
     movsxd      rax,        dword ptr arg(1)    ; src_stride
     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     SAVE_XMM 7, u
     %define     src_ptr     rcx
     %define     src_stride  rdx
@@ -76,7 +76,7 @@
     pop         rsi
     pop         rbp
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     RESTORE_XMM
   %endif
 %endif
@@ -111,7 +111,7 @@
 
     xchg        rbx,        rax
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     SAVE_XMM 7, u
     %define     src_ptr     rcx
     %define     src_stride  rdx
@@ -156,7 +156,7 @@
     pop         rsi
     pop         rbp
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     RESTORE_XMM
   %endif
diff --git a/vp8/common/x86/variance_mmx.c b/vp8/common/x86/variance_mmx.c
index 0c4dd4a981e021f555d7d7079284366d4a73e34a..36995db9aa7b1d450338e22cba6fe55e8b107bc0 100644
--- a/vp8/common/x86/variance_mmx.c
+++ b/vp8/common/x86/variance_mmx.c
@@ -91,7 +91,7 @@ unsigned int vp8_variance4x4_mmx(
 
     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 4));
+    return (var - (((unsigned int)avg * avg) >> 4));
 
 }
 
@@ -108,7 +108,7 @@ unsigned int vp8_variance8x8_mmx(
     vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
 
-    return (var - ((unsigned int)(avg * avg) >> 6));
+    return (var - (((unsigned int)avg * avg) >> 6));
 
 }
 
@@ -153,7 +153,7 @@ unsigned int vp8_variance16x16_mmx(
     var = sse0 + sse1 + sse2 + sse3;
     avg = sum0 + sum1 + sum2 + sum3;
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 8));
+    return (var - (((unsigned int)avg * avg) >> 8));
 }
 
 unsigned int vp8_variance16x8_mmx(
@@ -172,7 +172,7 @@ unsigned int vp8_variance16x8_mmx(
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 7));
+    return (var - (((unsigned int)avg * avg) >> 7));
 
 }
 
@@ -194,7 +194,7 @@ unsigned int vp8_variance8x16_mmx(
     avg = sum0 + sum1;
     *sse = var;
 
-    return (var - ((unsigned int)(avg * avg) >> 7));
+    return (var - (((unsigned int)avg * avg) >> 7));
 
 }
 
@@ -219,7 +219,7 @@ unsigned int vp8_sub_pixel_variance4x4_mmx
         &xsum, &xxsum
     );
     *sse = xxsum;
-    return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
+    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
 }
 
 
@@ -244,7 +244,7 @@ unsigned int vp8_sub_pixel_variance8x8_mmx
         &xsum, &xxsum
     );
     *sse = xxsum;
-    return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
+    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
 }
 
 unsigned int vp8_sub_pixel_variance16x16_mmx
@@ -282,7 +282,7 @@ unsigned int vp8_sub_pixel_variance16x16_mmx
     xxsum0 += xxsum1;
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
 
 
 }
@@ -335,7 +335,7 @@ unsigned int vp8_sub_pixel_variance16x8_mmx
     xxsum0 += xxsum1;
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
 }
 
 unsigned int vp8_sub_pixel_variance8x16_mmx
@@ -358,7 +358,7 @@ unsigned int vp8_sub_pixel_variance8x16_mmx
         &xsum, &xxsum
     );
     *sse = xxsum;
-    return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
+    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
 }
 
 
diff --git a/vp8/common/x86/variance_sse2.c b/vp8/common/x86/variance_sse2.c
index afd642915d235d8258ebd818088c10629116c994..7fa5f53dcbf605e835e4874249aa465d63547fd0 100644
--- a/vp8/common/x86/variance_sse2.c
+++ b/vp8/common/x86/variance_sse2.c
@@ -148,7 +148,7 @@ unsigned int vp8_variance4x4_wmt(
 
     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 4));
+    return (var - (((unsigned int)avg * avg) >> 4));
 
 }
 
@@ -165,7 +165,7 @@ unsigned int vp8_variance8x8_wmt
 
     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 6));
+    return (var - (((unsigned int)avg * avg) >> 6));
 
 }
 
@@ -184,7 +184,7 @@ unsigned int vp8_variance16x16_wmt
 
     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
     *sse = sse0;
-    return (sse0 - ((unsigned int)(sum0 * sum0) >> 8));
+    return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
 }
 unsigned int vp8_mse16x16_wmt(
     const unsigned char *src_ptr,
@@ -220,7 +220,7 @@ unsigned int vp8_variance16x8_wmt
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 7));
+    return (var - (((unsigned int)avg * avg) >> 7));
 
 }
 
@@ -241,7 +241,7 @@ unsigned int vp8_variance8x16_wmt
     var = sse0 + sse1;
     avg = sum0 + sum1;
     *sse = var;
-    return (var - ((unsigned int)(avg * avg) >> 7));
+    return (var - (((unsigned int)avg * avg) >> 7));
 
 }
 
@@ -265,7 +265,7 @@ unsigned int vp8_sub_pixel_variance4x4_wmt
         &xsum, &xxsum
     );
     *sse = xxsum;
-    return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
+    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
 }
 
 
@@ -314,7 +314,7 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
     }
 
     *sse = xxsum;
-    return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
+    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
 }
 
 unsigned int vp8_sub_pixel_variance16x16_wmt
@@ -376,7 +376,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
 }
 
 unsigned int vp8_sub_pixel_mse16x16_wmt(
@@ -447,7 +447,7 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
 }
 
 unsigned int vp8_sub_pixel_variance8x16_wmt
@@ -495,7 +495,7 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
     }
 
     *sse = xxsum;
-    return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
+    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
 }
 
 
@@ -515,7 +515,7 @@ unsigned int vp8_variance_halfpixvar16x16_h_wmt(
         &xsum0, &xxsum0);
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
 }
 
 
@@ -534,7 +534,7 @@ unsigned int vp8_variance_halfpixvar16x16_v_wmt(
         &xsum0, &xxsum0);
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
 }
 
 
@@ -554,5 +554,5 @@ unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
         &xsum0, &xxsum0);
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
 }
diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c
index ba2055cc90455e181da4c8373a4e9aa15be0ecbf..f90f8117c4dbdb0c1051af196c06d98b44ad94f9 100644
--- a/vp8/common/x86/variance_ssse3.c
+++ b/vp8/common/x86/variance_ssse3.c
@@ -113,7 +113,7 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
 }
 
 unsigned int vp8_sub_pixel_variance16x8_ssse3
@@ -162,5 +162,5 @@ unsigned int vp8_sub_pixel_variance16x8_ssse3
     }
 
     *sse = xxsum0;
-    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
 }
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 463211810cd6cf9dae894e4c1578263e0823f365..6f8282a6436ea38f2c4fd68ebd2e75056024b538 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -828,8 +828,8 @@ static void setup_token_decoder(VP8D_COMP *pbi,
     unsigned int partition_idx;
     unsigned int fragment_idx;
     unsigned int num_token_partitions;
-    const unsigned char *first_fragment_end = pbi->fragments[0] +
-                                          pbi->fragment_sizes[0];
+    const unsigned char *first_fragment_end = pbi->fragments.ptrs[0] +
+                                          pbi->fragments.sizes[0];
 
     TOKEN_PARTITION multi_token_partition =
             (TOKEN_PARTITION)vp8_read_literal(&pbi->mbc[8], 2);
@@ -839,10 +839,10 @@ static void setup_token_decoder(VP8D_COMP *pbi,
 
     /* Check for partitions within the fragments and unpack the fragments
      * so that each fragment pointer points to its corresponding partition. */
-    for (fragment_idx = 0; fragment_idx < pbi->num_fragments; ++fragment_idx)
+    for (fragment_idx = 0; fragment_idx < pbi->fragments.count; ++fragment_idx)
     {
-        unsigned int fragment_size = pbi->fragment_sizes[fragment_idx];
-        const unsigned char *fragment_end = pbi->fragments[fragment_idx] +
+        unsigned int fragment_size = pbi->fragments.sizes[fragment_idx];
+        const unsigned char *fragment_end = pbi->fragments.ptrs[fragment_idx] +
                                             fragment_size;
         /* Special case for handling the first partition since we have already
          * read its size. */
@@ -850,16 +850,16 @@ static void setup_token_decoder(VP8D_COMP *pbi,
         {
             /* Size of first partition + token partition sizes element */
             ptrdiff_t ext_first_part_size = token_part_sizes -
-                pbi->fragments[0] + 3 * (num_token_partitions - 1);
+                pbi->fragments.ptrs[0] + 3 * (num_token_partitions - 1);
             fragment_size -= (unsigned int)ext_first_part_size;
             if (fragment_size > 0)
             {
-                pbi->fragment_sizes[0] = (unsigned int)ext_first_part_size;
+                pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size;
                 /* The fragment contains an additional partition. Move to
                  * next. */
                 fragment_idx++;
-                pbi->fragments[fragment_idx] = pbi->fragments[0] +
-                  pbi->fragment_sizes[0];
+                pbi->fragments.ptrs[fragment_idx] = pbi->fragments.ptrs[0] +
+                  pbi->fragments.sizes[0];
             }
         }
         /* Split the chunk into partitions read from the bitstream */
@@ -868,12 +868,12 @@ static void setup_token_decoder(VP8D_COMP *pbi,
             ptrdiff_t partition_size = read_available_partition_size(
                                                  pbi,
                                                  token_part_sizes,
-                                                 pbi->fragments[fragment_idx],
+                                                 pbi->fragments.ptrs[fragment_idx],
                                                  first_fragment_end,
                                                  fragment_end,
                                                  fragment_idx - 1,
                                                  num_token_partitions);
-            pbi->fragment_sizes[fragment_idx] = (unsigned int)partition_size;
+            pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size;
             fragment_size -= (unsigned int)partition_size;
             assert(fragment_idx <= num_token_partitions);
             if (fragment_size > 0)
@@ -881,19 +881,19 @@ static void setup_token_decoder(VP8D_COMP *pbi,
                 /* The fragment contains an additional partition.
                  * Move to next. */
                 fragment_idx++;
-                pbi->fragments[fragment_idx] =
-                    pbi->fragments[fragment_idx - 1] + partition_size;
+                pbi->fragments.ptrs[fragment_idx] =
+                    pbi->fragments.ptrs[fragment_idx - 1] + partition_size;
             }
         }
     }
 
-    pbi->num_fragments = num_token_partitions + 1;
+    pbi->fragments.count = num_token_partitions + 1;
 
-    for (partition_idx = 1; partition_idx < pbi->num_fragments; ++partition_idx)
+    for (partition_idx = 1; partition_idx < pbi->fragments.count; ++partition_idx)
     {
         if (vp8dx_start_decode(bool_decoder,
-                               pbi->fragments[partition_idx],
-                               pbi->fragment_sizes[partition_idx]))
+                               pbi->fragments.ptrs[partition_idx],
+                               pbi->fragments.sizes[partition_idx]))
             vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate bool decoder %d",
                                partition_idx);
@@ -983,8 +983,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     vp8_reader *const bc = & pbi->mbc[8];
     VP8_COMMON *const pc = & pbi->common;
     MACROBLOCKD *const xd  = & pbi->mb;
-    const unsigned char *data = pbi->fragments[0];
-    const unsigned char *data_end =  data + pbi->fragment_sizes[0];
+    const unsigned char *data = pbi->fragments.ptrs[0];
+    const unsigned char *data_end =  data + pbi->fragments.sizes[0];
     ptrdiff_t first_partition_length_in_bytes;
 
     int i, j, k, l;
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 24fc8783aa9c618e2ba69957bb17d850b8e5b310..2db309658c5b94f3badd6f02fb887032c5769066 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -42,7 +42,16 @@ extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 static int get_free_fb (VP8_COMMON *cm);
 static void ref_cnt_fb (int *buf, int *idx, int new_idx);
 
-struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+static void remove_decompressor(VP8D_COMP *pbi)
+{
+#if CONFIG_ERROR_CONCEALMENT
+    vp8_de_alloc_overlap_lists(pbi);
+#endif
+    vp8_remove_common(&pbi->common);
+    vpx_free(pbi);
+}
+
+static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf)
 {
     VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
 
@@ -54,7 +63,7 @@ struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     if (setjmp(pbi->common.error.jmp))
     {
         pbi->common.error.setjmp = 0;
-        vp8dx_remove_decompressor(pbi);
+        remove_decompressor(pbi);
         return 0;
     }
 
@@ -65,11 +74,6 @@ struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     pbi->common.current_video_frame = 0;
     pbi->ready_for_new_data = 1;
 
-#if CONFIG_MULTITHREAD
-    pbi->max_threads = oxcf->max_threads;
-    vp8_decoder_create_threads(pbi);
-#endif
-
     /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
      *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
      */
@@ -92,9 +96,6 @@ struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
 
     pbi->decoded_key_frame = 0;
 
-    pbi->input_fragments = oxcf->input_fragments;
-    pbi->num_fragments = 0;
-
     /* Independent partitions is activated when a frame updates the
      * token probability table to have equal probabilities over the
      * PREV_COEF context.
@@ -106,25 +107,6 @@ struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
     return pbi;
 }
 
-
-void vp8dx_remove_decompressor(VP8D_COMP *pbi)
-{
-    if (!pbi)
-        return;
-
-#if CONFIG_MULTITHREAD
-    if (pbi->b_multithreaded_rd)
-        vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
-    vp8_decoder_remove_threads(pbi);
-#endif
-#if CONFIG_ERROR_CONCEALMENT
-    vp8_de_alloc_overlap_lists(pbi);
-#endif
-    vp8_remove_common(&pbi->common);
-    vpx_free(pbi);
-}
-
-
 vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
     VP8_COMMON *cm = &pbi->common;
@@ -282,60 +264,13 @@ static int swap_frame_buffers (VP8_COMMON *cm)
     return err;
 }
 
-int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
-                                  const uint8_t *source,
-                                  int64_t time_stamp)
+int check_fragments_for_errors(VP8D_COMP *pbi)
 {
-#if HAVE_NEON
-    int64_t dx_store_reg[8];
-#endif
-    VP8_COMMON *cm = &pbi->common;
-    int retcode = -1;
-
-    pbi->common.error.error_code = VPX_CODEC_OK;
-
-    if (pbi->num_fragments == 0)
-    {
-        /* New frame, reset fragment pointers and sizes */
-        vpx_memset((void*)pbi->fragments, 0, sizeof(pbi->fragments));
-        vpx_memset(pbi->fragment_sizes, 0, sizeof(pbi->fragment_sizes));
-    }
-    if (pbi->input_fragments && !(source == NULL && size == 0))
-    {
-        /* Store a pointer to this fragment and return. We haven't
-         * received the complete frame yet, so we will wait with decoding.
-         */
-        assert(pbi->num_fragments < MAX_PARTITIONS);
-        pbi->fragments[pbi->num_fragments] = source;
-        pbi->fragment_sizes[pbi->num_fragments] = size;
-        pbi->num_fragments++;
-        if (pbi->num_fragments > (1 << EIGHT_PARTITION) + 1)
-        {
-            pbi->common.error.error_code = VPX_CODEC_UNSUP_BITSTREAM;
-            pbi->common.error.setjmp = 0;
-            pbi->num_fragments = 0;
-            return -1;
-        }
-        return 0;
-    }
-
-    if (!pbi->input_fragments)
-    {
-        pbi->fragments[0] = source;
-        pbi->fragment_sizes[0] = size;
-        pbi->num_fragments = 1;
-    }
-    assert(pbi->common.multi_token_partition <= EIGHT_PARTITION);
-    if (pbi->num_fragments == 0)
-    {
-        pbi->num_fragments = 1;
-        pbi->fragments[0] = NULL;
-        pbi->fragment_sizes[0] = 0;
-    }
-
     if (!pbi->ec_active &&
-        pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
+        pbi->fragments.count <= 1 && pbi->fragments.sizes[0] == 0)
     {
+        VP8_COMMON *cm = &pbi->common;
+
         /* If error concealment is disabled we won't signal missing frames
          * to the decoder.
          */
@@ -361,12 +296,29 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
         /* Signal that we have no frame to show. */
         cm->show_frame = 0;
 
-        pbi->num_fragments = 0;
-
         /* Nothing more to do. */
         return 0;
     }
 
+    return 1;
+}
+
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
+                                  const uint8_t *source,
+                                  int64_t time_stamp)
+{
+#if HAVE_NEON
+    int64_t dx_store_reg[8];
+#endif
+    VP8_COMMON *cm = &pbi->common;
+    int retcode = -1;
+
+    pbi->common.error.error_code = VPX_CODEC_OK;
+
+    retcode = check_fragments_for_errors(pbi);
+    if(retcode <= 0)
+        return retcode;
+
 #if HAVE_NEON
 #if CONFIG_RUNTIME_CPU_DETECT
     if (cm->cpu_caps & HAS_NEON)
@@ -419,7 +371,13 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
 
     vp8_clear_system_state();
 
-#if CONFIG_ERROR_CONCEALMENT
+    if (cm->show_frame)
+    {
+        cm->current_video_frame++;
+        cm->show_frame_mi = cm->mi;
+    }
+
+    #if CONFIG_ERROR_CONCEALMENT
     /* swap the mode infos to storage for future error concealment */
     if (pbi->ec_enabled && pbi->common.prev_mi)
     {
@@ -441,9 +399,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
     }
 #endif
 
-    if (cm->show_frame)
-        cm->current_video_frame++;
-
     pbi->ready_for_new_data = 0;
     pbi->last_time_stamp = time_stamp;
 
@@ -458,7 +413,6 @@ decode_exit:
 #endif
 
     pbi->common.error.setjmp = 0;
-    pbi->num_fragments = 0;
     return retcode;
 }
 int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
@@ -521,3 +475,54 @@ int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame )
     return 0;
 
 }
+
+int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf)
+{
+    if(!fb->use_frame_threads)
+    {
+        /* decoder instance for single thread mode */
+        fb->pbi[0] = create_decompressor(oxcf);
+        if(!fb->pbi[0])
+            return VPX_CODEC_ERROR;
+
+#if CONFIG_MULTITHREAD
+        /* enable row-based threading only when use_frame_threads
+         * is disabled */
+        fb->pbi[0]->max_threads = oxcf->max_threads;
+        vp8_decoder_create_threads(fb->pbi[0]);
+#endif
+    }
+    else
+    {
+        /* TODO : create frame threads and decoder instances for each
+         * thread here */
+    }
+
+    return VPX_CODEC_OK;
+}
+
+int vp8_remove_decoder_instances(struct frame_buffers *fb)
+{
+    if(!fb->use_frame_threads)
+    {
+        VP8D_COMP *pbi = fb->pbi[0];
+
+        if (!pbi)
+            return VPX_CODEC_ERROR;
+#if CONFIG_MULTITHREAD
+        if (pbi->b_multithreaded_rd)
+            vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
+        vp8_decoder_remove_threads(pbi);
+#endif
+
+        /* decoder instance for single thread mode */
+        remove_decompressor(pbi);
+    }
+    else
+    {
+        /* TODO : remove frame threads and decoder instances for each
+         * thread here */
+    }
+
+    return VPX_CODEC_OK;
+}
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 0063beb638c44687f01508c7867949c001cd2f60..fb2dde8527f7a5895916471c32ee1928d0b895a1 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -33,6 +33,31 @@ typedef struct
     MACROBLOCKD  mbd;
 } MB_ROW_DEC;
 
+
+typedef struct
+{
+    int enabled;
+    unsigned int count;
+    const unsigned char *ptrs[MAX_PARTITIONS];
+    unsigned int sizes[MAX_PARTITIONS];
+} FRAGMENT_DATA;
+
+#define MAX_FB_MT_DEC 32
+
+struct frame_buffers
+{
+    /*
+     * this struct will be populated with frame buffer management
+     * info in future commits. */
+
+    /* enable/disable frame-based threading */
+    int     use_frame_threads;
+
+    /* decoder instances */
+    struct VP8D_COMP *pbi[MAX_FB_MT_DEC];
+
+};
+
 typedef struct VP8D_COMP
 {
     DECLARE_ALIGNED(16, MACROBLOCKD, mb);
@@ -46,10 +71,7 @@ typedef struct VP8D_COMP
 
     VP8D_CONFIG oxcf;
 
-
-    const unsigned char *fragments[MAX_PARTITIONS];
-    unsigned int   fragment_sizes[MAX_PARTITIONS];
-    unsigned int   num_fragments;
+    FRAGMENT_DATA fragments;
 
 #if CONFIG_MULTITHREAD
     /* variable for threading */
@@ -95,7 +117,6 @@ typedef struct VP8D_COMP
 #endif
     int ec_enabled;
     int ec_active;
-    int input_fragments;
     int decoded_key_frame;
     int independent_partitions;
     int frame_corrupt_residual;
@@ -104,6 +125,9 @@ typedef struct VP8D_COMP
 
 int vp8_decode_frame(VP8D_COMP *cpi);
 
+int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
+int vp8_remove_decoder_instances(struct frame_buffers *fb);
+
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval,expr) do {\
         lval = (expr); \
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 68095ca68347749c3fd88794f69d13c13af38b39..433726df604c9732252a6477eb2f4b1b23289fed 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -858,7 +858,9 @@ skip_motion_search:
      */
     if ((cm->current_video_frame > 0) &&
         (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
-        ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
+        ((cpi->twopass.this_frame_stats.intra_error /
+          DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
+         2.0))
     {
         vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     }
@@ -2116,23 +2118,25 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         (cpi->twopass.kf_group_error_left > 0))
     {
         cpi->twopass.gf_group_bits =
-            (int)((double)cpi->twopass.kf_group_bits *
-                  (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+            (int64_t)(cpi->twopass.kf_group_bits *
+                      (gf_group_err / cpi->twopass.kf_group_error_left));
     }
     else
         cpi->twopass.gf_group_bits = 0;
 
-    cpi->twopass.gf_group_bits = (int)(
+    cpi->twopass.gf_group_bits =
         (cpi->twopass.gf_group_bits < 0)
             ? 0
             : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-                ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits);
+                ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
 
     /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
      * variability limit (cpi->oxcf.two_pass_vbrmax_section)
      */
-    if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
-        cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
+    if (cpi->twopass.gf_group_bits >
+        (int64_t)max_bits * cpi->baseline_gf_interval)
+        cpi->twopass.gf_group_bits =
+            (int64_t)max_bits * cpi->baseline_gf_interval;
 
     /* Reset the file position */
     reset_fpf_position(cpi, start_pos);
@@ -2446,7 +2450,7 @@ void vp8_second_pass(VP8_COMP *cpi)
          */
         if (cpi->oxcf.error_resilient_mode)
         {
-            cpi->twopass.gf_group_bits = (int)cpi->twopass.kf_group_bits;
+            cpi->twopass.gf_group_bits = cpi->twopass.kf_group_bits;
             cpi->twopass.gf_group_error_left =
                                   (int)cpi->twopass.kf_group_error_left;
             cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index b08c7a5897bacc81cf9d71c100c60c028e6bdc4b..a34af64280702b800485f701860420b370e55d43 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -235,13 +235,12 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     MACROBLOCKD *xd = &x->e_mbd;
     unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
     unsigned char *y;
-    int buf_r1, buf_r2, buf_c1, buf_c2;
+    int buf_r1, buf_r2, buf_c1;
 
     /* Clamping to avoid out-of-range data access */
     buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3;
     buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3;
     buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3;
-    buf_c2 = ((bestmv->as_mv.col + 3) > x->mv_col_max)?(x->mv_col_max - bestmv->as_mv.col):3;
     y_stride = 32;
 
     /* Copy to intermediate buffer before searching. */
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 7eb7193bf75f8f1323f391dbc25b5d955dc5d711..92f9818577436368e5e665c480e9da8c92256179 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -5362,6 +5362,7 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppfla
 #endif
 
 #if CONFIG_POSTPROC
+        cpi->common.show_frame_mi = cpi->common.mi;
         ret = vp8_post_proc_frame(&cpi->common, dest, flags);
 #else
 
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index fb8ad357c02b5ae0971d745d44ad1fc7d30154c4..378731d0a7a81c1e813cc43e23763b6b71928406 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -587,7 +587,7 @@ typedef struct VP8_COMP
         /* Error score of frames still to be coded in kf group */
         int64_t kf_group_error_left;
         /* Projected Bits available for a group including 1 GF or ARF */
-        int gf_group_bits;
+        int64_t gf_group_bits;
         /* Bits for the golden frame or ARF */
         int gf_bits;
         int alt_extra_bits;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 673de2b334d09ca58342d398482f5773a9f46bee..4c2527d68ade140a0f2942959d004e77dc77fbe1 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -389,7 +389,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb)
 
 }
 
-static void update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
+static void update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv)
 {
     MACROBLOCKD *xd = &x->e_mbd;
     /* Split MV modes currently not supported when RD is nopt enabled,
@@ -1241,7 +1241,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
       != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
         best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
 
-    update_mvcount(cpi, x, &best_ref_mv);
+    update_mvcount(x, &best_ref_mv);
 }
 
 
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index a399a38771dd223c09786bdf6e41d8cdeee0d707..65fd0c5be58b65def0f4271be793c519a3c27a70 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1360,10 +1360,10 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
          * whichever is smaller.
          */
         int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
-        av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+        av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2;
 
         if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
-            av_key_frame_frequency = cpi->oxcf.key_freq;
+            av_key_frame_frequency = key_freq;
 
         cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
             = av_key_frame_frequency;
@@ -1393,6 +1393,10 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
         av_key_frame_frequency  /= total_weight;
 
     }
+    // TODO (marpan): Given the checks above, |av_key_frame_frequency|
+    // should always be above 0. But for now we keep the sanity check in.
+    if (av_key_frame_frequency == 0)
+        av_key_frame_frequency = 1;
     return av_key_frame_frequency;
 }
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index f0ec7b6e210b980644b3594eefabf635bbd09c34..3d60bebdaf5987b26a7ee0d70027e52e4b9a2f30 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1733,7 +1733,7 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
     }
 }
 
-static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
+static void rd_update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv)
 {
     if (x->e_mbd.mode_info_context->mbmi.mode == SPLITMV)
     {
@@ -2608,7 +2608,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
         best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
 
-    rd_update_mvcount(cpi, x, &best_ref_mv);
+    rd_update_mvcount(x, &best_ref_mv);
 }
 
 void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_)
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index d880ce0c46073949d9a3390ddb4023ef0f1e1112..d06bca5927f2888b5f06b1db9579ca95803b5c60 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -29,7 +29,7 @@
     movsxd      rax, dword ptr arg(2)
     lea         rcx, [rsi + rax*2]
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     %define     input       rcx
     %define     output      rdx
     %define     pitch       r8
@@ -53,7 +53,7 @@
     RESTORE_GOT
     pop         rbp
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     RESTORE_XMM
   %endif
 %endif
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index fe9464b3d995cd2b31805e3dbb27a293432e566f..b41768ce083cedd420ef7d548f3928b1b076d8bc 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -27,7 +27,7 @@ sym(vp8_regular_quantize_b_sse2):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %endif
@@ -46,7 +46,7 @@ sym(vp8_regular_quantize_b_sse2):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -226,7 +226,7 @@ ZIGZAG_LOOP 15
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
@@ -236,147 +236,6 @@ ZIGZAG_LOOP 15
     pop         rbp
     ret
 
-; void vp8_fast_quantize_b_sse2 | arg
-;  (BLOCK  *b,                  |  0
-;   BLOCKD *d)                  |  1
-
-global sym(vp8_fast_quantize_b_sse2) PRIVATE
-sym(vp8_fast_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    push        rdi
-    push        rsi
-  %else
-    ; these registers are used for passing arguments
-  %endif
-%endif
-
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_round]
-    mov         rdx, [rdi + vp8_block_quant_fast]
-
-    ; z = coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; dup z so we can save sz
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; x = abs(z) = (z ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; x += round
-    paddw       xmm1, [rcx]
-    paddw       xmm5, [rcx + 16]
-
-    mov         rax, [rsi + vp8_blockd_qcoeff]
-    mov         rcx, [rsi + vp8_blockd_dequant]
-    mov         rdi, [rsi + vp8_blockd_dqcoeff]
-
-    ; y = x * quant >> 16
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    ; x = (y ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; qcoeff = x
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    ; x * dequant
-    movdqa      xmm2, xmm1
-    movdqa      xmm3, xmm5
-    pmullw      xmm2, [rcx]
-    pmullw      xmm3, [rcx + 16]
-
-    ; dqcoeff = x * dequant
-    movdqa      [rdi], xmm2
-    movdqa      [rdi + 16], xmm3
-
-    pxor        xmm4, xmm4                  ;clear all bits
-    pcmpeqw     xmm1, xmm4
-    pcmpeqw     xmm5, xmm4
-
-    pcmpeqw     xmm4, xmm4                  ;set all bits
-    pxor        xmm1, xmm4
-    pxor        xmm5, xmm4
-
-    pand        xmm1, [GLOBAL(inv_zig_zag)]
-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
-
-    pmaxsw      xmm1, xmm5
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; now down to 8
-    pshufd      xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; only 4 left
-    pshuflw     xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; okay, just 2!
-    pshuflw     xmm5, xmm1, 00000001b
-
-    pmaxsw      xmm1, xmm5
-
-    movd        eax, xmm1
-    and         eax, 0xff
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
 SECTION_RODATA
 align 16
 inv_zig_zag:
diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c
new file mode 100644
index 0000000000000000000000000000000000000000..55d57ad62ad6f4590b227a945c1af218499e1ead
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropy.h"
+#include "vp8/encoder/block.h"
+
+#include <mmintrin.h> //MMX
+#include <xmmintrin.h> //SSE
+#include <emmintrin.h> //SSE2
+
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
+  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
+
+  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
+
+  /* sign of z: z >> 15 */
+  sz0 = _mm_srai_epi16(z0, 15);
+  sz1 = _mm_srai_epi16(z1, 15);
+
+  /* x = abs(z): (z ^ sz) - sz */
+  x0 = _mm_xor_si128(z0, sz0);
+  x1 = _mm_xor_si128(z1, sz1);
+  x0 = _mm_sub_epi16(x0, sz0);
+  x1 = _mm_sub_epi16(x1, sz1);
+
+  /* x += round */
+  x0 = _mm_add_epi16(x0, round0);
+  x1 = _mm_add_epi16(x1, round1);
+
+  /* y = (x * quant) >> 16 */
+  y0 = _mm_mulhi_epi16(x0, quant_fast0);
+  y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+  /* x = abs(y) = (y ^ sz) - sz */
+  y0 = _mm_xor_si128(y0, sz0);
+  y1 = _mm_xor_si128(y1, sz1);
+  x0 = _mm_sub_epi16(y0, sz0);
+  x1 = _mm_sub_epi16(y1, sz1);
+
+  /* qcoeff = x */
+  _mm_store_si128((__m128i *)(d->qcoeff), x0);
+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+  /* x * dequant */
+  xdq0 = _mm_mullo_epi16(x0, dequant0);
+  xdq1 = _mm_mullo_epi16(x1, dequant1);
+
+  /* dqcoeff = x * dequant */
+  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
+
+  /* build a mask for the zig zag */
+  zeros = _mm_setzero_si128();
+
+  x0 = _mm_cmpeq_epi16(x0, zeros);
+  x1 = _mm_cmpeq_epi16(x1, zeros);
+
+  ones = _mm_cmpeq_epi16(zeros, zeros);
+
+  x0 = _mm_xor_si128(x0, ones);
+  x1 = _mm_xor_si128(x1, ones);
+
+  x0 = _mm_and_si128(x0, inv_zig_zag0);
+  x1 = _mm_and_si128(x1, inv_zig_zag1);
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* now down to 8 */
+  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* only 4 left */
+  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* okay, just 2! */
+  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
+}
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index f21146457e8295032ade7ce565082c62a834f867..dbd171bfcdeef537040d5ba6e8002fbc30e0b9ca 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -31,7 +31,7 @@ sym(vp8_regular_quantize_b_sse4):
     %define stack_size 32
     sub         rsp, stack_size
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     SAVE_XMM 8, u
     push        rdi
     push        rsi
@@ -43,7 +43,7 @@ sym(vp8_regular_quantize_b_sse4):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -240,7 +240,7 @@ ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
     pop         rbp
 %else
   %undef xmm5
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
     RESTORE_XMM
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
index 35368894dafde871c3afb8dc2016fafed545e0ce..7b1dc119f080b19b78b3af9ff293404b03e88d07 100644
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -27,7 +27,7 @@ sym(vp8_fast_quantize_b_ssse3):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %endif
@@ -38,7 +38,7 @@ sym(vp8_fast_quantize_b_ssse3):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -122,7 +122,7 @@ sym(vp8_fast_quantize_b_ssse3):
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index ce9d9836bdaf5023b69494df9354cf3327dfac3e..bd92b398a0b1c34205d52131ee53ea67ccdd2790 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -50,7 +50,7 @@ sym(vp8_temporal_filter_apply_sse2):
         ; 0x8000 >> (16 - strength)
         mov         rdx,            16
         sub         rdx,            arg(4) ; 16 - strength
-        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
         movdqa      xmm5,           [GLOBAL(_const_top_bit)]
         psrlw       xmm5,           xmm4
         movdqa      [rsp + rounding_bit], xmm5
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 01482fcc7471970d3620387c9ac0ae407abc2189..1db61f1614c7a4943ca09e1d4bc73f24f496c468 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -64,7 +64,6 @@ struct vpx_codec_alg_priv
     vp8_stream_info_t       si;
     int                     defer_alloc;
     int                     decoder_init;
-    struct VP8D_COMP       *pbi;
     int                     postproc_cfg_set;
     vp8_postproc_cfg_t      postproc_cfg;
 #if CONFIG_POSTPROC_VISUALIZER
@@ -76,7 +75,9 @@ struct vpx_codec_alg_priv
 #endif
     vpx_image_t             img;
     int                     img_setup;
+    struct frame_buffers    yv12_frame_buffers;
     void                    *user_priv;
+    FRAGMENT_DATA           fragments;
 };
 
 static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t flags)
@@ -215,11 +216,36 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
         {
             vp8_init_ctx(ctx, &mmap);
 
+            /* initialize number of fragments to zero */
+            ctx->priv->alg_priv->fragments.count = 0;
+            /* is input fragments enabled? */
+            ctx->priv->alg_priv->fragments.enabled =
+                    (ctx->priv->alg_priv->base.init_flags &
+                        VPX_CODEC_USE_INPUT_FRAGMENTS);
+
             ctx->priv->alg_priv->defer_alloc = 1;
             /*post processing level initialized to do nothing */
         }
     }
 
+    ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads =
+            (ctx->priv->alg_priv->base.init_flags &
+                    VPX_CODEC_USE_FRAME_THREADING);
+
+    /* for now, disable frame threading */
+    ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = 0;
+
+    if(ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads &&
+            (( ctx->priv->alg_priv->base.init_flags &
+                            VPX_CODEC_USE_ERROR_CONCEALMENT)
+                    || ( ctx->priv->alg_priv->base.init_flags &
+                            VPX_CODEC_USE_INPUT_FRAGMENTS) ) )
+    {
+        /* row-based threading, error concealment, and input fragments will
+         * not be supported when using frame-based threading */
+        res = VPX_CODEC_INVALID_PARAM;
+    }
+
     return res;
 }
 
@@ -227,7 +253,7 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
 {
     int i;
 
-    vp8dx_remove_decompressor(ctx->pbi);
+    vp8_remove_decoder_instances(&ctx->yv12_frame_buffers);
 
     for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--)
     {
@@ -343,6 +369,47 @@ static void yuvconfig2image(vpx_image_t               *img,
     img->self_allocd = 0;
 }
 
+static int
+update_fragments(vpx_codec_alg_priv_t  *ctx,
+                 const uint8_t         *data,
+                 unsigned int           data_sz,
+                 vpx_codec_err_t       *res)
+{
+    *res = VPX_CODEC_OK;
+
+    if (ctx->fragments.count == 0)
+    {
+        /* New frame, reset fragment pointers and sizes */
+        vpx_memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
+        vpx_memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
+    }
+    if (ctx->fragments.enabled && !(data == NULL && data_sz == 0))
+    {
+        /* Store a pointer to this fragment and return. We haven't
+         * received the complete frame yet, so we will wait with decoding.
+         */
+        ctx->fragments.ptrs[ctx->fragments.count] = data;
+        ctx->fragments.sizes[ctx->fragments.count] = data_sz;
+        ctx->fragments.count++;
+        if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1)
+        {
+            ctx->fragments.count = 0;
+            *res = VPX_CODEC_INVALID_PARAM;
+            return -1;
+        }
+        return 0;
+    }
+
+    if (!ctx->fragments.enabled)
+    {
+        ctx->fragments.ptrs[0] = data;
+        ctx->fragments.sizes[0] = data_sz;
+        ctx->fragments.count = 1;
+    }
+
+    return 1;
+}
+
 static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                                   const uint8_t         *data,
                                   unsigned int            data_sz,
@@ -353,6 +420,11 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     unsigned int resolution_change = 0;
     unsigned int w, h;
 
+
+    /* Update the input fragment data */
+    if(update_fragments(ctx, data, data_sz, &res) <= 0)
+        return res;
+
     /* Determine the stream parameters. Note that we rely on peek_si to
      * validate that we have a buffer that does not wrap around the top
      * of the heap.
@@ -360,7 +432,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     w = ctx->si.w;
     h = ctx->si.h;
 
-    res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
+    res = ctx->base.iface->dec.peek_si(ctx->fragments.ptrs[0],
+                                       ctx->fragments.sizes[0], &ctx->si);
 
     if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf)
     {
@@ -412,7 +485,6 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
         if (!res)
         {
             VP8D_CONFIG oxcf;
-            struct VP8D_COMP* optr;
 
             oxcf.Width = ctx->si.w;
             oxcf.Height = ctx->si.h;
@@ -421,10 +493,6 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
             oxcf.max_threads = ctx->cfg.threads;
             oxcf.error_concealment =
                     (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT);
-            oxcf.input_fragments =
-                    (ctx->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
-
-            optr = vp8dx_create_decompressor(&oxcf);
 
             /* If postprocessing was enabled by the application and a
              * configuration has not been provided, default it.
@@ -438,20 +506,17 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
                 ctx->postproc_cfg.noise_level = 0;
             }
 
-            if (!optr)
-                res = VPX_CODEC_ERROR;
-            else
-                ctx->pbi = optr;
+            res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
         }
 
         ctx->decoder_init = 1;
     }
 
-    if (!res && ctx->pbi)
+    if (!res)
     {
+        VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
         if(resolution_change)
         {
-            VP8D_COMP *pbi = ctx->pbi;
             VP8_COMMON *const pc = & pbi->common;
             MACROBLOCKD *const xd  = & pbi->mb;
 #if CONFIG_MULTITHREAD
@@ -541,15 +606,20 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
             pbi->common.error.setjmp = 0;
 
             /* required to get past the first get_free_fb() call */
-            ctx->pbi->common.fb_idx_ref_cnt[0] = 0;
+            pbi->common.fb_idx_ref_cnt[0] = 0;
         }
 
+        /* update the pbi fragment data */
+        pbi->fragments = ctx->fragments;
+
         ctx->user_priv = user_priv;
-        if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline))
+        if (vp8dx_receive_compressed_data(pbi, data_sz, data, deadline))
         {
-            VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
             res = update_error_state(ctx, &pbi->common.error);
         }
+
+        /* get ready for the next series of fragments */
+        ctx->fragments.count = 0;
     }
 
     return res;
@@ -590,7 +660,8 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
 #endif
         }
 
-        if (0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
+        if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd,
+                                     &time_stamp, &time_end_stamp, &flags))
         {
             yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
 
@@ -715,14 +786,15 @@ static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx,
 
     vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
-    if (data)
+    if (data && !ctx->yv12_frame_buffers.use_frame_threads)
     {
         vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
         YV12_BUFFER_CONFIG sd;
 
         image2yuvconfig(&frame->img, &sd);
 
-        return vp8dx_set_reference(ctx->pbi, frame->frame_type, &sd);
+        return vp8dx_set_reference(ctx->yv12_frame_buffers.pbi[0],
+                                   frame->frame_type, &sd);
     }
     else
         return VPX_CODEC_INVALID_PARAM;
@@ -736,14 +808,15 @@ static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx,
 
     vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
-    if (data)
+    if (data && !ctx->yv12_frame_buffers.use_frame_threads)
     {
         vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
         YV12_BUFFER_CONFIG sd;
 
         image2yuvconfig(&frame->img, &sd);
 
-        return vp8dx_get_reference(ctx->pbi, frame->frame_type, &sd);
+        return vp8dx_get_reference(ctx->yv12_frame_buffers.pbi[0],
+                                   frame->frame_type, &sd);
     }
     else
         return VPX_CODEC_INVALID_PARAM;
@@ -799,10 +872,11 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                 va_list args)
 {
     int *update_info = va_arg(args, int *);
-    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
 
-    if (update_info)
+    if (update_info && !ctx->yv12_frame_buffers.use_frame_threads)
     {
+        VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+
         *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
             + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
             + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
@@ -819,11 +893,11 @@ static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
                                               va_list args)
 {
     int *ref_info = va_arg(args, int *);
-    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
-    VP8_COMMON *oci = &pbi->common;
 
-    if (ref_info)
+    if (ref_info && !ctx->yv12_frame_buffers.use_frame_threads)
     {
+        VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+        VP8_COMMON *oci = &pbi->common;
         *ref_info =
             (vp8dx_references_buffer( oci, ALTREF_FRAME )?VP8_ALTR_FRAME:0) |
             (vp8dx_references_buffer( oci, GOLDEN_FRAME )?VP8_GOLD_FRAME:0) |
@@ -844,7 +918,7 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
 
     if (corrupted)
     {
-        VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+        VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
         *corrupted = pbi->common.frame_to_show->corrupted;
 
         return VPX_CODEC_OK;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 0659407adc3b7f8a71d6b5147dce0b091d742598..f6feafb6ea3a690304554ea5a6754cfe9705a8f8 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -89,8 +89,15 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 
+# TODO(johann) make this generic
+ifeq ($(HAVE_SSE2),yes)
+vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
+vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
+endif
+
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
 ifeq ($(HAVE_SSE2),yes)
@@ -112,7 +119,6 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 endif
 
-
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
 
 $(eval $(call asm_offsets_template,\
diff --git a/vp8_multi_resolution_encoder.c b/vp8_multi_resolution_encoder.c
index eae36a4da4f4e7384bbea805b8c83b7f543e5989..4c29056e563e77eeab54941997044accd8740be4 100644
--- a/vp8_multi_resolution_encoder.c
+++ b/vp8_multi_resolution_encoder.c
@@ -216,7 +216,7 @@ int main(int argc, char **argv)
      * If target bitrate for highest-resolution level is set to 0,
      * (i.e. target_bitrate[0]=0), we skip encoding at that level.
      */
-    unsigned int         target_bitrate[NUM_ENCODERS]={1400, 500, 100};
+    unsigned int         target_bitrate[NUM_ENCODERS]={1000, 500, 100};
     /* Enter the frame rate of the input video */
     int                  framerate = 30;
     /* Set down-sampling factor for each resolution level.
@@ -351,27 +351,26 @@ int main(int argc, char **argv)
         if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
             die_codec(&codec[i], "Failed to set cpu_used");
     }
-    /* Set static thresh for highest-resolution encoder. Set it to 1000 for
-     * better performance. */
-    {
-        unsigned int static_thresh = 1000;
-        if(vpx_codec_control(&codec[0], VP8E_SET_STATIC_THRESHOLD, static_thresh))
-            die_codec(&codec[0], "Failed to set static threshold");
-    }
-    /* Set static thresh = 0 for other encoders for better quality */
-    for ( i=1; i<NUM_ENCODERS; i++)
+
+    /* Set static threshold. */
+    for ( i=0; i<NUM_ENCODERS; i++)
     {
-        unsigned int static_thresh = 0;
+        unsigned int static_thresh = 1;
         if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh))
             die_codec(&codec[i], "Failed to set static threshold");
     }
+
     /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
-    for ( i=0; i< NUM_ENCODERS; i++)
+    /* Enable denoising for the highest-resolution encoder. */
+    if(vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
+        die_codec(&codec[0], "Failed to set noise_sensitivity");
+    for ( i=1; i< NUM_ENCODERS; i++)
     {
         if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
             die_codec(&codec[i], "Failed to set noise_sensitivity");
     }
 
+
     frame_avail = 1;
     got_data = 0;
 
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 77b79395e3fb4979cd8fc8b1e0c672abd2ab4c91..2f709bf58da5bfbab451ee0931947d04a2b34e90 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -57,9 +57,9 @@ unsigned int vp9_sad16x3_c(const uint8_t *src_ptr,
 
 
 unsigned int vp9_variance2x16_c(const uint8_t *src_ptr,
-                                const int  source_stride,
+                                int  source_stride,
                                 const uint8_t *ref_ptr,
-                                const int  recon_stride,
+                                int  recon_stride,
                                 unsigned int *sse) {
   int sum;
   variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, sse, &sum);
@@ -67,9 +67,9 @@ unsigned int vp9_variance2x16_c(const uint8_t *src_ptr,
 }
 
 unsigned int vp9_variance16x2_c(const uint8_t *src_ptr,
-                                const int  source_stride,
+                                int  source_stride,
                                 const uint8_t *ref_ptr,
-                                const int  recon_stride,
+                                int  recon_stride,
                                 unsigned int *sse) {
   int sum;
   variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, sse, &sum);
@@ -77,11 +77,11 @@ unsigned int vp9_variance16x2_c(const uint8_t *src_ptr,
 }
 
 unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
-                                          const int  src_pixels_per_line,
-                                          const int  xoffset,
-                                          const int  yoffset,
+                                          int  src_pixels_per_line,
+                                          int  xoffset,
+                                          int  yoffset,
                                           const uint8_t *dst_ptr,
-                                          const int dst_pixels_per_line,
+                                          int dst_pixels_per_line,
                                           unsigned int *sse) {
   uint16_t FData3[16 * 3];  // Temp data buffer used in filtering
   uint8_t temp2[2 * 16];
@@ -98,11 +98,11 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
 }
 
 unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
-                                          const int  src_pixels_per_line,
-                                          const int  xoffset,
-                                          const int  yoffset,
+                                          int  src_pixels_per_line,
+                                          int  xoffset,
+                                          int  yoffset,
                                           const uint8_t *dst_ptr,
-                                          const int dst_pixels_per_line,
+                                          int dst_pixels_per_line,
                                           unsigned int *sse) {
   uint16_t FData3[2 * 17];  // Temp data buffer used in filtering
   uint8_t temp2[2 * 16];
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 9cbf44c464d2ec6f3906368ba65265764d878bdc..e6dcff4d1a16c9a2cbefbd8af7b2d8e34b3c6eef 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -14,7 +14,7 @@ struct loop_filter_info;
 /* Encoder forward decls */
 struct block;
 struct macroblock;
-struct variance_vtable;
+struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
 union int_mv;
@@ -470,25 +470,25 @@ specialize vp9_sad8x8x8 sse4
 prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad4x4x8 sse4
 
-prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad64x64x4d sse2
 
-prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x4d sse2
 
-prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x16x4d sse2
 
-prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x8x4d sse2
 
-prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x16x4d sse2
 
-prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x8x4d sse2
 
-prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 specialize vp9_sub_pixel_mse16x16 sse2 mmx
@@ -577,16 +577,16 @@ specialize vp9_short_walsh8x4_x8
 #
 # Motion search
 #
-prototype int vp9_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
+prototype int vp9_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
 specialize vp9_full_search_sad sse3 sse4_1
 vp9_full_search_sad_sse3=vp9_full_search_sadx3
 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
 
-prototype int vp9_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
+prototype int vp9_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
 specialize vp9_refining_search_sad sse3
 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
 
-prototype int vp9_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
+prototype int vp9_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
 specialize vp9_diamond_search_sad sse3
 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4
 
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index fa24f4cd0da4bc56f13d74ba3f17470b2cc4812c..32f00e2893dade99ee9085866fee80cbc39b893c 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -113,7 +113,7 @@
     mov         rcx, 0x0400040
 
     movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
+    movq        xmm5, rcx
     packsswb    xmm4, xmm4
     pshuflw     xmm0, xmm4, 0b              ;k0_k1
     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
@@ -198,7 +198,7 @@
     mov         rcx, 0x0400040
 
     movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
+    movq        xmm5, rcx
     packsswb    xmm4, xmm4
     pshuflw     xmm0, xmm4, 0b              ;k0_k1
     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
@@ -542,7 +542,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     mov         rcx, 0x0400040
 
     movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
+    movq        xmm5, rcx
     packsswb    xmm4, xmm4
     pshuflw     xmm0, xmm4, 0b              ;k0_k1
     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
@@ -681,7 +681,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     mov         rcx, 0x0400040
 
     movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
+    movq        xmm5, rcx
     packsswb    xmm4, xmm4
     pshuflw     xmm0, xmm4, 0b              ;k0_k1
     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index d4212a52eea488176762121a634fdf51e1de5a63..86806d2d0143b6906688530ece299628f9090f32 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -565,7 +565,7 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
 };
 
 static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                                int mb_row, unsigned int mb_col,
+                                int mb_row, int mb_col,
                                 BOOL_DECODER* const bc) {
   int n, eobtotal;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
@@ -688,7 +688,7 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
 }
 
 static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                                int mb_row, unsigned int mb_col,
+                                int mb_row, int mb_col,
                                 BOOL_DECODER* const bc) {
   int n, eobtotal;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4d0a299e8437325d20aec04dae42e077d3091ce4..337276d5919b81b29e3d28ae633a9c65d71d5ffd 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -751,7 +751,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
       ((cm->current_video_frame > 0) &&
        (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
        ((cpi->twopass.this_frame_stats->intra_error /
-         cpi->twopass.this_frame_stats->coded_error) > 2.0))) {
+         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats->coded_error)) >
+        2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     cpi->twopass.sr_update_lag = 1;
   } else
@@ -1650,8 +1651,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   if ((cpi->twopass.kf_group_bits > 0) &&
       (cpi->twopass.kf_group_error_left > 0)) {
     cpi->twopass.gf_group_bits =
-      (int)((double)cpi->twopass.kf_group_bits *
-            (gf_group_err / cpi->twopass.kf_group_error_left));
+      (int64_t)(cpi->twopass.kf_group_bits *
+                (gf_group_err / cpi->twopass.kf_group_error_left));
   } else
     cpi->twopass.gf_group_bits = 0;
 
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index a51c786563d98a55a3749f8f7c4a1a8ef1706d60..1bca9d267c2a300c5e32b62dd4af99b284b94775 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -174,7 +174,7 @@ vp9_lookahead_peek(struct lookahead_ctx *ctx,
                    int                   index) {
   struct lookahead_entry *buf = NULL;
 
-  assert(index < ctx->max_sz);
+  assert(index < (int)ctx->max_sz);
   if (index < (int)ctx->sz) {
     index += ctx->read_idx;
     if (index >= (int)ctx->max_sz)
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 496be950c63949876a8472fa0398c7edfaac2824..59e33a46480c6fecfe7fa27e7fbfcba89d5a794a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -756,7 +756,7 @@ static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
     error += this_diff * this_diff;
   }
 
-  return error > INT_MAX ? INT_MAX : error;
+  return error > INT_MAX ? INT_MAX : (int)error;
 }
 
 #define DEBUG_ERROR 0
@@ -3067,9 +3067,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->best_ref_mv.as_int = ref_mv->as_int;
   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
 
-  ctx->single_pred_diff = comp_pred_diff[SINGLE_PREDICTION_ONLY];
-  ctx->comp_pred_diff   = comp_pred_diff[COMP_PREDICTION_ONLY];
-  ctx->hybrid_pred_diff = comp_pred_diff[HYBRID_PREDICTION];
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY];
+  ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
 
   memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
 }
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index dc21f02f638b804ead98c58fb9b2889ee9fc2791..af5526dce333bf133eff44b957ea3c63951ab766 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -383,7 +383,7 @@ void vp9_sad4x4x8_c(const uint8_t *src_ptr,
 
 void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
-                       const uint8_t *ref_ptr[],
+                       const uint8_t* const ref_ptr[],
                        int  ref_stride,
                        unsigned int *sad_array) {
   sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
@@ -398,7 +398,7 @@ void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
 
 void vp9_sad32x32x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
-                       const uint8_t *ref_ptr[],
+                       const uint8_t* const ref_ptr[],
                        int  ref_stride,
                        unsigned int *sad_array) {
   sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
@@ -413,7 +413,7 @@ void vp9_sad32x32x4d_c(const uint8_t *src_ptr,
 
 void vp9_sad16x16x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
-                       const uint8_t *ref_ptr[],
+                       const uint8_t* const ref_ptr[],
                        int  ref_stride,
                        unsigned int *sad_array) {
   sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
@@ -428,7 +428,7 @@ void vp9_sad16x16x4d_c(const uint8_t *src_ptr,
 
 void vp9_sad16x8x4d_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      const uint8_t *ref_ptr[],
+                      const uint8_t* const ref_ptr[],
                       int  ref_stride,
                       unsigned int *sad_array) {
   sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
@@ -443,7 +443,7 @@ void vp9_sad16x8x4d_c(const uint8_t *src_ptr,
 
 void vp9_sad8x8x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     const uint8_t *ref_ptr[],
+                     const uint8_t* const ref_ptr[],
                      int  ref_stride,
                      unsigned int *sad_array) {
   sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
@@ -458,7 +458,7 @@ void vp9_sad8x8x4d_c(const uint8_t *src_ptr,
 
 void vp9_sad8x16x4d_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      const uint8_t *ref_ptr[],
+                      const uint8_t* const ref_ptr[],
                       int  ref_stride,
                       unsigned int *sad_array) {
   sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
@@ -473,7 +473,7 @@ void vp9_sad8x16x4d_c(const uint8_t *src_ptr,
 
 void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     const uint8_t *ref_ptr[],
+                     const uint8_t* const ref_ptr[],
                      int  ref_stride,
                      unsigned int *sad_array) {
   sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 8b32524a24d5eab6d5dc2409c12e281c65617878..13dabbda41d711a8dce115cf2892182450083a14 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -33,7 +33,7 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr,
 
 typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
                                      int source_stride,
-                                     const uint8_t ** ref_ptr,
+                                     const uint8_t* const ref_ptr[],
                                      int  ref_stride, unsigned int *sad_array);
 
 typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,
@@ -63,7 +63,7 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
                                                    const uint8_t *ref_ptr,
                                                    int  ref_stride);
 
-typedef struct variance_vtable {
+typedef struct vp9_variance_vtable {
     vp9_sad_fn_t            sdf;
     vp9_variance_fn_t       vf;
     vp9_subpixvariance_fn_t svf;
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm
index 57b81a566282679b8e573cb3f2bfc0eed4544c3a..bbd6086dafd3b3dd0689ee969f4054f4e430e5ca 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.asm
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -29,7 +29,7 @@
     movsxd      rax, dword ptr arg(2)
     lea         rcx, [rsi + rax*2]
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     %define     input       rcx
     %define     output      rdx
     %define     pitch       r8
@@ -53,7 +53,7 @@
     RESTORE_GOT
     pop         rbp
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     RESTORE_XMM
   %endif
 %endif
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.asm b/vp9/encoder/x86/vp9_quantize_sse2.asm
index 060acc2acfaef6a2afc59e30c602c751086edbc5..2a686f5a87b0fd425f3a11f60af9eebf2742223b 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.asm
+++ b/vp9/encoder/x86/vp9_quantize_sse2.asm
@@ -27,7 +27,7 @@ sym(vp9_regular_quantize_b_sse2):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %endif
@@ -46,7 +46,7 @@ sym(vp9_regular_quantize_b_sse2):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -223,7 +223,7 @@ ZIGZAG_LOOP 15
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
@@ -247,7 +247,7 @@ sym(vp9_fast_quantize_b_sse2):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %else
@@ -261,7 +261,7 @@ sym(vp9_fast_quantize_b_sse2):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -361,7 +361,7 @@ sym(vp9_fast_quantize_b_sse2):
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
diff --git a/vp9/encoder/x86/vp9_quantize_sse4.asm b/vp9/encoder/x86/vp9_quantize_sse4.asm
index 1d43ce95863b60063777a552f1cf0eb950302f4c..d7779bd0df11519c30739caa96ec186df8c27a5d 100644
--- a/vp9/encoder/x86/vp9_quantize_sse4.asm
+++ b/vp9/encoder/x86/vp9_quantize_sse4.asm
@@ -31,7 +31,7 @@ sym(vp9_regular_quantize_b_sse4):
     %define stack_size 32
     sub         rsp, stack_size
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     SAVE_XMM 8, u
     push        rdi
     push        rsi
@@ -43,7 +43,7 @@ sym(vp9_regular_quantize_b_sse4):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -238,7 +238,7 @@ ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
     pop         rbp
 %else
   %undef xmm5
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
     RESTORE_XMM
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 41edbc13ef25a3cb56b8406575ec32f4540b8f78..e082af1f5c46f65d418a3020874380c7ea0faba9 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -27,7 +27,7 @@ sym(vp9_fast_quantize_b_ssse3):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %endif
@@ -38,7 +38,7 @@ sym(vp9_fast_quantize_b_ssse3):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -122,7 +122,7 @@ sym(vp9_fast_quantize_b_ssse3):
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm
index 5d84172701c377d94eb62fdbb5cbc1d775417886..2b90a5d54789fa46787e0e2e7efe1dc6c978fdb8 100644
--- a/vp9/encoder/x86/vp9_sad_sse3.asm
+++ b/vp9/encoder/x86/vp9_sad_sse3.asm
@@ -33,7 +33,7 @@
     movsxd      rax,        dword ptr arg(1)    ; src_stride
     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     SAVE_XMM 7, u
     %define     src_ptr     rcx
     %define     src_stride  rdx
@@ -76,7 +76,7 @@
     pop         rsi
     pop         rbp
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     RESTORE_XMM
   %endif
 %endif
diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
index a559d5d5ae1b71c5ac7a60747fb751553ef757b7..d2d13b3839425b5d297e1af3823963965b049ea0 100644
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -50,7 +50,7 @@ sym(vp9_temporal_filter_apply_sse2):
         ; 0x8000 >> (16 - strength)
         mov         rdx,            16
         sub         rdx,            arg(4) ; 16 - strength
-        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
         movdqa      xmm5,           [GLOBAL(_const_top_bit)]
         psrlw       xmm5,           xmm4
         movdqa      [rsp + rounding_bit], xmm5
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index d2dec6f5dd0517375332cfd77ddc60b432337089..ddbc0a6d6b0584a5dbb16878e0928cbb9ac6adaa 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -65,12 +65,17 @@ extern "C" {
    *
    *  The available flags are specified by VPX_CODEC_USE_* defines.
    */
+#define VPX_CODEC_CAP_FRAME_THREADING   0x200000 /**< Can support frame-based
+                                                      multi-threading */
+
 #define VPX_CODEC_USE_POSTPROC   0x10000 /**< Postprocess decoded frame */
 #define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded
   frames */
 #define VPX_CODEC_USE_INPUT_FRAGMENTS   0x40000 /**< The input frame should be
   passed to the decoder one
   fragment at a time */
+#define VPX_CODEC_USE_FRAME_THREADING   0x80000 /**< Enable frame-based
+                                                     multi-threading */
 
   /*!\brief Stream properties
    *
diff --git a/vpx_ports/emms.asm b/vpx_ports/emms.asm
index efad1a503db477b7b02c16f4d7ad4a6a9ba934bd..db8da287375206fad61ad631ba0abcb7c4f3709e 100644
--- a/vpx_ports/emms.asm
+++ b/vpx_ports/emms.asm
@@ -18,7 +18,7 @@ sym(vpx_reset_mmx_state):
     ret
 
 
-%ifidn __OUTPUT_FORMAT__,x64
+%if LIBVPX_YASM_WIN64
 global sym(vpx_winx64_fldcw) PRIVATE
 sym(vpx_winx64_fldcw):
     sub   rsp, 8
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index 0c9fe377405693d7fcb406d823519d3c2d623a9e..eccbfa35c2080965025fd4d7c831a89dfe78de19 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -78,6 +78,17 @@
 %endif
 
 
+; LIBVPX_YASM_WIN64
+; Set LIBVPX_YASM_WIN64 if output is Windows 64bit so the code will work if x64
+; or win64 is defined on the Yasm command line.
+%ifidn __OUTPUT_FORMAT__,win64
+%define LIBVPX_YASM_WIN64 1
+%elifidn __OUTPUT_FORMAT__,x64
+%define LIBVPX_YASM_WIN64 1
+%else
+%define LIBVPX_YASM_WIN64 0
+%endif
+
 ; sym()
 ; Return the proper symbol name for the target ABI.
 ;
@@ -90,7 +101,7 @@
 %define sym(x) x
 %elifidn __OUTPUT_FORMAT__,elfx32
 %define sym(x) x
-%elifidn __OUTPUT_FORMAT__,x64
+%elif LIBVPX_YASM_WIN64
 %define sym(x) x
 %else
 %define sym(x) _ %+ x
@@ -114,7 +125,7 @@
     %define PRIVATE :hidden
   %elifidn __OUTPUT_FORMAT__,elfx32
     %define PRIVATE :hidden
-  %elifidn __OUTPUT_FORMAT__,x64
+  %elif LIBVPX_YASM_WIN64
     %define PRIVATE
   %else
     %define PRIVATE :private_extern
@@ -131,7 +142,7 @@
 %else
   ; 64 bit ABI passes arguments in registers. This is a workaround to get up
   ; and running quickly. Relies on SHADOW_ARGS_TO_STACK
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     %define arg(x) [rbp+16+8*x]
   %else
     %define arg(x) [rbp-8-8*x]
@@ -230,6 +241,12 @@
   %elifidn __OUTPUT_FORMAT__,elfx32
     %define WRT_PLT wrt ..plt
     %define HIDDEN_DATA(x) x:data hidden
+  %elifidn __OUTPUT_FORMAT__,macho64
+    %ifdef CHROMIUM
+      %define HIDDEN_DATA(x) x:private_extern
+    %else
+      %define HIDDEN_DATA(x) x
+    %endif
   %else
     %define HIDDEN_DATA(x) x
   %endif
@@ -251,7 +268,7 @@
   %endm
   %define UNSHADOW_ARGS
 %else
-%ifidn __OUTPUT_FORMAT__,x64
+%if LIBVPX_YASM_WIN64
   %macro SHADOW_ARGS_TO_STACK 1 ; argc
     %if %1 > 0
         mov arg(0),rcx
@@ -307,7 +324,7 @@
 ; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return
 ; value. Typically we follow this up with 'push rbp' - re-aligning the stack -
 ; but in some cases this is not done and unaligned movs must be used.
-%ifidn __OUTPUT_FORMAT__,x64
+%if LIBVPX_YASM_WIN64
 %macro SAVE_XMM 1-2 a
   %if %1 < 6
     %error Only xmm registers 6-15 must be preserved
diff --git a/vpxenc.c b/vpxenc.c
index 2f3ae0f8cd59232a01f066d7e9bcb4849895d7b4..0df0a982ff929805f71bfbc2f605547b739cdde5 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -10,7 +10,7 @@
 
 #include "vpx_config.h"
 
-#if defined(_WIN32) || !CONFIG_OS_SUPPORT
+#if defined(_WIN32) || defined(__OS2__) || !CONFIG_OS_SUPPORT
 #define USE_POSIX_MMAP 0
 #else
 #define USE_POSIX_MMAP 1