diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 1926f20bdf7a4c5829439039e688b442ab2ede61..1cba5d35ac5eb02ce6c36ca87dbbe4935a932515 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -148,6 +148,7 @@ typedef enum {
 #define VP8_YMODES  (B_PRED + 1)
 #define VP8_UV_MODES (TM_PRED + 1)
 #define VP8_I8X8_MODES (TM_PRED + 1)
+#define VP8_I32X32_MODES (TM_PRED + 1)
 
 #define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
 
@@ -293,6 +294,11 @@ typedef struct {
     INTERPOLATIONFILTERTYPE interp_filter;
 #endif
 
+#if CONFIG_SUPERBLOCKS
+  // FIXME need a SB array of 4 MB_MODE_INFOs that
+  // only needs one encoded_as_sb.
+  unsigned char encoded_as_sb;
+#endif
 } MB_MODE_INFO;
 
 typedef struct {
diff --git a/vp8/common/entropymode.c b/vp8/common/entropymode.c
index 8d43ce8273a8a200b550898d15fc24a46467e47e..5627aa43a4a073062efb96511ae33c25ab9f7c48 100644
--- a/vp8/common/entropymode.c
+++ b/vp8/common/entropymode.c
@@ -227,6 +227,14 @@ const vp8_tree_index vp8_mv_ref_tree[8] = {
   -NEWMV, -SPLITMV
 };
 
+#if CONFIG_SUPERBLOCKS
+const vp8_tree_index vp8_sb_mv_ref_tree[6] = {
+  -ZEROMV, 2,
+  -NEARESTMV, 4,
+  -NEARMV, -NEWMV
+};
+#endif
+
 const vp8_tree_index vp8_sub_mv_ref_tree[6] = {
   -LEFT4X4, 2,
   -ABOVE4X4, 4,
@@ -236,12 +244,18 @@ const vp8_tree_index vp8_sub_mv_ref_tree[6] = {
 
 struct vp8_token_struct vp8_bmode_encodings   [VP8_BINTRAMODES];
 struct vp8_token_struct vp8_ymode_encodings   [VP8_YMODES];
+#if CONFIG_SUPERBLOCKS
+struct vp8_token_struct vp8_sb_kf_ymode_encodings [VP8_I32X32_MODES];
+#endif
 struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES];
 struct vp8_token_struct vp8_uv_mode_encodings  [VP8_UV_MODES];
-struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_UV_MODES];
+struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_I8X8_MODES];
 struct vp8_token_struct vp8_mbsplit_encodings [VP8_NUMMBSPLITS];
 
 struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
+#if CONFIG_SUPERBLOCKS
+struct vp8_token_struct vp8_sb_mv_ref_encoding_array  [VP8_MVREFS];
+#endif
 struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
 
 
@@ -253,11 +267,18 @@ void vp8_init_mbmode_probs(VP8_COMMON *x) {
     vp8_ymode_tree, x->fc.ymode_prob, bct, y_mode_cts, 256, 1);
   {
     int i;
-    for (i = 0; i < 8; i++)
+    for (i = 0; i < 8; i++) {
       vp8_tree_probs_from_distribution(
         VP8_YMODES, vp8_kf_ymode_encodings, vp8_kf_ymode_tree,
         x->kf_ymode_prob[i], bct, kf_y_mode_cts[i],
         256, 1);
+#if CONFIG_SUPERBLOCKS
+      vp8_tree_probs_from_distribution(
+        VP8_I32X32_MODES, vp8_sb_kf_ymode_encodings, vp8_sb_ymode_tree,
+        x->sb_kf_ymode_prob[i], bct, kf_y_mode_cts[i],
+        256, 1);
+#endif
+    }
   }
   {
     int i;
@@ -360,6 +381,9 @@ void vp8_entropy_mode_init() {
   vp8_tokens_from_tree(vp8_bmode_encodings,   vp8_bmode_tree);
   vp8_tokens_from_tree(vp8_ymode_encodings,   vp8_ymode_tree);
   vp8_tokens_from_tree(vp8_kf_ymode_encodings, vp8_kf_ymode_tree);
+#if CONFIG_SUPERBLOCKS
+  vp8_tokens_from_tree(vp8_sb_kf_ymode_encodings, vp8_sb_ymode_tree);
+#endif
   vp8_tokens_from_tree(vp8_uv_mode_encodings,  vp8_uv_mode_tree);
   vp8_tokens_from_tree(vp8_i8x8_mode_encodings,  vp8_i8x8_mode_tree);
   vp8_tokens_from_tree(vp8_mbsplit_encodings, vp8_mbsplit_tree);
@@ -370,6 +394,10 @@ void vp8_entropy_mode_init() {
 
   vp8_tokens_from_tree_offset(vp8_mv_ref_encoding_array,
                               vp8_mv_ref_tree, NEARESTMV);
+#if CONFIG_SUPERBLOCKS
+  vp8_tokens_from_tree_offset(vp8_sb_mv_ref_encoding_array,
+                              vp8_sb_mv_ref_tree, NEARESTMV);
+#endif
   vp8_tokens_from_tree_offset(vp8_sub_mv_ref_encoding_array,
                               vp8_sub_mv_ref_tree, LEFT4X4);
 }
diff --git a/vp8/common/entropymode.h b/vp8/common/entropymode.h
index f9cc263b9e43a8896733797aa0cbba54d9a01a16..430c949a6263186e562f04ec1fb21d5075148608 100644
--- a/vp8/common/entropymode.h
+++ b/vp8/common/entropymode.h
@@ -40,21 +40,25 @@ extern const vp8_tree_index vp8_bmode_tree[];
 extern const vp8_tree_index  vp8_ymode_tree[];
 extern const vp8_tree_index  vp8_kf_ymode_tree[];
 extern const vp8_tree_index  vp8_uv_mode_tree[];
+#define vp8_sb_ymode_tree vp8_uv_mode_tree
 extern const vp8_tree_index  vp8_i8x8_mode_tree[];
 extern const vp8_tree_index  vp8_mbsplit_tree[];
 extern const vp8_tree_index  vp8_mv_ref_tree[];
+extern const vp8_tree_index  vp8_sb_mv_ref_tree[];
 extern const vp8_tree_index  vp8_sub_mv_ref_tree[];
 
 extern struct vp8_token_struct vp8_bmode_encodings   [VP8_BINTRAMODES];
 extern struct vp8_token_struct vp8_ymode_encodings   [VP8_YMODES];
+extern struct vp8_token_struct vp8_sb_kf_ymode_encodings [VP8_I32X32_MODES];
 extern struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES];
-extern struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_UV_MODES];
+extern struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_I8X8_MODES];
 extern struct vp8_token_struct vp8_uv_mode_encodings  [VP8_UV_MODES];
 extern struct vp8_token_struct vp8_mbsplit_encodings  [VP8_NUMMBSPLITS];
 
 /* Inter mode values do not start at zero */
 
 extern struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
+extern struct vp8_token_struct vp8_sb_mv_ref_encoding_array    [VP8_MVREFS];
 extern struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
 
 void vp8_entropy_mode_init(void);
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index b71ef750df084e943cbfaa051d548759b3d6b2d2..d28024cda565c7adb744463817a9735375598ed7 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -47,6 +47,12 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) {
   rtcd->recon.recon4      = vp8_recon4b_c;
   rtcd->recon.recon_mb    = vp8_recon_mb_c;
   rtcd->recon.recon_mby   = vp8_recon_mby_c;
+#if CONFIG_SUPERBLOCKS
+  rtcd->recon.build_intra_predictors_sby_s =
+    vp8_build_intra_predictors_sby_s;
+  rtcd->recon.build_intra_predictors_sbuv_s =
+    vp8_build_intra_predictors_sbuv_s;
+#endif
   rtcd->recon.build_intra_predictors_mby =
     vp8_build_intra_predictors_mby;
 #if CONFIG_COMP_INTRA_PRED
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index 05c00ef4e1ee79bede20d7853084f9e1524f0531..d9c4b54be9b720777542412a0c0fa6623d7a346b 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -325,7 +325,13 @@ void vp8_loop_filter_frame
           lfi.lim = lfi_n->lim[filter_level];
           lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-          if (mb_col > 0)
+          if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+              )
             vp8_loop_filter_mbv_c
             (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
 
@@ -344,7 +350,13 @@ void vp8_loop_filter_frame
           }
 
           /* don't apply across umv border */
-          if (mb_row > 0)
+          if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+              )
             vp8_loop_filter_mbh_c
             (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
 
@@ -362,7 +374,13 @@ void vp8_loop_filter_frame
           }
         } else {
           // FIXME: Not 8x8 aware
-          if (mb_col > 0)
+          if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+              )
             LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
             (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
 
@@ -371,7 +389,13 @@ void vp8_loop_filter_frame
             (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
 
           /* don't apply across umv border */
-          if (mb_row > 0)
+          if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+              )
             LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
             (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
 
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index a36347dca8e5f8b599197c2301e844d61817edf0..b7a543220a0f2b98c18f801c01c0160aa6f193da 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -226,12 +226,15 @@ typedef struct VP8Common {
 
   /* Y,U,V,Y2 */
   ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
-  ENTROPY_CONTEXT_PLANES left_context;  /* (up to) 4 contexts "" */
+  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */
 
   /* keyframe block modes are predicted by their above, left neighbors */
 
   vp8_prob kf_bmode_prob [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES - 1];
   vp8_prob kf_ymode_prob[8][VP8_YMODES - 1]; /* keyframe "" */
+#if CONFIG_SUPERBLOCKS
+  vp8_prob sb_kf_ymode_prob[8][VP8_I32X32_MODES - 1];
+#endif
   int kf_ymode_probs_index;
   int kf_ymode_probs_update;
   vp8_prob kf_uv_mode_prob[VP8_YMODES] [VP8_UV_MODES - 1];
@@ -239,6 +242,9 @@ typedef struct VP8Common {
   vp8_prob prob_intra_coded;
   vp8_prob prob_last_coded;
   vp8_prob prob_gf_coded;
+#if CONFIG_SUPERBLOCKS
+  vp8_prob sb_coded;
+#endif
 
   // Context probabilities when using predictive coding of segment id
   vp8_prob segment_pred_probs[PREDICTION_PROBS];
diff --git a/vp8/common/pred_common.c b/vp8/common/pred_common.c
index ac5d8600980b1c9628d67d930be22ae5ae170881..cb80a0f7e92e438d823456989181c268abfb2e1e 100644
--- a/vp8/common/pred_common.c
+++ b/vp8/common/pred_common.c
@@ -1,3 +1,4 @@
+
 /*
  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  *
@@ -224,10 +225,24 @@ void set_pred_flag(MACROBLOCKD *const xd,
   switch (pred_id) {
     case PRED_SEG_ID:
       xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride].mbmi.seg_id_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride+1].mbmi.seg_id_predicted = pred_flag;
+      }
+#endif
       break;
 
     case PRED_REF:
       xd->mode_info_context->mbmi.ref_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride].mbmi.ref_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride+1].mbmi.ref_predicted = pred_flag;
+      }
+#endif
       break;
 
     case PRED_MBSKIP:
diff --git a/vp8/common/recon.c b/vp8/common/recon.c
index 8fc320863b85e2d375a63e699558aa11c1a97c0c..cf2d2fb85109940af7a42419adf05272d2386e76 100644
--- a/vp8/common/recon.c
+++ b/vp8/common/recon.c
@@ -124,6 +124,52 @@ void vp8_recon2b_c
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_recon_mby_s_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y;
+  BLOCKD *b = &xd->block[0];
+  int stride = b->dst_stride;
+  short *diff = b->diff;
+
+  for (y = 0; y < 16; y++) {
+    for (x = 0; x < 16; x++) {
+      int a = dst[x] + diff[x];
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+      dst[x] = a;
+    }
+    dst += stride;
+    diff += 16;
+  }
+}
+
+void vp8_recon_mbuv_s_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, i;
+  uint8_t *dst = udst;
+
+  for (i = 0; i < 2; i++, dst = vdst) {
+    BLOCKD *b = &xd->block[16 + 4 * i];
+    int stride = b->dst_stride;
+    short *diff = b->diff;
+
+    for (y = 0; y < 8; y++) {
+      for (x = 0; x < 8; x++) {
+        int a = dst[x] + diff[x];
+        if (a < 0)
+          a = 0;
+        else if (a > 255)
+          a = 255;
+        dst[x] = a;
+      }
+      dst += stride;
+      diff += 8;
+    }
+  }
+}
+#endif
+
 void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {
 #if ARCH_ARM
   BLOCKD *b = &xd->block[0];
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index 2626a218dddefb52966b4b1af7d015edc89ee8c9..3527fc14d872ab8df39bae31983509b330aa3bfa 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -100,6 +100,11 @@ extern prototype_recon_macroblock(vp8_recon_recon_mb);
 #endif
 extern prototype_recon_macroblock(vp8_recon_recon_mby);
 
+#ifndef vp8_recon_build_intra_predictors_sby_s
+#define vp8_recon_build_intra_predictors_sby_s vp8_build_intra_predictors_sby_s
+#endif
+extern prototype_build_intra_predictors(vp8_recon_build_intra_predictors_sby_s);
+
 #ifndef vp8_recon_build_intra_predictors_mby
 #define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
 #endif
@@ -126,6 +131,11 @@ extern prototype_build_intra_predictors\
 extern prototype_build_intra_predictors\
 (vp8_recon_build_intra_predictors_mby_s);
 
+#ifndef vp8_recon_build_intra_predictors_sbuv_s
+#define vp8_recon_build_intra_predictors_sbuv_s vp8_build_intra_predictors_sbuv_s
+#endif
+extern prototype_build_intra_predictors(vp8_recon_build_intra_predictors_sbuv_s);
+
 #ifndef vp8_recon_build_intra_predictors_mbuv
 #define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
 #endif
@@ -214,10 +224,16 @@ typedef struct vp8_recon_rtcd_vtable {
   vp8_recon_fn_t       recon4;
   vp8_recon_mb_fn_t    recon_mb;
   vp8_recon_mb_fn_t    recon_mby;
+#if CONFIG_SUPERBLOCKS
+  vp8_build_intra_pred_fn_t  build_intra_predictors_sby_s;
+#endif
   vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;
   vp8_build_intra_pred_fn_t  build_intra_predictors_mby;
 #if CONFIG_COMP_INTRA_PRED
   vp8_build_intra_pred_fn_t  build_comp_intra_predictors_mby;
+#endif
+#if CONFIG_SUPERBLOCKS
+  vp8_build_intra_pred_fn_t  build_intra_predictors_sbuv_s;
 #endif
   vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv_s;
   vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv;
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 0d82db78475b566cb7cf54c3185cf0a3c653e0e6..1b5ef837fb8549ada3d4f1b72e580e57b0dcacf8 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -759,6 +759,56 @@ void vp8_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
   vp8_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                        unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride) {
+  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
+  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
+          *v2 = x->second_pre.v_buffer;
+  int n;
+
+  for (n = 0; n < 4; n++)
+  {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
+    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+
+    vp8_build_1st_inter16x16_predictors_mb(x,
+      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
+      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
+      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
+      dst_ystride, dst_uvstride);
+    if (x->mode_info_context->mbmi.second_ref_frame) {
+      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
+      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+
+      vp8_build_2nd_inter16x16_predictors_mb(x,
+        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
+        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
+        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
+        dst_ystride, dst_uvstride);
+    }
+  }
+
+  x->pre.y_buffer = y1;
+  x->pre.u_buffer = u1;
+  x->pre.v_buffer = v1;
+
+  if (x->mode_info_context->mbmi.second_ref_frame) {
+    x->second_pre.y_buffer = y2;
+    x->second_pre.u_buffer = u2;
+    x->second_pre.v_buffer = v2;
+  }
+}
+#endif
+
 /*
  * The following functions should be called after an initial
  * call to vp8_build_inter16x16_predictors_mb() or _mby()/_mbuv().
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index 96bebc5be2ff8365b0addb3e05618466a9344184..d858cd153a586769cab1dd9fd6e3d1a5eb050dce 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -207,17 +207,18 @@ void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd,
   }
 }
 
-void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
-                                             unsigned char *ypred_ptr,
-                                             int y_stride, int mode) {
+void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
+                                         unsigned char *src, int src_stride,
+                                         unsigned char *ypred_ptr,
+                                         int y_stride, int mode, int bsize) {
 
-  unsigned char *yabove_row = xd->dst.y_buffer - xd->dst.y_stride;
-  unsigned char yleft_col[16];
+  unsigned char *yabove_row = src - src_stride;
+  unsigned char yleft_col[32];
   unsigned char ytop_left = yabove_row[-1];
   int r, c, i;
 
-  for (i = 0; i < 16; i++) {
-    yleft_col[i] = xd->dst.y_buffer [i * xd->dst.y_stride - 1];
+  for (i = 0; i < bsize; i++) {
+    yleft_col[i] = xd->dst.y_buffer [i * src_stride - 1];
   }
 
   /* for Y */
@@ -227,58 +228,58 @@ void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
       int i;
       int shift;
       int average = 0;
-
+      int log2_bsize_minus_1;
+
+      assert(bsize == 8 || bsize == 16 || bsize == 32);
+      if (bsize == 8) {
+        log2_bsize_minus_1 = 2;
+      } else if (bsize == 16) {
+        log2_bsize_minus_1 = 3;
+      } else /* bsize == 32 */ {
+        log2_bsize_minus_1 = 4;
+      }
 
       if (xd->up_available || xd->left_available) {
         if (xd->up_available) {
-          for (i = 0; i < 16; i++) {
+          for (i = 0; i < bsize; i++) {
             average += yabove_row[i];
           }
         }
 
         if (xd->left_available) {
-          for (i = 0; i < 16; i++) {
+          for (i = 0; i < bsize; i++) {
             average += yleft_col[i];
           }
         }
-        shift = 3 + xd->up_available + xd->left_available;
+        shift = log2_bsize_minus_1 + xd->up_available + xd->left_available;
         expected_dc = (average + (1 << (shift - 1))) >> shift;
       } else {
         expected_dc = 128;
       }
 
-      for (r = 0; r < 16; r++) {
-        vpx_memset(ypred_ptr, expected_dc, 16);
-        ypred_ptr += y_stride; /*16;*/
+      for (r = 0; r < bsize; r++) {
+        vpx_memset(ypred_ptr, expected_dc, bsize);
+        ypred_ptr += y_stride;
       }
     }
     break;
     case V_PRED: {
-
-      for (r = 0; r < 16; r++) {
-
-        ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
-        ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
-        ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
-        ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+      for (r = 0; r < bsize; r++) {
+        memcpy(ypred_ptr, yabove_row, bsize);
         ypred_ptr += y_stride;
       }
     }
     break;
     case H_PRED: {
-
-      for (r = 0; r < 16; r++) {
-
-        vpx_memset(ypred_ptr, yleft_col[r], 16);
+      for (r = 0; r < bsize; r++) {
+        vpx_memset(ypred_ptr, yleft_col[r], bsize);
         ypred_ptr += y_stride;
       }
-
     }
     break;
     case TM_PRED: {
-
-      for (r = 0; r < 16; r++) {
-        for (c = 0; c < 16; c++) {
+      for (r = 0; r < bsize; r++) {
+        for (c = 0; c < bsize; c++) {
           int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
 
           if (pred < 0)
@@ -292,31 +293,30 @@ void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
 
         ypred_ptr += y_stride;
       }
-
     }
     break;
     case D45_PRED: {
-      d45_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D135_PRED: {
-      d135_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D117_PRED: {
-      d117_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D153_PRED: {
-      d153_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D27_PRED: {
-      d27_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D63_PRED: {
-      d63_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case I8X8_PRED:
@@ -332,25 +332,36 @@ void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
 }
 
 void vp8_build_intra_predictors_mby(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mby_internal(xd, xd->predictor, 16,
-                                          xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->predictor, 16,
+                                      xd->mode_info_context->mbmi.mode, 16);
 }
 
 void vp8_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mby_internal(xd, xd->dst.y_buffer,
-                                          xd->dst.y_stride,
-                                          xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 16);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_intra_predictors_sby_s(MACROBLOCKD *x) {
+  vp8_build_intra_predictors_internal(x, x->dst.y_buffer, x->dst.y_stride,
+                                      x->dst.y_buffer, x->dst.y_stride,
+                                      x->mode_info_context->mbmi.mode, 32);
+}
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
 void vp8_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
   unsigned char predictor[2][256];
   int i;
 
-  vp8_build_intra_predictors_mby_internal(
-    xd, predictor[0], 16, xd->mode_info_context->mbmi.mode);
-  vp8_build_intra_predictors_mby_internal(
-    xd, predictor[1], 16, xd->mode_info_context->mbmi.second_mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      predictor[0], 16,
+                                      xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      predictor[1], 16,
+                                      xd->mode_info_context->mbmi.second_mode);
 
   for (i = 0; i < 256; i++) {
     xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
@@ -362,172 +373,37 @@ void vp8_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
                                               unsigned char *upred_ptr,
                                               unsigned char *vpred_ptr,
                                               int uv_stride,
-                                              int mode) {
-  YV12_BUFFER_CONFIG * dst = &xd->dst;
-  unsigned char *uabove_row = dst->u_buffer - dst->uv_stride;
-  unsigned char uleft_col[16];
-  unsigned char utop_left = uabove_row[-1];
-  unsigned char *vabove_row = dst->v_buffer - dst->uv_stride;
-  unsigned char vleft_col[20];
-  unsigned char vtop_left = vabove_row[-1];
-
-  int i, j;
-
-  for (i = 0; i < 8; i++) {
-    uleft_col[i] = dst->u_buffer [i * dst->uv_stride - 1];
-    vleft_col[i] = dst->v_buffer [i * dst->uv_stride - 1];
-  }
-
-  switch (mode) {
-    case DC_PRED: {
-      int expected_udc;
-      int expected_vdc;
-      int i;
-      int shift;
-      int Uaverage = 0;
-      int Vaverage = 0;
-
-      if (xd->up_available) {
-        for (i = 0; i < 8; i++) {
-          Uaverage += uabove_row[i];
-          Vaverage += vabove_row[i];
-        }
-      }
-
-      if (xd->left_available) {
-        for (i = 0; i < 8; i++) {
-          Uaverage += uleft_col[i];
-          Vaverage += vleft_col[i];
-        }
-      }
-
-      if (!xd->up_available && !xd->left_available) {
-        expected_udc = 128;
-        expected_vdc = 128;
-      } else {
-        shift = 2 + xd->up_available + xd->left_available;
-        expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
-        expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
-      }
-
-
-      /*vpx_memset(upred_ptr,expected_udc,64);*/
-      /*vpx_memset(vpred_ptr,expected_vdc,64);*/
-      for (i = 0; i < 8; i++) {
-        vpx_memset(upred_ptr, expected_udc, 8);
-        vpx_memset(vpred_ptr, expected_vdc, 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-    }
-    break;
-    case V_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        vpx_memcpy(upred_ptr, uabove_row, 8);
-        vpx_memcpy(vpred_ptr, vabove_row, 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-
-    }
-    break;
-    case H_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        vpx_memset(upred_ptr, uleft_col[i], 8);
-        vpx_memset(vpred_ptr, vleft_col[i], 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-    }
-
-    break;
-    case TM_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-          int predu = uleft_col[i] + uabove_row[j] - utop_left;
-          int predv = vleft_col[i] + vabove_row[j] - vtop_left;
-
-          if (predu < 0)
-            predu = 0;
-
-          if (predu > 255)
-            predu = 255;
-
-          if (predv < 0)
-            predv = 0;
-
-          if (predv > 255)
-            predv = 255;
-
-          upred_ptr[j] = predu;
-          vpred_ptr[j] = predv;
-        }
-
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-
-    }
-    break;
-    case D45_PRED: {
-      d45_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d45_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D135_PRED: {
-      d135_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d135_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D117_PRED: {
-      d117_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d117_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D153_PRED: {
-      d153_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d153_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D27_PRED: {
-      d27_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d27_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D63_PRED: {
-      d63_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d63_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-      break;
-  }
+                                              int mode, int bsize) {
+  vp8_build_intra_predictors_internal(xd, xd->dst.u_buffer, xd->dst.uv_stride,
+                                      upred_ptr, uv_stride, mode, bsize);
+  vp8_build_intra_predictors_internal(xd, xd->dst.v_buffer, xd->dst.uv_stride,
+                                      vpred_ptr, uv_stride, mode, bsize);
 }
 
 void vp8_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mbuv_internal(
-    xd, &xd->predictor[256], &xd->predictor[320],
-    8, xd->mode_info_context->mbmi.uv_mode);
+  vp8_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],
+                                           &xd->predictor[320], 8,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           8);
 }
 
 void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mbuv_internal(
-    xd, xd->dst.u_buffer, xd->dst.v_buffer,
-    xd->dst.uv_stride, xd->mode_info_context->mbmi.uv_mode);
+  vp8_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer,
+                                           xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           8);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
+  vp8_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer, xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           16);
+}
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
 void vp8_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
   unsigned char predictor[2][2][64];
@@ -541,7 +417,8 @@ void vp8_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
     xd->mode_info_context->mbmi.second_uv_mode);
   for (i = 0; i < 64; i++) {
     xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
-    xd->predictor[256 + 64 + i] = (predictor[1][0][i] + predictor[1][1][i] + 1) >> 1;
+    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
+                                   predictor[1][1][i] + 1) >> 1;
   }
 }
 #endif
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 94826ef6c6941760c45ab5ede76d50f76faecd7a..5e0600c2d1fe9e5f1e73fbc2624b7360460ea86f 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -29,34 +29,31 @@ int dec_mvcount = 0;
 #endif
 
 static int vp8_read_bmode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_bmode_tree, p);
 }
 
 
 static int vp8_read_ymode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+  return vp8_treed_read(bc, vp8_ymode_tree, p);
+}
 
-  return i;
+#if CONFIG_SUPERBLOCKS
+static int vp8_sb_kfread_ymode(vp8_reader *bc, const vp8_prob *p) {
+  return vp8_treed_read(bc, vp8_uv_mode_tree, p);
 }
+#endif
 
 static int vp8_kfread_ymode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_kf_ymode_tree, p);
 }
-static int vp8_read_i8x8_mode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_i8x8_mode_tree, p);
 
-  return i;
+static int vp8_read_i8x8_mode(vp8_reader *bc, const vp8_prob *p) {
+  return vp8_treed_read(bc, vp8_i8x8_mode_tree, p);
 }
 
 
 static int vp8_read_uv_mode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_uv_mode_tree, p);
 }
 
 // This function reads the current macro block's segnent id from the bitstream
@@ -112,8 +109,14 @@ static void vp8_kfread_modes(VP8D_COMP *pbi,
       m->mbmi.mb_skip_coeff = 0;
   }
 
+#if CONFIG_SUPERBLOCKS
+  if (m->mbmi.encoded_as_sb) {
+    y_mode = (MB_PREDICTION_MODE) vp8_sb_kfread_ymode(bc,
+      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+  } else
+#endif
   y_mode = (MB_PREDICTION_MODE) vp8_kfread_ymode(bc,
-                                                 pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
 #if CONFIG_COMP_INTRA_PRED
   m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
@@ -398,16 +401,18 @@ static MV_REFERENCE_FRAME read_ref_frame(VP8D_COMP *pbi,
   return (MV_REFERENCE_FRAME)ref_frame;
 }
 
-static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_mv_ref_tree, p);
+#if CONFIG_SUPERBLOCKS
+static MB_PREDICTION_MODE read_sb_mv_ref(vp8_reader *bc, const vp8_prob *p) {
+  return (MB_PREDICTION_MODE) vp8_treed_read(bc, vp8_sb_mv_ref_tree, p);
+}
+#endif
 
-  return (MB_PREDICTION_MODE)i;
+static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p) {
+  return (MB_PREDICTION_MODE) vp8_treed_read(bc, vp8_mv_ref_tree, p);
 }
 
 static B_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
-
-  return (B_PREDICTION_MODE)i;
+  return (B_PREDICTION_MODE) vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
 }
 
 #ifdef VPX_MODE_COUNT
@@ -537,15 +542,36 @@ static void read_mb_segment_id(VP8D_COMP *pbi,
         // Else .... decode it explicitly
         else {
           vp8_read_mb_segid(bc, mbmi, xd);
-          cm->last_frame_seg_map[index] = mbmi->segment_id;
         }
-
       }
       // Normal unpredicted coding mode
       else {
         vp8_read_mb_segid(bc, mbmi, xd);
+      }
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        cm->last_frame_seg_map[index] =
+        cm->last_frame_seg_map[index + 1] =
+        cm->last_frame_seg_map[index + cm->mb_cols] =
+        cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
+      } else
+#endif
+      {
         cm->last_frame_seg_map[index] = mbmi->segment_id;
       }
+    } else {
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        mbmi->segment_id =
+              cm->last_frame_seg_map[index] &&
+              cm->last_frame_seg_map[index + 1] &&
+              cm->last_frame_seg_map[index + cm->mb_cols] &&
+              cm->last_frame_seg_map[index + cm->mb_cols + 1];
+      } else
+#endif
+      {
+        mbmi->segment_id = cm->last_frame_seg_map[index];
+      }
     }
   } else {
     // The encoder explicitly sets the segment_id to 0
@@ -667,6 +693,11 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       mbmi->mode =
         get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
     } else {
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
+      } else
+#endif
       mbmi->mode = read_mv_ref(bc, mv_ref_p);
 
       vp8_accum_mv_refs(&pbi->common, mbmi->mode, rct);
@@ -963,6 +994,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       mbmi->mode = (MB_PREDICTION_MODE)
                    get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
     else {
+      // FIXME write using SB mode tree
       mbmi->mode = (MB_PREDICTION_MODE)
                    vp8_read_ymode(bc, pbi->common.fc.ymode_prob);
       pbi->common.fc.ymode_counts[mbmi->mode]++;
@@ -1045,6 +1077,9 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
     int mb_row = (sb_row << 1);
 
     for (sb_col = 0; sb_col < sb_cols; sb_col++) {
+#if CONFIG_SUPERBLOCKS
+      mi->mbmi.encoded_as_sb = vp8_read(&pbi->bc, cm->sb_coded);
+#endif
       for (i = 0; i < 4; i++) {
 
         int dy = row_delta[i];
@@ -1059,6 +1094,10 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
           prev_mi += offset_extended;
           continue;
         }
+#if CONFIG_SUPERBLOCKS
+        if (i)
+          mi->mbmi.encoded_as_sb = 0;
+#endif
 
         // Make sure the MacroBlockD mode info pointer is set correctly
         xd->mode_info_context = mi;
@@ -1074,6 +1113,18 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
           read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row,
                            mb_col);
 
+#if CONFIG_SUPERBLOCKS
+        if (mi->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          mi[1] = mi[cm->mode_info_stride] =
+            mi[cm->mode_info_stride + 1] = mi[0];
+          mi += 2;
+          prev_mi += 2;
+          break;
+        }
+#endif
+
         /* next macroblock */
         mb_row += dy;
         mb_col += dx;
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 6ff91489352658e1d45c3f900753eb74d311d43b..61d3c8d2c4989ab36f70412a5a5d5a5a27cf7a43 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -175,10 +175,27 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) {
  */
 static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sbuv_s)(xd);
+      RECON_INVOKE(&pbi->common.rtcd.recon,
+                   build_intra_predictors_sby_s)(xd);
+    } else {
+#endif
     RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
     RECON_INVOKE(&pbi->common.rtcd.recon,
                  build_intra_predictors_mby_s)(xd);
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
   } else {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+    } else {
+#endif
     vp8_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
                                            xd->dst.u_buffer, xd->dst.v_buffer,
                                            xd->dst.y_stride, xd->dst.uv_stride);
@@ -188,6 +205,9 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) {
                                              xd->dst.u_buffer, xd->dst.v_buffer,
                                              xd->dst.y_stride, xd->dst.uv_stride);
     }
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
   }
 #ifdef DEC_DEBUG
   if (dec_debug) {
@@ -204,11 +224,15 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) {
 
 extern const int vp8_i8x8_block[4];
 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
-                              unsigned int mb_idx) {
+                              unsigned int mb_col) {
   int eobtotal = 0;
   MB_PREDICTION_MODE mode;
   int i;
   int tx_type;
+#if CONFIG_SUPERBLOCKS
+  VP8_COMMON *pc = &pbi->common;
+  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+#endif
 
 #if CONFIG_HYBRIDTRANSFORM
   int QIndex = xd->q_index;
@@ -264,11 +288,25 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     xd->mode_info_context->mbmi.txfm_size = TX_8X8;
   }
 #endif
+#if CONFIG_SUPERBLOCKS
+  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+  }
+#endif
 
   tx_type = xd->mode_info_context->mbmi.txfm_size;
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp8_reset_mb_tokens_context(xd);
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      xd->above_context++;
+      xd->left_context++;
+      vp8_reset_mb_tokens_context(xd);
+      xd->above_context--;
+      xd->left_context--;
+    }
+#endif
   } else if (!vp8dx_bool_error(xd->current_bc)) {
     for (i = 0; i < 25; i++) {
       xd->block[i].eob = 0;
@@ -311,8 +349,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
      * */
     xd->mode_info_context->mbmi.mb_skip_coeff = 1;
 
-    skip_recon_mb(pbi, xd);
-    return;
+#if CONFIG_SUPERBLOCKS
+    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
+#endif
+    {
+      skip_recon_mb(pbi, xd);
+      return;
+    }
   }
 
 #ifdef DEC_DEBUG
@@ -343,6 +386,12 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
   /* do prediction */
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sby_s)(xd);
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sbuv_s)(xd);
+    } else
+#endif
     if (mode != I8X8_PRED) {
       RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd);
       if (mode != B_PRED) {
@@ -358,6 +407,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 #endif
     }
   } else {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+    } else
+#endif
     vp8_build_inter_predictors_mb(xd);
   }
 
@@ -481,6 +537,32 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     else
 #endif
     if (tx_type == TX_8X8) {
+#if CONFIG_SUPERBLOCKS
+      void *orig = xd->mode_info_context;
+      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
+      for (n = 0; n < num; n++) {
+        if (n != 0) {
+          for (i = 0; i < 25; i++) {
+            xd->block[i].eob = 0;
+            xd->eobs[i] = 0;
+          }
+          xd->above_context = pc->above_context + mb_col + (n & 1);
+          xd->left_context = pc->left_context + (n >> 1);
+          xd->mode_info_context = orig;
+          xd->mode_info_context += (n & 1);
+          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
+          if (!orig_skip_flag) {
+            eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd);
+            if (eobtotal == 0) // skip loopfilter
+              xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+          } else {
+            vp8_reset_mb_tokens_context(xd);
+          }
+        }
+
+        if (xd->mode_info_context->mbmi.mb_skip_coeff)
+          continue; // only happens for SBs, which are already in dest buffer
+#endif
       DEQUANT_INVOKE(&pbi->dequant, block_2x2)(b);
 #ifdef DEC_DEBUG
       if (dec_debug) {
@@ -501,10 +583,27 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
       ((int *)b->qcoeff)[5] = 0;
       ((int *)b->qcoeff)[6] = 0;
       ((int *)b->qcoeff)[7] = 0;
-      DEQUANT_INVOKE(&pbi->dequant, dc_idct_add_y_block_8x8)
-      (xd->qcoeff, xd->block[0].dequant,
-       xd->predictor, xd->dst.y_buffer,
-       xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        vp8_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
+          xd->block[0].dequant,
+          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+        // do UV inline also
+        vp8_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+      } else
+#endif
+        DEQUANT_INVOKE(&pbi->dequant, dc_idct_add_y_block_8x8)(xd->qcoeff,
+          xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+#if CONFIG_SUPERBLOCKS
+      }
+      xd->mode_info_context = orig;
+#endif
     } else {
       DEQUANT_INVOKE(&pbi->dequant, block)(b);
       if (xd->eobs[24] > 1) {
@@ -529,7 +628,10 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
   }
 
-  if (tx_type == TX_8X8
+#if CONFIG_SUPERBLOCKS
+    if (!xd->mode_info_context->mbmi.encoded_as_sb) {
+#endif
+      if (tx_type == TX_8X8
 #if CONFIG_TX16X16
       || tx_type == TX_16X16
 #endif
@@ -543,6 +645,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     (xd->qcoeff + 16 * 16, xd->block[16].dequant,
      xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
      xd->dst.uv_stride, xd->eobs + 16);
+#if CONFIG_SUPERBLOCKS
+  }
+#endif
 }
 
 
@@ -582,15 +687,21 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
   int row_delta[4] = { 0, +1,  0, -1};
   int col_delta[4] = { +1, -1, +1, +1};
   int sb_cols = (pc->mb_cols + 1) >> 1;
-  ENTROPY_CONTEXT_PLANES left_context[2];
 
   // For a SB there are 2 left contexts, each pertaining to a MB row within
-  vpx_memset(left_context, 0, sizeof(left_context));
+  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
 
   mb_row = mbrow;
   mb_col = 0;
 
   for (sb_col = 0; sb_col < sb_cols; sb_col++) {
+    MODE_INFO *mi = xd->mode_info_context;
+
+#if CONFIG_SUPERBLOCKS
+    if (pbi->interleaved_decoding)
+      mi->mbmi.encoded_as_sb = vp8_read(&pbi->bc, pc->sb_coded);
+#endif
+
     // Process the 4 MBs within the SB in the order:
     // top-left, top-right, bottom-left, bottom-right
     for (i = 0; i < 4; i++) {
@@ -598,6 +709,7 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
       int dx = col_delta[i];
       int offset_extended = dy * xd->mode_info_stride + dx;
 
+      mi = xd->mode_info_context;
       if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
         // MB lies outside frame, skip on to next
         mb_row += dy;
@@ -610,13 +722,10 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
 #ifdef DEC_DEBUG
       dec_debug = (pc->current_video_frame == 0 && mb_row == 0 && mb_col == 0);
 #endif
-      // Copy in the appropriate left context for this MB row
-      vpx_memcpy(&pc->left_context,
-                 &left_context[i >> 1],
-                 sizeof(ENTROPY_CONTEXT_PLANES));
 
       // Set above context pointer
       xd->above_context = pc->above_context + mb_col;
+      xd->left_context = pc->left_context + (i >> 1);
 
       /* Distance of Mb to the various image edges.
        * These are specified to 8th pel as they are always compared to
@@ -639,6 +748,10 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
       xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
       xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
 
+#if CONFIG_SUPERBLOCKS
+      if (i)
+        mi->mbmi.encoded_as_sb = 0;
+#endif
       if(pbi->interleaved_decoding)
         vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col);
 
@@ -681,15 +794,34 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
         xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
       }
 
-      decode_macroblock(pbi, xd, mb_row * pc->mb_cols + mb_col);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        mi[1] = mi[0];
+        mi[pc->mode_info_stride] = mi[0];
+        mi[pc->mode_info_stride + 1] = mi[0];
+      }
+#endif
+      decode_macroblock(pbi, xd, mb_col);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        mi[1].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+        mi[pc->mode_info_stride].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+        mi[pc->mode_info_stride + 1].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+      }
+#endif
 
       /* check if the boolean decoder has suffered an error */
       xd->corrupted |= vp8dx_bool_error(xd->current_bc);
 
-      // Store the modified left context for the MB row locally
-      vpx_memcpy(&left_context[i >> 1],
-                 &pc->left_context,
-                 sizeof(ENTROPY_CONTEXT_PLANES));
+#if CONFIG_SUPERBLOCKS
+      if (mi->mbmi.encoded_as_sb) {
+        assert(!i);
+        mb_col += 2;
+        xd->mode_info_context += 2;
+        xd->prev_mode_info_context += 2;
+        break;
+      }
+#endif
 
       // skip to next MB
       xd->mode_info_context += offset_extended;
@@ -806,7 +938,6 @@ static void init_frame(VP8D_COMP *pbi) {
     vp8_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
 
-  xd->left_context = &pc->left_context;
   xd->mode_info_context = pc->mi;
   xd->prev_mode_info_context = pc->prev_mi;
   xd->frame_type = pc->frame_type;
@@ -1151,6 +1282,10 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
     }
   }
 
+#if CONFIG_SUPERBLOCKS
+  pc->sb_coded = vp8_read_literal(bc, 8);
+#endif
+
   /* Read the loop filter level and type */
   pc->txfm_mode = (TXFM_MODE) vp8_read_bit(bc);
 
diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
index 36eea5d6f7d996ffd4045c96a81b110d9b3f2298..e97d3298f5c35f4537eba81227492f56fb555273 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -127,6 +127,19 @@ void vp8_dequant_dc_idct_add_y_block_8x8_c
 
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_dequant_dc_idct_add_y_block_8x8_inplace_c
+(short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs, short *dc, MACROBLOCKD *xd) {
+
+  vp8_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8, dst + 8, stride, stride, dc[1]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride, dst + 8 * stride, stride, stride, dc[4]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8, dst + 8 * stride + 8, stride, stride, dc[8]);
+
+}
+#endif
+
 void vp8_dequant_idct_add_y_block_8x8_c
 (short *q, short *dq, unsigned char *pre,
  unsigned char *dst, int stride, char *eobs, MACROBLOCKD *xd) {
@@ -153,6 +166,18 @@ void vp8_dequant_idct_add_uv_block_8x8_c
   vp8_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_dequant_idct_add_uv_block_8x8_inplace_c
+(short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, MACROBLOCKD *xd) {
+  vp8_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
+
+  q    += 64;
+
+  vp8_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
+}
+#endif
+
 #if CONFIG_LOSSLESS
 void vp8_dequant_dc_idct_add_y_block_lossless_c
 (short *q, short *dq, unsigned char *pre,
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 63499a8f7e68b9d2c379efa914f4803e12e05584..2e1364817fdf5cb8ac0236d80f7f9a69b38dcb84 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -149,7 +149,7 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) {
 
   pbi->decoded_key_frame = 0;
 
-  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV;
+  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV || CONFIG_SUPERBLOCKS;
 
   return (VP8D_PTR) pbi;
 }
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 76aed7e2d908af7a5f0866b542ca498ce506c2eb..90bc8e98776eaa0812fc4228dc4a4a499bf59c9f 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -288,6 +288,12 @@ static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) {
   vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
 }
 
+#if CONFIG_SUPERBLOCKS
+static void sb_kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) {
+  vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_sb_kf_ymode_encodings + m);
+}
+#endif
+
 static void write_i8x8_mode(vp8_writer *bc, int m, const vp8_prob *p) {
   vp8_write_token(bc, vp8_i8x8_mode_tree, p, vp8_i8x8_mode_encodings + m);
 }
@@ -533,6 +539,16 @@ static void write_mv_ref
                   vp8_mv_ref_encoding_array - NEARESTMV + m);
 }
 
+#if CONFIG_SUPERBLOCKS
+static void write_sb_mv_ref(vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p) {
+#if CONFIG_DEBUG
+  assert(NEARESTMV <= m  &&  m < SPLITMV);
+#endif
+  vp8_write_token(w, vp8_sb_mv_ref_tree, p,
+                  vp8_sb_mv_ref_encoding_array - NEARESTMV + m);
+}
+#endif
+
 static void write_sub_mv_ref
 (
   vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
@@ -810,6 +826,9 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
+#if CONFIG_SUPERBLOCKS
+      vp8_write(w, m->mbmi.encoded_as_sb, pc->sb_coded);
+#endif
       for (i = 0; i < 4; i++) {
         MB_MODE_INFO *mi;
         MV_REFERENCE_FRAME rf;
@@ -872,7 +891,15 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
         if (pc->mb_no_coeff_skip &&
             (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          vp8_encode_bool(w, mi->mb_skip_coeff,
+          int skip_coeff = mi->mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+          if (mi->encoded_as_sb) {
+            skip_coeff &= m[1].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+          }
+#endif
+          vp8_encode_bool(w, skip_coeff,
                           get_pred_prob(pc, xd, PRED_MBSKIP));
         }
 
@@ -884,6 +911,8 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           active_section = 6;
 #endif
 
+          // TODO(rbultje) write using SB tree structure
+
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
             write_ymode(w, mode, pc->fc.ymode_prob);
           }
@@ -949,7 +978,14 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
           // Is the segment coding of mode enabled
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-            write_mv_ref(w, mode, mv_ref_p);
+#if CONFIG_SUPERBLOCKS
+            if (mi->encoded_as_sb) {
+              write_sb_mv_ref(w, mode, mv_ref_p);
+            } else
+#endif
+            {
+              write_mv_ref(w, mode, mv_ref_p);
+            }
             vp8_accum_mv_refs(&cpi->common, mode, ct);
           }
 
@@ -1085,6 +1121,17 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           }
         }
 
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          cpi->mb.partition_info += 2;
+          prev_m += 2;
+          break;
+        }
+#endif
+
         // Next MB
         mb_row += dy;
         mb_col += dx;
@@ -1151,6 +1198,9 @@ static void write_kfmodes(VP8_COMP *cpi) {
 
     mb_col = 0;
     for (col = 0; col < c->mb_cols; col += 2) {
+#if CONFIG_SUPERBLOCKS
+      vp8_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
+#endif
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
       for (i = 0; i < 4; i++) {
@@ -1181,11 +1231,27 @@ static void write_kfmodes(VP8_COMP *cpi) {
         if (c->mb_no_coeff_skip &&
             (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          vp8_encode_bool(bc, m->mbmi.mb_skip_coeff,
+              int skip_coeff = m->mbmi.mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+              if (m->mbmi.encoded_as_sb) {
+                skip_coeff &= m[1].mbmi.mb_skip_coeff;
+                skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+                skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+              }
+#endif
+              vp8_encode_bool(bc, skip_coeff,
                           get_pred_prob(c, xd, PRED_MBSKIP));
         }
-        kfwrite_ymode(bc, ym,
-                      c->kf_ymode_prob[c->kf_ymode_probs_index]);
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          sb_kfwrite_ymode(bc, ym,
+                           c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+        } else
+#endif
+        {
+          kfwrite_ymode(bc, ym,
+                        c->kf_ymode_prob[c->kf_ymode_probs_index]);
+        }
 
         if (ym == B_PRED) {
           const int mis = c->mode_info_stride;
@@ -1233,6 +1299,14 @@ static void write_kfmodes(VP8_COMP *cpi) {
         } else
           write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          break;
+        }
+#endif
         // Next MB
         mb_row += dy;
         mb_col += dx;
@@ -1793,7 +1867,7 @@ static void put_delta_q(vp8_writer *bc, int delta_q) {
   } else
     vp8_write_bit(bc, 0);
 }
-extern const unsigned int kf_y_mode_cts[8][VP8_YMODES];
+
 static void decide_kf_ymode_entropy(VP8_COMP *cpi) {
 
   int mode_cost[MB_MODE_COUNT];
@@ -1808,6 +1882,13 @@ static void decide_kf_ymode_entropy(VP8_COMP *cpi) {
     for (j = 0; j < VP8_YMODES; j++) {
       cost += mode_cost[j] * cpi->ymode_count[j];
     }
+#if CONFIG_SUPERBLOCKS
+    vp8_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
+                    vp8_sb_ymode_tree);
+    for (j = 0; j < VP8_I32X32_MODES; j++) {
+      cost += mode_cost[j] * cpi->sb_ymode_count[j];
+    }
+#endif
     if (cost < bestcost) {
       bestindex = i;
       bestcost = cost;
@@ -1906,11 +1987,6 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
       // Select the coding strategy (temporal or spatial)
       choose_segmap_coding_method(cpi);
 
-      // Take a copy of the segment map if it changed for
-      // future comparison
-      vpx_memcpy(pc->last_frame_seg_map,
-                 cpi->segmentation_map, pc->MBs);
-
       // Write out the chosen coding method.
       vp8_write_bit(bc, (pc->temporal_update) ? 1 : 0);
     }
@@ -2048,6 +2124,19 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     }
   }
 
+#if CONFIG_SUPERBLOCKS
+  {
+    /* sb mode probability */
+    int sb_coded = 256 - (cpi->sb_count << 8) / (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
+    if (sb_coded <= 0)
+      sb_coded = 1;
+    else if (sb_coded >= 256)
+      sb_coded = 255;
+    pc->sb_coded = sb_coded;
+    vp8_write_literal(bc, pc->sb_coded, 8);
+  }
+#endif
+
   vp8_write_bit(bc, pc->txfm_mode);
 
   // Encode the loop filter level and type
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index dfc1d743ec585cc7fa6dc0a2ef5f7c24bb891d2b..d73af4faafd241ff13e5dbf91ed487bbc87dbe99 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -82,7 +82,9 @@ typedef struct {
   int best_mode_index;
   int rddiv;
   int rdmult;
-
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
 } PICK_MODE_CONTEXT;
 
 typedef struct {
@@ -139,12 +141,6 @@ typedef struct {
   int mv_col_max;
   int mv_row_min;
   int mv_row_max;
-#if CONFIG_SUPERBLOCKS
-  int mv_col_min_sb;
-  int mv_col_max_sb;
-  int mv_row_min_sb;
-  int mv_row_max_sb;
-#endif
 
   int skip;
 
@@ -163,8 +159,6 @@ typedef struct {
   int optimize;
   int q_index;
 
-  int encode_as_sb;
-
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
   PICK_MODE_CONTEXT mb_context[4];
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e58c852a7b5288591efb6f9b38d19b3398327478..4472497e0b064254041fa3e4db60ea5fc828a95e 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -57,16 +57,24 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
                                       MB_ROW_COMP *mbr_ei,
                                       int mb_row,
                                       int count);
-extern int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                              int recon_yoffset, int recon_uvoffset,
+                              int *returnrate, int *returndistortion);
+extern void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
                                             int recon_yoffset,
-                                            int recon_uvoffset);
+                                            int recon_uvoffset, int *r, int *d);
 void vp8_build_block_offsets(MACROBLOCK *x);
 void vp8_setup_block_ptrs(MACROBLOCK *x);
 void vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
                                    int recon_yoffset, int recon_uvoffset,
                                    int output_enabled);
+void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int recon_yoffset, int recon_uvoffset, int mb_col, int mb_row);
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int output_enabled);
+void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    TOKENEXTRA **t, int mb_col);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
 
 
@@ -378,6 +386,13 @@ static void update_state(VP8_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
+#if CONFIG_SUPERBLOCKS
+  if (mi->mbmi.encoded_as_sb) {
+    vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
+    vpx_memcpy(xd->mode_info_context + cpi->common.mode_info_stride, mi, sizeof(MODE_INFO));
+    vpx_memcpy(xd->mode_info_context + cpi->common.mode_info_stride + 1, mi, sizeof(MODE_INFO));
+  }
+#endif
 
   if (mb_mode == B_PRED) {
     for (i = 0; i < 16; i++) {
@@ -448,6 +463,10 @@ static void update_state(VP8_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 
     cpi->prediction_error += ctx->distortion;
     cpi->intra_error += ctx->intra_error;
+
+    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
   }
 }
 
@@ -458,7 +477,8 @@ static void pick_mb_modes(VP8_COMP *cpi,
                           MACROBLOCK  *x,
                           MACROBLOCKD *xd,
                           TOKENEXTRA **tp,
-                          int *totalrate) {
+                          int *totalrate,
+                          int *totaldist) {
   int i;
   int map_index;
   int recon_yoffset, recon_uvoffset;
@@ -477,7 +497,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
 
   /* Function should not modify L & A contexts; save and restore on exit */
   vpx_memcpy(left_context,
-             cpi->left_context,
+             cm->left_context,
              sizeof(left_context));
   vpx_memcpy(above_context,
              initial_above_context_ptr,
@@ -525,9 +545,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
 
     // Restore the appropriate left context depending on which
     // row in the SB the MB is situated
-    vpx_memcpy(&cm->left_context,
-               &cpi->left_context[i >> 1],
-               sizeof(ENTROPY_CONTEXT_PLANES));
+    xd->left_context = cm->left_context + (i >> 1);
 
     // Set up distance of MB to edge of frame in 1/8th pel units
     xd->mb_to_top_edge    = -((mb_row * 16) << 3);
@@ -568,9 +586,11 @@ static void pick_mb_modes(VP8_COMP *cpi,
     // Is segmentation enabled
     if (xd->segmentation_enabled) {
       // Code to set segment id in xd->mbmi.segment_id
-      if (cpi->segmentation_map[map_index] <= 3)
+      if (xd->update_mb_segmentation_map)
         mbmi->segment_id = cpi->segmentation_map[map_index];
       else
+        mbmi->segment_id = cm->last_frame_seg_map[map_index];
+      if (mbmi->segment_id > 3)
         mbmi->segment_id = 0;
 
       vp8cx_mb_init_quantizer(cpi, x);
@@ -583,22 +603,29 @@ static void pick_mb_modes(VP8_COMP *cpi,
     /* force 4x4 transform for mode selection */
     mbmi->txfm_size = TX_4X4; // TODO IS this right??
 
+#if CONFIG_SUPERBLOCKS
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+
     cpi->update_context = 0;    // TODO Do we need this now??
 
     // Find best coding mode & reconstruct the MB so it is available
     // as a predictor for MBs that follow in the SB
     if (cm->frame_type == KEY_FRAME) {
-      *totalrate += vp8_rd_pick_intra_mode(cpi, x);
-
-      // Save the coding context
-      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
-                 sizeof(MODE_INFO));
+      int r, d;
+      vp8_rd_pick_intra_mode(cpi, x, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
 
       // Dummy encode, do not do the tokenization
       vp8cx_encode_intra_macro_block(cpi, x, tp, 0);
       // Note the encoder may have changed the segment_id
+
+      // Save the coding context
+      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
+                 sizeof(MODE_INFO));
     } else {
-      int seg_id;
+      int seg_id, r, d;
 
       if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
           !segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
@@ -612,9 +639,10 @@ static void pick_mb_modes(VP8_COMP *cpi,
         cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
       }
 
-      *totalrate += vp8cx_pick_mode_inter_macroblock(cpi, x,
-                                                     recon_yoffset,
-                                                     recon_uvoffset);
+      vp8cx_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
+                                       recon_uvoffset, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
 
       // Dummy encode, do not do the tokenization
       vp8cx_encode_inter_macroblock(cpi, x, tp,
@@ -639,11 +667,6 @@ static void pick_mb_modes(VP8_COMP *cpi,
       }
     }
 
-    // Keep a copy of the updated left context
-    vpx_memcpy(&cpi->left_context[i >> 1],
-               &cm->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
     // Next MB
     mb_row += dy;
     mb_col += dx;
@@ -664,7 +687,7 @@ static void pick_mb_modes(VP8_COMP *cpi,
   }
 
   /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cpi->left_context,
+  vpx_memcpy(cm->left_context,
              left_context,
              sizeof(left_context));
   vpx_memcpy(initial_above_context_ptr,
@@ -672,6 +695,156 @@ static void pick_mb_modes(VP8_COMP *cpi,
              sizeof(above_context));
 }
 
+#if CONFIG_SUPERBLOCKS
+static void pick_sb_modes (VP8_COMP *cpi,
+                           VP8_COMMON *cm,
+                           int mb_row,
+                           int mb_col,
+                           MACROBLOCK  *x,
+                           MACROBLOCKD *xd,
+                           TOKENEXTRA **tp,
+                           int *totalrate,
+                           int *totaldist)
+{
+  int map_index;
+  int recon_yoffset, recon_uvoffset;
+  int ref_fb_idx = cm->lst_fb_idx;
+  int dst_fb_idx = cm->new_fb_idx;
+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  ENTROPY_CONTEXT_PLANES left_context[2];
+  ENTROPY_CONTEXT_PLANES above_context[2];
+  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
+    + mb_col;
+
+  /* Function should not modify L & A contexts; save and restore on exit */
+  vpx_memcpy (left_context,
+              cm->left_context,
+              sizeof(left_context));
+  vpx_memcpy (above_context,
+              initial_above_context_ptr,
+              sizeof(above_context));
+
+  map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+  /* set above context pointer */
+  xd->above_context = cm->above_context + mb_col;
+
+  /* Restore the appropriate left context depending on which
+   * row in the SB the MB is situated */
+  xd->left_context = cm->left_context;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+  /* Set up limit values for MV components to prevent them from
+   * extending beyond the UMV borders assuming 16x16 block size */
+  x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+
+  xd->up_available   = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
+
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+#if 0 // FIXME
+  /* Copy current MB to a work buffer */
+  RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer,
+                                            x->src.y_stride,
+                                            x->thismb, 16);
+#endif
+  x->rddiv = cpi->RDDIV;
+  x->rdmult = cpi->RDMULT;
+  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    vp8_activity_masking(cpi, x);
+  /* Is segmentation enabled */
+  if (xd->segmentation_enabled)
+  {
+    /* Code to set segment id in xd->mbmi.segment_id */
+    if (xd->update_mb_segmentation_map)
+      xd->mode_info_context->mbmi.segment_id =
+            cpi->segmentation_map[map_index] &&
+            cpi->segmentation_map[map_index + 1] &&
+            cpi->segmentation_map[map_index + cm->mb_cols] &&
+            cpi->segmentation_map[map_index + cm->mb_cols + 1];
+    else
+      xd->mode_info_context->mbmi.segment_id =
+            cm->last_frame_seg_map[map_index] &&
+            cm->last_frame_seg_map[map_index + 1] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
+    if (xd->mode_info_context->mbmi.segment_id > 3)
+      xd->mode_info_context->mbmi.segment_id = 0;
+
+    vp8cx_mb_init_quantizer(cpi, x);
+  }
+  else
+    /* Set to Segment 0 by default */
+    xd->mode_info_context->mbmi.segment_id = 0;
+
+  x->active_ptr = cpi->active_map + map_index;
+  
+  cpi->update_context = 0;    // TODO Do we need this now??
+
+  /* Find best coding mode & reconstruct the MB so it is available
+   * as a predictor for MBs that follow in the SB */
+  if (cm->frame_type == KEY_FRAME)
+  {
+    vp8_rd_pick_intra_mode_sb(cpi, x,
+                              totalrate,
+                              totaldist);
+
+    /* Save the coding context */
+    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
+               sizeof(MODE_INFO));
+  }
+  else
+  {
+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+        !segfeature_active( xd, 0, SEG_LVL_REF_FRAME ) &&
+        segfeature_active( xd, 1, SEG_LVL_REF_FRAME ) &&
+        check_segref(xd, 1, INTRA_FRAME)  +
+        check_segref(xd, 1, LAST_FRAME)   +
+        check_segref(xd, 1, GOLDEN_FRAME) +
+        check_segref(xd, 1, ALTREF_FRAME) == 1)
+    {
+      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+    }
+    else
+    {
+      cpi->seg0_progress =
+        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
+    }
+
+    vp8_rd_pick_inter_mode_sb(cpi, x,
+                              recon_yoffset,
+                              recon_uvoffset,
+                              totalrate,
+                              totaldist);
+  }
+
+  /* Restore L & A coding context to those in place on entry */
+  vpx_memcpy (cm->left_context,
+              left_context,
+              sizeof(left_context));
+  vpx_memcpy (initial_above_context_ptr,
+              above_context,
+              sizeof(above_context));
+}
+#endif
+
 static void encode_sb(VP8_COMP *cpi,
                       VP8_COMMON *cm,
                       int mbrow,
@@ -679,6 +852,7 @@ static void encode_sb(VP8_COMP *cpi,
                       MACROBLOCK  *x,
                       MACROBLOCKD *xd,
                       TOKENEXTRA **tp) {
+  VP8_COMMON *pc = cm;
   int i;
   int map_index;
   int mb_row, mb_col;
@@ -733,22 +907,19 @@ static void encode_sb(VP8_COMP *cpi,
 
     // Restore MB state to that when it was picked
 #if CONFIG_SUPERBLOCKS
-    if (x->encode_as_sb)
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
       update_state(cpi, x, &x->sb_context[i]);
-    else
+      cpi->sb_count++;
+    } else
 #endif
       update_state(cpi, x, &x->mb_context[i]);
 
-    // Copy in the appropriate left context
-    vpx_memcpy(&cm->left_context,
-               &cpi->left_context[i >> 1],
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
     map_index = (mb_row * cpi->common.mb_cols) + mb_col;
     x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
 
     // reset above block coeffs
     xd->above_context = cm->above_context + mb_col;
+    xd->left_context  = cm->left_context + (i >> 1);
 
     // Set up distance of MB to edge of the frame in 1/8th pel units
     xd->mb_to_top_edge    = -((mb_row * 16) << 3);
@@ -756,24 +927,28 @@ static void encode_sb(VP8_COMP *cpi,
     xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
     xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
 
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 16x16 block size
-    x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-
 #if CONFIG_SUPERBLOCKS
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 32x32 block size
-    x->mv_row_min_sb = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min_sb = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max_sb = ((cm->mb_rows - mb_row) * 16 +
-                        (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-    x->mv_col_max_sb = ((cm->mb_cols - mb_col) * 16 +
-                        (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 32x32 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    } else {
+#endif
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 16x16 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+#if CONFIG_SUPERBLOCKS
+    }
 #endif
 
     xd->up_available = (mb_row != 0);
@@ -796,24 +971,21 @@ static void encode_sb(VP8_COMP *cpi,
 
     // Is segmentation enabled
     if (xd->segmentation_enabled) {
-      // Code to set segment id in xd->mbmi.segment_id
-      if (cpi->segmentation_map[map_index] <= 3)
-        mbmi->segment_id = cpi->segmentation_map[map_index];
-      else
-        mbmi->segment_id = 0;
-
       vp8cx_mb_init_quantizer(cpi, x);
-    } else
-      // Set to Segment 0 by default
-      mbmi->segment_id = 0;
+    }
 
     x->active_ptr = cpi->active_map + map_index;
 
     cpi->update_context = 0;
 
     if (cm->frame_type == KEY_FRAME) {
-      vp8cx_encode_intra_macro_block(cpi, x, tp, 1);
-      // Note the encoder may have changed the segment_id
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp8cx_encode_intra_super_block(cpi, x, tp, mb_col);
+      else
+#endif
+        vp8cx_encode_intra_macro_block(cpi, x, tp, 1);
+        // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
       y_modes[mbmi->mode]++;
@@ -822,9 +994,25 @@ static void encode_sb(VP8_COMP *cpi,
       unsigned char *segment_id;
       int seg_ref_active;
 
-      vp8cx_encode_inter_macroblock(cpi, x, tp,
-                                    recon_yoffset, recon_uvoffset, 1);
-      // Note the encoder may have changed the segment_id
+      if (xd->mode_info_context->mbmi.ref_frame) {
+        unsigned char pred_context;
+
+        pred_context = get_pred_context(cm, xd, PRED_COMP);
+
+        if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
+          cpi->single_pred_count[pred_context]++;
+        else
+          cpi->comp_pred_count[pred_context]++;
+      }
+
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp8cx_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_col, mb_row);
+      else
+#endif
+        vp8cx_encode_inter_macroblock(cpi, x, tp,
+                                      recon_yoffset, recon_uvoffset, 1);
+        // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
       inter_y_modes[mbmi->mode]++;
@@ -864,10 +1052,20 @@ static void encode_sb(VP8_COMP *cpi,
     // TODO Partitioning is broken!
     cpi->tplist[mb_row].stop = *tp;
 
-    // Copy back updated left context
-    vpx_memcpy(&cpi->left_context[i >> 1],
-               &cm->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      x->src.y_buffer += 32;
+      x->src.u_buffer += 16;
+      x->src.v_buffer += 16;
+
+      x->gf_active_ptr      += 2;
+      x->partition_info     += 2;
+      xd->mode_info_context += 2;
+      xd->prev_mode_info_context += 2;
+      
+      break;
+    }
+#endif
 
     // Next MB
     mb_row += dy;
@@ -911,14 +1109,13 @@ void encode_sb_row(VP8_COMP *cpi,
   int mb_cols = cm->mb_cols;
 
   // Initialize the left context for the new SB row
-  vpx_memset(cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(&cm->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
 
   // Code each SB in the row
   for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
-    int mb_rate = 0;
+    int mb_rate = 0, mb_dist = 0;
 #if CONFIG_SUPERBLOCKS
-    int sb_rate = INT_MAX;
+    int sb_rate = INT_MAX, sb_dist;
 #endif
 
 #if CONFIG_DEBUG
@@ -930,8 +1127,14 @@ void encode_sb_row(VP8_COMP *cpi,
     unsigned char *vb = x->src.v_buffer;
 #endif
 
+#if CONFIG_SUPERBLOCKS
     // Pick modes assuming the SB is coded as 4 independent MBs
-    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate);
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
+#if CONFIG_SUPERBLOCKS
+    mb_rate += vp8_cost_bit(cm->sb_coded, 0);
+#endif
 
     x->src.y_buffer -= 32;
     x->src.u_buffer -= 16;
@@ -952,21 +1155,40 @@ void encode_sb_row(VP8_COMP *cpi,
 #endif
 
 #if CONFIG_SUPERBLOCKS
-    // Pick a mode assuming that it applies all 4 of the MBs in the SB
-    pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, &sb_rate);
+    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||
+          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
+      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
+      sb_rate += vp8_cost_bit(cm->sb_coded, 1);
+    }
 
-    // Decide whether to encode as a SB or 4xMBs
-    if (sb_rate < mb_rate) {
-      x->encode_as_sb = 1;
+    /* Decide whether to encode as a SB or 4xMBs */
+    if (sb_rate < INT_MAX &&
+        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
+          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
       *totalrate += sb_rate;
     } else
 #endif
     {
-      x->encode_as_sb = 0;
+#if CONFIG_SUPERBLOCKS
+      xd->mode_info_context->mbmi.encoded_as_sb = 0;
+      if (cm->mb_cols - 1 > mb_col)
+        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
+      if (cm->mb_rows - 1 > mb_row) {
+        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+        if (cm->mb_cols - 1 > mb_col)
+          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+      }
+#endif
       *totalrate += mb_rate;
     }
 
-    // Encode SB using best computed mode(s)
+    /* Encode SB using best computed mode(s) */
     encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
 
 #if CONFIG_DEBUG
@@ -1038,8 +1260,6 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) {
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
-  xd->left_context = &cm->left_context;
-
   vp8_zero(cpi->count_mb_ref_frame_usage)
   vp8_zero(cpi->bmode_count)
   vp8_zero(cpi->ymode_count)
@@ -1049,6 +1269,10 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) {
   vp8_zero(cpi->mbsplit_count)
   vp8_zero(cpi->common.fc.mv_ref_ct)
   vp8_zero(cpi->common.fc.mv_ref_ct_a)
+#if CONFIG_SUPERBLOCKS
+  vp8_zero(cpi->sb_ymode_count)
+  cpi->sb_count = 0;
+#endif
   // vp8_zero(cpi->uv_mode_count)
 
   x->mvc = cm->fc.mvc;
@@ -1380,7 +1604,12 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x) {
   }
 #endif
 
-  ++cpi->ymode_count[m];
+#if CONFIG_SUPERBLOCKS
+  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    ++cpi->sb_ymode_count[m];
+  } else
+#endif
+    ++cpi->ymode_count[m];
   if (m != I8X8_PRED)
     ++cpi->y_uv_mode_count[m][uvm];
   else {
@@ -1418,6 +1647,160 @@ static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x) {
 #endif
 }
 
+#if CONFIG_SUPERBLOCKS
+static void update_sb_skip_coeff_state(VP8_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       ENTROPY_CONTEXT_PLANES ta[4],
+                                       ENTROPY_CONTEXT_PLANES tl[4],
+                                       TOKENEXTRA *t[4],
+                                       TOKENEXTRA **tp,
+                                       int skip[4])
+{
+  TOKENEXTRA tokens[4][16 * 24];
+  int n_tokens[4], n;
+
+  // if there were no skips, we don't need to do anything
+  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
+    return;
+
+  // if we don't do coeff skipping for this frame, we don't
+  // need to do anything here
+  if (!cpi->common.mb_no_coeff_skip)
+    return;
+
+  // if all 4 MBs skipped coeff coding, nothing to be done
+  if (skip[0] && skip[1] && skip[2] && skip[3])
+    return;
+
+  // so the situation now is that we want to skip coeffs
+  // for some MBs, but not all, and we didn't code EOB
+  // coefficients for them. However, the skip flag for this
+  // SB will be 0 overall, so we need to insert EOBs in the
+  // middle of the token tree. Do so here.
+  n_tokens[0] = t[1] - t[0];
+  n_tokens[1] = t[2] - t[1];
+  n_tokens[2] = t[3] - t[2];
+  n_tokens[3] = *tp  - t[3];
+  if (n_tokens[0])
+    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
+  if (n_tokens[1])
+    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
+  if (n_tokens[2])
+    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
+  if (n_tokens[3])
+    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
+
+  // reset pointer, stuff EOBs where necessary
+  *tp = t[0];
+  for (n = 0; n < 4; n++) {
+    TOKENEXTRA *tbak = *tp;
+    if (skip[n]) {
+      x->e_mbd.above_context = &ta[n];
+      x->e_mbd.left_context  = &tl[n];
+      vp8_stuff_mb_8x8(cpi, &x->e_mbd, tp, 0);
+    } else {
+      if (n_tokens[n]) {
+        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+      }
+      (*tp) += n_tokens[n];
+    }
+  }
+}
+
+void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    TOKENEXTRA **t,
+                                    int mb_col) {
+  const int output_enabled = 1;
+  int n;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP8_COMMON *cm = &cpi->common;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
+    adjust_act_zbin(cpi, x);
+    vp8_update_zbin_extra(cpi, x);
+  }
+
+  /* test code: set transform size based on mode selection */
+  if (cpi->common.txfm_mode == ALLOW_8X8) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[1].mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[cm->mode_info_stride].mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[cm->mode_info_stride+1].mbmi.txfm_size = TX_8X8;
+    cpi->t8x8_count++;
+  } else {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    cpi->t4x4_count++;
+  }
+
+  RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
+  RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++)
+  {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    xd->above_context = cm->above_context + mb_col + (n & 1);
+    xd->left_context = cm->left_context + (n >> 1);
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp8_transform_intra_mby_8x8(x);
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mby_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+    if (x->optimize) {
+      vp8_optimize_mby_8x8(x, rtcd);
+      vp8_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_recon_mby_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp8_recon_mbuv_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (output_enabled) {
+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+      vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+    }
+  }
+
+  if (output_enabled) {
+    // Tokenize
+    xd->mode_info_context = mi;
+    sum_intra_stats(cpi, x);
+    update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+  }
+}
+#endif
+
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
                                     MACROBLOCK *x,
                                     TOKENEXTRA **t,
@@ -1484,6 +1867,9 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
   unsigned char ref_pred_flag;
 
   x->skip = 0;
+#if CONFIG_SUPERBLOCKS
+  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
+#endif
 
 #if CONFIG_SWITCHABLE_INTERP
   vp8_setup_interp_filters(xd, mbmi->interp_filter, cm);
@@ -1648,3 +2034,190 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
     }
   }
 }
+
+#if CONFIG_SUPERBLOCKS
+void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int recon_yoffset, int recon_uvoffset, int mb_col, int mb_row) {
+  const int output_enabled = 1;
+  VP8_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  int mis = xd->mode_info_stride;
+  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int seg_ref_active;
+  unsigned char ref_pred_flag;
+  int n;
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  x->skip = 0;
+
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Adjust the zbin based on this MB rate.
+    adjust_act_zbin(cpi, x);
+  }
+
+  {
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    cpi->zbin_mode_boost = 0;
+    if (cpi->zbin_mode_boost_enabled) {
+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
+          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+          cpi->zbin_mode_boost = 0;
+        else
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+      }
+    }
+
+    vp8_update_zbin_extra(cpi, x);
+  }
+
+  seg_ref_active = segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+  // SET VARIOUS PREDICTION FLAGS
+
+  // Did the chosen reference frame match its predicted value.
+  ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
+                    get_pred_ref(cm, xd)));
+  set_pred_flag(xd, PRED_REF, ref_pred_flag);
+
+  /* test code: set transform size based on mode selection */
+  if (cpi->common.txfm_mode == ALLOW_8X8
+      && x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED
+      && x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+      && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+    cpi->t8x8_count++;
+  } else {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    cpi->t4x4_count++;
+  }
+
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
+  } else {
+    int ref_fb_idx;
+
+    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+      ref_fb_idx = cpi->common.lst_fb_idx;
+    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cpi->common.gld_fb_idx;
+    else
+      ref_fb_idx = cpi->common.alt_fb_idx;
+
+    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+    if (xd->mode_info_context->mbmi.second_ref_frame) {
+      int second_ref_fb_idx;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cpi->common.lst_fb_idx;
+      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cpi->common.gld_fb_idx;
+      else
+        second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+                                    recon_yoffset;
+      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+                                    recon_uvoffset;
+      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+                                    recon_uvoffset;
+    }
+
+    vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++)
+  {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      vp8_transform_intra_mby_8x8(x);
+    } else {
+      vp8_transform_mby_8x8(x);
+    }
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mby_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+    if (x->optimize) {
+      vp8_optimize_mby_8x8(x, rtcd);
+      vp8_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_recon_mby_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp8_recon_mbuv_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (!x->skip) {
+      if (output_enabled) {
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n >> 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+        vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      }
+    } else {
+      int mb_skip_context =
+        cpi->common.mb_no_coeff_skip ?
+          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+          0;
+      if (cpi->common.mb_no_coeff_skip) {
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n >> 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        cpi->skip_true_count[mb_skip_context]++;
+        vp8_fix_contexts(xd);
+      } else {
+        vp8_stuff_mb(cpi, xd, t, 0);
+        xd->mode_info_context->mbmi.mb_skip_coeff = 0;
+        cpi->skip_false_count[mb_skip_context]++;
+      }
+    }
+  }
+
+  xd->mode_info_context = mi;
+  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+}
+#endif
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index e03b47e2ced1c2b42fef23f32146c0be3d91d020..473f8ba3d531de14576dabb22e299c603fc41224 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -67,11 +67,10 @@ void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
   }
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride,
+                           unsigned char *upred, unsigned char *vpred, int dst_stride) {
   short *udiff = diff + 256;
   short *vdiff = diff + 320;
-  unsigned char *upred = pred + 256;
-  unsigned char *vpred = pred + 320;
 
   int r, c;
 
@@ -81,8 +80,8 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
     }
 
     udiff += 8;
-    upred += 8;
-    usrc  += stride;
+    upred += dst_stride;
+    usrc  += src_stride;
   }
 
   for (r = 0; r < 8; r++) {
@@ -91,12 +90,19 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
     }
 
     vdiff += 8;
-    vpred += 8;
-    vsrc  += stride;
+    vpred += dst_stride;
+    vsrc  += src_stride;
   }
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+  unsigned char *upred = pred + 256;
+  unsigned char *vpred = pred + 320;
+
+  vp8_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
+}
+
+void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int dst_stride) {
   int r, c;
 
   for (r = 0; r < 16; r++) {
@@ -105,11 +111,16 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in
     }
 
     diff += 16;
-    pred += 16;
-    src  += stride;
+    pred += dst_stride;
+    src  += src_stride;
   }
 }
 
+void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+{
+  vp8_subtract_mby_s_c(diff, src, stride, pred, 16);
+}
+
 static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
   BLOCK *b = &x->block[0];
 
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 6390f3fe4fa6446e195e61e356975c37971d9fbd..6a5bf59d50d44ec99df1c0ad271a864dbfaf7429 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -23,24 +23,36 @@ extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER
 void vp8_cmachine_specific_config(VP8_COMP *cpi) {
 #if CONFIG_RUNTIME_CPU_DETECT
   cpi->rtcd.common                    = &cpi->common.rtcd;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32              = vp8_sad32x32_c;
+#endif
   cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
   cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
   cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
   cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
   cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x3            = vp8_sad32x32x3_c;
+#endif
   cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_c;
   cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_c;
   cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_c;
   cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
   cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x8            = vp8_sad32x32x8_c;
+#endif
   cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
   cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
   cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
   cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
   cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x4d           = vp8_sad32x32x4d_c;
+#endif
   cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
   cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
   cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -54,16 +66,34 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) {
   cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
   cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
   cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.var32x32              = vp8_variance32x32_c;
+#endif
 
   cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
   cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
   cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
   cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
   cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.subpixvar32x32        = vp8_sub_pixel_variance32x32_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_h     = vp8_variance_halfpixvar32x32_h_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_v     = vp8_variance_halfpixvar32x32_v_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_hv    = vp8_variance_halfpixvar32x32_hv_c;
+#endif
   cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.subpixmse32x32        = vp8_sub_pixel_mse32x32_c;
+#endif
 
   cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
   cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index ba4cd897dbc25f27bde5420f0a237b1709886128..a0621b649b3e426d95dffcbb623ae26cb830175e 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -243,7 +243,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   int y_stride;
   int offset;
 
-#if ARCH_X86 || ARCH_X86_64
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
   unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
   unsigned char *y;
   int buf_r1, buf_r2, buf_c1, buf_c2;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index bcbc857666d293ce0b88867237c04df688011fb0..deff0db08522e4ffd0304c9d3a07436a063e25f2 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -620,6 +620,42 @@ static void print_seg_map(VP8_COMP *cpi) {
   fclose(statsfile);
 }
 
+static void update_reference_segmentation_map(VP8_COMP *cpi) {
+  VP8_COMMON *cm = &cpi->common;
+  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
+  MODE_INFO *mi = cm->mi;
+  uint8_t *segmap = cpi->segmentation_map;
+  uint8_t *segcache = cm->last_frame_seg_map;
+
+  for (row = 0; row < sb_rows; row++) {
+    for (col = 0; col < sb_cols; col++) {
+      MODE_INFO *miptr = mi + col * 2;
+      uint8_t *seg = segmap + col * 2;
+      uint8_t *cache = segcache + col * 2;
+#if CONFIG_SUPERBLOCKS
+      if (miptr->mbmi.encoded_as_sb) {
+        cache[0] = cache[1] = cache[cm->mb_cols] = cache[cm->mb_cols + 1] =
+          miptr->mbmi.segment_id;
+      } else
+#endif
+      {
+        cache[0] = miptr[0].mbmi.segment_id;
+        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+          cache[1] = miptr[1].mbmi.segment_id;
+        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
+          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
+          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+            cache[1] = miptr[1].mbmi.segment_id;
+          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
+        }
+      }
+    }
+    segmap += 2 * cm->mb_cols;
+    segcache += 2 * cm->mb_cols;
+    mi += 2 * cm->mode_info_stride;
+  }
+}
+
 static void set_default_lf_deltas(VP8_COMP *cpi) {
   cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -1736,6 +1772,9 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) {
   cm->prob_last_coded               = 128;
   cm->prob_gf_coded                 = 128;
   cm->prob_intra_coded              = 63;
+#if CONFIG_SUPERBLOCKS
+  cm->sb_coded                      = 200;
+#endif
   for (i = 0; i < COMP_PRED_CONTEXTS; i++)
     cm->prob_comppred[i]         = 128;
 
@@ -1919,6 +1958,18 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) {
   init_mv_ref_counts();
 #endif
 
+#if CONFIG_SUPERBLOCKS
+  cpi->fn_ptr[BLOCK_32X32].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32);
+  cpi->fn_ptr[BLOCK_32X32].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32);
+  cpi->fn_ptr[BLOCK_32X32].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar32x32);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_h  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_h);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_v);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_hv);
+  cpi->fn_ptr[BLOCK_32X32].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x3);
+  cpi->fn_ptr[BLOCK_32X32].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x8);
+  cpi->fn_ptr[BLOCK_32X32].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x4d);
+#endif
+
   cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
   cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
   cpi->fn_ptr[BLOCK_16X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
@@ -3618,6 +3669,10 @@ static void encode_frame_to_data_rate
   cpi->dummy_packing = 0;
   vp8_pack_bitstream(cpi, dest, size);
 
+  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+    update_reference_segmentation_map(cpi);
+  }
+
 #if CONFIG_PRED_FILTER
   // Select the prediction filtering mode to use for the
   // next frame based on the current frame selections
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index ff3a2110719179dbc3a793f15a8cb63bfed9f26e..7fb7dd2ff4b5c13c920e9d4fad324d79e0d34266 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -359,7 +359,9 @@ enum {
   BLOCK_8X8,
   BLOCK_4X4,
   BLOCK_16X16,
-  BLOCK_MAX_SEGMENTS
+  BLOCK_MAX_SEGMENTS,
+  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+  BLOCK_MAX_SB_SEGMENTS,
 };
 
 typedef struct VP8_COMP {
@@ -528,6 +530,10 @@ typedef struct VP8_COMP {
 
   int cq_target_quality;
 
+#if CONFIG_SUPERBLOCKS
+  int sb_count;
+  int sb_ymode_count [VP8_I32X32_MODES];
+#endif
   int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
   int bmode_count [VP8_BINTRAMODES];
   int i8x8_mode_count [VP8_I8X8_MODES];
@@ -628,7 +634,7 @@ typedef struct VP8_COMP {
   vp8_full_search_fn_t full_search_sad;
   vp8_refining_search_fn_t refining_search_sad;
   vp8_diamond_search_fn_t diamond_search_sad;
-  vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
+  vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
@@ -732,9 +738,6 @@ typedef struct VP8_COMP {
 
   int droppable;
 
-  // Global store for SB left contexts, one for each MB row in the SB
-  ENTROPY_CONTEXT_PLANES left_context[2];
-
   // TODO Do we still need this??
   int update_context;
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 720736f33555ef44f3e0c6d0cbac88cef9461202..92a80ecbaeeec6e77f7d7fb9ed2a10ef61371af3 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -718,7 +718,7 @@ static void macro_block_yrd(MACROBLOCK *mb,
   *Rate = vp8_rdcost_mby(mb);
 }
 
-static int vp8_rdcost_mby_8x8(MACROBLOCK *mb) {
+static int vp8_rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -726,11 +726,16 @@ static int vp8_rdcost_mby_8x8(MACROBLOCK *mb) {
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
 
   for (b = 0; b < 16; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
@@ -775,7 +780,7 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = vp8_rdcost_mby_8x8(mb);
+  *Rate = vp8_rdcost_mby_8x8(mb, 1);
 }
 
 #if CONFIG_TX16X16
@@ -823,6 +828,66 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
   d[12] = p[12];
 }
 
+#if CONFIG_SUPERBLOCKS
+static void super_block_yrd_8x8(MACROBLOCK *x,
+                                int *rate,
+                                int *distortion,
+                                const VP8_ENCODER_RTCD *rtcd, int *skip)
+{
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const by2 = x->block + 24;
+  BLOCKD *const bdy2  = xd->block + 24;
+  int d = 0, r = 0, n;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+  ENTROPY_CONTEXT_PLANES t_above[2];
+  ENTROPY_CONTEXT_PLANES t_left[2];
+  int skippable = 1;
+
+  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_transform_mby_8x8(x);
+    vp8_quantize_mby_8x8(x);
+
+    /* remove 1st order dc to properly combine 1st/2nd order distortion */
+    x->coeff[  0] = 0;
+    x->coeff[ 64] = 0;
+    x->coeff[128] = 0;
+    x->coeff[192] = 0;
+    xd->dqcoeff[  0] = 0;
+    xd->dqcoeff[ 64] = 0;
+    xd->dqcoeff[128] = 0;
+    xd->dqcoeff[192] = 0;
+
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, 0);
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, berr)(by2->coeff, bdy2->dqcoeff, 16);
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += vp8_rdcost_mby_8x8(x, 0);
+    skippable = skippable && mby_is_skippable_8x8(xd);
+  }
+
+  *distortion = (d >> 2);
+  *rate       = r;
+  if (skip) *skip = skippable;
+  xd->above_context = ta;
+  xd->left_context = tl;
+  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
+  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
+}
+#endif
+
 static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
@@ -1062,6 +1127,45 @@ static int64_t rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rat
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_pick_intra_sby_mode(VP8_COMP *cpi,
+                                      MACROBLOCK *x,
+                                      int *rate,
+                                      int *rate_tokenonly,
+                                      int *distortion) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int this_rate, this_rate_tokenonly;
+  int this_distortion;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  /* Y Search for 32x32 intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.mode = mode;
+    RECON_INVOKE(&cpi->common.rtcd.recon,
+                 build_intra_predictors_sby_s)(&x->e_mbd);
+
+    super_block_yrd_8x8(x, &this_rate_tokenonly,
+                        &this_distortion, IF_RTCD(&cpi->rtcd), NULL);
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+
+  return best_rd;
+}
+#endif
 
 static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
                                       MACROBLOCK *x,
@@ -1372,18 +1476,23 @@ static int64_t rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int rd_cost_mbuv_8x8(MACROBLOCK *mb) {
+static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
 
   for (b = 16; b < 24; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
@@ -1393,6 +1502,54 @@ static int rd_cost_mbuv_8x8(MACROBLOCK *mb) {
   return cost;
 }
 
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_inter32x32_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                                int *distortion, int fullpixel, int *skip) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int n, r = 0, d = 0;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  int skippable = 1;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+    d += ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+    skippable = skippable && mbuv_is_skippable_8x8(xd);
+  }
+
+  *rate = r;
+  *distortion = d;
+  if (skip) *skip = skippable;
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context, t_left, sizeof(t_left));
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+#endif
 
 static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                                     int *distortion, int fullpixel) {
@@ -1403,7 +1560,7 @@ static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 
   vp8_quantize_mbuv_8x8(x);
 
-  *rate       = rd_cost_mbuv_8x8(x);
+  *rate       = rd_cost_mbuv_8x8(x, 1);
   *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1527,7 +1684,7 @@ static void rd_pick_intra_mbuv_mode_8x8(VP8_COMP *cpi,
 
     vp8_quantize_mbuv_8x8(x);
 
-    rate_to = rd_cost_mbuv_8x8(x);
+    rate_to = rd_cost_mbuv_8x8(x, 1);
     rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
     distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
@@ -1546,6 +1703,91 @@ static void rd_pick_intra_mbuv_mode_8x8(VP8_COMP *cpi,
   mbmi->uv_mode = mode_selected;
 }
 
+#if CONFIG_SUPERBLOCKS
+static void super_block_uvrd_8x8(MACROBLOCK *x,
+                                 int *rate,
+                                 int *distortion,
+                                 const VP8_ENCODER_RTCD *rtcd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int d = 0, r = 0, n;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left,  xd->left_context,  sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, mbuverr)(x) >> 2;
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+  }
+
+  xd->above_context = ta;
+  xd->left_context = tl;
+  *distortion = (d >> 2);
+  *rate       = r;
+
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context,  t_left,  sizeof(t_left));
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP8_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       int *rate,
+                                       int *rate_tokenonly,
+                                       int *distortion) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate;
+  int this_distortion;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
+    RECON_INVOKE(&cpi->rtcd.common->recon,
+                 build_intra_predictors_sbuv_s)(&x->e_mbd);
+
+    super_block_uvrd_8x8(x, &this_rate_tokenonly,
+                         &this_distortion, IF_RTCD(&cpi->rtcd));
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+
+  return best_rd;
+}
+#endif
+
 int vp8_cost_mv_ref(VP8_COMP *cpi,
                     MB_PREDICTION_MODE m,
                     const int near_mv_ref_ct[4]) {
@@ -2568,25 +2810,33 @@ static void vp8_estimate_ref_frame_costs(VP8_COMP *cpi, int segment_id, unsigned
   }
 }
 
-static void store_coding_context(MACROBLOCK *x, int mb_index,
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                                  int mode_index,
                                  PARTITION_INFO *partition,
                                  int_mv *ref_mv,
-                                 int_mv *second_ref_mv) {
+                                 int_mv *second_ref_mv,
+                                 int single_pred_diff,
+                                 int comp_pred_diff,
+                                 int hybrid_pred_diff) {
   MACROBLOCKD *xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  x->mb_context[mb_index].best_mode_index = mode_index;
-  vpx_memcpy(&x->mb_context[mb_index].mic, xd->mode_info_context,
+  ctx->best_mode_index = mode_index;
+  vpx_memcpy(&ctx->mic, xd->mode_info_context,
              sizeof(MODE_INFO));
-  vpx_memcpy(&x->mb_context[mb_index].partition_info, partition,
-             sizeof(PARTITION_INFO));
-  x->mb_context[mb_index].best_ref_mv.as_int = ref_mv->as_int;
-  x->mb_context[mb_index].second_best_ref_mv.as_int = second_ref_mv->as_int;
-
-  // x->mb_context[mb_index].rddiv = x->rddiv;
-  // x->mb_context[mb_index].rdmult = x->rdmult;
+  if (partition)
+    vpx_memcpy(&ctx->partition_info, partition,
+               sizeof(PARTITION_INFO));
+  ctx->best_ref_mv.as_int = ref_mv->as_int;
+  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
+
+  // ctx[mb_index].rddiv = x->rddiv;
+  // ctx[mb_index].rdmult = x->rdmult;
+
+  ctx->single_pred_diff = single_pred_diff;
+  ctx->comp_pred_diff   = comp_pred_diff;
+  ctx->hybrid_pred_diff = hybrid_pred_diff;
 }
 
 static void inter_mode_cost(VP8_COMP *cpi, MACROBLOCK *x, int this_mode,
@@ -3464,7 +3714,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     }
 #endif
 
-    if (x->skip)
+    if (x->skip && !mode_excluded)
       break;
   }
 
@@ -3557,16 +3807,36 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   }
 
 end:
-  // TODO Save these to add in only if MB coding mode is selected?
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    cpi->rd_comp_pred_diff[i] += best_pred_diff[i];
+  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2]);
+}
 
-  store_coding_context(x, xd->mb_index, best_mode_index, &best_partition,
-                       &frame_best_ref_mv[mbmi->ref_frame],
-                       &frame_best_ref_mv[mbmi->second_ref_frame]);
+#if CONFIG_SUPERBLOCKS
+void vp8_rd_pick_intra_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                               int *returnrate,
+                               int *returndist) {
+  int rate_y, rate_uv;
+  int rate_y_tokenonly, rate_uv_tokenonly;
+  int error_y, error_uv;
+  int dist_y, dist_uv;
+
+  x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                     &dist_uv);
+  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                   &dist_y);
+
+  // TODO(rbultje): add rate_uv
+  *returnrate = rate_y;
+  *returndist = dist_y + (dist_uv >> 2);
 }
+#endif
 
-int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
+void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
+                            int *returnrate, int *returndist) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int64_t error4x4, error16x16;
@@ -3585,6 +3855,8 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
   int rate8x8, dist8x8;
   int mode16x16;
   int mode8x8[2][4];
+  int dist;
+  int rateuv8, rateuv_tokenonly8, distuv8;
 
   mbmi->ref_frame = INTRA_FRAME;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
@@ -3646,9 +3918,11 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      dist = dist4x4;
     } else {
       mbmi->mode = mode16x16;
       rate += rate16x16;
+      dist = dist16x16;
     }
   } else {
     if (error4x4 < error8x8) {
@@ -3663,17 +3937,727 @@ int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      dist = dist4x4;
     } else {
       mbmi->mode = I8X8_PRED;
       set_i8x8_block_modes(x, mode8x8);
       rate += rate8x8;
+      dist = dist8x8;
     }
   }
-  return rate;
+
+  // TODO(rbultje): should add rateuv here also
+  *returnrate = rate - rateuv;
+  *returndist = dist + (distuv >> 2);
 }
 
-int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                     int recon_yoffset, int recon_uvoffset) {
+#if CONFIG_SUPERBLOCKS
+int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                                  int recon_yoffset, int recon_uvoffset,
+                                  int *returnrate, int *returndistortion) {
+  VP8_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &xd->block[0];
+  MB_PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame;
+  int mis = xd->mode_info_stride;
+  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
+  int comp_pred;
+  int_mv best_ref_mv, second_best_ref_mv;
+  int_mv mode_mv[MB_MODE_COUNT];
+  int_mv frame_nearest_mv[4];
+  int_mv frame_near_mv[4];
+  int_mv frame_best_ref_mv[4];
+  int_mv mc_search_result[4];
+  int frame_mdcounts[4][4];
+  unsigned char *y_buffer[4];
+  unsigned char *u_buffer[4];
+  unsigned char *v_buffer[4];
+  static const int flag_list[4] = { 0, VP8_LAST_FLAG, VP8_GOLD_FLAG, VP8_ALT_FLAG };
+  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx, cpi->common.alt_fb_idx };
+  int mdcounts[4];
+  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  int saddone = 0;
+  int sr = 0;  // search range got from mv_pred(). It uses step_param levels. (0-7)
+  int64_t best_rd = INT64_MAX;
+  int64_t best_comp_rd = INT64_MAX;
+  int64_t best_single_rd = INT64_MAX;
+  int64_t best_hybrid_rd = INT64_MAX;
+  int64_t best_yrd = INT64_MAX;
+  MB_MODE_INFO best_mbmode;
+  int mode_index = 0;
+#if 0
+  PARTITION_INFO best_partition;
+  union b_mode_info best_bmodes[16];
+#endif
+  unsigned int ref_costs[MAX_REF_FRAMES];
+
+  xd->mode_info_context->mbmi.segment_id = segment_id;
+  vp8_estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      YV12_BUFFER_CONFIG *ref_buf = &cpi->common.yv12_fb[idx_list[ref_frame]];
+
+      vp8_find_near_mvs(xd, xd->mode_info_context,
+                        xd->prev_mode_info_context,
+                        &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
+                        &frame_best_ref_mv[ref_frame], frame_mdcounts[ref_frame],
+                        ref_frame, cpi->common.ref_frame_sign_bias);
+
+      y_buffer[ref_frame] = ref_buf->y_buffer + recon_yoffset;
+      u_buffer[ref_frame] = ref_buf->u_buffer + recon_uvoffset;
+      v_buffer[ref_frame] = ref_buf->v_buffer + recon_uvoffset;
+    }
+    mc_search_result[ref_frame].as_int = INVALID_MV;
+  }
+
+  for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {
+    int_mv mvp;
+    int mode_excluded;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int other_cost = 0;
+    int compmode_cost = 0;
+    int rate2 = 0;
+    int distortion2 = 0;
+    int rate_y = 0;
+    int rate_uv = 0;
+    int distortion_uv;
+    int distortion;
+    int skippable_y, skippable_uv;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_rd <= cpi->rd_threshes[mode_index]) {
+      continue;
+    }
+
+    this_mode = vp8_mode_order[mode_index].mode;
+    ref_frame = vp8_mode_order[mode_index].ref_frame;
+    xd->mode_info_context->mbmi.ref_frame = ref_frame;
+    comp_pred = vp8_mode_order[mode_index].second_ref_frame != INTRA_FRAME;
+    xd->mode_info_context->mbmi.mode = this_mode;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+#if 0 && CONFIG_PRED_FILTER
+    xd->mode_info_context->mbmi.pred_filter_enabled = 0;
+#endif
+
+#if 0 && CONFIG_COMP_INTRA_PRED
+    xd->mode_info_context->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+    xd->mode_info_context->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+
+    // not yet supported or not superblocky
+    // TODO(rbultje): support intra coding
+    if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)
+      continue;
+
+    if (comp_pred) {
+      int second_ref;
+
+      if (ref_frame == ALTREF_FRAME) {
+        second_ref = LAST_FRAME;
+      } else {
+        second_ref = ref_frame + 1;
+      }
+      if (!(cpi->ref_frame_flags & flag_list[second_ref]))
+        continue;
+      xd->mode_info_context->mbmi.second_ref_frame = second_ref;
+
+      xd->second_pre.y_buffer = y_buffer[second_ref];
+      xd->second_pre.u_buffer = u_buffer[second_ref];
+      xd->second_pre.v_buffer = v_buffer[second_ref];
+      second_best_ref_mv  = frame_best_ref_mv[second_ref];
+      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+    } else {
+      xd->mode_info_context->mbmi.second_ref_frame = INTRA_FRAME;
+      mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+    }
+
+    xd->pre.y_buffer = y_buffer[ref_frame];
+    xd->pre.u_buffer = u_buffer[ref_frame];
+    xd->pre.v_buffer = v_buffer[ref_frame];
+    mode_mv[ZEROMV].as_int = 0;
+    mode_mv[NEARESTMV] = frame_nearest_mv[ref_frame];
+    mode_mv[NEARMV] = frame_near_mv[ref_frame];
+    best_ref_mv = frame_best_ref_mv[ref_frame];
+    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+        !check_segref(xd, segment_id, ref_frame)) {
+      continue;
+    }
+    // If the segment mode feature is enabled....
+    // then do nothing if the current mode is not allowed..
+    else if (segfeature_active(xd, segment_id, SEG_LVL_MODE)  &&
+             (this_mode != get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+      continue;
+    }
+    // Disable this drop out case if either the mode or ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    else if (!segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+             !segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative
+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
+          continue;
+        }
+      }
+    }
+
+    if (!comp_pred) {
+      switch (this_mode) {
+        case NEWMV: {
+          int thissme;
+          int bestsme = INT_MAX;
+          int step_param = cpi->sf.first_step;
+          int further_steps;
+          int n;
+          int do_refine = 1;   /* If last step (1-away) of n-step search doesn't pick the center point as the best match,
+                                  we will do a final 1-away diamond refining search  */
+          int num00;
+
+          int sadpb = x->sadperbit16;
+          int_mv mvp_full;
+
+          int col_min = (best_ref_mv.as_mv.col >> 3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7) ? 1 : 0);
+          int row_min = (best_ref_mv.as_mv.row >> 3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7) ? 1 : 0);
+          int col_max = (best_ref_mv.as_mv.col >> 3) + MAX_FULL_PEL_VAL;
+          int row_max = (best_ref_mv.as_mv.row >> 3) + MAX_FULL_PEL_VAL;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+
+          if (!saddone) {
+            vp8_cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0]);
+            saddone = 1;
+          }
+
+          vp8_mv_pred(cpi, xs, xd->mode_info_context, &mvp,
+                      xd->mode_info_context->mbmi.ref_frame,
+                      cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+          mvp_full.as_mv.col = mvp.as_mv.col >> 3;
+          mvp_full.as_mv.row = mvp.as_mv.row >> 3;
+
+          // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
+          if (x->mv_col_min < col_min)
+            x->mv_col_min = col_min;
+          if (x->mv_col_max > col_max)
+            x->mv_col_max = col_max;
+          if (x->mv_row_min < row_min)
+            x->mv_row_min = row_min;
+          if (x->mv_row_max > row_max)
+            x->mv_row_max = row_max;
+
+          // adjust search range according to sr from mv prediction
+          if (sr > step_param)
+            step_param = sr;
+
+          // Initial step/diamond search
+          {
+            bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.as_mv.first,
+                                              step_param, sadpb, &num00,
+                                              &cpi->fn_ptr[BLOCK_32X32],
+                                              XMVCOST, &best_ref_mv);
+            mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+
+            // Further step/diamond searches as necessary
+            n = 0;
+            further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+            n = num00;
+            num00 = 0;
+
+            /* If there won't be more n-step search, check to see if refining search is needed. */
+            if (n > further_steps)
+              do_refine = 0;
+
+            while (n < further_steps) {
+              n++;
+
+              if (num00)
+                num00--;
+              else {
+                thissme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                                  &d->bmi.as_mv.first, step_param + n, sadpb, &num00,
+                                                  &cpi->fn_ptr[BLOCK_32X32],
+                                                  XMVCOST, &best_ref_mv);
+
+                /* check to see if refining search is needed. */
+                if (num00 > (further_steps - n))
+                  do_refine = 0;
+
+                if (thissme < bestsme) {
+                  bestsme = thissme;
+                  mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+                } else {
+                  d->bmi.as_mv.first.as_int = mode_mv[NEWMV].as_int;
+                }
+              }
+            }
+          }
+
+          /* final 1-away diamond refining search */
+          if (do_refine == 1) {
+            int search_range;
+
+            // It seems not a good way to set search_range. Need further investigation.
+            // search_range = MAXF(abs((mvp.row>>3) - d->bmi.mv.as_mv.row), abs((mvp.col>>3) - d->bmi.mv.as_mv.col));
+            search_range = 8;
+
+            thissme = cpi->refining_search_sad(x, b, d, &d->bmi.as_mv.first, sadpb,
+                                               search_range, &cpi->fn_ptr[BLOCK_32X32],
+                                               XMVCOST, &best_ref_mv);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+            } else {
+              d->bmi.as_mv.first.as_int = mode_mv[NEWMV].as_int;
+            }
+          }
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+            cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first, &best_ref_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[BLOCK_32X32],
+                                         XMVCOST, &dis, &sse);
+          }
+          mc_search_result[xd->mode_info_context->mbmi.ref_frame].as_int =
+            d->bmi.as_mv.first.as_int;
+
+          mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+
+          // Add the new motion vector cost to our rolling cost variable
+          rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+        }
+
+        case NEARESTMV:
+        case NEARMV:
+          // Clip "next_nearest" so that it does not extend to far out of image
+          vp8_clamp_mv2(&mode_mv[this_mode], xd);
+
+          // Do not bother proceeding if the vector (from newmv,nearest or near) is 0,0 as this should then be coded using the zeromv mode.
+          if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && (mode_mv[this_mode].as_int == 0)) {
+            continue;
+          }
+
+        case ZEROMV:
+          // Trap vectors that reach beyond the UMV borders
+          // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
+          // because of the lack of break statements in the previous two cases.
+          if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+              ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+            continue;
+          }
+
+          vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+
+#if CONFIG_PRED_FILTER
+          // Filtered prediction:
+          xd->mode_info_context->mbmi.pred_filter_enabled =
+          vp8_mode_order[mode_index].pred_filter_flag;
+          rate2 += vp8_cost_bit(cpi->common.prob_pred_filter_off,
+                                xd->mode_info_context->mbmi.pred_filter_enabled);
+#endif
+
+          vp8_build_inter32x32_predictors_sb(xd,
+                                             xd->dst.y_buffer,
+                                             xd->dst.u_buffer,
+                                             xd->dst.v_buffer,
+                                             xd->dst.y_stride,
+                                             xd->dst.uv_stride);
+
+          compmode_cost =
+            vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 0);
+
+          if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+            x->skip = 1;
+          } else if (x->encode_breakout) {
+            unsigned int sse;
+            unsigned int var;
+            int threshold = (xd->block[0].dequant[1] *
+                             xd->block[0].dequant[1] >> 4);
+
+            if (threshold < x->encode_breakout)
+              threshold = x->encode_breakout;
+
+            var = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32)(*(b->base_src),
+              b->src_stride, xd->dst.y_buffer, xd->dst.y_stride, &sse);
+
+            if (sse < threshold) {
+              unsigned int q2dc = xd->block[24].dequant[0];
+              /* If there is no codeable 2nd order dc
+                or a very small uniform pixel change change */
+              if ((sse - var < q2dc *q2dc >> 4) ||
+                  (sse / 2 > var && sse - var < 64)) {
+                // Check u and v to make sure skip is ok
+                int sse2, sse3;
+                int var2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                                  (x->src.u_buffer, x->src.uv_stride,
+                                   xd->dst.u_buffer, xd->dst.uv_stride, &sse2);
+                int var3 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                                  (x->src.v_buffer, x->src.uv_stride,
+                                   xd->dst.v_buffer, xd->dst.uv_stride, &sse3);
+                sse2 += sse3;
+                if (sse2 * 2 < threshold) {
+                  x->skip = 1;
+                  distortion2 = sse + sse2;
+                  rate2 = 500;
+
+                  /* for best_yrd calculation */
+                  rate_uv = 0;
+                  distortion_uv = sse2;
+
+                  disable_skip = 1;
+                  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+                  break;
+                }
+              }
+            }
+          }
+
+          // Add in the Mv/mode cost
+          rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
+
+          // Y cost and distortion - FIXME support other transform sizes
+          super_block_yrd_8x8(x, &rate_y, &distortion,
+                              IF_RTCD(&cpi->rtcd), &skippable_y);
+          rate2 += rate_y;
+          distortion2 += distortion;
+
+          rd_inter32x32_uv_8x8(cpi, x, &rate_uv, &distortion_uv,
+                               cpi->common.full_pixel, &skippable_uv);
+
+          rate2 += rate_uv;
+          distortion2 += distortion_uv;
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+          break;
+
+        default:
+          break;
+      }
+    } else { /* xd->mode_info_context->mbmi.second_ref_frame != 0 */
+      int ref1 = xd->mode_info_context->mbmi.ref_frame;
+      int ref2 = xd->mode_info_context->mbmi.second_ref_frame;
+
+      mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      switch (this_mode) {
+        case NEWMV:
+          if (mc_search_result[ref1].as_int == INVALID_MV ||
+              mc_search_result[ref2].as_int == INVALID_MV)
+            continue;
+          xd->mode_info_context->mbmi.mv[0].as_int = mc_search_result[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = mc_search_result[ref2].as_int;
+          rate2 += vp8_mv_bit_cost(&mc_search_result[ref1],
+                                   &frame_best_ref_mv[ref1],
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+          rate2 += vp8_mv_bit_cost(&mc_search_result[ref2],
+                                   &frame_best_ref_mv[ref2],
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+          break;
+        case ZEROMV:
+          xd->mode_info_context->mbmi.mv[0].as_int = 0;
+          xd->mode_info_context->mbmi.mv[1].as_int = 0;
+          break;
+        case NEARMV:
+          if (frame_near_mv[ref1].as_int == 0 || frame_near_mv[ref2].as_int == 0) {
+            continue;
+          }
+          xd->mode_info_context->mbmi.mv[0].as_int = frame_near_mv[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = frame_near_mv[ref2].as_int;
+          break;
+        case NEARESTMV:
+          if (frame_nearest_mv[ref1].as_int == 0 || frame_nearest_mv[ref2].as_int == 0) {
+            continue;
+          }
+          xd->mode_info_context->mbmi.mv[0].as_int = frame_nearest_mv[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = frame_nearest_mv[ref2].as_int;
+          break;
+        default:
+          break;
+      }
+
+      /* Add in the Mv/mode cost */
+      rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
+
+      vp8_clamp_mv2(&xd->mode_info_context->mbmi.mv[0], xd);
+      vp8_clamp_mv2(&xd->mode_info_context->mbmi.mv[1], xd);
+      if (((xd->mode_info_context->mbmi.mv[0].as_mv.row >> 3) < x->mv_row_min) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.row >> 3) > x->mv_row_max) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.col >> 3) < x->mv_col_min) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.col >> 3) > x->mv_col_max) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.row >> 3) < x->mv_row_min) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.row >> 3) > x->mv_row_max) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.col >> 3) < x->mv_col_min) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.col >> 3) > x->mv_col_max)) {
+        continue;
+      }
+
+      /* build first and second prediction */
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+
+      /* Y cost and distortion - TODO(rbultje) support other transform sizes */
+      super_block_yrd_8x8(x, &rate_y, &distortion,
+                          IF_RTCD(&cpi->rtcd), &skippable_y);
+
+      rate2 += rate_y;
+      distortion2 += distortion;
+
+      /* UV cost and distortion */
+      rd_inter32x32_uv_8x8(cpi, x, &rate_uv, &distortion_uv,
+                           cpi->common.full_pixel, &skippable_uv);
+
+      rate2 += rate_uv;
+      distortion2 += distortion_uv;
+
+      /* don't bother w/ skip, we would never have come here if skip were
+       * enabled */
+      xd->mode_info_context->mbmi.mode = this_mode;
+
+      /* We don't include the cost of the second reference here, because there
+       * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
+       * other words if you present them in that order, the second one is
+       * always known if the first is known */
+      compmode_cost = vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 1);
+    }
+
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      rate2 += compmode_cost;
+    }
+
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip) {
+      // Test for the condition where skip block will be activated
+      // because there are no non zero coefficients and make any
+      // necessary adjustment for rate. Ignore if skip is coded at
+      // segment level as the cost wont have been added in.
+      if (cpi->common.mb_no_coeff_skip) {
+        int mb_skippable = skippable_y && skippable_uv;
+        int mb_skip_allowed;
+
+        // Is Mb level skip allowed for this mb.
+        mb_skip_allowed =
+          !segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+          get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+        if (mb_skippable) {
+          // Back out the coefficient coding costs
+          rate2 -= (rate_y + rate_uv);
+          // for best_yrd calculation
+          rate_uv = 0;
+
+          if (mb_skip_allowed) {
+            int prob_skip_cost;
+
+            // Cost the skip mb case
+            vp8_prob skip_prob =
+              get_pred_prob(cm, xd, PRED_MBSKIP);
+
+            if (skip_prob) {
+              prob_skip_cost = vp8_cost_bit(skip_prob, 1);
+              rate2 += prob_skip_cost;
+              other_cost += prob_skip_cost;
+            }
+          }
+        }
+        // Add in the cost of the no skip flag.
+        else if (mb_skip_allowed) {
+          int prob_skip_cost = vp8_cost_bit(get_pred_prob(cm, xd,
+                                                          PRED_MBSKIP), 0);
+          rate2 += prob_skip_cost;
+          other_cost += prob_skip_cost;
+        }
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+#if 0
+    // Keep record of best intra distortion
+    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+        (this_rd < best_intra_rd)) {
+      best_intra_rd = this_rd;
+      *returnintra = distortion2;
+    }
+#endif
+
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      if (this_rd < best_comp_rd)
+        best_comp_rd = this_rd;
+      if (this_rd < best_single_rd)
+        best_single_rd = this_rd;
+      if (this_rd < best_hybrid_rd)
+        best_hybrid_rd = this_rd;
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+#if 0
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (this_mode <= B_PRED) {
+          xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;
+          /* required for left and above block mv */
+          xd->mode_info_context->mbmi.mv.as_int = 0;
+        }
+#endif
+
+        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+        /* Calculate the final y RD estimate for this mode */
+        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+                          (distortion2 - distortion_uv));
+
+        *returnrate = rate2;
+        *returndistortion = distortion2;
+        best_rd = this_rd;
+        vpx_memcpy(&best_mbmode, &xd->mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+      }
+#if 0
+      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+    else {
+#if 0
+      cpi->rd_thresh_mult[mode_index] += 4;
+
+      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME &&
+          single_rd < best_single_rd) {
+        best_single_rd = single_rd;
+      } else if (xd->mode_info_context->mbmi.second_ref_frame != INTRA_FRAME &&
+                 single_rd < best_comp_rd) {
+        best_comp_rd = single_rd;
+      }
+      if (hybrid_rd < best_hybrid_rd) {
+        best_hybrid_rd = hybrid_rd;
+      }
+    }
+
+    if (x->skip && !mode_excluded)
+      break;
+  }
+
+  // TODO(rbultje) integrate with RD thresholding
+#if 0
+  // Reduce the activation RD thresholds for the best choice mode
+  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
+      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
+    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+    cpi->rd_thresh_mult[best_mode_index] =
+      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
+      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+    cpi->rd_threshes[best_mode_index] =
+      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+  }
+#endif
+
+  // This code forces Altref,0,0 and skip for the frame that overlays a
+  // an alrtef unless Altref is filtered. However, this is unsafe if
+  // segment level coding of ref frame or mode is enabled for this
+  // segment.
+  if (!segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+      !segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+      cpi->is_src_frame_alt_ref &&
+      (cpi->oxcf.arnr_max_frames == 0) &&
+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+    xd->mode_info_context->mbmi.mode = ZEROMV;
+    xd->mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+    xd->mode_info_context->mbmi.mv[0].as_int = 0;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+    xd->mode_info_context->mbmi.mb_skip_coeff =
+      (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+    xd->mode_info_context->mbmi.partitioning = 0;
+
+    xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+    if (best_rd != INT64_MAX)
+      store_coding_context(x, &x->sb_context[0], mode_index, NULL,
+        &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+        &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+        0, 0, 0);
+    return best_rd;
+  }
+
+  // macroblock modes
+  vpx_memcpy(&xd->mode_info_context->mbmi, &best_mbmode,
+             sizeof(MB_MODE_INFO));
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  if (best_rd != INT64_MAX)
+    store_coding_context(x, &x->sb_context[0], mode_index, NULL,
+      &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+      &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+      (best_single_rd == INT64_MAX) ? INT_MIN : (best_rd - best_single_rd),
+      (best_comp_rd   == INT64_MAX) ? INT_MIN : (best_rd - best_comp_rd),
+      (best_hybrid_rd == INT64_MAX) ? INT_MIN : (best_rd - best_hybrid_rd));
+
+  return best_rd;
+}
+#endif
+
+void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                      int recon_yoffset,
+                                      int recon_uvoffset,
+                                      int *totalrate, int *totaldist) {
   VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
@@ -3694,17 +4678,6 @@ int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
     vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
                            &distortion, &intra_error);
 
-    if (mbmi->ref_frame) {
-      unsigned char pred_context;
-
-      pred_context = get_pred_context(cm, xd, PRED_COMP);
-
-      if (mbmi->second_ref_frame == INTRA_FRAME)
-        cpi->single_pred_count[pred_context]++;
-      else
-        cpi->comp_pred_count[pred_context]++;
-    }
-
     /* restore cpi->zbin_mode_boost_enabled */
     cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
   }
@@ -3717,5 +4690,6 @@ int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
   x->mb_context[xd->mb_index].distortion  = distortion;
   x->mb_context[xd->mb_index].intra_error = intra_error;
 
-  return rate;
+  *totalrate = rate;
+  *totaldist = distortion;
 }
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index 2b5928de9593a01a54c3ba7610c496aff5795c37..0e36a519deaadf3fe1a6687b0be1b4c0c9f79ef4 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -18,7 +18,8 @@
 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset,
                                    int *returnrate, int *returndistortion, int64_t *returnintra);
-extern int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *r, int *d);
+extern void vp8_rd_pick_intra_mode_sb(VP8_COMP *cpi, MACROBLOCK *x, int *r, int *d);
 
 extern void vp8_mv_pred
 (
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
index 78a87f392c0798b3ff26678e41e5c37f0778c253..4fdfd11862040cbe3639ca28c52f10b0b1157dad 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -13,29 +13,6 @@
 #include "vpx_ports/config.h"
 #include "vpx/vpx_integer.h"
 
-unsigned int vp8_sad16x16_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad) {
-
-  int r, c;
-  unsigned int sad = 0;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      sad += abs(src_ptr[c] - ref_ptr[c]);
-    }
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sad;
-}
-
-
 static __inline
 unsigned int sad_mx_n_c(
   const unsigned char *src_ptr,
@@ -60,6 +37,21 @@ unsigned int sad_mx_n_c(
   return sad;
 }
 
+unsigned int vp8_sad32x32_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
+}
+
+unsigned int vp8_sad16x16_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
+}
 
 unsigned int vp8_sad8x8_c(
   const unsigned char *src_ptr,
@@ -104,6 +96,7 @@ unsigned int vp8_sad4x4_c(
 
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
+
 #if CONFIG_NEWBESTREFMV
 unsigned int vp8_sad2x16_c(
   const unsigned char *src_ptr,
@@ -122,6 +115,34 @@ unsigned int vp8_sad16x2_c(
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 2);
 }
 #endif
+
+void vp8_sad32x32x3_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned int *sad_array
+                      ) {
+  sad_array[0] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad32x32x8_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned short *sad_array
+                      ) {
+  sad_array[0] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[3] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, 0x7fffffff);
+  sad_array[4] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+  sad_array[5] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+  sad_array[6] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, 0x7fffffff);
+  sad_array[7] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x3_c(
   const unsigned char *src_ptr,
   int  src_stride,
@@ -267,6 +288,18 @@ void vp8_sad4x4x8_c(
   sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
 }
 
+void vp8_sad32x32x4d_c(const unsigned char *src_ptr,
+                       int  src_stride,
+                       unsigned char *ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array
+                       ) {
+  sad_array[0] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x4d_c(
   const unsigned char *src_ptr,
   int  src_stride,
diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index e9d02cdd41f32e0bf62bdfa802e9fdfdcd942a19..e88b80d3409395a906ed0bc067b18415efbe025e 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -200,42 +200,59 @@ void choose_segmap_coding_method(VP8_COMP *cpi) {
   // in the frame
   xd->mode_info_context = cm->mi;
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      segment_id = xd->mode_info_context->mbmi.segment_id;
-
-      // Count the number of hits on each segment with no prediction
-      no_pred_segcounts[segment_id]++;
-
-      // Temporal prediction not allowed on key frames
-      if (cm->frame_type != KEY_FRAME) {
-        // Test to see if the segment id matches the predicted value.
-        int seg_predicted =
-          (segment_id == get_pred_mb_segid(cm, segmap_index));
-
-        // Get the segment id prediction context
-        pred_context =
-          get_pred_context(cm, xd, PRED_SEG_ID);
-
-        // Store the prediction status for this mb and update counts
-        // as appropriate
-        set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
-        temporal_predictor_count[pred_context][seg_predicted]++;
-
-        if (!seg_predicted)
-          // Update the "unpredicted" segment count
-          t_unpred_seg_counts[segment_id]++;
-      }
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
+      for (i = 0; i < 4; i++) {
+        static const int dx[4] = { +1, -1, +1, +1 };
+        static const int dy[4] = {  0, +1,  0, -1 };
+        int x_idx = i & 1, y_idx = i >> 1;
+
+        if (mb_col + x_idx >= cm->mb_cols ||
+            mb_row + y_idx >= cm->mb_rows) {
+          goto end;
+        }
+
+        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
+        segment_id = xd->mode_info_context->mbmi.segment_id;
+
+        // Count the number of hits on each segment with no prediction
+        no_pred_segcounts[segment_id]++;
+
+        // Temporal prediction not allowed on key frames
+        if (cm->frame_type != KEY_FRAME) {
+          // Test to see if the segment id matches the predicted value.
+          int seg_predicted =
+            (segment_id == get_pred_mb_segid(cm, segmap_index));
 
-      // Step on to the next mb
-      xd->mode_info_context++;
+          // Get the segment id prediction context
+          pred_context =
+            get_pred_context(cm, xd, PRED_SEG_ID);
 
-      // Step on to the next entry in the segment maps
-      segmap_index++;
+          // Store the prediction status for this mb and update counts
+          // as appropriate
+          set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
+          temporal_predictor_count[pred_context][seg_predicted]++;
+
+          if (!seg_predicted)
+            // Update the "unpredicted" segment count
+            t_unpred_seg_counts[segment_id]++;
+        }
+
+#if CONFIG_SUPERBLOCKS
+        if (xd->mode_info_context->mbmi.encoded_as_sb) {
+          assert(!i);
+          xd->mode_info_context += 2;
+          break;
+        }
+#endif
+      end:
+        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
+      }
     }
 
     // this is to account for the border in mode_info_context
-    xd->mode_info_context++;
+    xd->mode_info_context -= mb_col;
+    xd->mode_info_context += cm->mode_info_stride * 2;
   }
 
   // Work out probability tree for coding segments without prediction
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 73a0a6b99fdcd3b26239fa3c331e892af45190e5..e17733c58f3eeb4736137a4249b19a5918d452bd 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -145,8 +145,18 @@ extern prototype_sad(vp8_variance_sad16x8);
 #endif
 extern prototype_sad(vp8_variance_sad16x16);
 
+#ifndef vp8_variance_sad32x32
+#define vp8_variance_sad32x32 vp8_sad32x32_c
+#endif
+extern prototype_sad(vp8_variance_sad32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
+#ifndef vp8_variance_sad32x32x3
+#define vp8_variance_sad32x32x3 vp8_sad32x32x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad32x32x3);
+
 #ifndef vp8_variance_sad16x16x3
 #define vp8_variance_sad16x16x3 vp8_sad16x16x3_c
 #endif
@@ -172,6 +182,11 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
 #endif
 extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
 
+#ifndef vp8_variance_sad32x32x8
+#define vp8_variance_sad32x32x8 vp8_sad32x32x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad32x32x8);
+
 #ifndef vp8_variance_sad16x16x8
 #define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
 #endif
@@ -199,6 +214,11 @@ extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
 
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
+#ifndef vp8_variance_sad32x32x4d
+#define vp8_variance_sad32x32x4d vp8_sad32x32x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad32x32x4d);
+
 #ifndef vp8_variance_sad16x16x4d
 #define vp8_variance_sad16x16x4d vp8_sad16x16x4d_c
 #endif
@@ -258,6 +278,11 @@ extern prototype_variance(vp8_variance_var16x8);
 #endif
 extern prototype_variance(vp8_variance_var16x16);
 
+#ifndef vp8_variance_var32x32
+#define vp8_variance_var32x32 vp8_variance32x32_c
+#endif
+extern prototype_variance(vp8_variance_var32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_subpixvar4x4
@@ -285,26 +310,51 @@ extern prototype_subpixvariance(vp8_variance_subpixvar16x8);
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
 
+#ifndef vp8_variance_subpixvar32x32
+#define vp8_variance_subpixvar32x32 vp8_sub_pixel_variance32x32_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar32x32);
+
 #ifndef vp8_variance_halfpixvar16x16_h
 #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_h);
 
+#ifndef vp8_variance_halfpixvar32x32_h
+#define vp8_variance_halfpixvar32x32_h vp8_variance_halfpixvar32x32_h_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_h);
+
 #ifndef vp8_variance_halfpixvar16x16_v
 #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_v);
 
+#ifndef vp8_variance_halfpixvar32x32_v
+#define vp8_variance_halfpixvar32x32_v vp8_variance_halfpixvar32x32_v_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_v);
+
 #ifndef vp8_variance_halfpixvar16x16_hv
 #define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv);
 
+#ifndef vp8_variance_halfpixvar32x32_hv
+#define vp8_variance_halfpixvar32x32_hv vp8_variance_halfpixvar32x32_hv_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_hv);
+
 #ifndef vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixmse16x16);
 
+#ifndef vp8_variance_subpixmse32x32
+#define vp8_variance_subpixmse32x32 vp8_sub_pixel_mse32x32_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixmse32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_getmbss
@@ -349,38 +399,66 @@ typedef struct {
   vp8_sad_fn_t             sad8x16;
   vp8_sad_fn_t             sad16x8;
   vp8_sad_fn_t             sad16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_fn_t             sad32x32;
+#endif
 
   vp8_variance_fn_t        var4x4;
   vp8_variance_fn_t        var8x8;
   vp8_variance_fn_t        var8x16;
   vp8_variance_fn_t        var16x8;
   vp8_variance_fn_t        var16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        var32x32;
+#endif
 
   vp8_subpixvariance_fn_t  subpixvar4x4;
   vp8_subpixvariance_fn_t  subpixvar8x8;
   vp8_subpixvariance_fn_t  subpixvar8x16;
   vp8_subpixvariance_fn_t  subpixvar16x8;
   vp8_subpixvariance_fn_t  subpixvar16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_subpixvariance_fn_t  subpixvar32x32;
+#endif
   vp8_variance_fn_t        halfpixvar16x16_h;
+  vp8_variance_fn_t        halfpixvar32x32_h;
   vp8_variance_fn_t        halfpixvar16x16_v;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        halfpixvar32x32_v;
+#endif
   vp8_variance_fn_t        halfpixvar16x16_hv;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        halfpixvar32x32_hv;
+#endif
   vp8_subpixvariance_fn_t  subpixmse16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_subpixvariance_fn_t  subpixmse32x32;
+#endif
 
   vp8_getmbss_fn_t         getmbss;
   vp8_variance_fn_t        mse16x16;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi_fn_t       sad32x32x3;
+#endif
   vp8_sad_multi_fn_t       sad16x16x3;
   vp8_sad_multi_fn_t       sad16x8x3;
   vp8_sad_multi_fn_t       sad8x16x3;
   vp8_sad_multi_fn_t       sad8x8x3;
   vp8_sad_multi_fn_t       sad4x4x3;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi1_fn_t      sad32x32x8;
+#endif
   vp8_sad_multi1_fn_t      sad16x16x8;
   vp8_sad_multi1_fn_t      sad16x8x8;
   vp8_sad_multi1_fn_t      sad8x16x8;
   vp8_sad_multi1_fn_t      sad8x8x8;
   vp8_sad_multi1_fn_t      sad4x4x8;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi_d_fn_t     sad32x32x4d;
+#endif
   vp8_sad_multi_d_fn_t     sad16x16x4d;
   vp8_sad_multi_d_fn_t     sad16x8x4d;
   vp8_sad_multi_d_fn_t     sad8x16x4d;
diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
index 0b9d569b018e1312b7511ce16e9b6d302a2084c4..cbe2a51d6f046d78eb624ed36191810cfa2fd150 100644
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -55,6 +55,20 @@ static void variance(
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance32x32_c(const unsigned char *src_ptr,
+                                 int  source_stride,
+                                 const unsigned char *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 10));
+}
+#endif
 
 unsigned int vp8_variance16x16_c(
   const unsigned char *src_ptr,
@@ -334,6 +348,27 @@ unsigned int vp8_sub_pixel_variance16x16_c
   return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const unsigned char *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
+  unsigned char  temp2[36 * 32];
+  const short *HFilter, *VFilter;
+
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
+
+  return vp8_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
 
 unsigned int vp8_variance_halfpixvar16x16_h_c(
   const unsigned char *src_ptr,
@@ -345,17 +380,38 @@ unsigned int vp8_variance_halfpixvar16x16_h_c(
                                        ref_ptr, recon_stride, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
+
 
-unsigned int vp8_variance_halfpixvar16x16_v_c(
+unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_v_c(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
                                        ref_ptr, recon_stride, sse);
 }
-
+#endif
 
 unsigned int vp8_variance_halfpixvar16x16_hv_c(
   const unsigned char *src_ptr,
@@ -367,6 +423,16 @@ unsigned int vp8_variance_halfpixvar16x16_hv_c(
                                        ref_ptr, recon_stride, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
+                                               int  source_stride,
+                                               const unsigned char *ref_ptr,
+                                               int  recon_stride,
+                                               unsigned int *sse) {
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
 
 unsigned int vp8_sub_pixel_mse16x16_c
 (
@@ -382,6 +448,19 @@ unsigned int vp8_sub_pixel_mse16x16_c
   return *sse;
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,
+                                      int  src_pixels_per_line,
+                                      int  xoffset,
+                                      int  yoffset,
+                                      const unsigned char *dst_ptr,
+                                      int dst_pixels_per_line,
+                                      unsigned int *sse) {
+  vp8_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+  return *sse;
+}
+#endif
+
 unsigned int vp8_sub_pixel_variance16x8_c
 (
   const unsigned char  *src_ptr,