diff --git a/vp9/decoder/decodemv.c b/vp9/decoder/decodemv.c
index 5013462a3d38f8902fcdf7fcbed409ecb289b6c2..a22df8f3c38b1ba96c0742b384db223b83c36b7b 100644
--- a/vp9/decoder/decodemv.c
+++ b/vp9/decoder/decodemv.c
@@ -186,11 +186,6 @@ static void kfread_modes(VP9D_COMP *pbi,
   m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
 
-#if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb)
-    m->mbmi.txfm_size = TX_8X8;
-  else
-#endif
   if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
       m->mbmi.mode <= I8X8_PRED) {
     // FIXME(rbultje) code ternary symbol once all experiments are merged
@@ -1132,11 +1127,6 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 #endif
   }
 
-#if CONFIG_SUPERBLOCKS
-  if (mbmi->encoded_as_sb)
-    mbmi->txfm_size = TX_8X8;
-  else
-#endif
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
diff --git a/vp9/decoder/decodframe.c b/vp9/decoder/decodframe.c
index 60bf7bacab1ae9b96ca603077c04d0aec9876326..562b55e8f112be9005e331cb0de8512b3d15dd88 100644
--- a/vp9/decoder/decodframe.c
+++ b/vp9/decoder/decodframe.c
@@ -205,6 +205,146 @@ static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                              int mb_row, unsigned int mb_col,
+                              BOOL_DECODER* const bc) {
+  int i, n, eobtotal;
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  VP9_COMMON *const pc = &pbi->common;
+  MODE_INFO *orig_mi = xd->mode_info_context;
+
+  assert(xd->mode_info_context->mbmi.encoded_as_sb);
+
+  // re-initialize macroblock dequantizer before detokenization
+  if (xd->segmentation_enabled)
+    mb_init_dequantizer(pbi, xd);
+
+  if (pbi->common.frame_type != KEY_FRAME)
+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
+
+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+    vp9_reset_mb_tokens_context(xd);
+    if (mb_col < pc->mb_cols - 1)
+      xd->above_context++;
+    if (mb_row < pc->mb_rows - 1)
+      xd->left_context++;
+    vp9_reset_mb_tokens_context(xd);
+    if (mb_col < pc->mb_cols - 1)
+      xd->above_context--;
+    if (mb_row < pc->mb_rows - 1)
+      xd->left_context--;
+
+    /* Special case:  Force the loopfilter to skip when eobtotal and
+     * mb_skip_coeff are zero.
+     */
+    skip_recon_mb(pbi, xd);
+    return;
+  }
+
+  /* do prediction */
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sby_s(xd);
+    vp9_build_intra_predictors_sbuv_s(xd);
+  } else {
+    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  /* dequantization and idct */
+  for (n = 0; n < 4; n++) {
+    BLOCKD *b = &xd->block[24];
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
+      continue;
+
+    xd->above_context = pc->above_context + mb_col + x_idx;
+    xd->left_context = pc->left_context + y_idx;
+    xd->mode_info_context = orig_mi + x_idx + y_idx * pc->mode_info_stride;
+    for (i = 0; i < 25; i++) {
+      xd->block[i].eob = 0;
+      xd->eobs[i] = 0;
+    }
+
+    if (tx_size == TX_16X16) {
+      eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);
+    } else if (tx_size == TX_8X8) {
+      eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
+    } else {
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+    }
+    if (eobtotal == 0) {  // skip loopfilter
+      xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+      continue;
+    }
+
+    if (tx_size == TX_16X16) {
+      vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
+          xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+          xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+          xd->dst.y_stride, xd->dst.y_stride);
+      vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+    } else if (tx_size == TX_8X8) {
+      vp9_dequantize_b_2x2(b);
+      IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
+      ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct
+      ((int *)b->qcoeff)[1] = 0;
+      ((int *)b->qcoeff)[2] = 0;
+      ((int *)b->qcoeff)[3] = 0;
+      ((int *)b->qcoeff)[4] = 0;
+      ((int *)b->qcoeff)[5] = 0;
+      ((int *)b->qcoeff)[6] = 0;
+      ((int *)b->qcoeff)[7] = 0;
+      vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
+          xd->block[0].dequant,
+          xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+      vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+    } else {
+      vp9_dequantize_b(b);
+      if (xd->eobs[24] > 1) {
+        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+        ((int *)b->qcoeff)[0] = 0;
+        ((int *)b->qcoeff)[1] = 0;
+        ((int *)b->qcoeff)[2] = 0;
+        ((int *)b->qcoeff)[3] = 0;
+        ((int *)b->qcoeff)[4] = 0;
+        ((int *)b->qcoeff)[5] = 0;
+        ((int *)b->qcoeff)[6] = 0;
+        ((int *)b->qcoeff)[7] = 0;
+      } else {
+        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+        ((int *)b->qcoeff)[0] = 0;
+      }
+
+      vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(xd->qcoeff,
+          xd->block[0].dequant,
+          xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+      vp9_dequant_idct_add_uv_block_4x4_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+    }
+  }
+
+  xd->above_context = pc->above_context + mb_col;
+  xd->left_context = pc->left_context;
+  xd->mode_info_context = orig_mi;
+}
+#endif
+
 static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
                               int mb_row, unsigned int mb_col,
                               BOOL_DECODER* const bc) {
@@ -213,9 +353,9 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
   int i;
   int tx_size;
   TX_TYPE tx_type;
-  VP9_COMMON *pc = &pbi->common;
+
 #if CONFIG_SUPERBLOCKS
-  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
 #endif
 
   // re-initialize macroblock dequantizer before detokenization
@@ -227,20 +367,6 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp9_reset_mb_tokens_context(xd);
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb &&
-        (mb_col < pc->mb_cols - 1 || mb_row < pc->mb_rows - 1)) {
-      if (mb_col < pc->mb_cols - 1)
-        xd->above_context++;
-      if (mb_row < pc->mb_rows - 1)
-        xd->left_context++;
-      vp9_reset_mb_tokens_context(xd);
-      if (mb_col < pc->mb_cols - 1)
-        xd->above_context--;
-      if (mb_row < pc->mb_rows - 1)
-        xd->left_context--;
-    }
-#endif
   } else if (!bool_error(bc)) {
     for (i = 0; i < 25; i++) {
       xd->block[i].eob = 0;
@@ -267,14 +393,8 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
      * mb_skip_coeff are zero.
      * */
     xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-
-#if CONFIG_SUPERBLOCKS
-    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
-#endif
-    {
-      skip_recon_mb(pbi, xd);
-      return;
-    }
+    skip_recon_mb(pbi, xd);
+    return;
   }
 
   // moved to be performed before detokenization
@@ -283,12 +403,6 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
 
   /* do prediction */
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      vp9_build_intra_predictors_sby_s(xd);
-      vp9_build_intra_predictors_sbuv_s(xd);
-    } else
-#endif
     if (mode != I8X8_PRED) {
       vp9_build_intra_predictors_mbuv(xd);
       if (mode != B_PRED) {
@@ -296,13 +410,6 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
       }
     }
   } else {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
-                                         xd->dst.u_buffer, xd->dst.v_buffer,
-                                         xd->dst.y_stride, xd->dst.uv_stride);
-    } else
-#endif
     vp9_build_inter_predictors_mb(xd);
   }
 
@@ -404,40 +511,9 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
                                      16, xd->dst.y_stride);
       }
     } else if (tx_size == TX_8X8) {
-#if CONFIG_SUPERBLOCKS
-      void *orig = xd->mode_info_context;
-      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
-      for (n = 0; n < num; n++) {
-        int x_idx = n & 1, y_idx = n >> 1;
-        if (num == 4 && (mb_col + x_idx >= pc->mb_cols ||
-                         mb_row + y_idx >= pc->mb_rows))
-          continue;
-
-        if (n != 0) {
-          for (i = 0; i < 25; i++) {
-            xd->block[i].eob = 0;
-            xd->eobs[i] = 0;
-          }
-          xd->above_context = pc->above_context + mb_col + (n & 1);
-          xd->left_context = pc->left_context + (n >> 1);
-          xd->mode_info_context = orig;
-          xd->mode_info_context += (n & 1);
-          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
-          if (!orig_skip_flag) {
-            eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
-            if (eobtotal == 0) // skip loopfilter
-              xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-          } else {
-            vp9_reset_mb_tokens_context(xd);
-          }
-        }
-
-        if (xd->mode_info_context->mbmi.mb_skip_coeff)
-          continue; // only happens for SBs, which are already in dest buffer
-#endif
       vp9_dequantize_b_2x2(b);
       IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
-      ((int *)b->qcoeff)[0] = 0;// 2nd order block are set to 0 after inverse transform
+      ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct
       ((int *)b->qcoeff)[1] = 0;
       ((int *)b->qcoeff)[2] = 0;
       ((int *)b->qcoeff)[3] = 0;
@@ -445,27 +521,9 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
       ((int *)b->qcoeff)[5] = 0;
       ((int *)b->qcoeff)[6] = 0;
       ((int *)b->qcoeff)[7] = 0;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
-          xd->block[0].dequant,
-          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
-          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
-        // do UV inline also
-        vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
-          xd->block[16].dequant,
-          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
-          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
-          xd->dst.uv_stride, xd->eobs + 16, xd);
-      } else
-#endif
         vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
           xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
           xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
-#if CONFIG_SUPERBLOCKS
-      }
-      xd->mode_info_context = orig;
-#endif
     } else {
       vp9_dequantize_b(b);
       if (xd->eobs[24] > 1) {
@@ -489,25 +547,19 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
     }
   }
 
-#if CONFIG_SUPERBLOCKS
-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
-#endif
-    if ((tx_size == TX_8X8 &&
-         xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-         xd->mode_info_context->mbmi.mode != SPLITMV)
-        || tx_size == TX_16X16
-       )
-      vp9_dequant_idct_add_uv_block_8x8
-          (xd->qcoeff + 16 * 16, xd->block[16].dequant,
-           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-           xd->dst.uv_stride, xd->eobs + 16, xd); //
-    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
-      pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
-           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-           xd->dst.uv_stride, xd->eobs + 16);
-#if CONFIG_SUPERBLOCKS
-  }
-#endif
+  if ((tx_size == TX_8X8 &&
+       xd->mode_info_context->mbmi.mode != I8X8_PRED &&
+       xd->mode_info_context->mbmi.mode != SPLITMV)
+      || tx_size == TX_16X16
+     )
+    vp9_dequant_idct_add_uv_block_8x8
+        (xd->qcoeff + 16 * 16, xd->block[16].dequant,
+         xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
+         xd->dst.uv_stride, xd->eobs + 16, xd);
+  else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
+    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
+         xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
+         xd->dst.uv_stride, xd->eobs + 16);
 }
 
 
@@ -661,9 +713,15 @@ decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,
             mi[pc->mode_info_stride + 1] = mi[0];
         }
       }
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        decode_superblock(pbi, xd, mb_row, mb_col, bc);
+      } else {
+#endif
+        vp9_intra_prediction_down_copy(xd);
+        decode_macroblock(pbi, xd, mb_row, mb_col, bc);
+#if CONFIG_SUPERBLOCKS
+      }
 #endif
-      vp9_intra_prediction_down_copy(xd);
-      decode_macroblock(pbi, xd, mb_row, mb_col, bc);
 
       /* check if the boolean decoder has suffered an error */
       xd->corrupted |= bool_error(bc);
diff --git a/vp9/decoder/dequantize.h b/vp9/decoder/dequantize.h
index 560c4a417b4f34e30617f4187757d510a36e2c18..026bd2af61589cd3a044b6a3531ed5a45a36096b 100644
--- a/vp9/decoder/dequantize.h
+++ b/vp9/decoder/dequantize.h
@@ -73,12 +73,24 @@ void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
                                                    int stride,
                                                    unsigned short *eobs,
                                                    short *dc, MACROBLOCKD *xd);
+
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, short *dq,
+                                                   unsigned char *dst,
+                                                   int stride, char *eobs,
+                                                   short *dc, MACROBLOCKD *xd);
+
 void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
                                                  unsigned char *dstu,
                                                  unsigned char *dstv,
                                                  int stride,
                                                  unsigned short *eobs,
                                                  MACROBLOCKD *xd);
+
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, short *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
+                                                 int stride, char *eobs,
+                                                 MACROBLOCKD *xd);
 #endif
 
 #endif
diff --git a/vp9/decoder/idct_blk.c b/vp9/decoder/idct_blk.c
index d9fbf97c22361356fac00f3a22f0b10148dda26e..efe451e6cc45c6dc3f6dff9f535a2fb66cae7e49 100644
--- a/vp9/decoder/idct_blk.c
+++ b/vp9/decoder/idct_blk.c
@@ -36,6 +36,30 @@ void vp9_dequant_dc_idct_add_y_block_c(short *q, short *dq,
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, short *dq,
+                                                   unsigned char *dst,
+                                                   int stride, char *eobs,
+                                                   short *dc, MACROBLOCKD *xd) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1)
+        vp9_dequant_dc_idct_add_c(q, dq, dst, dst, stride, stride, dc[0]);
+      else
+        vp9_dc_only_idct_add_c(dc[0], dst, dst, stride, stride);
+
+      q   += 16;
+      dst += 4;
+      dc++;
+    }
+
+    dst += 4 * stride - 16;
+  }
+}
+#endif
+
 void vp9_dequant_idct_add_y_block_c(short *q, short *dq,
                                     unsigned char *pre,
                                     unsigned char *dst,
@@ -103,6 +127,47 @@ void vp9_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *pre,
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, short *dq,
+                                                 unsigned char *dstu,
+                                                 unsigned char *dstv,
+                                                 int stride, char *eobs,
+                                                 MACROBLOCKD *xd) {
+  int i, j;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp9_dequant_idct_add_c(q, dq, dstu, dstu, stride, stride);
+      } else {
+        vp9_dc_only_idct_add_c(q[0]*dq[0], dstu, dstu, stride, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q    += 16;
+      dstu += 4;
+    }
+
+    dstu += 4 * stride - 8;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp9_dequant_idct_add_c(q, dq, dstv, dstv, stride, stride);
+      } else {
+        vp9_dc_only_idct_add_c(q[0]*dq[0], dstv, dstv, stride, stride);
+        ((int *)q)[0] = 0;
+      }
+
+      q    += 16;
+      dstv += 4;
+    }
+
+    dstv += 4 * stride - 8;
+  }
+}
+#endif
 
 void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,
                                            unsigned char *pre,
diff --git a/vp9/encoder/bitstream.c b/vp9/encoder/bitstream.c
index b7bc99cb30dca1a166402d4c97cf8b0a7217b765..a25783a2607d1ba3a866ffcad90947674d01d715 100644
--- a/vp9/encoder/bitstream.c
+++ b/vp9/encoder/bitstream.c
@@ -919,7 +919,7 @@ static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
         MB_MODE_INFO *mi;
         MV_REFERENCE_FRAME rf;
         MB_PREDICTION_MODE mode;
-        int segment_id;
+        int segment_id, skip_coeff;
 
         int dy = row_delta[i];
         int dx = col_delta[i];
@@ -973,10 +973,11 @@ static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
           }
         }
 
+        skip_coeff = 1;
         if (pc->mb_no_coeff_skip &&
             (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          int skip_coeff = mi->mb_skip_coeff;
+          skip_coeff = mi->mb_skip_coeff;
 #if CONFIG_SUPERBLOCKS
           if (mi->encoded_as_sb) {
             skip_coeff &= m[1].mbmi.mb_skip_coeff;
@@ -1107,6 +1108,7 @@ static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
                       cpi->common.mcomp_filter_type);
             }
           }
+
           if (mi->second_ref_frame &&
               (mode == NEWMV || mode == SPLITMV)) {
             int_mv n1, n2;
@@ -1244,15 +1246,11 @@ static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
           }
         }
 
-        if (
-#if CONFIG_SUPERBLOCKS
-            !mi->encoded_as_sb &&
-#endif
-            ((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
+        if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
              (rf != INTRA_FRAME && !(mode == SPLITMV &&
                                      mi->partitioning == PARTITIONING_4X4))) &&
             pc->txfm_mode == TX_MODE_SELECT &&
-            !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||
+            !((pc->mb_no_coeff_skip && skip_coeff) ||
               (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
                vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
           TX_SIZE sz = mi->txfm_size;
@@ -1389,11 +1387,7 @@ static void write_mb_modes_kf(const VP9_COMMON  *c,
   } else
     write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
-  if (
-#if CONFIG_SUPERBLOCKS
-      !m->mbmi.encoded_as_sb &&
-#endif
-      ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
+  if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
       !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
         (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
diff --git a/vp9/encoder/encodeframe.c b/vp9/encoder/encodeframe.c
index 703b49e35a047270eccf0bdecb750424a01a5fc6..21def264f2dac25def04bbcd891201ef723f7b30 100644
--- a/vp9/encoder/encodeframe.c
+++ b/vp9/encoder/encodeframe.c
@@ -55,7 +55,8 @@ int mb_row_debug, mb_col_debug;
 
 static void encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int recon_yoffset,
-                                    int recon_uvoffset, int output_enabled);
+                                    int recon_uvoffset, int output_enabled,
+                                    int mb_col, int mb_row);
 
 static void encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int recon_yoffset,
@@ -65,7 +66,7 @@ static void encode_intra_macro_block(VP9_COMP *cpi, MACROBLOCK *x,
                                      TOKENEXTRA **t, int output_enabled);
 
 static void encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
-                                     TOKENEXTRA **t, int mb_col);
+                                     TOKENEXTRA **t, int mb_col, int mb_row);
 
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
@@ -466,9 +467,9 @@ static void update_state(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
     cpi->prediction_error += ctx->distortion;
     cpi->intra_error += ctx->intra_error;
 
-    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
-    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
-    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
+    cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY]   += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[HYBRID_PREDICTION]      += ctx->hybrid_pred_diff;
   }
 }
 
@@ -645,7 +646,7 @@ static void pick_mb_modes(VP9_COMP *cpi,
 
       // Dummy encode, do not do the tokenization
       encode_inter_macroblock(cpi, x, tp,
-                              recon_yoffset, recon_uvoffset, 0);
+                              recon_yoffset, recon_uvoffset, 0, mb_col, mb_row);
 
       seg_id = mbmi->segment_id;
       if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
@@ -975,7 +976,7 @@ static void encode_sb(VP9_COMP *cpi,
     if (cm->frame_type == KEY_FRAME) {
 #if CONFIG_SUPERBLOCKS
       if (xd->mode_info_context->mbmi.encoded_as_sb)
-        encode_intra_super_block(cpi, x, tp, mb_col);
+        encode_intra_super_block(cpi, x, tp, mb_col, mb_row);
       else
 #endif
         encode_intra_macro_block(cpi, x, tp, 1);
@@ -1005,8 +1006,8 @@ static void encode_sb(VP9_COMP *cpi,
                                 mb_col, mb_row);
       else
 #endif
-        encode_inter_macroblock(cpi, x, tp,
-                                recon_yoffset, recon_uvoffset, 1);
+        encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, 1,
+                                mb_col, mb_row);
         // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
@@ -1431,7 +1432,7 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
 
 static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
   VP9_COMMON *cm = &cpi->common;
-  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;
+  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id, skip;
   MODE_INFO *mi, *mi_ptr = cm->mi;
 #if CONFIG_SUPERBLOCKS
   MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;
@@ -1451,17 +1452,45 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
 #if CONFIG_SUPERBLOCKS
       sb_mbmi = &sb_mi->mbmi;
 #endif
-      if (
+      if (mbmi->txfm_size > txfm_max) {
+#if CONFIG_SUPERBLOCKS
+        if (sb_mbmi->encoded_as_sb) {
+          if (!((mb_col & 1) || (mb_row & 1))) {
+            segment_id = mbmi->segment_id;
+            skip = mbmi->mb_skip_coeff;
+            if (mb_col < cm->mb_cols - 1) {
+              segment_id = segment_id && mi[1].mbmi.segment_id;
+              skip = skip && mi[1].mbmi.mb_skip_coeff;
+            }
+            if (mb_row < cm->mb_rows - 1) {
+              segment_id = segment_id &&
+                           mi[cm->mode_info_stride].mbmi.segment_id;
+              skip = skip && mi[cm->mode_info_stride].mbmi.mb_skip_coeff;
+              if (mb_col < cm->mb_cols - 1) {
+                segment_id = segment_id &&
+                             mi[cm->mode_info_stride + 1].mbmi.segment_id;
+                skip = skip && mi[cm->mode_info_stride + 1].mbmi.mb_skip_coeff;
+              }
+            }
+            xd->mode_info_context = mi;
+            assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+                    vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+                   (cm->mb_no_coeff_skip && skip));
+            mbmi->txfm_size = txfm_max;
+          } else {
+            mbmi->txfm_size = sb_mbmi->txfm_size;
+          }
+        } else {
+#endif
+          segment_id = mbmi->segment_id;
+          xd->mode_info_context = mi;
+          assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+                  vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+                 (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
+          mbmi->txfm_size = txfm_max;
 #if CONFIG_SUPERBLOCKS
-          !sb_mbmi->encoded_as_sb &&
-#endif
-          mbmi->txfm_size > txfm_max) {
-        segment_id = mbmi->segment_id;
-        xd->mode_info_context = mi;
-        assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-                vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
-               (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
-        mbmi->txfm_size = txfm_max;
+        }
+#endif
       }
 #if CONFIG_SUPERBLOCKS
       if (mb_col & 1)
@@ -1835,7 +1864,7 @@ static void update_sb_skip_coeff_state(VP9_COMP *cpi,
 }
 
 static void encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
-                                     TOKENEXTRA **t, int mb_col) {
+                                     TOKENEXTRA **t, int mb_col, int mb_row) {
   const int output_enabled = 1;
   int n;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -1851,7 +1880,7 @@ static void encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
   const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
   TOKENEXTRA *tp[4];
   int skip[4];
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  MODE_INFO *mi = xd->mode_info_context;
   ENTROPY_CONTEXT_PLANES ta[4], tl[4];
 
   if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
@@ -1862,7 +1891,6 @@ static void encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
   vp9_build_intra_predictors_sby_s(&x->e_mbd);
   vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
 
-  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
 
@@ -1881,15 +1909,9 @@ static void encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           dst_uv_stride);
-    vp9_transform_mb_8x8(x);
-    vp9_quantize_mb_8x8(x);
-    if (x->optimize) {
-      vp9_optimize_mby_8x8(x, rtcd);
-      vp9_optimize_mbuv_8x8(x, rtcd);
-    }
-    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-    vp9_recon_mby_s_c(&x->e_mbd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
-    vp9_recon_mbuv_s_c(&x->e_mbd,
+    vp9_fidct_mb(x, rtcd);
+    vp9_recon_mby_s_c(xd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp9_recon_mbuv_s_c(xd,
                        udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                        vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
 
@@ -1898,16 +1920,35 @@ static void encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,
       memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
       tp[n] = *t;
       xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
-      vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
+      vp9_tokenize_mb(cpi, xd, t, 0);
       skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
     }
   }
 
   if (output_enabled) {
+    int segment_id;
+
     // Tokenize
     xd->mode_info_context = mi;
+    segment_id = mi->mbmi.segment_id;
     sum_intra_stats(cpi, x);
     update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+    if (cm->txfm_mode == TX_MODE_SELECT &&
+        !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+      cpi->txfm_count[mi->mbmi.txfm_size]++;
+    } else {
+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_16X16 : cm->txfm_mode;
+      mi->mbmi.txfm_size = sz;
+      if (mb_col < cm->mb_cols - 1)
+        mi[1].mbmi.txfm_size = sz;
+      if (mb_row < cm->mb_rows - 1) {
+        mi[cm->mode_info_stride].mbmi.txfm_size = sz;
+        if (mb_col < cm->mb_cols - 1)
+          mi[cm->mode_info_stride + 1].mbmi.txfm_size = sz;
+      }
+    }
   }
 }
 #endif /* CONFIG_SUPERBLOCKS */
@@ -1962,7 +2003,8 @@ static void encode_intra_macro_block(VP9_COMP *cpi, MACROBLOCK *x,
 }
 static void encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int recon_yoffset,
-                                    int recon_uvoffset, int output_enabled) {
+                                    int recon_uvoffset, int output_enabled,
+                                    int mb_col, int mb_row) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
@@ -2151,8 +2193,8 @@ static void encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     int recon_uvoffset,
                                     int mb_col, int mb_row) {
   const int output_enabled = 1;
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *src = x->src.y_buffer;
   uint8_t *dst = xd->dst.y_buffer;
   const uint8_t *usrc = x->src.u_buffer;
@@ -2162,13 +2204,13 @@ static void encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x,
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
   const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
-  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
   int seg_ref_active;
   unsigned char ref_pred_flag;
   int n;
   TOKENEXTRA *tp[4];
   int skip[4];
   MODE_INFO *mi = x->e_mbd.mode_info_context;
+  unsigned int segment_id = mi->mbmi.segment_id;
   ENTROPY_CONTEXT_PLANES ta[4], tl[4];
 
   x->skip = 0;
@@ -2248,7 +2290,6 @@ static void encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x,
                                        xd->dst.y_stride, xd->dst.uv_stride);
   }
 
-  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
 
@@ -2264,13 +2305,7 @@ static void encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x,
                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           dst_uv_stride);
-    vp9_transform_mb_8x8(x);
-    vp9_quantize_mb_8x8(x);
-    if (x->optimize) {
-      vp9_optimize_mby_8x8(x, rtcd);
-      vp9_optimize_mbuv_8x8(x, rtcd);
-    }
-    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp9_fidct_mb(x, rtcd);
     vp9_recon_mby_s_c(&x->e_mbd,
                       dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
     vp9_recon_mbuv_s_c(&x->e_mbd,
@@ -2313,5 +2348,21 @@ static void encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x,
 
   xd->mode_info_context = mi;
   update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+  if (cm->txfm_mode == TX_MODE_SELECT &&
+      !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+    cpi->txfm_count[mi->mbmi.txfm_size]++;
+  } else {
+    TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_16X16 : cm->txfm_mode;
+    mi->mbmi.txfm_size = sz;
+    if (mb_col < cm->mb_cols - 1)
+      mi[1].mbmi.txfm_size = sz;
+    if (mb_row < cm->mb_rows - 1) {
+      mi[cm->mode_info_stride].mbmi.txfm_size = sz;
+      if (mb_col < cm->mb_cols - 1)
+        mi[cm->mode_info_stride + 1].mbmi.txfm_size = sz;
+    }
+  }
 }
 #endif
diff --git a/vp9/encoder/encodemb.c b/vp9/encoder/encodemb.c
index d828c51c640357da0d4a61d0c83ba855287afe82..71e81edd0031f5be4e8c7ad8803a0a6c177425e1 100644
--- a/vp9/encoder/encodemb.c
+++ b/vp9/encoder/encodemb.c
@@ -884,13 +884,10 @@ static void optimize_mb_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
   vp9_optimize_mbuv_8x8(x, rtcd);
 }
 
-void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
+void vp9_fidct_mb(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
-  vp9_build_inter_predictors_mb(xd);
-  subtract_mb(rtcd, x);
-
   if (tx_size == TX_16X16) {
     vp9_transform_mb_16x16(x);
     vp9_quantize_mb_16x16(x);
@@ -924,7 +921,14 @@ void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
       optimize_mb_4x4(x, rtcd);
     vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);
   }
+}
+
+void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
 
+  vp9_build_inter_predictors_mb(xd);
+  subtract_mb(rtcd, x);
+  vp9_fidct_mb(x, rtcd);
   vp9_recon_mb(xd);
 }
 
diff --git a/vp9/encoder/encodemb.h b/vp9/encoder/encodemb.h
index 8a3d38f1de31ce17b8ee2897a8df30291118156a..e59ed8a2748e55bcfe70c3946d25631da4fac169 100644
--- a/vp9/encoder/encodemb.h
+++ b/vp9/encoder/encodemb.h
@@ -55,6 +55,8 @@ void vp9_transform_mb_16x16(MACROBLOCK *mb);
 void vp9_transform_mby_16x16(MACROBLOCK *x);
 void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
 
+void vp9_fidct_mb(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
 
 #if CONFIG_SUPERBLOCKS
diff --git a/vp9/encoder/rdopt.c b/vp9/encoder/rdopt.c
index 19b96af4023ba3acea05562f07b8c495ab9bf1e9..ef92b62580c2c56087011ea6b6c9eed1f860d123 100644
--- a/vp9/encoder/rdopt.c
+++ b/vp9/encoder/rdopt.c
@@ -610,7 +610,7 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
   return cost;
 }
 
-static int rdcost_mby_4x4(MACROBLOCK *mb) {
+static int rdcost_mby_4x4(MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -618,11 +618,16 @@ static int rdcost_mby_4x4(MACROBLOCK *mb) {
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
 
   for (b = 0; b < 16; b++)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
@@ -640,7 +645,7 @@ static void macro_block_yrd_4x4(MACROBLOCK *mb,
                                 int *Rate,
                                 int *Distortion,
                                 const VP9_ENCODER_RTCD *rtcd,
-                                int *skippable) {
+                                int *skippable, int backup) {
   int b;
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK   *const mb_y2 = mb->block + 24;
@@ -674,7 +679,7 @@ static void macro_block_yrd_4x4(MACROBLOCK *mb,
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = rdcost_mby_4x4(mb);
+  *Rate = rdcost_mby_4x4(mb, backup);
   *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, 1);
 }
 
@@ -711,7 +716,7 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
                                 int *Rate,
                                 int *Distortion,
                                 const VP9_ENCODER_RTCD *rtcd,
-                                int *skippable) {
+                                int *skippable, int backup) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK   *const mb_y2 = mb->block + 24;
   BLOCKD *const x_y2  = xd->block + 24;
@@ -735,28 +740,34 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = rdcost_mby_8x8(mb, 1);
+  *Rate = rdcost_mby_8x8(mb, backup);
   *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, 1);
 }
 
-static int rdcost_mby_16x16(MACROBLOCK *mb) {
+static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) {
   int cost;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
 
   cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
   return cost;
 }
 
 static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
-                                  const VP9_ENCODER_RTCD *rtcd, int *skippable) {
+                                  const VP9_ENCODER_RTCD *rtcd, int *skippable,
+                                  int backup) {
   int d;
   MACROBLOCKD *xd = &mb->e_mbd;
   BLOCKD *b  = &mb->e_mbd.block[0];
@@ -780,125 +791,97 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = rdcost_mby_16x16(mb);
+  *Rate = rdcost_mby_16x16(mb, backup);
   *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);
 }
 
-static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                            int *distortion, int *skippable,
-                            int64_t txfm_cache[NB_TXFM_MODES]) {
-  VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  MACROBLOCKD *xd = &x->e_mbd;
-  int can_skip = cm->mb_no_coeff_skip;
-  vp9_prob skip_prob = can_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
-  int s0, s1;
-  int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;
-  int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;
-  int d16x16, r16x16, r16x16s, s16x16;
-  int64_t rd16x16, rd16x16s;
-
-  vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
-                   x->block[0].src_stride);
-
-  if (skip_prob == 0)
-    skip_prob = 1;
-  s0 = vp9_cost_bit(skip_prob, 0);
-  s1 = vp9_cost_bit(skip_prob, 1);
-  macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);
-  if (can_skip) {
-    if (s16x16) {
-      rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
-    } else {
-      rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);
-    }
-  } else {
-    rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);
-  }
-  r16x16s = r16x16 + vp9_cost_one(cm->prob_tx[0]) + vp9_cost_one(cm->prob_tx[1]);
-  if (can_skip) {
-    if (s16x16) {
-      rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);
-    } else {
-      rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);
-    }
-  } else {
-    rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);
-  }
-  macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);
-  if (can_skip) {
-    if (s8x8) {
-      rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
-    } else {
-      rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);
-    }
-  } else {
-    rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);
-  }
-  r8x8s = r8x8 + vp9_cost_one(cm->prob_tx[0]);
-  r8x8s += vp9_cost_zero(cm->prob_tx[1]);
-  if (can_skip) {
-    if (s8x8) {
-      rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);
-    } else {
-      rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);
-    }
-  } else {
-    rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);
-  }
-  macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);
-  if (can_skip) {
-    if (s4x4) {
-      rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
-    } else {
-      rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);
+static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int r[2][TX_SIZE_MAX], int *rate,
+                                     int d[TX_SIZE_MAX], int *distortion,
+                                     int s[TX_SIZE_MAX], int *skip,
+                                     int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  vp9_prob skip_prob = cm->mb_no_coeff_skip ?
+                       vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
+  int64_t rd[2][TX_SIZE_MAX];
+  int n;
+
+  r[1][TX_16X16] = r[0][TX_16X16] + vp9_cost_one(cm->prob_tx[0]) +
+                   vp9_cost_one(cm->prob_tx[1]);
+  r[1][TX_8X8]   = r[0][TX_8X8] + vp9_cost_one(cm->prob_tx[0]) +
+                   vp9_cost_zero(cm->prob_tx[1]);
+  r[1][TX_4X4]   = r[0][TX_4X4] + vp9_cost_zero(cm->prob_tx[0]);
+
+  if (cm->mb_no_coeff_skip) {
+    int s0, s1;
+
+    assert(skip_prob > 0);
+    s0 = vp9_cost_bit(skip_prob, 0);
+    s1 = vp9_cost_bit(skip_prob, 1);
+
+    for (n = TX_4X4; n <= TX_16X16; n++) {
+      if (s[n]) {
+        rd[0][n] = rd[1][n] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+      } else {
+        rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n] + s0, d[n]);
+        rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n] + s0, d[n]);
+      }
     }
   } else {
-    rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);
-  }
-  r4x4s = r4x4 + vp9_cost_zero(cm->prob_tx[0]);
-  if (can_skip) {
-    if (s4x4) {
-      rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);
-    } else {
-      rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);
+    for (n = TX_4X4; n <= TX_16X16; n++) {
+      rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n], d[n]);
+      rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n], d[n]);
     }
-  } else {
-    rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);
   }
 
-  if ( cpi->common.txfm_mode == ALLOW_16X16 ||
-      (cpi->common.txfm_mode == TX_MODE_SELECT &&
-       rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {
+  if ( cm->txfm_mode == ALLOW_16X16 ||
+      (cm->txfm_mode == TX_MODE_SELECT &&
+       rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])) {
     mbmi->txfm_size = TX_16X16;
-    *skippable = s16x16;
-    *distortion = d16x16;
-    *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;
-  } else
-  if ( cpi->common.txfm_mode == ALLOW_8X8 ||
-      (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {
+  } else if (cm->txfm_mode == ALLOW_8X8 ||
+           (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_8X8] < rd[1][TX_4X4])) {
     mbmi->txfm_size = TX_8X8;
-    *skippable = s8x8;
-    *distortion = d8x8;
-    *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;
   } else {
-    assert(cpi->common.txfm_mode == ONLY_4X4 ||
-           (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));
+    assert(cm->txfm_mode == ONLY_4X4 ||
+          (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_4X4] <= rd[1][TX_8X8]));
     mbmi->txfm_size = TX_4X4;
-    *skippable = s4x4;
-    *distortion = d4x4;
-    *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;
   }
 
-  txfm_cache[ONLY_4X4] = rd4x4;
-  txfm_cache[ALLOW_8X8] = rd8x8;
-  txfm_cache[ALLOW_16X16] = rd16x16;
-  if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)
-    txfm_cache[TX_MODE_SELECT] = rd16x16s;
+  *distortion = d[mbmi->txfm_size];
+  *rate       = r[cm->txfm_mode == TX_MODE_SELECT][mbmi->txfm_size];
+  *skip       = s[mbmi->txfm_size];
+
+  txfm_cache[ONLY_4X4] = rd[0][TX_4X4];
+  txfm_cache[ALLOW_8X8] = rd[0][TX_8X8];
+  txfm_cache[ALLOW_16X16] = rd[0][TX_16X16];
+  if (rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])
+    txfm_cache[TX_MODE_SELECT] = rd[1][TX_16X16];
   else
-    txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;
+    txfm_cache[TX_MODE_SELECT] = rd[1][TX_4X4] < rd[1][TX_8X8] ?
+                                 rd[1][TX_4X4] : rd[1][TX_8X8];
+}
+
+static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int *distortion, int *skippable,
+                            int64_t txfm_cache[NB_TXFM_MODES]) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX];
+
+  vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
+                   x->block[0].src_stride);
 
+  macro_block_yrd_16x16(x, &r[0][TX_16X16], &d[TX_16X16],
+                        IF_RTCD(&cpi->rtcd), &s[TX_16X16], 1);
+  macro_block_yrd_8x8(x, &r[0][TX_8X8], &d[TX_8X8],
+                      IF_RTCD(&cpi->rtcd), &s[TX_8X8], 1);
+  macro_block_yrd_4x4(x, &r[0][TX_4X4], &d[TX_4X4],
+                      IF_RTCD(&cpi->rtcd), &s[TX_4X4], 1);
+
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
+                           txfm_cache);
 }
 
 static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
@@ -911,62 +894,61 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
 }
 
 #if CONFIG_SUPERBLOCKS
-static void super_block_yrd_8x8(MACROBLOCK *x,
-                                int *rate,
-                                int *distortion,
-                                const VP9_ENCODER_RTCD *rtcd, int *skip)
-{
+static void super_block_yrd(VP9_COMP *cpi,
+                            MACROBLOCK *x, int *rate, int *distortion,
+                            const VP9_ENCODER_RTCD *rtcd, int *skip,
+                            int64_t txfm_cache[NB_TXFM_MODES]) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *const by2 = x->block + 24;
-  BLOCKD *const bdy2  = xd->block + 24;
-  int d = 0, r = 0, n;
+  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX], n;
   const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
-  ENTROPY_CONTEXT_PLANES t_above[2];
-  ENTROPY_CONTEXT_PLANES t_left[2];
-  int skippable = 1;
-
-  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
+  ENTROPY_CONTEXT_PLANES t_above[3][2], *orig_above = xd->above_context;
+  ENTROPY_CONTEXT_PLANES t_left[3][2], *orig_left = xd->left_context;
+
+  for (n = TX_4X4; n <= TX_16X16; n++) {
+    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
+    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
+    r[0][n] = 0;
+    d[n] = 0;
+    s[n] = 1;
+  }
 
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
+    int r_tmp, d_tmp, s_tmp;
 
     vp9_subtract_mby_s_c(x->src_diff,
                          src + x_idx * 16 + y_idx * 16 * src_y_stride,
                          src_y_stride,
                          dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
                          dst_y_stride);
-    vp9_transform_mby_8x8(x);
-    vp9_quantize_mby_8x8(x);
-
-    /* remove 1st order dc to properly combine 1st/2nd order distortion */
-    x->coeff[  0] = 0;
-    x->coeff[ 64] = 0;
-    x->coeff[128] = 0;
-    x->coeff[192] = 0;
-    xd->dqcoeff[  0] = 0;
-    xd->dqcoeff[ 64] = 0;
-    xd->dqcoeff[128] = 0;
-    xd->dqcoeff[192] = 0;
-
-    d += vp9_mbblock_error(x, 0);
-    d += vp9_block_error(by2->coeff, bdy2->dqcoeff, 16);
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rdcost_mby_8x8(x, 0);
-    skippable = skippable && vp9_mby_is_skippable_8x8(xd, 1);
+
+    xd->above_context = &t_above[TX_16X16][x_idx];
+    xd->left_context = &t_left[TX_16X16][y_idx];
+    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    d[TX_16X16] += d_tmp;
+    r[0][TX_16X16] += r_tmp;
+    s[TX_16X16] = s[TX_16X16] && s_tmp;
+
+    xd->above_context = &t_above[TX_4X4][x_idx];
+    xd->left_context = &t_left[TX_4X4][y_idx];
+    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    d[TX_4X4] += d_tmp;
+    r[0][TX_4X4] += r_tmp;
+    s[TX_4X4] = s[TX_4X4] && s_tmp;
+
+    xd->above_context = &t_above[TX_8X8][x_idx];
+    xd->left_context = &t_left[TX_8X8][y_idx];
+    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    d[TX_8X8] += d_tmp;
+    r[0][TX_8X8] += r_tmp;
+    s[TX_8X8] = s[TX_8X8] && s_tmp;
   }
 
-  *distortion = (d >> 2);
-  *rate       = r;
-  if (skip) *skip = skippable;
-  xd->above_context = ta;
-  xd->left_context = tl;
-  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
-  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache);
+
+  xd->above_context = orig_above;
+  xd->left_context = orig_left;
 }
 #endif
 
@@ -1190,7 +1172,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
                                       int *rate,
                                       int *rate_tokenonly,
                                       int *distortion,
-                                      int *skippable) {
+                                      int *skippable,
+                                      int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   int this_rate, this_rate_tokenonly;
@@ -1202,8 +1185,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
     x->e_mbd.mode_info_context->mbmi.mode = mode;
     vp9_build_intra_predictors_sby_s(&x->e_mbd);
 
-    super_block_yrd_8x8(x, &this_rate_tokenonly,
-                        &this_distortion, IF_RTCD(&cpi->rtcd), &s);
+    super_block_yrd(cpi, x, &this_rate_tokenonly,
+                    &this_distortion, IF_RTCD(&cpi->rtcd), &s, txfm_cache);
     this_rate = this_rate_tokenonly +
                 x->mbmode_cost[x->e_mbd.frame_type]
                               [x->e_mbd.mode_info_context->mbmi.mode];
@@ -1239,12 +1222,12 @@ static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
   MB_PREDICTION_MODE mode2;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
 #endif
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   int rate, ratey;
   int distortion, skip;
   int64_t best_rd = INT64_MAX;
   int64_t this_rd;
-  MACROBLOCKD *xd = &x->e_mbd;
 
   int i;
   for (i = 0; i < NB_TXFM_MODES; i++)
@@ -1261,11 +1244,11 @@ static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
       mbmi->second_mode = mode2;
       if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
 #endif
-        vp9_build_intra_predictors_mby(&x->e_mbd);
+        vp9_build_intra_predictors_mby(xd);
 #if CONFIG_COMP_INTRA_PRED
       } else {
         continue; // i.e. disable for now
-        vp9_build_comp_intra_predictors_mby(&x->e_mbd);
+        vp9_build_comp_intra_predictors_mby(xd);
       }
 #endif
 
@@ -1273,7 +1256,7 @@ static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
 
       // FIXME add compoundmode cost
       // FIXME add rate for mode2
-      rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];
+      rate = ratey + x->mbmode_cost[xd->frame_type][mbmi->mode];
 
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
@@ -1519,18 +1502,23 @@ static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
-static int rd_cost_mbuv(MACROBLOCK *mb) {
+static int rd_cost_mbuv_4x4(MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
 
   for (b = 16; b < 24; b++)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
@@ -1541,15 +1529,13 @@ static int rd_cost_mbuv(MACROBLOCK *mb) {
 }
 
 
-static int64_t rd_inter16x16_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                int *distortion, int fullpixel, int *skip) {
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-
+static int64_t rd_inter16x16_uv_4x4(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                    int *distortion, int fullpixel, int *skip,
+                                    int do_ctx_backup) {
   vp9_transform_mbuv_4x4(x);
   vp9_quantize_mbuv_4x4(x);
 
-  *rate       = rd_cost_mbuv(x);
+  *rate       = rd_cost_mbuv_4x4(x, do_ctx_backup);
   *distortion = vp9_mbuverror(x) / 4;
   *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
 
@@ -1582,10 +1568,24 @@ static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
   return cost;
 }
 
+static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                    int *distortion, int fullpixel, int *skip,
+                                    int do_ctx_backup) {
+  vp9_transform_mbuv_8x8(x);
+  vp9_quantize_mbuv_8x8(x);
+
+  *rate       = rd_cost_mbuv_8x8(x, do_ctx_backup);
+  *distortion = vp9_mbuverror(x) / 4;
+  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
 #if CONFIG_SUPERBLOCKS
-static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                 int *distortion, int fullpixel, int *skip) {
   MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   int n, r = 0, d = 0;
   const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
@@ -1600,7 +1600,10 @@ static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
+    int d_tmp, s_tmp, r_tmp;
 
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
     vp9_subtract_mbuv_s_c(x->src_diff,
                           usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
                           vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
@@ -1609,58 +1612,35 @@ static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           dst_uv_stride);
 
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
+    if (mbmi->txfm_size == TX_4X4) {
+      rd_inter16x16_uv_4x4(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
+    } else {
+      rd_inter16x16_uv_8x8(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
+    }
 
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rd_cost_mbuv_8x8(x, 0);
-    d += vp9_mbuverror(x) / 4;
-    skippable = skippable && vp9_mbuv_is_skippable_8x8(xd);
+    r += r_tmp;
+    d += d_tmp;
+    skippable = skippable && s_tmp;
   }
 
   *rate = r;
   *distortion = d;
-  if (skip) *skip = skippable;
+  *skip = skippable;
   xd->left_context = tl;
   xd->above_context = ta;
   memcpy(xd->above_context, t_above, sizeof(t_above));
   memcpy(xd->left_context, t_left, sizeof(t_left));
 
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+  return RDCOST(x->rdmult, x->rddiv, r, d);
 }
 #endif
 
-static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                    int *distortion, int fullpixel, int *skip) {
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-
-  vp9_transform_mbuv_8x8(x);
-  vp9_quantize_mbuv_8x8(x);
-
-  *rate       = rd_cost_mbuv_8x8(x, 1);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-
 static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int *skippable, int fullpixel) {
+                              int *distortion, int *skip, int fullpixel) {
   vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);
   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                     x->e_mbd.predictor, x->src.uv_stride);
-
-  vp9_transform_mbuv_4x4(x);
-  vp9_quantize_mbuv_4x4(x);
-
-  *rate       = rd_cost_mbuv(x);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skippable  = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+  return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1);
 }
 
 static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
@@ -1707,7 +1687,7 @@ static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
       vp9_transform_mbuv_4x4(x);
       vp9_quantize_mbuv_4x4(x);
 
-      rate_to = rd_cost_mbuv(x);
+      rate_to = rd_cost_mbuv_4x4(x, 1);
       rate = rate_to
              + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
@@ -2434,8 +2414,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
     // store everything needed to come back to this!!
     for (i = 0; i < 16; i++) {
-      BLOCKD *bd = &x->e_mbd.block[i];
-
       bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
       if (mbmi->second_ref_frame)
         bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
@@ -3114,12 +3092,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                                  PARTITION_INFO *partition,
                                  int_mv *ref_mv,
                                  int_mv *second_ref_mv,
-                                 int single_pred_diff,
-                                 int comp_pred_diff,
-                                 int hybrid_pred_diff,
+                                 int64_t comp_pred_diff[NB_PREDICTION_TYPES],
                                  int64_t txfm_size_diff[NB_TXFM_MODES]) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
@@ -3135,15 +3110,11 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   // ctx[mb_index].rddiv = x->rddiv;
   // ctx[mb_index].rdmult = x->rdmult;
 
-  ctx->single_pred_diff = single_pred_diff;
-  ctx->comp_pred_diff   = comp_pred_diff;
-  ctx->hybrid_pred_diff = hybrid_pred_diff;
+  ctx->single_pred_diff = comp_pred_diff[SINGLE_PREDICTION_ONLY];
+  ctx->comp_pred_diff   = comp_pred_diff[COMP_PREDICTION_ONLY];
+  ctx->hybrid_pred_diff = comp_pred_diff[HYBRID_PREDICTION];
 
-  if (txfm_size_diff) {
-    memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
-  } else {
-    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
-  }
+  memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
 }
 
 static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,
@@ -3159,12 +3130,15 @@ static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,
   *distortion2 += *distortion;
 
   // UV cost and distortion
+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                    x->e_mbd.predictor, x->src.uv_stride);
   if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)
     rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                         cpi->common.full_pixel, &uv_skippable);
+                         cpi->common.full_pixel, &uv_skippable, 1);
   else
-    rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,
-                     &uv_skippable);
+    rd_inter16x16_uv_4x4(cpi, x, rate_uv, distortion_uv,
+                         cpi->common.full_pixel, &uv_skippable, 1);
+
   *rate2 += *rate_uv;
   *distortion2 += *distortion_uv;
   *skippable = y_skippable && uv_skippable;
@@ -3183,8 +3157,8 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                unsigned char *u_buffer[4],
                                unsigned char *v_buffer[4]) {
   YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   vp9_find_near_mvs(xd, xd->mode_info_context,
                     xd->prev_mode_info_context,
@@ -3435,14 +3409,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_SUPERBLOCKS
       int skippable_y, skippable_uv;
 
-      // Y cost and distortion - FIXME support other transform sizes
-      super_block_yrd_8x8(x, rate_y, distortion_y,
-                          IF_RTCD(&cpi->rtcd), &skippable_y);
+      // Y cost and distortion
+      super_block_yrd(cpi, x, rate_y, distortion_y,
+                      IF_RTCD(&cpi->rtcd), &skippable_y, txfm_cache);
       *rate2 += *rate_y;
       *distortion += *distortion_y;
 
-      rd_inter32x32_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                           cm->full_pixel, &skippable_uv);
+      rd_inter32x32_uv(cpi, x, rate_uv, distortion_uv,
+                       cm->full_pixel, &skippable_uv);
 
       *rate2 += *rate_uv;
       *distortion += *distortion_uv;
@@ -4053,8 +4027,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       }
 
       /* keep record of best compound/single-only prediction */
-      if (!disable_skip &&
-          mbmi->ref_frame != INTRA_FRAME) {
+      if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
         int64_t single_rd, hybrid_rd;
         int single_rate, hybrid_rate;
 
@@ -4202,12 +4175,10 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
 end:
-  store_coding_context(x, &x->mb_context[xd->mb_index],
-    best_mode_index, &best_partition,
-    &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
-    &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
-    (int)best_pred_diff[0], (int)best_pred_diff[1], (int)best_pred_diff[2],
-    best_txfm_diff);
+  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index,
+                       &best_partition, &frame_best_ref_mv[mbmi->ref_frame],
+                       &frame_best_ref_mv[mbmi->second_ref_frame],
+                       best_pred_diff, best_txfm_diff);
 }
 
 #if CONFIG_SUPERBLOCKS
@@ -4221,13 +4192,14 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int error_y, error_uv;
   int dist_y, dist_uv;
   int y_skip, uv_skip;
+  int64_t txfm_cache[NB_TXFM_MODES];
 
   xd->mode_info_context->mbmi.txfm_size = TX_8X8;
 
+  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                   &dist_y, &y_skip, txfm_cache);
   error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
                                      &dist_uv, &uv_skip);
-  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                   &dist_y, &y_skip);
 
   if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
@@ -4408,7 +4380,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   MB_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
-  int comp_pred;
+  int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   int_mv frame_best_ref_mv[4];
   int frame_mdcounts[4][4];
@@ -4423,10 +4395,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
   int saddone = 0;
   int64_t best_rd = INT64_MAX;
-  int64_t best_comp_rd = INT64_MAX;
-  int64_t best_single_rd = INT64_MAX;
-  int64_t best_hybrid_rd = INT64_MAX;
   int64_t best_yrd = INT64_MAX;
+  int64_t best_txfm_rd[NB_TXFM_MODES];
+  int64_t best_txfm_diff[NB_TXFM_MODES];
+  int64_t best_pred_diff[NB_PREDICTION_TYPES];
+  int64_t best_pred_rd[NB_PREDICTION_TYPES];
   MB_MODE_INFO best_mbmode;
   int mode_index, best_mode_index;
   unsigned int ref_costs[MAX_REF_FRAMES];
@@ -4436,6 +4409,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
 
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    best_txfm_rd[i] = INT64_MAX;
+
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame,
@@ -4606,14 +4584,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 #endif
 
-    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-      if (this_rd < best_comp_rd)
-        best_comp_rd = this_rd;
-      if (this_rd < best_single_rd)
-        best_single_rd = this_rd;
-      if (this_rd < best_hybrid_rd)
-        best_hybrid_rd = this_rd;
-    }
+    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
+      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
 
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
@@ -4673,14 +4646,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-      if (mbmi->second_ref_frame == INTRA_FRAME && single_rd < best_single_rd) {
-        best_single_rd = single_rd;
+      if (mbmi->second_ref_frame == INTRA_FRAME &&
+          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
+        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
       } else if (mbmi->second_ref_frame != INTRA_FRAME &&
-                 single_rd < best_comp_rd) {
-        best_comp_rd = single_rd;
+                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
+        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
       }
-      if (hybrid_rd < best_hybrid_rd) {
-        best_hybrid_rd = hybrid_rd;
+      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
+        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+    }
+
+    /* keep record of best txfm size */
+    if (!mode_excluded && this_rd != INT64_MAX) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        int64_t adj_rd;
+        if (this_mode != B_PRED) {
+          adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
+        } else {
+          adj_rd = this_rd;
+        }
+        if (adj_rd < best_txfm_rd[i])
+          best_txfm_rd[i] = adj_rd;
       }
     }
 
@@ -4719,31 +4706,40 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->uv_mode = DC_PRED;
     mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
     mbmi->partitioning = 0;
-    mbmi->txfm_size = TX_8X8;
+    mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ?
+                      TX_16X16 : cm->txfm_mode;
 
-    if (best_rd != INT64_MAX)
-      store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
-                           &frame_best_ref_mv[mbmi->ref_frame],
-                           &frame_best_ref_mv[mbmi->second_ref_frame],
-                           0, 0, 0, NULL);
-    return best_rd;
+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
+    goto end;
   }
 
   // macroblock modes
   vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  mbmi->txfm_size = TX_8X8;
-
-  if (best_rd != INT64_MAX)
-    store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
-                         &frame_best_ref_mv[mbmi->ref_frame],
-                         &frame_best_ref_mv[mbmi->second_ref_frame],
-                         (best_single_rd == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_single_rd),
-                         (best_comp_rd   == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_comp_rd),
-                         (best_hybrid_rd == INT64_MAX) ? INT_MIN :
-                                        (best_rd - best_hybrid_rd),
-                         NULL);
+
+  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      if (best_txfm_rd[i] == INT64_MAX)
+        best_txfm_diff[i] = INT_MIN;
+      else
+        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
+    }
+  } else {
+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+  }
+
+ end:
+  store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
+                       &frame_best_ref_mv[mbmi->ref_frame],
+                       &frame_best_ref_mv[mbmi->second_ref_frame],
+                       best_pred_diff, best_txfm_diff);
 
   return best_rd;
 }