diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 214c72b85990d7a906ffae8294f50bbf1a75cded..c33e5c9f1321f983bffe8e45b9ffb71b994be3d4 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -443,18 +443,61 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
   }
 }
 
-static void decode_sb_16x16(MACROBLOCKD *mb, int y_size) {
-  const int y_count = y_size * y_size;
-  const int uv_size = y_size / 2;
-  const int uv_count = uv_size * uv_size;
+static INLINE void decode_sby_32x32(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) - 1, bw = 1 << bwl;
+  const int bhl = mb_height_log2(bsize) - 1, bh = 1 << bhl;
+  const int y_count = bw * bh;
   int n;
 
   for (n = 0; n < y_count; n++) {
-    const int x_idx = n % y_size;
-    const int y_idx = n / y_size;
+    const int x_idx = n & (bw - 1);
+    const int y_idx = n >> bwl;
+    const int y_offset = (y_idx * 32) * mb->dst.y_stride + (x_idx * 32);
+    vp9_dequant_idct_add_32x32(BLOCK_OFFSET(mb->plane[0].qcoeff, n, 1024),
+                               mb->block[0].dequant ,
+                               mb->dst.y_buffer + y_offset,
+                               mb->dst.y_buffer + y_offset,
+                               mb->dst.y_stride, mb->dst.y_stride,
+                               mb->plane[0].eobs[n * 64]);
+  }
+}
+
+static INLINE void decode_sbuv_32x32(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) - 1, bw = (1 << bwl) / 2;
+  const int bhl = mb_height_log2(bsize) - 1, bh = (1 << bhl) / 2;
+  const int uv_count = bw * bh;
+  int n;
+  for (n = 0; n < uv_count; n++) {
+     const int x_idx = n & (bw - 1);
+     const int y_idx = n >> (bwl - 1);
+     const int uv_offset = (y_idx * 32) * mb->dst.uv_stride + (x_idx * 32);
+     vp9_dequant_idct_add_32x32(BLOCK_OFFSET(mb->plane[1].qcoeff, n, 1024),
+                                mb->block[16].dequant,
+                                mb->dst.u_buffer + uv_offset,
+                                mb->dst.u_buffer + uv_offset,
+                                mb->dst.uv_stride, mb->dst.uv_stride,
+                                mb->plane[1].eobs[n * 64]);
+     vp9_dequant_idct_add_32x32(BLOCK_OFFSET(mb->plane[2].qcoeff, n, 1024),
+                                mb->block[20].dequant,
+                                mb->dst.v_buffer + uv_offset,
+                                mb->dst.v_buffer + uv_offset,
+                                mb->dst.uv_stride, mb->dst.uv_stride,
+                                mb->plane[2].eobs[n * 64]);
+  }
+}
+
+static INLINE void decode_sby_16x16(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize), bw = 1 << bwl;
+  const int bhl = mb_height_log2(bsize), bh = 1 << bhl;
+  const int y_count = bw * bh;
+  int n;
+
+  for (n = 0; n < y_count; n++) {
+    const int x_idx = n & (bw - 1);
+    const int y_idx = n >> bwl;
     const int y_offset = (y_idx * 16) * mb->dst.y_stride + (x_idx * 16);
     const TX_TYPE tx_type = get_tx_type_16x16(mb,
-                                (y_idx * (4 * y_size) + x_idx) * 4);
+                                (y_idx * (4 * bw) + x_idx) * 4);
     if (tx_type == DCT_DCT) {
       vp9_dequant_idct_add_16x16(BLOCK_OFFSET(mb->plane[0].qcoeff, n, 256),
                                  mb->block[0].dequant ,
@@ -472,10 +515,19 @@ static void decode_sb_16x16(MACROBLOCKD *mb, int y_size) {
                                   mb->plane[0].eobs[n * 16]);
     }
   }
+}
+
+static INLINE void decode_sbuv_16x16(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize), bw = (1 << bwl) / 2;
+  const int bhl = mb_height_log2(bsize), bh = (1 << bhl) / 2;
+  const int uv_count = bw * bh;
+  int n;
+
+  assert(bsize >= BLOCK_SIZE_SB32X32);
 
   for (n = 0; n < uv_count; n++) {
-    const int x_idx = n % uv_size;
-    const int y_idx = n / uv_size;
+    const int x_idx = n & (bw - 1);
+    const int y_idx = n >> (bwl - 1);
     const int uv_offset = (y_idx * 16) * mb->dst.uv_stride + (x_idx * 16);
     vp9_dequant_idct_add_16x16(BLOCK_OFFSET(mb->plane[1].qcoeff, n, 256),
                                mb->block[16].dequant,
@@ -492,19 +544,19 @@ static void decode_sb_16x16(MACROBLOCKD *mb, int y_size) {
   }
 }
 
-static INLINE void decode_sb_8x8(MACROBLOCKD *xd, int y_size) {
-  const int y_count = y_size * y_size;
-  const int uv_size = y_size / 2;
-  const int uv_count = uv_size * uv_size;
+static INLINE void decode_sby_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize)  + 1, bw = 1 << bwl;
+  const int bhl = mb_height_log2(bsize) + 1, bh = 1 << bhl;
+  const int y_count = bw * bh;
   int n;
 
   // luma
   for (n = 0; n < y_count; n++) {
-    const int x_idx = n % y_size;
-    const int y_idx = n / y_size;
+    const int x_idx = n & (bw - 1);
+    const int y_idx = n >> bwl;
     const int y_offset = (y_idx * 8) * xd->dst.y_stride + (x_idx * 8);
     const TX_TYPE tx_type = get_tx_type_8x8(xd,
-                                            (y_idx * (2 * y_size) + x_idx) * 2);
+                                            (y_idx * (2 * bw) + x_idx) * 2);
     if (tx_type == DCT_DCT) {
       vp9_dequant_idct_add_8x8_c(BLOCK_OFFSET(xd->plane[0].qcoeff, n, 64),
                                  xd->block[0].dequant,
@@ -522,11 +574,18 @@ static INLINE void decode_sb_8x8(MACROBLOCKD *xd, int y_size) {
                                 xd->plane[0].eobs[n * 4]);
     }
   }
+}
+
+static INLINE void decode_sbuv_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize)  + 1, bw = 1 << (bwl - 1);
+  const int bhl = mb_height_log2(bsize) + 1, bh = 1 << (bhl - 1);
+  const int uv_count = bw * bh;
+  int n;
 
   // chroma
   for (n = 0; n < uv_count; n++) {
-    const int x_idx = n % uv_size;
-    const int y_idx = n / uv_size;
+    const int x_idx = n & (bw - 1);
+    const int y_idx = n >> (bwl - 1);
     const int uv_offset = (y_idx * 8) * xd->dst.uv_stride + (x_idx * 8);
     vp9_dequant_idct_add_8x8_c(BLOCK_OFFSET(xd->plane[1].qcoeff, n, 64),
                                xd->block[16].dequant,
@@ -543,18 +602,17 @@ static INLINE void decode_sb_8x8(MACROBLOCKD *xd, int y_size) {
   }
 }
 
-
-static void decode_sb_4x4(MACROBLOCKD *xd, int y_size) {
-  const int y_count = y_size * y_size;
-  const int uv_size = y_size / 2;
-  const int uv_count = uv_size * uv_size;
+static INLINE void decode_sby_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize)  + 2, bw = 1 << bwl;
+  const int bhl = mb_height_log2(bsize) + 2, bh = 1 << bhl;
+  const int y_count = bw * bh;
   int n;
 
   for (n = 0; n < y_count; n++) {
-    const int x_idx = n % y_size;
-    const int y_idx = n / y_size;
+    const int x_idx = n & (bw - 1);
+    const int y_idx = n >> bwl;
     const int y_offset = (y_idx * 4) * xd->dst.y_stride + (x_idx * 4);
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * y_size + x_idx);
+    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
     if (tx_type == DCT_DCT) {
       xd->itxm_add(BLOCK_OFFSET(xd->plane[0].qcoeff, n, 16),
                    xd->block[0].dequant,
@@ -573,10 +631,17 @@ static void decode_sb_4x4(MACROBLOCKD *xd, int y_size) {
                             xd->plane[0].eobs[n]);
     }
   }
+}
+
+static INLINE void decode_sbuv_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize)  + 2, bw = 1 << (bwl - 1);
+  const int bhl = mb_height_log2(bsize) + 2, bh = 1 << (bhl - 1);
+  const int uv_count = bw * bh;
+  int n;
 
   for (n = 0; n < uv_count; n++) {
-    const int x_idx = n % uv_size;
-    const int y_idx = n / uv_size;
+    const int x_idx = n & (bw - 1);
+    const int y_idx = n >> (bwl - 1);
     const int uv_offset = (y_idx * 4) * xd->dst.uv_stride + (x_idx * 4);
     xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, n, 16),
         xd->block[16].dequant,
@@ -591,14 +656,34 @@ static void decode_sb_4x4(MACROBLOCKD *xd, int y_size) {
   }
 }
 
-static void decode_sb64(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
-                        BOOL_DECODER* const bc) {
+// TODO(jingning): combine luma and chroma dequantization and inverse
+// transform into a single function looping over planes.
+static void decode_sb_32x32(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
+  decode_sby_32x32(mb, bsize);
+  if (bsize == BLOCK_SIZE_SB64X64)
+    decode_sbuv_32x32(mb, bsize);
+  else
+    decode_sbuv_16x16(mb, bsize);
+}
+
+static void decode_sb_16x16(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
+  decode_sby_16x16(mb, bsize);
+  if (bsize >= BLOCK_SIZE_SB32X32)
+    decode_sbuv_16x16(mb, bsize);
+  else
+    decode_sbuv_8x8(mb, bsize);
+}
+
+static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
+                      BOOL_DECODER* const bc, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
+  const int bw = 1 << bwl, bh = 1 << bhl;
   int n, eobtotal;
   VP9_COMMON *const pc = &pbi->common;
   MODE_INFO *mi = xd->mode_info_context;
   const int mis = pc->mode_info_stride;
 
-  assert(mi->mbmi.sb_type == BLOCK_SIZE_SB64X64);
+  assert(mi->mbmi.sb_type == bsize);
 
   if (pbi->common.frame_type != KEY_FRAME)
     vp9_setup_interp_filters(xd, mi->mbmi.interp_filter, pc);
@@ -608,7 +693,7 @@ static void decode_sb64(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
     mb_init_dequantizer(pbi, xd);
 
   if (mi->mbmi.mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_SB64X64);
+    vp9_reset_sb_tokens_context(xd, bsize);
 
     // Special case:  Force the loopfilter to skip when eobtotal and
     // mb_skip_coeff are zero.
@@ -616,19 +701,32 @@ static void decode_sb64(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
     return;
   }
 
-  // do prediction
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sb64y_s(xd);
-    vp9_build_intra_predictors_sb64uv_s(xd);
+  // TODO(jingning): need to combine intra/inter predictor functions and
+  // make them block size independent.
+  // generate prediction
+  if (bsize == BLOCK_SIZE_SB64X64) {
+    assert(bsize == BLOCK_SIZE_SB64X64);
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      vp9_build_intra_predictors_sb64y_s(xd);
+      vp9_build_intra_predictors_sb64uv_s(xd);
+    } else {
+      vp9_build_inter64x64_predictors_sb(xd, mb_row, mb_col);
+    }
   } else {
-    vp9_build_inter64x64_predictors_sb(xd, mb_row, mb_col);
+    assert(bsize == BLOCK_SIZE_SB32X32);
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      vp9_build_intra_predictors_sby_s(xd);
+      vp9_build_intra_predictors_sbuv_s(xd);
+    } else {
+      vp9_build_inter32x32_predictors_sb(xd, mb_row, mb_col);
+    }
   }
 
   // dequantization and idct
-  eobtotal = vp9_decode_tokens(pbi, xd, bc, BLOCK_SIZE_SB64X64);
+  eobtotal = vp9_decode_tokens(pbi, xd, bc, bsize);
   if (eobtotal == 0) {  // skip loopfilter
-    for (n = 0; n < 16; n++) {
-      const int x_idx = n & 3, y_idx = n >> 2;
+    for (n = 0; n < bw * bh; n++) {
+      const int x_idx = n & (bw - 1), y_idx = n >> bwl;
 
       if (mb_col + x_idx < pc->mb_cols && mb_row + y_idx < pc->mb_rows)
         mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
@@ -636,108 +734,18 @@ static void decode_sb64(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
   } else {
     switch (xd->mode_info_context->mbmi.txfm_size) {
       case TX_32X32:
-        for (n = 0; n < 4; n++) {
-          const int x_idx = n & 1, y_idx = n >> 1;
-          const int y_offset = x_idx * 32 + y_idx * xd->dst.y_stride * 32;
-          vp9_dequant_idct_add_32x32(BLOCK_OFFSET(xd->plane[0].qcoeff, n, 1024),
-              xd->block[0].dequant,
-              xd->dst.y_buffer + y_offset,
-              xd->dst.y_buffer + y_offset,
-              xd->dst.y_stride, xd->dst.y_stride, xd->plane[0].eobs[n * 64]);
-        }
-        vp9_dequant_idct_add_32x32(xd->plane[1].qcoeff,
-            xd->block[16].dequant, xd->dst.u_buffer, xd->dst.u_buffer,
-            xd->dst.uv_stride, xd->dst.uv_stride, xd->plane[1].eobs[0]);
-        vp9_dequant_idct_add_32x32(xd->plane[2].qcoeff,
-            xd->block[20].dequant, xd->dst.v_buffer, xd->dst.v_buffer,
-            xd->dst.uv_stride, xd->dst.uv_stride, xd->plane[2].eobs[0]);
-        break;
-      case TX_16X16:
-        decode_sb_16x16(xd, 4);
-        break;
-      case TX_8X8:
-        decode_sb_8x8(xd, 8);
-        break;
-      case TX_4X4:
-        decode_sb_4x4(xd, 16);
-        break;
-      default: assert(0);
-    }
-  }
-#if CONFIG_CODE_NONZEROCOUNT
-  propagate_nzcs(&pbi->common, xd);
-#endif
-}
-
-static void decode_sb32(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
-                        BOOL_DECODER* const bc) {
-  int eobtotal;
-  VP9_COMMON *const pc = &pbi->common;
-  MODE_INFO *mi = xd->mode_info_context;
-  const int mis = pc->mode_info_stride;
-
-  assert(mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);
-
-  if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, mi->mbmi.interp_filter, pc);
-
-  // re-initialize macroblock dequantizer before detokenization
-  if (xd->segmentation_enabled)
-    mb_init_dequantizer(pbi, xd);
-
-  if (mi->mbmi.mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_SB32X32);
-
-    // Special case:  Force the loopfilter to skip when eobtotal and
-    // mb_skip_coeff are zero.
-    skip_recon_mb(pbi, xd, mb_row, mb_col);
-    return;
-  }
-
-
-  // do prediction
-  if (mi->mbmi.ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sby_s(xd);
-    vp9_build_intra_predictors_sbuv_s(xd);
-  } else {
-    vp9_build_inter32x32_predictors_sb(xd, mb_row, mb_col);
-  }
-
-  // dequantization and idct
-  eobtotal = vp9_decode_tokens(pbi, xd, bc, BLOCK_SIZE_SB32X32);
-  if (eobtotal == 0) {  // skip loopfilter
-    mi->mbmi.mb_skip_coeff = 1;
-    if (mb_col + 1 < pc->mb_cols)
-      mi[1].mbmi.mb_skip_coeff = 1;
-    if (mb_row + 1 < pc->mb_rows) {
-      mi[mis].mbmi.mb_skip_coeff = 1;
-      if (mb_col + 1 < pc->mb_cols)
-        mi[mis + 1].mbmi.mb_skip_coeff = 1;
-    }
-  } else {
-    switch (xd->mode_info_context->mbmi.txfm_size) {
-      case TX_32X32:
-        vp9_dequant_idct_add_32x32(xd->plane[0].qcoeff, xd->block[0].dequant,
-                                   xd->dst.y_buffer, xd->dst.y_buffer,
-                                   xd->dst.y_stride, xd->dst.y_stride,
-                                   xd->plane[0].eobs[0]);
-        vp9_dequant_idct_add_16x16(xd->plane[1].qcoeff, xd->block[16].dequant,
-                                   xd->dst.u_buffer, xd->dst.u_buffer,
-                                   xd->dst.uv_stride, xd->dst.uv_stride,
-                                   xd->plane[1].eobs[0]);
-        vp9_dequant_idct_add_16x16(xd->plane[2].qcoeff, xd->block[16].dequant,
-                                   xd->dst.v_buffer, xd->dst.v_buffer,
-                                   xd->dst.uv_stride, xd->dst.uv_stride,
-                                   xd->plane[2].eobs[0]);
+        decode_sb_32x32(xd, bsize);
         break;
       case TX_16X16:
-        decode_sb_16x16(xd, 2);
+        decode_sb_16x16(xd, bsize);
         break;
       case TX_8X8:
-        decode_sb_8x8(xd, 4);
+        decode_sby_8x8(xd, bsize);
+        decode_sbuv_8x8(xd, bsize);
         break;
       case TX_4X4:
-        decode_sb_4x4(xd, 8);
+        decode_sby_4x4(xd, bsize);
+        decode_sbuv_4x4(xd, bsize);
         break;
       default: assert(0);
     }
@@ -747,6 +755,8 @@ static void decode_sb32(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col,
 #endif
 }
 
+// TODO(jingning): Need to merge SB and MB decoding. The MB decoding currently
+// couples special handles on I8x8, B_PRED, and splitmv modes.
 static void decode_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,
                      int mb_row, int mb_col,
                      BOOL_DECODER* const bc) {
@@ -943,7 +953,7 @@ static void decode_sb_row(VP9D_COMP *pbi, int mb_row, vp9_reader* r) {
       set_offsets(pbi, 64, mb_row, mb_col);
       vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, r);
       set_refs(pbi, 64, mb_row, mb_col);
-      decode_sb64(pbi, xd, mb_row, mb_col, r);
+      decode_sb(pbi, xd, mb_row, mb_col, r, BLOCK_SIZE_SB64X64);
       xd->corrupted |= bool_error(r);
     } else {
       // not SB64
@@ -962,7 +972,7 @@ static void decode_sb_row(VP9D_COMP *pbi, int mb_row, vp9_reader* r) {
           set_offsets(pbi, 32, y_idx_sb, x_idx_sb);
           vp9_decode_mb_mode_mv(pbi, xd, y_idx_sb, x_idx_sb, r);
           set_refs(pbi, 32, y_idx_sb, x_idx_sb);
-          decode_sb32(pbi, xd, y_idx_sb, x_idx_sb, r);
+          decode_sb(pbi, xd, y_idx_sb, x_idx_sb, r, BLOCK_SIZE_SB32X32);
           xd->corrupted |= bool_error(r);
         } else {
           // not SB32