diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 0bfaec7a0c5e8573318ddf01643e012ff810c2ce..b58945e51b129ee680e3b6dd1a998de3979c45fb 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -417,6 +417,7 @@ typedef struct macroblockd {
 
 static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
   switch (subsize) {
+    case BLOCK_SIZE_SB64X64:
     case BLOCK_SIZE_SB64X32:
     case BLOCK_SIZE_SB32X64:
     case BLOCK_SIZE_SB32X32:
@@ -444,10 +445,10 @@ static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                             BLOCK_SIZE_TYPE sb_type,
                                             BLOCK_SIZE_TYPE sb_size) {
-  int bsl = mi_width_log2(sb_size), bs = 1 << bsl;
-  int bwl = mi_width_log2(sb_type);
-  int bhl = mi_height_log2(sb_type);
-  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
+  int bwl = b_width_log2(sb_type);
+  int bhl = b_height_log2(sb_type);
+  int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
   int i;
 
 #if !CONFIG_AB4X4
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3e3a94e51da35652474eb4616e97175edb3d4bfb..a44fe4ca59d997d605eda339b78cc2b2937fce18 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -107,10 +107,10 @@ const vp9_prob vp9_partition_probs[NUM_PARTITION_CONTEXTS]
                                   [PARTITION_TYPES - 1] = {
   // FIXME(jingning,rbultje) put real probabilities here
 #if CONFIG_AB4X4
-  {202, 162, 107},
-  {16,  2,   169},
-  {3,   246,  19},
-  {104, 90,  134},
+  {105,  88,  252},
+  {113,  88,  249},
+  {113, 106,  251},
+  {126, 105,  107},
 #endif
   {202, 162, 107},
   {16,  2,   169},
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 3864d3c862e40d2c7991c41e8c885ec15c357fdc..d34bfa74d4e34d1b241541ccd68d0c8b1962ab0b 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -540,8 +540,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
   int_mv *const mv0 = &mbmi->mv[0];
   int_mv *const mv1 = &mbmi->mv[1];
-  const int bw = 1 << mi_width_log2(mi->mbmi.sb_type);
-  const int bh = 1 << mi_height_log2(mi->mbmi.sb_type);
+  BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  int bw = 1 << b_width_log2(bsize);
+  int bh = 1 << b_height_log2(bsize);
 
   const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
                                        cm->height == cm->last_height &&
@@ -549,6 +550,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                                        cm->last_show_frame;
 
   int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;
+  int j, idx, idy;
 
   mbmi->need_to_clamp_mvs = 0;
   mbmi->need_to_clamp_secondmv = 0;
@@ -562,7 +564,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   // Distance of Mb to the various image edges.
   // These specified to 8th pel as they are always compared to MV values
   // that are in 1/8th pel units
-  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+  set_mi_row_col(cm, xd, mi_row, 1 << mi_height_log2(bsize),
+                         mi_col, 1 << mi_width_log2(bsize));
 
   mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
   mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
@@ -613,14 +616,14 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         mbmi->mode = ZEROMV;
       } else {
 #if CONFIG_AB4X4
-        if (mbmi->sb_type >= BLOCK_SIZE_SB8X8)
+        if (bsize >= BLOCK_SIZE_SB8X8)
           mbmi->mode = read_sb_mv_ref(r, mv_ref_p);
         else
           mbmi->mode = SPLITMV;
 #else
-        mbmi->mode = mbmi->sb_type > BLOCK_SIZE_SB8X8 ?
-                                     read_sb_mv_ref(r, mv_ref_p)
-                                   : read_mv_ref(r, mv_ref_p);
+        mbmi->mode = bsize > BLOCK_SIZE_SB8X8 ?
+                                   read_sb_mv_ref(r, mv_ref_p)
+                                 : read_mv_ref(r, mv_ref_p);
 #endif
         vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref_frame]);
       }
@@ -685,80 +688,87 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
     mbmi->uv_mode = DC_PRED;
     switch (mbmi->mode) {
-      case SPLITMV: {
-        const int num_p = 4;
-        int j = 0;
-
-        mbmi->need_to_clamp_mvs = 0;
-        do {  // for each subset j
-          int_mv leftmv, abovemv, second_leftmv, second_abovemv;
-          int_mv blockmv, secondmv;
-          int mv_contz;
-          int blockmode;
-          int k = j;
-
-          leftmv.as_int = left_block_mv(xd, mi, k);
-          abovemv.as_int = above_block_mv(mi, k, mis);
-          second_leftmv.as_int = 0;
-          second_abovemv.as_int = 0;
-          if (mbmi->second_ref_frame > 0) {
-            second_leftmv.as_int = left_block_second_mv(xd, mi, k);
-            second_abovemv.as_int = above_block_second_mv(mi, k, mis);
-          }
-          mv_contz = vp9_mv_cont(&leftmv, &abovemv);
-          blockmode = read_sub_mv_ref(r, cm->fc.sub_mv_ref_prob[mv_contz]);
-          cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;
-
-          switch (blockmode) {
-            case NEW4X4:
-              decode_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
-                        &cm->fc.NMVcount, xd->allow_high_precision_mv);
-
-              if (mbmi->second_ref_frame > 0)
-                decode_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
-                          &cm->fc.NMVcount, xd->allow_high_precision_mv);
-
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][3]++;
-#endif
-              break;
-            case LEFT4X4:
-              blockmv.as_int = leftmv.as_int;
-              if (mbmi->second_ref_frame > 0)
-                secondmv.as_int = second_leftmv.as_int;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][0]++;
-#endif
-              break;
-            case ABOVE4X4:
-              blockmv.as_int = abovemv.as_int;
-              if (mbmi->second_ref_frame > 0)
-                secondmv.as_int = second_abovemv.as_int;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][1]++;
-#endif
-              break;
-            case ZERO4X4:
-              blockmv.as_int = 0;
-              if (mbmi->second_ref_frame > 0)
-                secondmv.as_int = 0;
-#ifdef VPX_MODE_COUNT
-              vp9_mv_cont_count[mv_contz][2]++;
+      case SPLITMV:
+#if !CONFIG_AB4X4
+        bw = 1, bh = 1;
 #endif
-              break;
-            default:
-              break;
+        mbmi->need_to_clamp_mvs = 0;
+        for (idy = 0; idy < 2; idy += bh) {
+          for (idx = 0; idx < 2; idx += bw) {
+            int_mv leftmv, abovemv, second_leftmv, second_abovemv;
+            int_mv blockmv, secondmv;
+            int mv_contz;
+            int blockmode;
+            int i, k;
+            j = idy * 2 + idx;
+            k = j;
+
+            leftmv.as_int = left_block_mv(xd, mi, k);
+            abovemv.as_int = above_block_mv(mi, k, mis);
+            second_leftmv.as_int = 0;
+            second_abovemv.as_int = 0;
+            if (mbmi->second_ref_frame > 0) {
+              second_leftmv.as_int = left_block_second_mv(xd, mi, k);
+              second_abovemv.as_int = above_block_second_mv(mi, k, mis);
+            }
+            mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+            blockmode = read_sub_mv_ref(r, cm->fc.sub_mv_ref_prob[mv_contz]);
+            cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;
+
+            switch (blockmode) {
+              case NEW4X4:
+                decode_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
+                           &cm->fc.NMVcount, xd->allow_high_precision_mv);
+
+                if (mbmi->second_ref_frame > 0)
+                  decode_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
+                            &cm->fc.NMVcount, xd->allow_high_precision_mv);
+
+  #ifdef VPX_MODE_COUNT
+                vp9_mv_cont_count[mv_contz][3]++;
+  #endif
+                break;
+              case LEFT4X4:
+                blockmv.as_int = leftmv.as_int;
+                if (mbmi->second_ref_frame > 0)
+                  secondmv.as_int = second_leftmv.as_int;
+  #ifdef VPX_MODE_COUNT
+                vp9_mv_cont_count[mv_contz][0]++;
+  #endif
+                break;
+              case ABOVE4X4:
+                blockmv.as_int = abovemv.as_int;
+                if (mbmi->second_ref_frame > 0)
+                  secondmv.as_int = second_abovemv.as_int;
+  #ifdef VPX_MODE_COUNT
+                vp9_mv_cont_count[mv_contz][1]++;
+  #endif
+                break;
+              case ZERO4X4:
+                blockmv.as_int = 0;
+                if (mbmi->second_ref_frame > 0)
+                  secondmv.as_int = 0;
+  #ifdef VPX_MODE_COUNT
+                vp9_mv_cont_count[mv_contz][2]++;
+  #endif
+                break;
+              default:
+                break;
+            }
+            mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
+            if (mbmi->second_ref_frame > 0)
+              mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
+
+            for (i = 1; i < bh; ++i)
+              vpx_memcpy(&mi->bmi[j + i * 2], &mi->bmi[j], sizeof(mi->bmi[j]));
+            for (i = 1; i < bw; ++i)
+              vpx_memcpy(&mi->bmi[j + i], &mi->bmi[j], sizeof(mi->bmi[j]));
           }
-          mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
-          if (mbmi->second_ref_frame > 0)
-            mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
-        } while (++j < num_p);
-      }
-
-      mv0->as_int = mi->bmi[3].as_mv[0].as_int;
-      mv1->as_int = mi->bmi[3].as_mv[1].as_int;
+        }
 
-      break;  /* done with SPLITMV */
+        mv0->as_int = mi->bmi[3].as_mv[0].as_int;
+        mv1->as_int = mi->bmi[3].as_mv[1].as_int;
+        break;  /* done with SPLITMV */
 
       case NEARMV:
         // Clip "next_nearest" so that it does not extend to far out of image
@@ -822,14 +832,14 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     mv0->as_int = 0;
 
 #if CONFIG_AB4X4
-    if (mbmi->sb_type >= BLOCK_SIZE_SB8X8) {
+    if (bsize >= BLOCK_SIZE_SB8X8) {
       mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
       cm->fc.sb_ymode_counts[mbmi->mode]++;
     } else {
       mbmi->mode = I4X4_PRED;
     }
 #else
-    if (mbmi->sb_type > BLOCK_SIZE_SB8X8) {
+    if (bsize > BLOCK_SIZE_SB8X8) {
       mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
       cm->fc.sb_ymode_counts[mbmi->mode]++;
     } else {
@@ -840,7 +850,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
     // If MB mode is I4X4_PRED read the block modes
 #if CONFIG_AB4X4
-    if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+    if (bsize < BLOCK_SIZE_SB8X8) {
 #else
     if (mbmi->mode == I4X4_PRED) {
 #endif
@@ -857,21 +867,21 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   }
 
 #if CONFIG_AB4X4
-    if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
-        mbmi->sb_type >= BLOCK_SIZE_SB8X8) {
+  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
+      bsize >= BLOCK_SIZE_SB8X8) {
 #else
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode != I4X4_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
 #endif
-    const int allow_16x16 = mbmi->sb_type >= BLOCK_SIZE_MB16X16;
-    const int allow_32x32 = mbmi->sb_type >= BLOCK_SIZE_SB32X32;
+    const int allow_16x16 = bsize >= BLOCK_SIZE_MB16X16;
+    const int allow_32x32 = bsize >= BLOCK_SIZE_SB32X32;
     mbmi->txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
-  } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32 &&
+  } else if (bsize >= BLOCK_SIZE_SB32X32 &&
              cm->txfm_mode >= ALLOW_32X32) {
     mbmi->txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
-             mbmi->sb_type >= BLOCK_SIZE_MB16X16
+             bsize >= BLOCK_SIZE_MB16X16
 #if !CONFIG_AB4X4
       && ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))
@@ -880,7 +890,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 &&
 #if CONFIG_AB4X4
-      (mbmi->sb_type >= BLOCK_SIZE_SB8X8))
+      (bsize >= BLOCK_SIZE_SB8X8))
 #else
       (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == I4X4_PRED) &&
        !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV)))
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index b200e6ccceb43bfd1098dfed3bb9988b3f24a43b..e58c579796ca06e385fd0d826295131b1abf27f5 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -413,6 +413,11 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
                            vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &pbi->mb;
 
+#if CONFIG_AB4X4
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index > 0)
+      return;
+#endif
   set_offsets(pbi, bsize, mi_row, mi_col);
   vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
   set_refs(pbi, mi_row, mi_col);
@@ -465,6 +470,7 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
   }
 
   subsize = get_subsize(bsize, partition);
+  *(get_sb_index(xd, subsize)) = 0;
 
   switch (partition) {
     case PARTITION_NONE:
@@ -472,11 +478,13 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
       break;
     case PARTITION_HORZ:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
       if (mi_row + bs < pc->mi_rows)
         decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize);
       break;
     case PARTITION_VERT:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
       if (mi_col + bs < pc->mi_cols)
         decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize);
       break;
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 6a55e8fb8157e90134ab4b993ee4e7818c8e4695..9761bd6f8d2066d3003ae5185be53df0e7aaabf4 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -34,7 +34,7 @@
 static void recon_write_yuv_frame(const char *name,
                                   const YV12_BUFFER_CONFIG *s,
                                   int w, int _h) {
-  FILE *yuv_file = fopen((char *)name, "ab");
+  FILE *yuv_file = fopen(name, "ab");
   const uint8_t *src = s->y_buffer;
   int h = _h;
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index bcec13c4bedcafe87ad40e0026169bfe362a9207..b09da88e4706390fb4fda9d4698391c4471cf092 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -696,39 +696,50 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
                         nmvc, xd->allow_high_precision_mv);
         break;
       case SPLITMV: {
-        int j = 0;
-
-        do {
-          B_PREDICTION_MODE blockmode;
-          int_mv blockmv;
-          int k = -1;  /* first block in subset j */
-          int mv_contz;
-          int_mv leftmv, abovemv;
-
-          blockmode = cpi->mb.partition_info->bmi[j].mode;
-          blockmv = cpi->mb.partition_info->bmi[j].mv;
-          k = j;
-          leftmv.as_int = left_block_mv(xd, m, k);
-          abovemv.as_int = above_block_mv(m, k, mis);
-          mv_contz = vp9_mv_cont(&leftmv, &abovemv);
-
-          write_sub_mv_ref(bc, blockmode,
-                           cpi->common.fc.sub_mv_ref_prob[mv_contz]);
-          cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
-          if (blockmode == NEW4X4) {
+        int j;
+        B_PREDICTION_MODE blockmode;
+        int_mv blockmv;
+        int k = -1;  /* first block in subset j */
+        int mv_contz;
+        int_mv leftmv, abovemv;
+        int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;
+        int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;
+        int idx, idy;
+#if !CONFIG_AB4X4
+        bw = 1, bh = 1;
+#endif
+        for (idy = 0; idy < 2; idy += bh) {
+          for (idx = 0; idx < 2; idx += bw) {
+            j = idy * 2 + idx;
+            blockmode = cpi->mb.partition_info->bmi[j].mode;
+            blockmv = cpi->mb.partition_info->bmi[j].mv;
+            k = j;
+            leftmv.as_int = left_block_mv(xd, m, k);
+            abovemv.as_int = above_block_mv(m, k, mis);
+            mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+
+            write_sub_mv_ref(bc, blockmode,
+                             cpi->common.fc.sub_mv_ref_prob[mv_contz]);
+            cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
+            if (blockmode == NEW4X4) {
 #ifdef ENTROPY_STATS
-            active_section = 11;
+              active_section = 11;
 #endif
-            vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
-                          nmvc, xd->allow_high_precision_mv);
-
-            if (mi->second_ref_frame > 0)
-              vp9_encode_mv(bc,
-                            &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
-                            &mi->best_second_mv.as_mv,
+              vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
                             nmvc, xd->allow_high_precision_mv);
+
+              if (mi->second_ref_frame > 0)
+                vp9_encode_mv(bc,
+                              &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+                              &mi->best_second_mv.as_mv,
+                              nmvc, xd->allow_high_precision_mv);
+            }
           }
-        } while (++j < cpi->mb.partition_info->count);
+        }
+
+#ifdef MODE_STATS
+        ++count_mb_seg[mi->partitioning];
+#endif
         break;
       }
       default:
@@ -837,6 +848,11 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
+#if CONFIG_AB4X4
+  if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index > 0)
+      return;
+#endif
   xd->mode_info_context = m;
   set_mi_row_col(&cpi->common, xd, mi_row,
                  1 << mi_height_log2(m->mbmi.sb_type),
@@ -891,7 +907,7 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
 
 #if CONFIG_AB4X4
   if (bsize < BLOCK_SIZE_SB8X8)
-    if (xd->ab_index != 0)
+    if (xd->ab_index > 0)
       return;
 #endif
 
@@ -910,6 +926,7 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
   }
 
   subsize = get_subsize(bsize, partition);
+  *(get_sb_index(xd, subsize)) = 0;
 
   switch (partition) {
     case PARTITION_NONE:
@@ -917,11 +934,13 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
       break;
     case PARTITION_HORZ:
       write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+      *(get_sb_index(xd, subsize)) = 1;
       if ((mi_row + bs) < cm->mi_rows)
         write_modes_b(cpi, m + bs * mis, bc, tok, tok_end, mi_row + bs, mi_col);
       break;
     case PARTITION_VERT:
       write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+      *(get_sb_index(xd, subsize)) = 1;
       if ((mi_col + bs) < cm->mi_cols)
         write_modes_b(cpi, m + bs, bc, tok, tok_end, mi_row, mi_col + bs);
       break;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 27f693d64585cb119a573310ba4dd7137a7c272e..3e108c8fcfd9e4d96855d6d10bc9f9f372877da9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -786,6 +786,12 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
 
   if (sub_index != -1)
     *(get_sb_index(xd, bsize)) = sub_index;
+
+#if CONFIG_AB4X4
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index > 0)
+      return;
+#endif
   set_offsets(cpi, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
@@ -828,13 +834,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
 
   if (bsl == bwl && bsl == bhl) {
 #if CONFIG_AB4X4
-    if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) {
-      if (bsize > BLOCK_SIZE_SB8X8 ||
-          (bsize == BLOCK_SIZE_SB8X8 && c1 == bsize))
+    if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
         cpi->partition_count[pl][PARTITION_NONE]++;
-      else
-        cpi->partition_count[pl][PARTITION_SPLIT]++;
-    }
 #else
     if (output_enabled && bsize > BLOCK_SIZE_SB8X8)
       cpi->partition_count[pl][PARTITION_NONE]++;
@@ -909,7 +910,6 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
       return;
     }
 #endif
-
   assert(mi_height_log2(bsize) == mi_width_log2(bsize));
 
   // buffer the above/left context information of the block in search.
@@ -939,7 +939,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     for (i = 0; i < 4; ++i) {
       int x_idx = (i & 1) * (ms >> 1);
       int y_idx = (i >> 1) * (ms >> 1);
-      int r, d;
+      int r = 0, d = 0;
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
@@ -966,10 +966,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
-  // TODO(jingning): need to enable 4x8 and 8x4 partition coding
   // PARTITION_HORZ
   if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) &&
+#if CONFIG_AB4X4
+      (bsize >= BLOCK_SIZE_SB8X8)) {
+#else
       (bsize >= BLOCK_SIZE_MB16X16)) {
+#endif
     int r2, d2;
     int mb_skip = 0;
     subsize = get_subsize(bsize, PARTITION_HORZ);
@@ -978,7 +981,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
                   get_block_context(x, subsize));
 
     if (mi_row + ms <= cm->mi_rows) {
-      int r, d;
+      int r = 0, d = 0;
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
       *(get_sb_index(xd, subsize)) = 1;
@@ -992,8 +995,12 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     }
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
+#if CONFIG_AB4X4
+    if (r2 < INT_MAX)
+      r2 += x->partition_cost[pl][PARTITION_HORZ];
+#else
     r2 += x->partition_cost[pl][PARTITION_HORZ];
-
+#endif
     if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
          RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
       srate = r2;
@@ -1005,7 +1012,11 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
 
   // PARTITION_VERT
   if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) &&
+#if CONFIG_AB4X4
+      (bsize >= BLOCK_SIZE_SB8X8)) {
+#else
       (bsize >= BLOCK_SIZE_MB16X16)) {
+#endif
     int r2, d2;
     int mb_skip = 0;
     subsize = get_subsize(bsize, PARTITION_VERT);
@@ -1013,7 +1024,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
                   get_block_context(x, subsize));
     if (mi_col + ms <= cm->mi_cols) {
-      int r, d;
+      int r = 0, d = 0;
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
       *(get_sb_index(xd, subsize)) = 1;
@@ -1027,8 +1038,12 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
     }
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
+#if CONFIG_AB4X4
+    if (r2 < INT_MAX)
+      r2 += x->partition_cost[pl][PARTITION_VERT];
+#else
     r2 += x->partition_cost[pl][PARTITION_VERT];
-
+#endif
     if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
          RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
       srate = r2;
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index d180e46cf88015103a8fa036e6168f1ca5e51276..e2cd8838c31930f8847af13c7b29e404edeb3c62 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -573,7 +573,11 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
   int bhl = b_height_log2(mbmi->sb_type), bh = 1 << bhl;
   int idx, idy;
 
+#if CONFIG_AB4X4
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+#else
   if (mbmi->mode == SPLITMV) {
+#endif
     int i;
     PARTITION_INFO *pi = x->partition_info;
 #if !CONFIG_AB4X4
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f928e7afe86be512c8b5a585675e45c580076340..ff437c18e99a34538616acf6858bb436426803d0 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -910,6 +910,11 @@ static int labels2mode(MACROBLOCK *x,
   MB_MODE_INFO * mbmi = &mic->mbmi;
   const int mis = xd->mode_info_stride;
   int i, cost = 0, thismvcost = 0;
+#if CONFIG_AB4X4
+  int idx, idy;
+  int bw = 1 << b_width_log2(mbmi->sb_type);
+  int bh = 1 << b_height_log2(mbmi->sb_type);
+#endif
 
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
@@ -993,6 +998,17 @@ static int labels2mode(MACROBLOCK *x,
     x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
     if (mbmi->second_ref_frame > 0)
       x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+#if CONFIG_AB4X4
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
+                   &mic->bmi[i], sizeof(mic->bmi[i]));
+        vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
+                   &x->partition_info->bmi[i],
+                   sizeof(x->partition_info->bmi[i]));
+      }
+    }
+#endif
   }
 
   cost += thismvcost;
@@ -1007,8 +1023,15 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                        int *distortion,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
-  int i;
+  int i, k;
   MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int idx, idy;
+#if !CONFIG_AB4X4
+  bw = 1, bh = 1;
+#endif
 
   *labelyrate = 0;
   *distortion = 0;
@@ -1018,10 +1041,10 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
       uint8_t* const src =
       raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
                                 x->plane[0].src.buf, src_stride);
-      int16_t* const src_diff =
+      int16_t* src_diff =
       raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
                                 x->plane[0].src_diff);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
+      int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
       uint8_t* const pre =
       raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
                                 xd->plane[0].pre[0].buf,
@@ -1030,7 +1053,8 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
       raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
                                 xd->plane[0].dst.buf,
                                 xd->plane[0].dst.stride);
-      int thisdistortion;
+      int thisdistortion = 0;
+      int thisrate = 0;
 
       vp9_build_inter_predictor(pre,
                                 xd->plane[0].pre[0].stride,
@@ -1038,7 +1062,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                 xd->plane[0].dst.stride,
                                 &xd->mode_info_context->bmi[i].as_mv[0],
                                 &xd->scale_factor[0],
-                                4, 4, 0 /* no avg */, &xd->subpix);
+                                4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
 
       // TODO(debargha): Make this work properly with the
       // implicit-compoundinter-weight experiment when implicit
@@ -1051,22 +1075,33 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
         vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
                                   dst, xd->plane[0].dst.stride,
                                   &xd->mode_info_context->bmi[i].as_mv[1],
-                                  &xd->scale_factor[1], 4, 4, 1,
+                                  &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
                                   &xd->subpix);
       }
 
-      vp9_subtract_block(4, 4, src_diff, 8,
+      vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
                          src, src_stride,
                          dst, xd->plane[0].dst.stride);
-      x->fwd_txm4x4(src_diff, coeff, 16);
-      x->quantize_b_4x4(x, i, DCT_DCT, 16);
-      thisdistortion = vp9_block_error(coeff,
-                                       BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                    i, 16), 16);
+
+      k = i;
+      for (idy = 0; idy < bh; ++idy) {
+        for (idx = 0; idx < bw; ++idx) {
+          k += (idy * 2 + idx);
+          src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
+                                               x->plane[0].src_diff);
+          coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
+          x->fwd_txm4x4(src_diff, coeff, 16);
+          x->quantize_b_4x4(x, k, DCT_DCT, 16);
+          thisdistortion += vp9_block_error(coeff,
+                                            BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                         k, 16), 16);
+          thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
+                                  ta + (k & 1),
+                                  tl + (k >> 1), TX_4X4, 16);
+        }
+      }
       *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, 0, i, PLANE_TYPE_Y_WITH_DC,
-                                 ta + (i & 1),
-                                 tl + (i >> 1), TX_4X4, 16);
+      *labelyrate += thisrate;
     }
   }
   *distortion >>= 2;
@@ -1155,15 +1190,18 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   int sbr = 0, sbd = 0;
   int segmentyrate = 0;
   int best_eobs[4] = { 0 };
-#if CONFIG_AB4X4
   BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
-  int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-#endif
-
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int idx, idy;
   vp9_variance_fn_ptr_t *v_fn_ptr;
 
-  ENTROPY_CONTEXT t_above[2], t_left[2];
-  ENTROPY_CONTEXT t_above_b[2], t_left_b[2];
+  ENTROPY_CONTEXT t_above[4], t_left[4];
+  ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
+
+#if !CONFIG_AB4X4
+  bh = 1, bw = 1;
+#endif
 
   vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
@@ -1181,183 +1219,367 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   label_mv_thresh = 1 * bsi->mvthresh / label_count;
 
   // Segmentation method overheads
+#if !CONFIG_AB4X4
   rate += vp9_cost_mv_ref(cpi, SPLITMV,
                           mbmi->mb_mode_context[mbmi->ref_frame]);
   this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
   br += rate;
+#endif
   other_segment_rd = this_segment_rd;
 
-  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
-    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
-    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
-    B_PREDICTION_MODE mode_selected = ZERO4X4;
-    int bestlabelyrate = 0;
-
-    // search for the best motion vector on this segment
-    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
-      int64_t this_rd;
-      int distortion;
-      int labelyrate;
-      ENTROPY_CONTEXT t_above_s[2], t_left_s[2];
-
-      vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
-      vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
-
-      // motion search for newmv (single predictor case only)
-      if (mbmi->second_ref_frame <= 0 && this_mode == NEW4X4) {
-        int sseshift, n;
-        int step_param = 0;
-        int further_steps;
-        int thissme, bestsme = INT_MAX;
-        const struct buf_2d orig_src = x->plane[0].src;
-        const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
-
-        /* Is the best so far sufficiently good that we cant justify doing
-         * and new motion search. */
-        if (best_label_rd < label_mv_thresh)
-          break;
+  for (idy = 0; idy < 2; idy += bh) {
+    for (idx = 0; idx < 2; idx += bw) {
+      // TODO(jingning,rbultje): rewrite the rate-distortion optimization
+      // loop for 4x4/4x8/8x4 block coding
+#if CONFIG_AB4X4
+      int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
+      int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
+      B_PREDICTION_MODE mode_selected = ZERO4X4;
+      int bestlabelyrate = 0;
+      i = idy * 2 + idx;
+
+      // search for the best motion vector on this segment
+      for (this_mode = LEFT4X4; this_mode <= NEW4X4; ++this_mode) {
+        int64_t this_rd;
+        int distortion;
+        int labelyrate;
+        ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
+
+        vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
+        vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
+
+        // motion search for newmv (single predictor case only)
+        if (mbmi->second_ref_frame <= 0 && this_mode == NEW4X4) {
+          int sseshift, n;
+          int step_param = 0;
+          int further_steps;
+          int thissme, bestsme = INT_MAX;
+          const struct buf_2d orig_src = x->plane[0].src;
+          const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
+
+          /* Is the best so far sufficiently good that we cant justify doing
+           * and new motion search. */
+          if (best_label_rd < label_mv_thresh)
+            break;
 
-        if (cpi->compressor_speed) {
-          // use previous block's result as next block's MV predictor.
-          if (i > 0) {
-            bsi->mvp.as_int =
-            x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
-            if (i == 2)
+          if (cpi->compressor_speed) {
+            // use previous block's result as next block's MV predictor.
+            if (i > 0) {
               bsi->mvp.as_int =
-              x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
-            step_param = 2;
+              x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
+              if (i == 2)
+                bsi->mvp.as_int =
+                x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
+              step_param = 2;
+            }
           }
-        }
 
-        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-
-        {
-          int sadpb = x->sadperbit4;
-          int_mv mvp_full;
-
-          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
-          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
-
-          // find first label
-          n = i;
-
-          // adjust src pointer for this segment
-          x->plane[0].src.buf =
-          raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
-                                    x->plane[0].src.buf,
-                                    x->plane[0].src.stride);
-          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
-          x->e_mbd.plane[0].pre[0].buf =
-          raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
-                                    x->e_mbd.plane[0].pre[0].buf,
-                                    x->e_mbd.plane[0].pre[0].stride);
-
-          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                           sadpb, further_steps, 0, v_fn_ptr,
-                                           bsi->ref_mv, &mode_mv[NEW4X4]);
-
-          sseshift = 0;
-
-          // Should we do a full search (best quality only)
-          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
-            /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                     x->mv_row_min, x->mv_row_max);
-
-            thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, v_fn_ptr,
-                                           x->nmvjointcost, x->mvcost,
-                                           bsi->ref_mv,
-                                           n);
-
-            if (thissme < bestsme) {
-              bestsme = thissme;
-              mode_mv[NEW4X4].as_int =
-              x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
-            } else {
-              /* The full search result is actually worse so re-instate the
-               * previous best vector */
-              x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
-              mode_mv[NEW4X4].as_int;
+          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+
+          {
+            int sadpb = x->sadperbit4;
+            int_mv mvp_full;
+
+            mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
+            mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+
+            // find first label
+            n = i;
+
+            // adjust src pointer for this segment
+            x->plane[0].src.buf =
+            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
+                                      x->plane[0].src.buf,
+                                      x->plane[0].src.stride);
+            assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
+            x->e_mbd.plane[0].pre[0].buf =
+            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
+                                      x->e_mbd.plane[0].pre[0].buf,
+                                      x->e_mbd.plane[0].pre[0].stride);
+
+            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                             sadpb, further_steps, 0, v_fn_ptr,
+                                             bsi->ref_mv, &mode_mv[NEW4X4]);
+
+            sseshift = 0;
+
+            // Should we do a full search (best quality only)
+            if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
+              /* Check if mvp_full is within the range. */
+              clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+                       x->mv_row_min, x->mv_row_max);
+
+              thissme = cpi->full_search_sad(x, &mvp_full,
+                                             sadpb, 16, v_fn_ptr,
+                                             x->nmvjointcost, x->mvcost,
+                                             bsi->ref_mv,
+                                             n);
+
+              if (thissme < bestsme) {
+                bestsme = thissme;
+                mode_mv[NEW4X4].as_int =
+                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
+              } else {
+                /* The full search result is actually worse so re-instate the
+                 * previous best vector */
+                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
+                mode_mv[NEW4X4].as_int;
+              }
             }
           }
-        }
 
-        if (bestsme < INT_MAX) {
-          int distortion;
-          unsigned int sse;
-          cpi->find_fractional_mv_step(x, &mode_mv[NEW4X4],
-                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,
-                                       x->nmvjointcost, x->mvcost,
-                                       &distortion, &sse);
+          if (bestsme < INT_MAX) {
+            int distortion;
+            unsigned int sse;
+            cpi->find_fractional_mv_step(x, &mode_mv[NEW4X4],
+                                         bsi->ref_mv, x->errorperbit, v_fn_ptr,
+                                         x->nmvjointcost, x->mvcost,
+                                         &distortion, &sse);
 
-          // safe motion search result for use in compound prediction
-          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
+            // safe motion search result for use in compound prediction
+            seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
+          }
+
+          // restore src pointers
+          x->plane[0].src = orig_src;
+          x->e_mbd.plane[0].pre[0] = orig_pre;
+        } else if (mbmi->second_ref_frame > 0 && this_mode == NEW4X4) {
+          /* NEW4X4 */
+          /* motion search not completed? Then skip newmv for this block with
+           * comppred */
+          if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
+              seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+            continue;
+          }
         }
 
-        // restore src pointers
-        x->plane[0].src = orig_src;
-        x->e_mbd.plane[0].pre[0] = orig_pre;
-      } else if (mbmi->second_ref_frame > 0 && this_mode == NEW4X4) {
-        /* NEW4X4 */
-        /* motion search not completed? Then skip newmv for this block with
-         * comppred */
-        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
-            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+        rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+                           &second_mode_mv[this_mode], seg_mvs[i],
+                           bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                           x->mvcost, cpi);
+
+        // Trap vectors that reach beyond the UMV borders
+        if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+            ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+            ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+            ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
           continue;
         }
-      }
+        if (mbmi->second_ref_frame > 0 &&
+            mv_check_bounds(x, &second_mode_mv[this_mode]))
+          continue;
 
-      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
-                         &second_mode_mv[this_mode], seg_mvs[i],
-                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                         x->mvcost, cpi);
+        this_rd = encode_inter_mb_segment(&cpi->common,
+                                          x, labels, i, &labelyrate,
+                                          &distortion, t_above_s, t_left_s);
+        this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+        rate += labelyrate;
+
+        if (this_rd < best_label_rd) {
+          sbr = rate;
+          sbd = distortion;
+          bestlabelyrate = labelyrate;
+          mode_selected = this_mode;
+          best_label_rd = this_rd;
+          for (j = 0; j < 4; j++)
+            if (labels[j] == i)
+              best_eobs[j] = x->e_mbd.plane[0].eobs[j];
+
+          vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
+          vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
+        }
+      } /*for each 4x4 mode*/
 
-      // Trap vectors that reach beyond the UMV borders
-      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
-          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
-        continue;
-      }
-      if (mbmi->second_ref_frame > 0 &&
-          mv_check_bounds(x, &second_mode_mv[this_mode]))
-        continue;
+      vpx_memcpy(t_above, t_above_b, sizeof(t_above));
+      vpx_memcpy(t_left, t_left_b, sizeof(t_left));
 
-      this_rd = encode_inter_mb_segment(&cpi->common,
-                                        x, labels, i, &labelyrate,
-                                        &distortion, t_above_s, t_left_s);
-      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-      rate += labelyrate;
-
-      if (this_rd < best_label_rd) {
-        sbr = rate;
-        sbd = distortion;
-        bestlabelyrate = labelyrate;
-        mode_selected = this_mode;
-        best_label_rd = this_rd;
-        for (j = 0; j < 4; j++)
-          if (labels[j] == i)
-            best_eobs[j] = x->e_mbd.plane[0].eobs[j];
-
-        vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
-        vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
-      }
-    } /*for each 4x4 mode*/
+      labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+                  &second_mode_mv[mode_selected], seg_mvs[i],
+                  bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                  x->mvcost, cpi);
+#else
+      int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
+      int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
+      B_PREDICTION_MODE mode_selected = ZERO4X4;
+      int bestlabelyrate = 0;
+      i = idy * 2 + idx;
+
+      // search for the best motion vector on this segment
+      for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
+        int64_t this_rd;
+        int distortion;
+        int labelyrate;
+        ENTROPY_CONTEXT t_above_s[2], t_left_s[2];
+
+        vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
+        vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
+
+        // motion search for newmv (single predictor case only)
+        if (mbmi->second_ref_frame <= 0 && this_mode == NEW4X4) {
+          int sseshift, n;
+          int step_param = 0;
+          int further_steps;
+          int thissme, bestsme = INT_MAX;
+          const struct buf_2d orig_src = x->plane[0].src;
+          const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
+
+          /* Is the best so far sufficiently good that we cant justify doing
+           * and new motion search. */
+          if (best_label_rd < label_mv_thresh)
+            break;
 
-    vpx_memcpy(t_above, t_above_b, sizeof(t_above));
-    vpx_memcpy(t_left, t_left_b, sizeof(t_left));
+          if (cpi->compressor_speed) {
+            // use previous block's result as next block's MV predictor.
+            if (i > 0) {
+              bsi->mvp.as_int =
+              x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
+              if (i == 2)
+                bsi->mvp.as_int =
+                x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
+              step_param = 2;
+            }
+          }
+
+          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+
+          {
+            int sadpb = x->sadperbit4;
+            int_mv mvp_full;
+
+            mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
+            mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+
+            // find first label
+            n = i;
+
+            // adjust src pointer for this segment
+            x->plane[0].src.buf =
+            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
+                                      x->plane[0].src.buf,
+                                      x->plane[0].src.stride);
+            assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
+            x->e_mbd.plane[0].pre[0].buf =
+            raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, n,
+                                      x->e_mbd.plane[0].pre[0].buf,
+                                      x->e_mbd.plane[0].pre[0].stride);
 
-    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
-                &second_mode_mv[mode_selected], seg_mvs[i],
-                bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                x->mvcost, cpi);
+            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                             sadpb, further_steps, 0, v_fn_ptr,
+                                             bsi->ref_mv, &mode_mv[NEW4X4]);
+
+            sseshift = 0;
+
+            // Should we do a full search (best quality only)
+            if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
+              /* Check if mvp_full is within the range. */
+              clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+                       x->mv_row_min, x->mv_row_max);
+
+              thissme = cpi->full_search_sad(x, &mvp_full,
+                                             sadpb, 16, v_fn_ptr,
+                                             x->nmvjointcost, x->mvcost,
+                                             bsi->ref_mv,
+                                             n);
+
+              if (thissme < bestsme) {
+                bestsme = thissme;
+                mode_mv[NEW4X4].as_int =
+                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
+              } else {
+                /* The full search result is actually worse so re-instate the
+                 * previous best vector */
+                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
+                mode_mv[NEW4X4].as_int;
+              }
+            }
+          }
+
+          if (bestsme < INT_MAX) {
+            int distortion;
+            unsigned int sse;
+            cpi->find_fractional_mv_step(x, &mode_mv[NEW4X4],
+                                         bsi->ref_mv, x->errorperbit, v_fn_ptr,
+                                         x->nmvjointcost, x->mvcost,
+                                         &distortion, &sse);
+
+            // safe motion search result for use in compound prediction
+            seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
+          }
+
+          // restore src pointers
+          x->plane[0].src = orig_src;
+          x->e_mbd.plane[0].pre[0] = orig_pre;
+        } else if (mbmi->second_ref_frame > 0 && this_mode == NEW4X4) {
+          /* NEW4X4 */
+          /* motion search not completed? Then skip newmv for this block with
+           * comppred */
+          if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
+              seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+            continue;
+          }
+        }
+
+        rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+                           &second_mode_mv[this_mode], seg_mvs[i],
+                           bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                           x->mvcost, cpi);
+
+        // Trap vectors that reach beyond the UMV borders
+        if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+            ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+            ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+            ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+          continue;
+        }
+        if (mbmi->second_ref_frame > 0 &&
+            mv_check_bounds(x, &second_mode_mv[this_mode]))
+          continue;
 
-    br += sbr;
-    bd += sbd;
-    segmentyrate += bestlabelyrate;
-    this_segment_rd += best_label_rd;
-    other_segment_rd += best_other_rd;
+        this_rd = encode_inter_mb_segment(&cpi->common,
+                                          x, labels, i, &labelyrate,
+                                          &distortion, t_above_s, t_left_s);
+        this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+        rate += labelyrate;
+
+        if (this_rd < best_label_rd) {
+          sbr = rate;
+          sbd = distortion;
+          bestlabelyrate = labelyrate;
+          mode_selected = this_mode;
+          best_label_rd = this_rd;
+          for (j = 0; j < 4; j++)
+            if (labels[j] == i)
+              best_eobs[j] = x->e_mbd.plane[0].eobs[j];
+
+          vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
+          vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
+        }
+      } /*for each 4x4 mode*/
+
+      vpx_memcpy(t_above, t_above_b, sizeof(t_above));
+      vpx_memcpy(t_left, t_left_b, sizeof(t_left));
+
+      labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+                  &second_mode_mv[mode_selected], seg_mvs[i],
+                  bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                  x->mvcost, cpi);
+#endif
+
+      br += sbr;
+      bd += sbd;
+      segmentyrate += bestlabelyrate;
+      this_segment_rd += best_label_rd;
+      other_segment_rd += best_other_rd;
+
+      for (j = 1; j < bh; ++j)
+        vpx_memcpy(&x->partition_info->bmi[i + j * 2],
+                   &x->partition_info->bmi[i],
+                   sizeof(x->partition_info->bmi[i]));
+      for (j = 1; j < bw; ++j)
+        vpx_memcpy(&x->partition_info->bmi[i + j],
+                   &x->partition_info->bmi[i],
+                   sizeof(x->partition_info->bmi[i]));
+    }
   } /* for each label */
 
   if (this_segment_rd < bsi->segment_rd) {
@@ -2504,12 +2726,23 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable;
     int64_t txfm_cache[NB_TXFM_MODES];
+    int i;
+
+    for (i = 0; i < NB_TXFM_MODES; ++i)
+      txfm_cache[i] = INT64_MAX;
 
     // Test best rd so far against threshold for trying this mode.
+#if CONFIG_AB4X4
+    if (bsize >= BLOCK_SIZE_SB8X8 &&
+        (best_rd < cpi->rd_threshes[mode_index] ||
+         cpi->rd_threshes[mode_index] == INT_MAX))
+      continue;
+#else
     if (best_rd <= cpi->rd_threshes[mode_index] ||
         cpi->rd_threshes[mode_index] == INT_MAX) {
       continue;
     }
+#endif
 
     x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
@@ -2520,7 +2753,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
     }
 
+#if CONFIG_AB4X4
+    if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
+#else
     if (cpi->speed > 0) {
+#endif
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
       }
@@ -2652,6 +2889,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 += dist_uv[TX_4X4];
       distortion_uv = dist_uv[TX_4X4];
       mbmi->uv_mode = mode_uv[TX_4X4];
+#if CONFIG_AB4X4
+      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      for (i = 0; i < NB_TXFM_MODES; ++i)
+        txfm_cache[i] = txfm_cache[ONLY_4X4];
+#endif
     } else if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       vp9_build_intra_predictors_sby_s(xd, bsize);
@@ -2785,6 +3027,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 += distortion_uv;
       skippable = skippable && uv_skippable;
 
+#if CONFIG_AB4X4
+      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      for (i = 0; i < NB_TXFM_MODES; ++i)
+        txfm_cache[i] = txfm_cache[ONLY_4X4];
+#endif
+
       if (!mode_excluded) {
         if (is_comp_pred)
           mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
@@ -2855,7 +3103,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // Is Mb level skip allowed (i.e. not coded at segment level).
       mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
 
+#if CONFIG_AB4X4
+      if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
+#else
       if (skippable) {
+#endif
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
         // for best_yrd calculation
@@ -3001,12 +3253,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
     if (!mode_excluded && this_rd != INT64_MAX) {
       for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd;
+        int64_t adj_rd = INT64_MAX;
         if (this_mode != I4X4_PRED) {
           adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
         } else {
           adj_rd = this_rd;
         }
+
         if (adj_rd < best_txfm_rd[i])
           best_txfm_rd[i] = adj_rd;
       }
@@ -3073,7 +3326,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
       cpi->is_src_frame_alt_ref &&
       (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)
+#if CONFIG_AB4X4
+      && bsize >= BLOCK_SIZE_SB8X8
+#endif
+     ) {
     mbmi->mode = ZEROMV;
     mbmi->ref_frame = ALTREF_FRAME;
     mbmi->second_ref_frame = NONE;