From 8b2f57d0b8da5a51e4579da6baa3e7bf4ea40b5b Mon Sep 17 00:00:00 2001
From: Yaowu Xu <yaowu@google.com>
Date: Mon, 6 Aug 2012 10:51:20 -0700
Subject: [PATCH] a new way of determining reference motion vector

Using surrounding reconstructed pixels from left and above to select
best matching mv to use as reference motion vector for mv encoding.

Test results:
       AVGPSNR  GLBPSNR VPXSSIM
Derf:  1.107%   1.062%  0.992%
Std-hd:1.209%   1.176%  1.029%

Change-Id: I8f10e09ee6538c05df2fb9f069abcaf1edb3fca6
---
 configure                 |   1 +
 vp8/common/blockd.h       |   6 +
 vp8/common/findnearmv.c   | 139 ++++++++++++++++++---
 vp8/common/findnearmv.h   |  47 +++----
 vp8/decoder/decodemv.c    |  80 ++++++++++--
 vp8/decoder/decodframe.c  |   9 +-
 vp8/decoder/onyxd_if.c    |   2 +-
 vp8/encoder/bitstream.c   |   9 +-
 vp8/encoder/encodeframe.c |  22 ++--
 vp8/encoder/onyx_int.h    |   2 +-
 vp8/encoder/rdopt.c       |  41 ++++++-
 vp8/encoder/sad_c.c       |  19 ++-
 vp8/encoder/tokenize.c    | 252 +++++++++++++++++++++++---------------
 13 files changed, 464 insertions(+), 165 deletions(-)

diff --git a/configure b/configure
index b135874bde..2593a0e60c 100755
--- a/configure
+++ b/configure
@@ -226,6 +226,7 @@ EXPERIMENT_LIST="
     hybridtransform8x8
     switchable_interp
     tx16x16
+    newbestrefmv
 "
 CONFIG_LIST="
     external_build
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 2932fd4973..a0fe46cfa8 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -267,6 +267,9 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   TX_SIZE txfm_size;
   int_mv mv, second_mv;
+#if CONFIG_NEWBESTREFMV
+  int_mv ref_mv, second_ref_mv;
+#endif
   unsigned char partitioning;
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
@@ -423,6 +426,9 @@ typedef struct MacroBlockD {
 #endif
 
   int mb_index;   // Index of the MB in the SB (0..3)
+#if CONFIG_NEWBESTREFMV
+  int_mv ref_mv[4];
+#endif
 
 #if CONFIG_HYBRIDTRANSFORM
   int q_index;
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index d35e2c4d48..303893d9db 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -10,6 +10,7 @@
 
 
 #include "findnearmv.h"
+#include <limits.h>
 
 const unsigned char vp8_mbsplit_offset[4][16] = {
   { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
@@ -18,6 +19,15 @@ const unsigned char vp8_mbsplit_offset[4][16] = {
   { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
 };
 
+static void lower_mv_precision(int_mv *mv)
+{
+  if (mv->as_mv.row & 1)
+    mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
+  if (mv->as_mv.col & 1)
+    mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);
+}
+
+
 /* Predict motion vectors using those from already-decoded nearby blocks.
    Note that we only consider one 4x4 subblock from each candidate 16x16
    macroblock.   */
@@ -32,8 +42,7 @@ void vp8_find_near_mvs
   int_mv *best_mv,
   int cnt[4],
   int refframe,
-  int *ref_frame_sign_bias
-) {
+  int *ref_frame_sign_bias) {
   const MODE_INFO *above = here - xd->mode_info_stride;
   const MODE_INFO *left = here - 1;
   const MODE_INFO *aboveleft = above - 1;
@@ -43,16 +52,30 @@ void vp8_find_near_mvs
   int             *cntx = cnt;
   enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
 
+#if CONFIG_NEWBESTREFMV
+  int_mv          *ref_mv = xd->ref_mv;
+#endif
+
   /* Zero accumulators */
   mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
   cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+#if CONFIG_NEWBESTREFMV
+  ref_mv[0].as_int = ref_mv[1].as_int
+                   = ref_mv[2].as_int
+                   = ref_mv[3].as_int
+                   = 0;
+#endif
 
   /* Process above */
   if (above->mbmi.ref_frame != INTRA_FRAME) {
     if (above->mbmi.mv.as_int) {
-      (++mv)->as_int = above->mbmi.mv.as_int;
+      ++ mv;
+      mv->as_int = above->mbmi.mv.as_int;
       mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
               refframe, mv, ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+      ref_mv[0].as_int = mv->as_int;
+#endif
       ++cntx;
     }
     *cntx += 2;
@@ -65,10 +88,13 @@ void vp8_find_near_mvs
       this_mv.as_int = left->mbmi.mv.as_int;
       mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
               refframe, &this_mv, ref_frame_sign_bias);
-
+#if CONFIG_NEWBESTREFMV
+      ref_mv[1].as_int = this_mv.as_int;
+#endif
       if (this_mv.as_int != mv->as_int) {
-        (++mv)->as_int = this_mv.as_int;
-        ++cntx;
+        ++ mv;
+        mv->as_int = this_mv.as_int;
+        ++ cntx;
       }
       *cntx += 2;
     } else
@@ -79,9 +105,21 @@ void vp8_find_near_mvs
       (lf_here->mbmi.ref_frame == LAST_FRAME && refframe == LAST_FRAME)) {
     if (aboveleft->mbmi.mv.as_int) {
       third = aboveleft;
+#if CONFIG_NEWBESTREFMV
+      ref_mv[2].as_int = aboveleft->mbmi.mv.as_int;
+      mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame],
+              refframe, (ref_mv+2), ref_frame_sign_bias);
+#endif
     } else if (lf_here->mbmi.mv.as_int) {
       third = lf_here;
     }
+#if CONFIG_NEWBESTREFMV
+    if (lf_here->mbmi.mv.as_int) {
+      ref_mv[3].as_int = lf_here->mbmi.mv.as_int;
+      mv_bias(ref_frame_sign_bias[lf_here->mbmi.ref_frame],
+              refframe, (ref_mv+3), ref_frame_sign_bias);
+    }
+#endif
     if (third) {
       int_mv this_mv;
       this_mv.as_int = third->mbmi.mv.as_int;
@@ -89,8 +127,9 @@ void vp8_find_near_mvs
               refframe, &this_mv, ref_frame_sign_bias);
 
       if (this_mv.as_int != mv->as_int) {
-        (++mv)->as_int = this_mv.as_int;
-        ++cntx;
+        ++ mv;
+        mv->as_int = this_mv.as_int;
+        ++ cntx;
       }
       *cntx += 1;
     } else
@@ -134,18 +173,9 @@ void vp8_find_near_mvs
    * is not being used, by truncating the last bit towards 0
    */
   if (!xd->allow_high_precision_mv) {
-    if (best_mv->as_mv.row & 1)
-      best_mv->as_mv.row += (best_mv->as_mv.row > 0 ? -1 : 1);
-    if (best_mv->as_mv.col & 1)
-      best_mv->as_mv.col += (best_mv->as_mv.col > 0 ? -1 : 1);
-    if (nearest->as_mv.row & 1)
-      nearest->as_mv.row += (nearest->as_mv.row > 0 ? -1 : 1);
-    if (nearest->as_mv.col & 1)
-      nearest->as_mv.col += (nearest->as_mv.col > 0 ? -1 : 1);
-    if (nearby->as_mv.row & 1)
-      nearby->as_mv.row += (nearby->as_mv.row > 0 ? -1 : 1);
-    if (nearby->as_mv.col & 1)
-      nearby->as_mv.col += (nearby->as_mv.col > 0 ? -1 : 1);
+    lower_mv_precision(best_mv);
+    lower_mv_precision(nearest);
+    lower_mv_precision(nearby);
   }
 
   // TODO: move clamp outside findnearmv
@@ -163,3 +193,72 @@ vp8_prob *vp8_mv_ref_probs(VP8_COMMON *pc,
   p[3] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[3]] [3];
   return p;
 }
+
+#if CONFIG_NEWBESTREFMV
+/* check a list of motion vectors by sad score using a number rows of pixels
+ * above and a number cols of pixels in the left to select the one with best
+ * score to use as ref motion vector
+ */
+void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
+                           unsigned char *ref_y_buffer,
+                           int ref_y_stride,
+                           int_mv *best_mv){
+  int_mv *ref_mv = xd->ref_mv;
+  int bestsad = INT_MAX;
+  int i;
+  unsigned char *above_src;
+  unsigned char *left_src;
+  unsigned char *above_ref;
+  unsigned char *left_ref;
+  int sad;
+
+  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
+  left_src  = xd->dst.y_buffer - 2;
+  above_ref = ref_y_buffer - ref_y_stride * 2;
+  left_ref  = ref_y_buffer - 2;
+
+  bestsad = vp8_sad16x2_c(above_src, xd->dst.y_stride,
+                          above_ref, ref_y_stride,
+                          INT_MAX);
+  bestsad += vp8_sad2x16_c(left_src, xd->dst.y_stride,
+                           left_ref, ref_y_stride,
+                           INT_MAX);
+  best_mv->as_int = 0;
+
+  for(i = 0; i < 4; ++i) {
+    if (ref_mv[i].as_int) {
+      int_mv this_mv;
+      int offset=0;
+      int row_offset, col_offset;
+      this_mv.as_int = ref_mv[i].as_int;
+      vp8_clamp_mv(&this_mv,
+                   xd->mb_to_left_edge - LEFT_TOP_MARGIN + 16,
+                   xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+                   xd->mb_to_top_edge - LEFT_TOP_MARGIN + 16,
+                   xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+
+      row_offset = (this_mv.as_mv.row > 0) ?
+        ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
+      col_offset = (this_mv.as_mv.col > 0) ?
+        ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
+      offset = ref_y_stride * row_offset + col_offset;
+
+      sad = vp8_sad16x2_c(above_src, xd->dst.y_stride,
+                          above_ref + offset, ref_y_stride, INT_MAX);
+
+      sad += vp8_sad2x16_c(left_src, xd->dst.y_stride,
+                           left_ref + offset, ref_y_stride, INT_MAX);
+
+      if (sad < bestsad) {
+        bestsad = sad;
+        best_mv->as_int = this_mv.as_int;
+      }
+    }
+  }
+  if (!xd->allow_high_precision_mv)
+    lower_mv_precision(best_mv);
+
+  vp8_clamp_mv2(best_mv, xd);
+}
+
+#endif
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index d4769e6085..3bb2024c23 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -33,20 +33,14 @@ static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, co
 
 #define LEFT_TOP_MARGIN (16 << 3)
 #define RIGHT_BOTTOM_MARGIN (16 << 3)
-static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
-  if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-    mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-  else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-    mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
-  if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-    mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-  else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-    mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-}
 
-static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
-                         int mb_to_top_edge, int mb_to_bottom_edge) {
+
+
+static void vp8_clamp_mv(int_mv *mv,
+                         int mb_to_left_edge,
+                         int mb_to_right_edge,
+                         int mb_to_top_edge,
+                         int mb_to_bottom_edge) {
   mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
                   mb_to_left_edge : mv->as_mv.col;
   mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
@@ -56,15 +50,26 @@ static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
   mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
                   mb_to_bottom_edge : mv->as_mv.row;
 }
-static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
-                                        int mb_to_right_edge, int mb_to_top_edge,
+
+static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
+  vp8_clamp_mv(mv,
+              xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+              xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+              xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+              xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+
+
+static unsigned int vp8_check_mv_bounds(int_mv *mv,
+                                        int mb_to_left_edge,
+                                        int mb_to_right_edge,
+                                        int mb_to_top_edge,
                                         int mb_to_bottom_edge) {
-  unsigned int need_to_clamp;
-  need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
-  need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
-  need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
-  need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
-  return need_to_clamp;
+  return (mv->as_mv.col < mb_to_left_edge) ||
+         (mv->as_mv.col > mb_to_right_edge) ||
+         (mv->as_mv.row < mb_to_top_edge) ||
+         (mv->as_mv.row > mb_to_bottom_edge);
 }
 
 void vp8_find_near_mvs
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 5a11f646c9..2e0049df07 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -619,10 +619,44 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     int_mv nearest_second, nearby_second, best_mv_second;
     vp8_prob mv_ref_p [VP8_MVREFS - 1];
 
+#if CONFIG_NEWBESTREFMV
+    int recon_y_stride, recon_yoffset;
+    int recon_uv_stride, recon_uvoffset;
+#endif
+
     vp8_find_near_mvs(xd, mi,
                       prev_mi,
                       &nearest, &nearby, &best_mv, rct,
-                      mbmi->ref_frame, pbi->common.ref_frame_sign_bias);
+                      mbmi->ref_frame, cm->ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+    {
+      int ref_fb_idx;
+
+      /* Select the appropriate reference frame for this MB */
+      if (mbmi->ref_frame == LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+      else if (mbmi->ref_frame == GOLDEN_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+      else
+        ref_fb_idx = cm->alt_fb_idx;
+
+      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;
+      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+
+      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+      vp8_find_best_ref_mvs(xd,
+                            xd->pre.y_buffer,
+                            recon_y_stride,
+                            &best_mv);
+    }
+#endif
+
     vp8_mv_ref_probs(&pbi->common, mv_ref_p, rct);
 
     // Is the segment level mode feature enabled for this segment
@@ -672,11 +706,41 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       mbmi->second_ref_frame = mbmi->ref_frame + 1;
       if (mbmi->second_ref_frame == 4)
         mbmi->second_ref_frame = 1;
-
-      vp8_find_near_mvs(xd, mi,
-                        prev_mi,
-                        &nearest_second, &nearby_second, &best_mv_second, rct,
-                        mbmi->second_ref_frame, pbi->common.ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+      if (mbmi->second_ref_frame) {
+        int second_ref_fb_idx;
+        /* Select the appropriate reference frame for this MB */
+        if (mbmi->second_ref_frame == LAST_FRAME)
+          second_ref_fb_idx = cm->lst_fb_idx;
+        else if (mbmi->second_ref_frame ==
+          GOLDEN_FRAME)
+          second_ref_fb_idx = cm->gld_fb_idx;
+        else
+          second_ref_fb_idx = cm->alt_fb_idx;
+
+        xd->second_pre.y_buffer =
+          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+        xd->second_pre.u_buffer =
+          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+        xd->second_pre.v_buffer =
+          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+        vp8_find_near_mvs(xd, mi, prev_mi,
+                          &nearest_second, &nearby_second, &best_mv_second,
+                          rct,
+                          mbmi->second_ref_frame,
+                          cm->ref_frame_sign_bias);
+        vp8_find_best_ref_mvs(xd,
+                              xd->second_pre.y_buffer,
+                              recon_y_stride,
+                              &best_mv_second);
+      }
+#else
+      vp8_find_near_mvs(xd, mi, prev_mi,
+                        &nearest_second, &nearby_second, &best_mv_second,
+                        rct,
+                        mbmi->second_ref_frame,
+                        pbi->common.ref_frame_sign_bias);
+#endif
     } else {
       mbmi->second_ref_frame = 0;
     }
@@ -941,7 +1005,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
       }
     } else {
       mbmi->uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(
-		      bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
+        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
       pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
     }
 
@@ -1021,7 +1085,7 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
 void vpx_decode_mode_mvs_init(VP8D_COMP *pbi){
   VP8_COMMON *cm = &pbi->common;
   mb_mode_mv_init(pbi);
-  if (cm->frame_type == KEY_FRAME &&!cm->kf_ymode_probs_update)
+  if (cm->frame_type == KEY_FRAME && !cm->kf_ymode_probs_update)
     cm->kf_ymode_probs_index = vp8_read_literal(&pbi->bc, 3);
 }
 void vpx_decode_mb_mode_mv(VP8D_COMP *pbi,
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 59f453edff..ffa7f0cc1e 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -631,10 +631,6 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
       xd->up_available = (mb_row != 0);
       xd->left_available = (mb_col != 0);
 
-      if(pbi->interleaved_decoding)
-        vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col);
-
-      update_blockd_bmi(xd);
 
       recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
       recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
@@ -643,6 +639,11 @@ decode_sb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mbrow, MACROBLOCKD *xd) {
       xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
       xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
 
+      if(pbi->interleaved_decoding)
+        vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col);
+
+      update_blockd_bmi(xd);
+
       /* Select the appropriate reference frame for this MB */
       if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
         ref_fb_idx = pc->lst_fb_idx;
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index f7d93b2015..f9195b6263 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -149,7 +149,7 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) {
 
   pbi->decoded_key_frame = 0;
 
-  pbi->interleaved_decoding = 1;
+  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV;
 
   return (VP8D_PTR) pbi;
 }
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 574427cf70..103391c2c6 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -933,6 +933,9 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
             vp8_find_near_mvs(xd, m, prev_m, &n1, &n2, &best_mv, ct,
                               rf, cpi->common.ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+            best_mv.as_int = mi->ref_mv.as_int;
+#endif
             vp8_mv_ref_probs(&cpi->common, mv_ref_p, ct);
 
 #ifdef ENTROPY_STATS
@@ -983,7 +986,11 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
             vp8_find_near_mvs(xd, m,
                               prev_m,
                               &n1, &n2, &best_second_mv, ct,
-                              mi->second_ref_frame, cpi->common.ref_frame_sign_bias);
+                              mi->second_ref_frame,
+                              cpi->common.ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+            best_second_mv.as_int = mi->second_ref_mv.as_int;
+#endif
           }
 
           // does the feature use compound prediction or not
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e1d0bf4a70..460c160ac5 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -47,7 +47,8 @@ int enc_debug = 0;
 int mb_row_debug, mb_col_debug;
 #endif
 
-extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x,
+                         TOKENEXTRA **t, int dry_run);
 
 extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
 extern void vp8_auto_select_speed(VP8_COMP *cpi);
@@ -1467,8 +1468,12 @@ void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
   if (output_enabled) {
     // Tokenize
     sum_intra_stats(cpi, x);
-    vp8_tokenize_mb(cpi, &x->e_mbd, t);
+    vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
   }
+#if CONFIG_NEWBESTREFMV
+  else
+    vp8_tokenize_mb(cpi, &x->e_mbd, t, 1);
+#endif
 }
 #ifdef SPEEDSTATS
 extern int cnt_pm;
@@ -1624,8 +1629,9 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
       fflush(stdout);
     }
 #endif
-    if (output_enabled)
-      vp8_tokenize_mb(cpi, xd, t);
+
+    vp8_tokenize_mb(cpi, xd, t, !output_enabled);
+
 #ifdef ENC_DEBUG
     if (enc_debug) {
       printf("Tokenized\n");
@@ -1640,12 +1646,14 @@ void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
       0;
     if (cpi->common.mb_no_coeff_skip) {
       xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-      cpi->skip_true_count[mb_skip_context]++;
+      if (output_enabled)
+        cpi->skip_true_count[mb_skip_context]++;
       vp8_fix_contexts(xd);
     } else {
-      vp8_stuff_mb(cpi, xd, t);
+      vp8_stuff_mb(cpi, xd, t, !output_enabled);
       xd->mode_info_context->mbmi.mb_skip_coeff = 0;
-      cpi->skip_false_count[mb_skip_context]++;
+      if (output_enabled)
+        cpi->skip_false_count[mb_skip_context]++;
     }
   }
 }
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 6a0a902128..6b6167b239 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -761,7 +761,7 @@ void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
 
 int rd_cost_intra_mb(MACROBLOCKD *x);
 
-void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **, int dry_run);
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 67bf33d6fd..df76fc316e 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2730,6 +2730,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 #if CONFIG_PRED_FILTER
   int best_filter_state;
 #endif
+#if CONFIG_NEWBESTREFMV
+  int_mv ref_mv[MAX_REF_FRAMES] = {0};
+#endif
+
   // int all_rds[MAX_MODES];        // Experimental debug code.
   // int all_rates[MAX_MODES];
   // int all_dist[MAX_MODES];
@@ -2789,6 +2793,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
     u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
     v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
+#if CONFIG_NEWBESTREFMV
+    vp8_find_best_ref_mvs(&x->e_mbd,
+                          y_buffer[LAST_FRAME],
+                          lst_yv12->y_stride,
+                          &frame_best_ref_mv[LAST_FRAME]);
+    ref_mv[LAST_FRAME].as_int = frame_best_ref_mv[LAST_FRAME].as_int;
+#endif
   }
 
   if (cpi->ref_frame_flags & VP8_GOLD_FLAG) {
@@ -2802,6 +2813,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
     u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
     v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
+#if CONFIG_NEWBESTREFMV
+    vp8_find_best_ref_mvs(&x->e_mbd,
+                          y_buffer[GOLDEN_FRAME],
+                          gld_yv12->y_stride,
+                          &frame_best_ref_mv[GOLDEN_FRAME]);
+    ref_mv[GOLDEN_FRAME].as_int = frame_best_ref_mv[GOLDEN_FRAME].as_int;
+#endif
   }
 
   if (cpi->ref_frame_flags & VP8_ALT_FLAG) {
@@ -2815,6 +2833,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
     u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
     v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
+#if CONFIG_NEWBESTREFMV
+    vp8_find_best_ref_mvs(&x->e_mbd,
+                          y_buffer[ALTREF_FRAME],
+                          alt_yv12->y_stride,
+                          &frame_best_ref_mv[ALTREF_FRAME]);
+    ref_mv[ALTREF_FRAME].as_int = frame_best_ref_mv[ALTREF_FRAME].as_int;
+#endif
   }
 
   *returnintra = INT64_MAX;
@@ -2872,6 +2897,12 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
       vp8_mode_order[mode_index].ref_frame;
     xd->mode_info_context->mbmi.second_ref_frame =
       vp8_mode_order[mode_index].second_ref_frame;
+#if CONFIG_NEWBESTREFMV
+    x->e_mbd.mode_info_context->mbmi.ref_mv =
+      ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+    x->e_mbd.mode_info_context->mbmi.second_ref_mv =
+      ref_mv[x->e_mbd.mode_info_context->mbmi.second_ref_frame];
+#endif
 #if CONFIG_PRED_FILTER
     xd->mode_info_context->mbmi.pred_filter_enabled = 0;
 #endif
@@ -3851,8 +3882,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
   }
 
   // macroblock modes
-  vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-
+  vpx_memcpy(&x->e_mbd.mode_info_context->mbmi,
+             &best_mbmode, sizeof(MB_MODE_INFO));
+#if CONFIG_NEWBESTREFMV
+  x->e_mbd.mode_info_context->mbmi.ref_mv =
+    ref_mv[best_mbmode.ref_frame];
+  x->e_mbd.mode_info_context->mbmi.second_ref_mv =
+    ref_mv[best_mbmode.second_ref_frame];
+#endif
   if (best_mbmode.mode == B_PRED) {
     for (i = 0; i < 16; i++) {
       xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
index 5ce13ec129..78a87f392c 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -104,7 +104,24 @@ unsigned int vp8_sad4x4_c(
 
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
-
+#if CONFIG_NEWBESTREFMV
+unsigned int vp8_sad2x16_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad){
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 2, 16);
+}
+unsigned int vp8_sad16x2_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad){
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 2);
+}
+#endif
 void vp8_sad16x16x3_c(
   const unsigned char *src_ptr,
   int  src_stride,
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index 105aa6a7c7..a8b6436efd 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -38,10 +38,13 @@ extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
                     [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
 #endif
 #endif
-void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
-void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
+void vp8_stuff_mb(VP8_COMP *cpi,
+                  MACROBLOCKD *x, TOKENEXTRA **t, int dry_run);
+void vp8_stuff_mb_8x8(VP8_COMP *cpi,
+                      MACROBLOCKD *x, TOKENEXTRA **t, int dry_run);
 #if CONFIG_TX16X16
-void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
+void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x,
+                        TOKENEXTRA **t, int dry_run);
 #endif
 void vp8_fix_contexts(MACROBLOCKD *x);
 
@@ -110,9 +113,15 @@ static void fill_value_tokens() {
 }
 
 #if CONFIG_TX16X16
-static void tokenize1st_order_b_16x16(MACROBLOCKD *xd, const BLOCKD *const b, TOKENEXTRA **tp,
-                                      const int type, const FRAME_TYPE frametype, ENTROPY_CONTEXT *a,
-                                      ENTROPY_CONTEXT *l, VP8_COMP *cpi) {
+static void tokenize1st_order_b_16x16(MACROBLOCKD *xd,
+                                      const BLOCKD *const b,
+                                      TOKENEXTRA **tp,
+                                      const int type,
+                                      const FRAME_TYPE frametype,
+                                      ENTROPY_CONTEXT *a,
+                                      ENTROPY_CONTEXT *l,
+                                      VP8_COMP *cpi,
+                                      int dry_run) {
   int pt; /* near block/prev token context index */
   int c = 0;                  /* start at DC unless type 0 */
   const int eob = b->eob;     /* one beyond last nonzero coeff */
@@ -147,8 +156,8 @@ static void tokenize1st_order_b_16x16(MACROBLOCKD *xd, const BLOCKD *const b, TO
     t->context_tree = cpi->common.fc.coef_probs_16x16[type][band][pt];
 
     t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
-
-    ++cpi->coef_counts_16x16[type][band][pt][x];
+    if (!dry_run)
+      ++cpi->coef_counts_16x16[type][band][pt][x];
   } while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < seg_eob);
 
   *tp = t;
@@ -166,8 +175,8 @@ static void tokenize2nd_order_b_8x8
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   int c = 0;          /* start at DC */
   const int eob = b->eob;     /* one beyond last nonzero coeff */
@@ -213,8 +222,8 @@ static void tokenize2nd_order_b_8x8
              x, vp8_coef_encodings[x].Len, t->skip_eob_node, eob, c, band, type,
              cpi->count, mb_row_debug, mb_col_debug);
 #endif
-
-    ++cpi->coef_counts_8x8       [type] [band] [pt] [x];
+    if (!dry_run)
+      ++cpi->coef_counts_8x8       [type] [band] [pt] [x];
   } while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < seg_eob);
 
 
@@ -224,8 +233,10 @@ static void tokenize2nd_order_b_8x8
 
 }
 
-static void tokenize2nd_order_b(MACROBLOCKD *xd, TOKENEXTRA **tp,
-                                VP8_COMP *cpi) {
+static void tokenize2nd_order_b(MACROBLOCKD *xd,
+                                TOKENEXTRA **tp,
+                                VP8_COMP *cpi,
+                                int dry_run) {
   int pt;             /* near block/prev token context index */
   int c;              /* start at DC */
   TOKENEXTRA *t = *tp;/* store tokens starting here */
@@ -261,7 +272,8 @@ static void tokenize2nd_order_b(MACROBLOCKD *xd, TOKENEXTRA **tp,
 
     t->skip_eob_node = ((pt == 0) && (band > 0));
 
-    ++cpi->coef_counts       [1] [band] [pt] [token];
+    if (!dry_run)
+      ++cpi->coef_counts       [1] [band] [pt] [token];
 
     pt = vp8_prev_token_class[token];
     t++;
@@ -274,7 +286,8 @@ static void tokenize2nd_order_b(MACROBLOCKD *xd, TOKENEXTRA **tp,
 
     t->skip_eob_node = ((pt == 0) && (band > 0));
 
-    ++cpi->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
+    if (!dry_run)
+      ++cpi->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
 
     t++;
   }
@@ -295,8 +308,8 @@ static void tokenize1st_order_b_8x8
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   int c = type ? 0 : 1;       /* start at DC unless type 0 */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
@@ -324,7 +337,9 @@ static void tokenize1st_order_b_8x8
     t->context_tree = cpi->common.fc.coef_probs_8x8[type][band][pt];
 
     t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
-    ++cpi->coef_counts_8x8[type][band][pt][x];
+
+    if (!dry_run)
+      ++cpi->coef_counts_8x8[type][band][pt][x];
 
     pt = vp8_prev_token_class[x];
     ++t;
@@ -337,7 +352,8 @@ static void tokenize1st_order_b_8x8
     t->context_tree = cpi->common.fc.coef_probs_8x8 [type] [band] [pt];
     t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
 
-    ++cpi->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN];
+    if (!dry_run)
+      ++cpi->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN];
     ++t;
   }
 
@@ -350,7 +366,8 @@ static void tokenize1st_order_b_8x8
 static void tokenize1st_order_ht(   MACROBLOCKD *xd,
                                     TOKENEXTRA **tp,
                                     int type,
-                                    VP8_COMP    *cpi) {
+                                    VP8_COMP *cpi,
+                                    int dry_run) {
   unsigned int block;
   const BLOCKD *b;
   int pt;             /* near block/prev token context index */
@@ -428,7 +445,8 @@ static void tokenize1st_order_ht(   MACROBLOCKD *xd,
       t->skip_eob_node = pt == 0 &&
           ((band > 0 && type > 0) || (band > 1 && type == 0));
 
-      ++cpi->coef_counts       [type] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [type] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -441,8 +459,8 @@ static void tokenize1st_order_ht(   MACROBLOCKD *xd,
 
       t->skip_eob_node = pt == 0 &&
           ((band > 0 && type > 0) || (band > 1 && type == 0));
-
-      ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+      if (!dry_run)
+        ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
 
       t++;
     }
@@ -478,7 +496,8 @@ static void tokenize1st_order_ht(   MACROBLOCKD *xd,
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -490,9 +509,8 @@ static void tokenize1st_order_ht(   MACROBLOCKD *xd,
       t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
-
-      ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
-
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
       t++;
     }
 
@@ -510,8 +528,8 @@ static void tokenize1st_order_chroma
   MACROBLOCKD *xd,
   TOKENEXTRA **tp,
   int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   unsigned int block;
   const BLOCKD *b;
   int pt;             /* near block/prev token context index */
@@ -557,7 +575,8 @@ static void tokenize1st_order_chroma
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -570,7 +589,8 @@ static void tokenize1st_order_chroma
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
 
       t++;
     }
@@ -586,8 +606,8 @@ static void tokenize1st_order_b
   MACROBLOCKD *xd,
   TOKENEXTRA **tp,
   int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   unsigned int block;
   const BLOCKD *b;
   int pt;             /* near block/prev token context index */
@@ -626,8 +646,8 @@ static void tokenize1st_order_b
 
       t->skip_eob_node = pt == 0 &&
                          ((band > 0 && type > 0) || (band > 1 && type == 0));
-
-      ++cpi->coef_counts       [type] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [type] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -640,8 +660,8 @@ static void tokenize1st_order_b
 
       t->skip_eob_node = pt == 0 &&
                          ((band > 0 && type > 0) || (band > 1 && type == 0));
-
-      ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+      if (!dry_run)
+        ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
 
       t++;
     }
@@ -671,7 +691,8 @@ static void tokenize1st_order_b
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -684,7 +705,8 @@ static void tokenize1st_order_b
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
 
       t++;
     }
@@ -757,12 +779,16 @@ int mb_is_skippable_16x16(MACROBLOCKD *x) {
 }
 #endif
 
-void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+void vp8_tokenize_mb(VP8_COMP *cpi,
+                     MACROBLOCKD *x,
+                     TOKENEXTRA **t,
+                     int dry_run) {
   int plane_type;
   int has_y2_block;
   int b;
   int tx_type = x->mode_info_context->mbmi.txfm_size;
   int mb_skip_context = get_pred_context(&cpi->common, x, PRED_MBSKIP);
+  TOKENEXTRA *t_backup = *t;
 
   // If the MB is going to be skipped because of a segment level flag
   // exclude this from the skip count stats used to calculate the
@@ -804,25 +830,28 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
   }
 
   if (x->mode_info_context->mbmi.mb_skip_coeff) {
-    cpi->skip_true_count[mb_skip_context] += skip_inc;
+    if (!dry_run)
+      cpi->skip_true_count[mb_skip_context] += skip_inc;
     if (!cpi->common.mb_no_coeff_skip) {
 #if CONFIG_TX16X16
       if (tx_type == TX_16X16)
-        vp8_stuff_mb_16x16(cpi, x, t);
+        vp8_stuff_mb_16x16(cpi, x, t, dry_run);
       else
 #endif
       if (tx_type == TX_8X8)
-        vp8_stuff_mb_8x8(cpi, x, t);
+        vp8_stuff_mb_8x8(cpi, x, t, dry_run);
       else
-        vp8_stuff_mb(cpi, x, t);
+        vp8_stuff_mb(cpi, x, t, dry_run);
     } else {
       vp8_fix_contexts(x);
     }
-
+    if (dry_run)
+      *t = t_backup;
     return;
   }
 
-  cpi->skip_false_count[mb_skip_context] += skip_inc;
+  if (!dry_run)
+    cpi->skip_false_count[mb_skip_context] += skip_inc;
 
   plane_type = 3;
   if (has_y2_block) {
@@ -832,9 +861,10 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
       tokenize2nd_order_b_8x8(x,
                               x->block + 24, t, 1, x->frame_type,
                               A + vp8_block2above_8x8[24],
-                              L + vp8_block2left_8x8[24], cpi);
+                              L + vp8_block2left_8x8[24],
+                              cpi, dry_run);
     } else
-      tokenize2nd_order_b(x, t, cpi);
+      tokenize2nd_order_b(x, t, cpi, dry_run);
 
     plane_type = 0;
   }
@@ -843,14 +873,15 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
   if (tx_type == TX_16X16) {
     ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
     ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
-    tokenize1st_order_b_16x16(x, x->block, t, 3, x->frame_type, A, L, cpi);
+    tokenize1st_order_b_16x16(x, x->block, t, 3,
+                              x->frame_type, A, L, cpi, dry_run);
     for (b = 1; b < 16; b++) {
       *(A + vp8_block2above[b]) = *(A);
       *(L + vp8_block2left[b] ) = *(L);
     }
     for (b = 16; b < 24; b += 4) {
       tokenize1st_order_b_8x8(x, x->block + b, t, 2, x->frame_type,
-          A + vp8_block2above_8x8[b], L + vp8_block2left_8x8[b], cpi);
+          A + vp8_block2above_8x8[b], L + vp8_block2left_8x8[b], cpi, dry_run);
       *(A + vp8_block2above_8x8[b]+1) = *(A + vp8_block2above_8x8[b]);
       *(L + vp8_block2left_8x8[b]+1 ) = *(L + vp8_block2left_8x8[b]);
     }
@@ -867,7 +898,7 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
                               x->block + b, t, plane_type, x->frame_type,
                               A + vp8_block2above_8x8[b],
                               L + vp8_block2left_8x8[b],
-                              cpi);
+                              cpi, dry_run);
       *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
       *(L + vp8_block2left_8x8[b] + 1)  = *(L + vp8_block2left_8x8[b]);
     }
@@ -876,14 +907,14 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
                               x->block + b, t, 2, x->frame_type,
                               A + vp8_block2above_8x8[b],
                               L + vp8_block2left_8x8[b],
-                              cpi);
+                              cpi, dry_run);
       *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
       *(L + vp8_block2left_8x8[b] + 1) = *(L + vp8_block2left_8x8[b]);
     }
   } else {
 #if CONFIG_HYBRIDTRANSFORM
     if(active_ht) {
-      tokenize1st_order_ht(x, t, plane_type, cpi);
+      tokenize1st_order_ht(x, t, plane_type, cpi, dry_run);
     } else {
 
 #if CONFIG_HYBRIDTRANSFORM8X8
@@ -896,23 +927,25 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
                                   x->frame_type,
                                   A + vp8_block2above_8x8[b],
                                   L + vp8_block2left_8x8[b],
-                                  cpi);
+                                  cpi, dry_run);
           *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
           *(L + vp8_block2left_8x8[b] + 1)  = *(L + vp8_block2left_8x8[b]);
         }
-        tokenize1st_order_chroma(x, t, PLANE_TYPE_UV, cpi);
+        tokenize1st_order_chroma(x, t, PLANE_TYPE_UV, cpi, dry_run);
       } else {
-        tokenize1st_order_b(x, t, plane_type, cpi);
+        tokenize1st_order_b(x, t, plane_type, cpi, dry_run);
       }
 #else
-      tokenize1st_order_b(x, t, plane_type, cpi);
+      tokenize1st_order_b(x, t, plane_type, cpi, dry_run);
 #endif
 
     }
 #else
-    tokenize1st_order_b(x, t, plane_type, cpi);
+    tokenize1st_order_b(x, t, plane_type, cpi, dry_run);
 #endif
   }
+  if (dry_run)
+    *t = t_backup;
 }
 
 
@@ -1178,16 +1211,15 @@ void vp8_tokenize_initialize() {
 }
 
 
-static __inline void stuff2nd_order_b_8x8
-(
+static __inline void stuff2nd_order_b_8x8(
   const BLOCKD *const b,
   TOKENEXTRA **tp,
   const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1202,7 +1234,8 @@ static __inline void stuff2nd_order_b_8x8
   ++t;
 
   *tp = t;
-  ++cpi->coef_counts_8x8       [1] [0] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts_8x8       [1] [0] [pt] [DCT_EOB_TOKEN];
   pt = 0;
   *a = *l = pt;
 
@@ -1216,8 +1249,8 @@ static __inline void stuff1st_order_b_8x8
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1231,7 +1264,8 @@ static __inline void stuff1st_order_b_8x8
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts_8x8       [0] [1] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts_8x8[0] [1] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
 
@@ -1247,8 +1281,8 @@ void stuff1st_order_buv_8x8
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1262,13 +1296,17 @@ void stuff1st_order_buv_8x8
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts_8x8[2] [0] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts_8x8[2] [0] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
 
 }
 
-void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+void vp8_stuff_mb_8x8(VP8_COMP *cpi,
+                      MACROBLOCKD *x,
+                      TOKENEXTRA **t,
+                      int dry_run) {
   ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
   ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
   int plane_type;
@@ -1276,14 +1314,14 @@ void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
 
   stuff2nd_order_b_8x8(x->block + 24, t, 1, x->frame_type,
                        A + vp8_block2above_8x8[24],
-                       L + vp8_block2left_8x8[24], cpi);
+                       L + vp8_block2left_8x8[24], cpi, dry_run);
   plane_type = 0;
 
   for (b = 0; b < 16; b += 4) {
     stuff1st_order_b_8x8(x->block + b, t, plane_type, x->frame_type,
                          A + vp8_block2above_8x8[b],
                          L + vp8_block2left_8x8[b],
-                         cpi);
+                         cpi, dry_run);
     *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
     *(L + vp8_block2left_8x8[b] + 1)  = *(L + vp8_block2left_8x8[b]);
   }
@@ -1292,7 +1330,7 @@ void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
     stuff1st_order_buv_8x8(x->block + b, t, 2, x->frame_type,
                            A + vp8_block2above[b],
                            L + vp8_block2left[b],
-                           cpi);
+                           cpi, dry_run);
     *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
     *(L + vp8_block2left_8x8[b] + 1) = *(L + vp8_block2left_8x8[b]);
   }
@@ -1301,9 +1339,13 @@ void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
 
 #if CONFIG_TX16X16
 static __inline
-void stuff1st_order_b_16x16(const BLOCKD *const b, TOKENEXTRA **tp, const FRAME_TYPE frametype,
-                            ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, VP8_COMP *cpi)
-{
+void stuff1st_order_b_16x16(const BLOCKD *const b,
+                            TOKENEXTRA **tp,
+                            const FRAME_TYPE frametype,
+                            ENTROPY_CONTEXT *a,
+                            ENTROPY_CONTEXT *l,
+                            VP8_COMP *cpi,
+                            int dry_run){
     int pt; /* near block/prev token context index */
     TOKENEXTRA *t = *tp;        /* store tokens starting here */
     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1315,17 +1357,21 @@ void stuff1st_order_b_16x16(const BLOCKD *const b, TOKENEXTRA **tp, const FRAME_
     t->skip_eob_node = 0;
     ++t;
     *tp = t;
-    ++cpi->coef_counts_16x16[3][1][pt][DCT_EOB_TOKEN];
+    if (!dry_run)
+      ++cpi->coef_counts_16x16[3][1][pt][DCT_EOB_TOKEN];
     pt = 0; /* 0 <-> all coeff data is zero */
     *a = *l = pt;
 }
 
-void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+void vp8_stuff_mb_16x16(VP8_COMP *cpi,
+                        MACROBLOCKD *x,
+                        TOKENEXTRA **t,
+                        int dry_run) {
   ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
   ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
   int b, i;
 
-  stuff1st_order_b_16x16(x->block, t, x->frame_type, A, L, cpi);
+  stuff1st_order_b_16x16(x->block, t, x->frame_type, A, L, cpi, dry_run);
   for (i = 1; i < 16; i++) {
     *(A + vp8_block2above[i]) = *(A);
     *(L +  vp8_block2left[i]) = *(L);
@@ -1334,7 +1380,7 @@ void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
     stuff1st_order_buv_8x8(x->block + b, t, 2, x->frame_type,
         A + vp8_block2above[b],
         L + vp8_block2left[b],
-        cpi);
+        cpi, dry_run);
     *(A + vp8_block2above_8x8[b]+1) = *(A + vp8_block2above_8x8[b]);
     *(L + vp8_block2left_8x8[b]+1 ) = *(L + vp8_block2left_8x8[b]);
   }
@@ -1348,8 +1394,8 @@ static __inline void stuff2nd_order_b
   TOKENEXTRA **tp,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1359,20 +1405,19 @@ static __inline void stuff2nd_order_b
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
 
   pt = 0;
   *a = *l = pt;
 
 }
 
-static __inline void stuff1st_order_b
-(
-  TOKENEXTRA **tp,
-  ENTROPY_CONTEXT *a,
-  ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+static __inline void stuff1st_order_b(TOKENEXTRA **tp,
+                                      ENTROPY_CONTEXT *a,
+                                      ENTROPY_CONTEXT *l,
+                                      VP8_COMP *cpi,
+                                      int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1382,7 +1427,8 @@ static __inline void stuff1st_order_b
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts       [0] [1] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts[0] [1] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
 
@@ -1393,8 +1439,8 @@ void stuff1st_order_buv
   TOKENEXTRA **tp,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1404,31 +1450,39 @@ void stuff1st_order_buv
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
 }
 
-void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t, int dry_run) {
   ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
   ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
   int plane_type;
   int b;
+  TOKENEXTRA *t_backup = *t;
 
   stuff2nd_order_b(t,
-                   A + vp8_block2above[24], L + vp8_block2left[24], cpi);
+                   A + vp8_block2above[24],
+                   L + vp8_block2left[24],
+                   cpi, dry_run);
   plane_type = 0;
 
   for (b = 0; b < 16; b++)
     stuff1st_order_b(t,
                      A + vp8_block2above[b],
-                     L + vp8_block2left[b], cpi);
+                     L + vp8_block2left[b],
+                     cpi, dry_run);
 
   for (b = 16; b < 24; b++)
     stuff1st_order_buv(t,
                        A + vp8_block2above[b],
-                       L + vp8_block2left[b], cpi);
+                       L + vp8_block2left[b],
+                       cpi, dry_run);
 
+  if (dry_run)
+    *t = t_backup;
 }
 void vp8_fix_contexts(MACROBLOCKD *x) {
   /* Clear entropy contexts for Y2 blocks */
-- 
GitLab