diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index af91eeb61e944bd343e2908b92b33debf73f069c..fb5a58a4c8288079c8a0b576e32c29604df72b95 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -122,13 +122,6 @@ typedef enum {
 
 #define WHT_UPSCALE_FACTOR 2
 
-#define TX_SIZE_PROBS  6  // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2)
-
-#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \
-                            (c)->fc.tx_probs_8x8p :    \
-                            (b) < BLOCK_SIZE_SB32X32 ? \
-                            (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p)
-
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a27e6b09e3342082f7854788987c5ed1c17c12fd..2c833ab536e085881daf734437fcc3dda778f35b 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1678,6 +1678,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
+    // printf("Switching to lossless\n");
     cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
     cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
@@ -1687,6 +1688,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
     cpi->zbin_mode_boost_enabled = 0;
     cpi->common.txfm_mode = ONLY_4X4;
   } else {
+    // printf("Not lossless\n");
     cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
     cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
@@ -1695,7 +1697,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
 }
 
 static void switch_txfm_mode(VP9_COMP *cpi) {
-  if (cpi->sf.use_largest_txform &&
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
       cpi->common.txfm_mode >= ALLOW_32X32)
     cpi->common.txfm_mode = ALLOW_32X32;
 }
@@ -1728,6 +1730,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   vp9_zero(cm->fc.switchable_interp_count);
   vp9_zero(cpi->best_switchable_interp_count);
+  vp9_zero(cpi->txfm_stepdown_count);
 
   xd->mode_info_context = cm->mi;
   xd->prev_mode_info_context = cm->prev_mi;
@@ -1930,6 +1933,47 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
   }
 }
 
+static int get_frame_type(VP9_COMP *cpi) {
+  int frame_type;
+  if (cpi->common.frame_type == KEY_FRAME)
+    frame_type = 0;
+  else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
+    frame_type = 3;
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+    frame_type = 1;
+  else
+    frame_type = 2;
+  return frame_type;
+}
+
+static void select_txfm_mode(VP9_COMP *cpi) {
+  if (cpi->oxcf.lossless) {
+    cpi->common.txfm_mode = ONLY_4X4;
+  } else if (cpi->common.current_video_frame == 0) {
+    cpi->common.txfm_mode = TX_MODE_SELECT;
+  } else {
+    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+      int frame_type = get_frame_type(cpi);
+      cpi->common.txfm_mode =
+          cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
+          > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+          ALLOW_32X32 : TX_MODE_SELECT;
+    } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+      cpi->common.txfm_mode = ALLOW_32X32;
+    } else {
+      unsigned int total = 0;
+      int i;
+      for (i = 0; i < TX_SIZE_MAX_SB; ++i)
+        total += cpi->txfm_stepdown_count[i];
+      if (total) {
+        double fraction = (double)cpi->txfm_stepdown_count[0] / total;
+        cpi->common.txfm_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
+        // printf("fraction = %f\n", fraction);
+      }  // else keep unchanged
+    }
+  }
+}
+
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON * const cm = &cpi->common;
 
@@ -1940,7 +1984,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   // side behaviour is where the ALT ref buffer has oppositie sign bias to
   // the other two.
   if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
-      == cm->ref_frame_sign_bias[GOLDEN_FRAME])
+       == cm->ref_frame_sign_bias[GOLDEN_FRAME])
       || (cm->ref_frame_sign_bias[ALTREF_FRAME]
           == cm->ref_frame_sign_bias[LAST_FRAME])) {
     cm->allow_comp_inter_inter = 0;
@@ -1952,9 +1996,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   }
 
   if (cpi->sf.RD) {
-    int i, frame_type, pred_type;
-    TXFM_MODE txfm_type;
-
+    int i, pred_type;
     /*
      * This code does a single RD pass over the whole frame assuming
      * either compound, single or hybrid prediction as per whatever has
@@ -1964,26 +2006,19 @@ void vp9_encode_frame(VP9_COMP *cpi) {
      * that for subsequent frames.
      * It does the same analysis for transform size selection also.
      */
-    if (cpi->common.frame_type == KEY_FRAME)
-      frame_type = 0;
-    else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
-      frame_type = 3;
-    else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-      frame_type = 1;
-    else
-      frame_type = 2;
+    int frame_type = get_frame_type(cpi);
 
     /* prediction (compound, single or hybrid) mode selection */
     if (frame_type == 3 || !cm->allow_comp_inter_inter)
       pred_type = SINGLE_PREDICTION_ONLY;
     else if (cpi->rd_prediction_type_threshes[frame_type][1]
-        > cpi->rd_prediction_type_threshes[frame_type][0]
-        && cpi->rd_prediction_type_threshes[frame_type][1]
-            > cpi->rd_prediction_type_threshes[frame_type][2]
-        && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+             > cpi->rd_prediction_type_threshes[frame_type][0]
+             && cpi->rd_prediction_type_threshes[frame_type][1]
+             > cpi->rd_prediction_type_threshes[frame_type][2]
+             && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
       pred_type = COMP_PREDICTION_ONLY;
     else if (cpi->rd_prediction_type_threshes[frame_type][0]
-        > cpi->rd_prediction_type_threshes[frame_type][2])
+             > cpi->rd_prediction_type_threshes[frame_type][2])
       pred_type = SINGLE_PREDICTION_ONLY;
     else
       pred_type = HYBRID_PREDICTION;
@@ -1992,43 +2027,10 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
     cpi->mb.e_mbd.lossless = 0;
     if (cpi->oxcf.lossless) {
-      txfm_type = ONLY_4X4;
       cpi->mb.e_mbd.lossless = 1;
-    } else
-#if 0
-      /* FIXME (rbultje): this code is disabled until we support cost updates
-       * while a frame is being encoded; the problem is that each time we
-       * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
-       * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
-       * further behind and not being chosen for subsequent frames either. This
-       * is essentially a local minimum problem that we can probably fix by
-       * estimating real costs more closely within a frame, perhaps by re-
-       * calculating costs on-the-fly as frame encoding progresses. */
-      if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-          cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
-          cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-          cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
-          cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
-          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
-        txfm_type = TX_MODE_SELECT;
-      } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
-          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
-          && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
-          cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
-      ) {
-        txfm_type = ONLY_4X4;
-      } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
-          cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
-        txfm_type = ALLOW_16X16;
-      } else
-      txfm_type = ALLOW_8X8;
-#else
-      txfm_type =
-          cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
-              > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
-              ALLOW_32X32 : TX_MODE_SELECT;
-#endif
-    cpi->common.txfm_mode = txfm_type;
+    }
+
+    select_txfm_mode(cpi);
     cpi->common.comp_pred_mode = pred_type;
     encode_frame_internal(cpi);
 
@@ -2043,7 +2045,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       int diff;
       if (i == TX_MODE_SELECT)
         pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
-            2048 * (TX_SIZE_MAX_SB - 1), 0);
+                     2048 * (TX_SIZE_MAX_SB - 1), 0);
       diff = (int) (pd / cpi->common.MBs);
       cpi->rd_tx_select_threshes[frame_type][i] += diff;
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
@@ -2102,7 +2104,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
         cpi->common.txfm_mode = ALLOW_8X8;
         reset_skip_txfm_size(cpi, TX_8X8);
       } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0
-          && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+                 && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
         cpi->common.txfm_mode = ONLY_4X4;
         reset_skip_txfm_size(cpi, TX_4X4);
       } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 03d8ea3027f8b95a5bd663ff48efc3200bbff706..49582b264f2e16015af916295813946de70b7b81 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -701,7 +701,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
   sf->adaptive_rd_thresh = 0;
   sf->use_lastframe_partitioning = 0;
-  sf->use_largest_txform = 0;
+  sf->tx_size_search_method = USE_FULL_RD;
   sf->use_8tap_always = 0;
   sf->use_avoid_tested_higherror = 0;
   sf->skip_lots_of_modes = 0;
@@ -744,17 +744,15 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       if (speed == 1) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->less_rectangular_check  = 1;
-        sf->use_largest_txform        = !(cpi->common.frame_type == KEY_FRAME ||
-                                          cpi->common.intra_only ||
-                                          cpi->common.show_frame == 0);
-
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only ||
+                                      cpi->common.show_frame == 0) ?
+                                     USE_FULL_RD :
+                                     USE_LARGESTINTRA);
         sf->disable_splitmv =
             (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
       }
       if (speed == 2) {
-        sf->use_largest_txform        = !(cpi->common.frame_type == KEY_FRAME ||
-                                                  cpi->common.intra_only ||
-                                                  cpi->common.show_frame == 0);
         sf->adjust_thresholds_by_speed = 1;
         sf->less_rectangular_check  = 1;
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@@ -763,15 +761,30 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_lastframe_partitioning = 1;
         sf->adjust_partitioning_from_last_frame = 1;
         sf->last_partitioning_redo_frequency = 3;
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only ||
+                                      cpi->common.show_frame == 0) ?
+                                     USE_FULL_RD :
+                                     USE_LARGESTALL);
       }
       if (speed == 3) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->partition_by_variance = 1;
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only ||
+                                      cpi->common.show_frame == 0) ?
+                                     USE_FULL_RD :
+                                     USE_LARGESTALL);
       }
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->use_one_partition_size_always = 1;
         sf->always_this_block_size = BLOCK_SIZE_MB16X16;
+        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
+                                      cpi->common.intra_only ||
+                                      cpi->common.show_frame == 0) ?
+                                     USE_FULL_RD :
+                                     USE_LARGESTALL);
       }
       /*
       if (speed == 2) {
@@ -788,7 +801,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       }
       */
 
-     break;
+      break;
 
   }; /* switch */
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 153abbbe0412b4c1219130199a974f4691eb0099..bc1e54b0e50022b34a934a3152b9d1cb90ae5e0c 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -200,6 +200,13 @@ typedef enum {
   HEX = 2
 } SEARCH_METHODS;
 
+typedef enum {
+  USE_FULL_RD = 0,
+  USE_LARGESTINTRA,
+  USE_LARGESTINTRA_MODELINTER,
+  USE_LARGESTALL
+} TX_SIZE_SEARCH_METHOD;
+
 typedef struct {
   int RD;
   SEARCH_METHODS search_method;
@@ -219,7 +226,7 @@ typedef struct {
   int adaptive_rd_thresh;
   int skip_encode_sb;
   int use_lastframe_partitioning;
-  int use_largest_txform;
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
   int use_8tap_always;
   int use_avoid_tested_higherror;
   int skip_lots_of_modes;
@@ -589,6 +596,8 @@ typedef struct VP9_COMP {
                                       [VP9_SWITCHABLE_FILTERS];
   unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS];
 
+  unsigned int txfm_stepdown_count[TX_SIZE_MAX_SB];
+
   int initial_width;
   int initial_height;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 22a2c41218e2ed4f33262b4a2dc523e6f8884366..38460a5f5417cd6193bb1a45b7a1df8dcbdac653 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -279,6 +279,242 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   }
 }
 
+static enum BlockSize get_block_size(int bw, int bh) {
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+
+  if (bw == 4 && bh == 8)
+    return BLOCK_4X8;
+
+  if (bw == 8 && bh == 4)
+    return BLOCK_8X4;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+
+  if (bw == 16 && bh == 16)
+    return BLOCK_16X16;
+
+  if (bw == 32 && bh == 32)
+    return BLOCK_32X32;
+
+  if (bw == 32 && bh == 16)
+    return BLOCK_32X16;
+
+  if (bw == 16 && bh == 32)
+    return BLOCK_16X32;
+
+  if (bw == 64 && bh == 32)
+    return BLOCK_64X32;
+
+  if (bw == 32 && bh == 64)
+    return BLOCK_32X64;
+
+  if (bw == 64 && bh == 64)
+    return BLOCK_64X64;
+
+  assert(0);
+  return -1;
+}
+
+static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
+                                           struct macroblockd_plane *pd) {
+  return get_block_size(plane_block_width(bsize, pd),
+                        plane_block_height(bsize, pd));
+}
+
+static double linear_interpolate(double x, int ntab, int inv_step,
+                                 const double *tab) {
+  double y = x * inv_step;
+  int d = (int) y;
+  if (d >= ntab - 1) {
+    return tab[ntab - 1];
+  } else {
+    double a = y - d;
+    return tab[d] * (1 - a) + tab[d + 1] * a;
+  }
+}
+
+static double model_rate_norm(double x) {
+  // Normalized rate
+  // This function models the rate for a Laplacian source
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+  // and H(x) is the binary entropy function.
+  static const int inv_rate_tab_step = 8;
+  static const double rate_tab[] = {
+    64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,
+    2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,
+    1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,
+    0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,
+    0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,
+    0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,
+    0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,
+    0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,
+    0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,
+    0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
+    0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,
+    0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,
+    0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,
+    0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
+    0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,
+  };
+  const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]);
+  assert(x >= 0.0);
+  return linear_interpolate(x, rate_tab_num, inv_rate_tab_step, rate_tab);
+}
+
+static double model_dist_norm(double x) {
+  // Normalized distortion
+  // This function models the normalized distortion for a Laplacian source
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance)
+  // Note the actual distortion is Dn * variance.
+  static const int inv_dist_tab_step = 8;
+  static const double dist_tab[] = {
+    0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,
+    0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,
+    0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,
+    0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,
+    0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,
+    0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,
+    0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,
+    0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,
+    0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,
+    0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,
+    0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,
+    0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,
+    0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,
+    0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,
+    0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,
+  };
+  const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]);
+  assert(x >= 0.0);
+  return linear_interpolate(x, dist_tab_num, inv_dist_tab_step, dist_tab);
+}
+
+static void model_rd_from_var_lapndz(int var, int n, int qstep,
+                                     int *rate, int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  vp9_clear_system_state();
+  if (var == 0 || n == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
+    double D, R;
+    double s2 = (double) var / n;
+    double x = qstep / sqrt(s2);
+    D = model_dist_norm(x);
+    R = model_rate_norm(x);
+    if (R < 0) {
+      R = 0;
+      D = var;
+    }
+    *rate = (n * R * 256 + 0.5);
+    *dist = (n * D * s2 + 0.5);
+  }
+  vp9_clear_system_state();
+}
+
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int64_t *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int i, rate_sum = 0, dist_sum = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+
+    // TODO(dkovalev) the same code in get_plane_block_size
+    const int bw = plane_block_width(bsize, pd);
+    const int bh = plane_block_height(bsize, pd);
+    const enum BlockSize bs = get_block_size(bw, bh);
+    unsigned int sse;
+    int rate;
+    int64_t dist;
+    (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                              pd->dst.buf, pd->dst.stride, &sse);
+    // sse works better than var, since there is no dc prediction used
+    model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = dist_sum << 4;
+}
+
+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                                 TX_SIZE tx_size,
+                                 MACROBLOCK *x, MACROBLOCKD *xd,
+                                 int *out_rate_sum, int64_t *out_dist_sum,
+                                 int *out_skip) {
+  int t, j, k;
+  enum BlockSize bs;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int bw = plane_block_width(bsize, pd);
+  const int bh = plane_block_height(bsize, pd);
+  int rate_sum = 0;
+  int64_t dist_sum = 0;
+
+  if (tx_size == TX_4X4) {
+    bs = BLOCK_4X4;
+    t = 4;
+  } else if (tx_size == TX_8X8) {
+    bs = BLOCK_8X8;
+    t = 8;
+  } else if (tx_size == TX_16X16) {
+    bs = BLOCK_16X16;
+    t = 16;
+  } else if (tx_size == TX_32X32) {
+    bs = BLOCK_32X32;
+    t = 32;
+  } else {
+    assert(0);
+  }
+  assert(bs <= get_block_size(bw, bh));
+  *out_skip = 1;
+  for (j = 0; j < bh; j+=t) {
+    for (k = 0; k < bw; k+=t) {
+      int rate;
+      int64_t dist;
+      unsigned int sse;
+      (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
+                                p->src.stride,
+                                pd->dst.buf + j * pd->dst.stride + k,
+                                pd->dst.stride, &sse);
+      // sse works better than var, since there is no dc prediction used
+      model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
+                               &rate, &dist);
+      rate_sum += rate;
+      dist_sum += dist;
+      *out_skip &= (rate < 1024);
+    }
+  }
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = (dist_sum << 4);
+}
+
 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
@@ -423,12 +659,199 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   return cost;
 }
 
+struct rdcost_block_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  TX_SIZE tx_size;
+  int bw;
+  int bh;
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int64_t best_rd;
+  int skip;
+};
+
+static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                       int ss_txfrm_size, void *arg) {
+  struct rdcost_block_args* args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  int64_t this_sse;
+  int shift = args->tx_size == TX_32X32 ? 0 : 2;
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                &this_sse) >> shift;
+  args->sse += this_sse >> shift;
+}
+
+static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                       int ss_txfrm_size, void *arg) {
+  struct rdcost_block_args* args = arg;
+  int x_idx, y_idx;
+  MACROBLOCKD * const xd = &args->x->e_mbd;
+
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
+                           &y_idx);
+
+  args->rate += cost_coeffs(args->cm, args->x, plane, block,
+                            xd->plane[plane].plane_type, args->t_above + x_idx,
+                            args->t_left + y_idx, args->tx_size,
+                            args->bw * args->bh);
+}
+
+
+static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
+                        BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD * const xd = &x->e_mbd;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int bw = 1 << bwl, bh = 1 << bhl;
+  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+    0, 0, 0, 0, 0 };
+
+  vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
+             sizeof(ENTROPY_CONTEXT) * bw);
+  vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
+             sizeof(ENTROPY_CONTEXT) * bh);
+
+  foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);
+  return args.rate;
+}
+
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  int cost = 0, plane;
+
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
+  }
+  return cost;
+}
+
+static int block_error(int16_t *coeff, int16_t *dqcoeff,
+                       int block_size, int shift) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    int this_diff = coeff[i] - dqcoeff[i];
+    error += (unsigned)this_diff * this_diff;
+  }
+  error >>= shift;
+
+  return error > INT_MAX ? INT_MAX : (int)error;
+}
+
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+                           int shift, int64_t *sse) {
+  struct macroblockd_plane *p = &x->e_mbd.plane[0];
+  const int bw = plane_block_width(bsize, p);
+  const int bh = plane_block_height(bsize, p);
+  int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+                              bw * bh, sse) >> shift;
+  *sse >>= shift;
+  return e;
+}
+
+static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+                                int shift, int64_t *sse) {
+  int64_t sum = 0, this_sse;
+  int plane;
+
+  *sse = 0;
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    struct macroblockd_plane *p = &x->e_mbd.plane[plane];
+    const int bw = plane_block_width(bsize, p);
+    const int bh = plane_block_height(bsize, p);
+    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
+                           bw * bh, &this_sse);
+    *sse += this_sse;
+  }
+  *sse >>= shift;
+  return sum >> shift;
+}
+
+static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                           int ss_txfrm_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct encode_b_args encode_args = {args->cm, x, NULL};
+
+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+    encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args);
+  else
+    xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args);
+
+  dist_block(plane, block, bsize, ss_txfrm_size, args);
+  rate_block(plane, block, bsize, ss_txfrm_size, args);
+}
+
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                     int *rate, int64_t *distortion,
+                                     int *skippable, int64_t *sse,
+                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int bwl = b_width_log2(bsize) - xd->plane[0].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[0].subsampling_y;
+  const int bw = 1 << bwl, bh = 1 << bhl;
+  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
+                                    0, 0, 0, 0, 0 };
+  xd->mode_info_context->mbmi.txfm_size = tx_size;
+  vpx_memcpy(&args.t_above, pd->above_context, sizeof(ENTROPY_CONTEXT) * bw);
+  vpx_memcpy(&args.t_left, pd->left_context, sizeof(ENTROPY_CONTEXT) * bh);
+
+  foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args);
+  *distortion = args.dist;
+  *rate       = args.rate;
+  *sse        = args.sse;
+  *skippable  = vp9_sby_is_skippable(xd, bsize);
+}
+
+static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int *rate, int64_t *distortion,
+                                     int *skip, int64_t *sse,
+                                     BLOCK_SIZE_TYPE bs) {
+  const TX_SIZE max_txfm_size = TX_32X32
+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  if (max_txfm_size == TX_32X32 &&
+      (cm->txfm_mode == ALLOW_32X32 ||
+       cm->txfm_mode == TX_MODE_SELECT)) {
+    mbmi->txfm_size = TX_32X32;
+  } else if (max_txfm_size >= TX_16X16 &&
+             (cm->txfm_mode == ALLOW_16X16 ||
+              cm->txfm_mode == ALLOW_32X32 ||
+              cm->txfm_mode == TX_MODE_SELECT)) {
+    mbmi->txfm_size = TX_16X16;
+  } else if (cm->txfm_mode != ONLY_4X4) {
+    mbmi->txfm_size = TX_8X8;
+  } else {
+    mbmi->txfm_size = TX_4X4;
+  }
+  super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
+                           &sse[mbmi->txfm_size], bs,
+                           mbmi->txfm_size);
+  cpi->txfm_stepdown_count[0]++;
+}
+
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                      int (*r)[2], int *rate,
                                      int64_t *d, int64_t *distortion,
                                      int *s, int *skip,
                                      int64_t txfm_cache[NB_TXFM_MODES],
-                                     TX_SIZE max_txfm_size) {
+                                     BLOCK_SIZE_TYPE bs) {
+  const TX_SIZE max_txfm_size = TX_32X32
+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -502,137 +925,122 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   else
     txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
                                  rd[TX_4X4][1] : rd[TX_8X8][1];
-}
-
-static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
-                                int shift, int64_t *sse) {
-  int64_t sum = 0, this_sse;
-  int plane;
 
-  *sse = 0;
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    struct macroblockd_plane *p = &x->e_mbd.plane[plane];
-    const int bw = plane_block_width(bsize, p);
-    const int bh = plane_block_height(bsize, p);
-    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
-                           bw * bh, &this_sse);
-    *sse += this_sse;
+  if (max_txfm_size == TX_32X32 &&
+      rd[TX_32X32][1] < rd[TX_16X16][1] &&
+      rd[TX_32X32][1] < rd[TX_8X8][1] &&
+      rd[TX_32X32][1] < rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[0]++;
+  } else if (max_txfm_size >= TX_16X16 &&
+             rd[TX_16X16][1] < rd[TX_8X8][1] &&
+             rd[TX_16X16][1] < rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+  } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+  } else {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
   }
-  *sse >>= shift;
-  return sum >> shift;
 }
 
-struct rdcost_block_args {
-  VP9_COMMON *cm;
-  MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
-  TX_SIZE tx_size;
-  int bw;
-  int bh;
-  int rate;
-  int64_t dist;
-  int64_t sse;
-  int64_t best_rd;
-  int skip;
-};
-
-static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                       int ss_txfrm_size, void *arg) {
-  struct rdcost_block_args* args = arg;
-  MACROBLOCK* const x = args->x;
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  int64_t this_sse;
-  int shift = args->tx_size == TX_32X32 ? 0 : 2;
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
-  args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                &this_sse) >> shift;
-  args->sse += this_sse >> shift;
-}
-
-static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                       int ss_txfrm_size, void *arg) {
-  struct rdcost_block_args* args = arg;
-  int x_idx, y_idx;
-  MACROBLOCKD * const xd = &args->x->e_mbd;
-
-  txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
-                           &y_idx);
-
-  args->rate += cost_coeffs(args->cm, args->x, plane, block,
-                            xd->plane[plane].plane_type, args->t_above + x_idx,
-                            args->t_left + y_idx, args->tx_size,
-                            args->bw * args->bh);
-}
-
-static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
-                        BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-  const int bw = 1 << bwl, bh = 1 << bhl;
-  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
-                                    0, 0, 0, 0, 0 };
-
-  vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
-             sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
-             sizeof(ENTROPY_CONTEXT) * bh);
-
-  foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args);
+static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
+                                          int (*r)[2], int *rate,
+                                          int64_t *d, int64_t *distortion,
+                                          int *s, int *skip, int64_t *sse,
+                                          BLOCK_SIZE_TYPE bs,
+                                          int *model_used) {
+  const TX_SIZE max_txfm_size = TX_32X32
+      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+  int64_t rd[TX_SIZE_MAX_SB][2];
+  int n, m;
+  int s0, s1;
+  double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00};
+  // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00};
 
-  return args.rate;
-}
+  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
 
-static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
-                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  int cost = 0, plane;
+  // for (n = TX_4X4; n <= max_txfm_size; n++)
+  //   r[n][0] = (r[n][0] * scale_r[n]);
 
-  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    r[n][1] = r[n][0];
+    for (m = 0; m <= n - (n == max_txfm_size); m++) {
+      if (m == n)
+        r[n][1] += vp9_cost_zero(tx_probs[m]);
+      else
+        r[n][1] += vp9_cost_one(tx_probs[m]);
+    }
   }
-  return cost;
-}
 
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                           int ss_txfrm_size, void *arg) {
-  struct rdcost_block_args *args = arg;
-  MACROBLOCK *const x = args->x;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args encode_args = {args->cm, x, NULL};
+  assert(skip_prob > 0);
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
 
-  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
-    encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args);
-  else
-    xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args);
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    if (s[n]) {
+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+    } else {
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+    }
+  }
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    rd[n][0] = (scale_rd[n] * rd[n][0]);
+    rd[n][1] = (scale_rd[n] * rd[n][1]);
+  }
 
-  dist_block(plane, block, bsize, ss_txfrm_size, args);
-  rate_block(plane, block, bsize, ss_txfrm_size, args);
-}
+  if (max_txfm_size == TX_32X32 &&
+      (cm->txfm_mode == ALLOW_32X32 ||
+       (cm->txfm_mode == TX_MODE_SELECT &&
+        rd[TX_32X32][1] <= rd[TX_16X16][1] &&
+        rd[TX_32X32][1] <= rd[TX_8X8][1] &&
+        rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
+    mbmi->txfm_size = TX_32X32;
+  } else if (max_txfm_size >= TX_16X16 &&
+             (cm->txfm_mode == ALLOW_16X16 ||
+              cm->txfm_mode == ALLOW_32X32 ||
+              (cm->txfm_mode == TX_MODE_SELECT &&
+               rd[TX_16X16][1] <= rd[TX_8X8][1] &&
+               rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
+    mbmi->txfm_size = TX_16X16;
+  } else if (cm->txfm_mode == ALLOW_8X8 ||
+             cm->txfm_mode == ALLOW_16X16 ||
+             cm->txfm_mode == ALLOW_32X32 ||
+           (cm->txfm_mode == TX_MODE_SELECT &&
+            rd[TX_8X8][1] <= rd[TX_4X4][1])) {
+    mbmi->txfm_size = TX_8X8;
+  } else {
+    mbmi->txfm_size = TX_4X4;
+  }
 
-static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
-                                     int *rate, int64_t *distortion,
-                                     int *skippable, int64_t *sse,
-                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  const int bwl = b_width_log2(bsize) - xd->plane[0].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[0].subsampling_y;
-  const int bw = 1 << bwl, bh = 1 << bhl;
-  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh,
-                                    0, 0, 0, 0, 0 };
-  xd->mode_info_context->mbmi.txfm_size = tx_size;
-  vpx_memcpy(&args.t_above, pd->above_context, sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(&args.t_left, pd->left_context, sizeof(ENTROPY_CONTEXT) * bh);
+  if (model_used[mbmi->txfm_size]) {
+    // Actually encode using the chosen mode if a model was used, but do not
+    // update the r, d costs
+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip,
+                             &sse[mbmi->txfm_size], bs, mbmi->txfm_size);
+  } else {
+    *distortion = d[mbmi->txfm_size];
+    *rate       = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];
+    *skip       = s[mbmi->txfm_size];
+  }
 
-  foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args);
-  *distortion = args.dist;
-  *rate       = args.rate;
-  *sse        = args.sse;
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
+  if (max_txfm_size == TX_32X32 &&
+      rd[TX_32X32][1] <= rd[TX_16X16][1] &&
+      rd[TX_32X32][1] <= rd[TX_8X8][1] &&
+      rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[0]++;
+  } else if (max_txfm_size >= TX_16X16 &&
+             rd[TX_16X16][1] <= rd[TX_8X8][1] &&
+             rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+  } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+  } else {
+    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
+  }
 }
 
 static void super_block_yrd(VP9_COMP *cpi,
@@ -649,38 +1057,67 @@ static void super_block_yrd(VP9_COMP *cpi,
   if (mbmi->ref_frame[0] > INTRA_FRAME)
     vp9_subtract_sby(x, bs);
 
-  if (cpi->sf.use_largest_txform) {
-    if (bs >= BLOCK_SIZE_SB32X32) {
-      mbmi->txfm_size = TX_32X32;
-    } else if (bs >= BLOCK_SIZE_MB16X16) {
-      mbmi->txfm_size = TX_16X16;
-    } else if (bs >= BLOCK_SIZE_SB8X8) {
-      mbmi->txfm_size = TX_8X8;
-    } else {
-      mbmi->txfm_size = TX_4X4;
-    }
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
+      (cpi->sf.tx_size_search_method != USE_FULL_RD &&
+       mbmi->ref_frame[0] == INTRA_FRAME)) {
     vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
-    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, &sse[0], bs,
-                             mbmi->txfm_size);
+    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, bs);
     if (psse)
-      *psse = sse[0];
+      *psse = sse[mbmi->txfm_size];
     return;
   }
-  if (bs >= BLOCK_SIZE_SB32X32)
-    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                             &sse[TX_32X32], bs, TX_32X32);
-  if (bs >= BLOCK_SIZE_MB16X16)
-    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                             &sse[TX_16X16], bs, TX_16X16);
-  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
-                           &sse[TX_8X8], bs, TX_8X8);
-  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
-                           &sse[TX_4X4], bs, TX_4X4);
-
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
-                           skip, txfm_cache,
-                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
-                           - (bs < BLOCK_SIZE_MB16X16));
+
+  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
+      mbmi->ref_frame[0] > INTRA_FRAME) {
+    int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1};
+    if (bs >= BLOCK_SIZE_SB32X32) {
+      if (model_used[TX_32X32]) {
+        model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
+                             &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
+      } else {
+        super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
+                                 &s[TX_32X32], &sse[TX_32X32], bs, TX_32X32);
+      }
+    }
+    if (bs >= BLOCK_SIZE_MB16X16) {
+      if (model_used[TX_16X16]) {
+        model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
+                             &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
+      } else {
+        super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
+                                 &s[TX_16X16], &sse[TX_16X16], bs, TX_16X16);
+      }
+    }
+    if (model_used[TX_8X8]) {
+      model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
+                           &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
+    } else {
+      super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+                               &sse[TX_8X8], bs, TX_8X8);
+    }
+    if (model_used[TX_4X4]) {
+      model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
+                           &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
+    } else {
+      super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+                               &sse[TX_4X4], bs, TX_4X4);
+    }
+    choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
+                                  skip, sse, bs, model_used);
+  } else {
+    if (bs >= BLOCK_SIZE_SB32X32)
+      super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
+                               &s[TX_32X32], &sse[TX_32X32], bs, TX_32X32);
+    if (bs >= BLOCK_SIZE_MB16X16)
+      super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
+                               &s[TX_16X16], &sse[TX_16X16], bs, TX_16X16);
+    super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+                             &sse[TX_8X8], bs, TX_8X8);
+    super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+                             &sse[TX_4X4], bs, TX_4X4);
+    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+                             skip, txfm_cache, bs);
+  }
   if (psse)
     *psse = sse[mbmi->txfm_size];
 }
@@ -909,8 +1346,10 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     return best_rd;
   }
 
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
+  if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+    for (i = 0; i < NB_TXFM_MODES; i++)
+      txfm_cache[i] = INT64_MAX;
+  }
 
   /* Y Search for 32x32 intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
@@ -943,11 +1382,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
       *skippable      = s;
     }
 
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      int64_t adj_rd = this_rd + local_txfm_cache[i] -
-                       local_txfm_cache[cpi->common.txfm_mode];
-      if (adj_rd < txfm_cache[i]) {
-        txfm_cache[i] = adj_rd;
+    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        int64_t adj_rd = this_rd + local_txfm_cache[i] -
+            local_txfm_cache[cpi->common.txfm_mode];
+        if (adj_rd < txfm_cache[i]) {
+          txfm_cache[i] = adj_rd;
+        }
       }
     }
   }
@@ -1246,50 +1687,6 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   return r;
 }
 
-static enum BlockSize get_block_size(int bw, int bh) {
-  if (bw == 4 && bh == 4)
-    return BLOCK_4X4;
-
-  if (bw == 4 && bh == 8)
-    return BLOCK_4X8;
-
-  if (bw == 8 && bh == 4)
-    return BLOCK_8X4;
-
-  if (bw == 8 && bh == 8)
-    return BLOCK_8X8;
-
-  if (bw == 8 && bh == 16)
-    return BLOCK_8X16;
-
-  if (bw == 16 && bh == 8)
-    return BLOCK_16X8;
-
-  if (bw == 16 && bh == 16)
-    return BLOCK_16X16;
-
-  if (bw == 32 && bh == 32)
-    return BLOCK_32X32;
-
-  if (bw == 32 && bh == 16)
-    return BLOCK_32X16;
-
-  if (bw == 16 && bh == 32)
-    return BLOCK_16X32;
-
-  if (bw == 64 && bh == 32)
-    return BLOCK_64X32;
-
-  if (bw == 32 && bh == 64)
-    return BLOCK_32X64;
-
-  if (bw == 64 && bh == 64)
-    return BLOCK_64X64;
-
-  assert(0);
-  return -1;
-}
-
 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
   x->plane[0].src.buf =
@@ -1837,195 +2234,6 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   return scaled_ref_frame;
 }
 
-static double linear_interpolate(double x, int ntab, double step,
-                                 const double *tab) {
-  double y = x / step;
-  int d = (int) y;
-  double a = y - d;
-  if (d >= ntab - 1)
-    return tab[ntab - 1];
-  else
-    return tab[d] * (1 - a) + tab[d + 1] * a;
-}
-
-static double model_rate_norm(double x) {
-  // Normalized rate
-  // This function models the rate for a Laplacian source
-  // source with given variance when quantized with a uniform quantizer
-  // with given stepsize. The closed form expressions are in:
-  // Hang and Chen, "Source Model for transform video coder and its
-  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
-  // Sys. for Video Tech., April 1997.
-  static const double rate_tab_step = 0.125;
-  static const double rate_tab[] = {
-    256.0000, 4.944453, 3.949276, 3.371593,
-    2.965771, 2.654550, 2.403348, 2.193612,
-    2.014208, 1.857921, 1.719813, 1.596364,
-    1.484979, 1.383702, 1.291025, 1.205767,
-    1.126990, 1.053937, 0.985991, 0.922644,
-    0.863472, 0.808114, 0.756265, 0.707661,
-    0.662070, 0.619287, 0.579129, 0.541431,
-    0.506043, 0.472828, 0.441656, 0.412411,
-    0.384980, 0.359260, 0.335152, 0.312563,
-    0.291407, 0.271600, 0.253064, 0.235723,
-    0.219508, 0.204351, 0.190189, 0.176961,
-    0.164611, 0.153083, 0.142329, 0.132298,
-    0.122945, 0.114228, 0.106106, 0.098541,
-    0.091496, 0.084937, 0.078833, 0.073154,
-    0.067872, 0.062959, 0.058392, 0.054147,
-    0.050202, 0.046537, 0.043133, 0.039971,
-    0.037036, 0.034312, 0.031783, 0.029436,
-    0.027259, 0.025240, 0.023367, 0.021631,
-    0.020021, 0.018528, 0.017145, 0.015863,
-    0.014676, 0.013575, 0.012556, 0.011612,
-    0.010738, 0.009929, 0.009180, 0.008487,
-    0.007845, 0.007251, 0.006701, 0.006193,
-    0.005722, 0.005287, 0.004884, 0.004512,
-    0.004168, 0.003850, 0.003556, 0.003284,
-    0.003032, 0.002800, 0.002585, 0.002386,
-    0.002203, 0.002034, 0.001877, 0.001732,
-    0.001599, 0.001476, 0.001362, 0.001256,
-    0.001159, 0.001069, 0.000987, 0.000910,
-    0.000840, 0.000774, 0.000714, 0.000659,
-    0.000608, 0.000560, 0.000517, 0.000476,
-    0.000439, 0.000405, 0.000373, 0.000344,
-    0.000317, 0.000292, 0.000270, 0.000248,
-    0.000229, 0.000211, 0.000195, 0.000179,
-    0.000165, 0.000152, 0.000140, 0.000129,
-    0.000119, 0.000110, 0.000101, 0.000093,
-    0.000086, 0.000079, 0.000073, 0.000067,
-    0.000062, 0.000057, 0.000052, 0.000048,
-    0.000044, 0.000041, 0.000038, 0.000035,
-    0.000032, 0.000029, 0.000027, 0.000025,
-    0.000023, 0.000021, 0.000019, 0.000018,
-    0.000016, 0.000015, 0.000014, 0.000013,
-    0.000012, 0.000011, 0.000010, 0.000009,
-    0.000008, 0.000008, 0.000007, 0.000007,
-    0.000006, 0.000006, 0.000005, 0.000005,
-    0.000004, 0.000004, 0.000004, 0.000003,
-    0.000003, 0.000003, 0.000003, 0.000002,
-    0.000002, 0.000002, 0.000002, 0.000002,
-    0.000002, 0.000001, 0.000001, 0.000001,
-    0.000001, 0.000001, 0.000001, 0.000001,
-    0.000001, 0.000001, 0.000001, 0.000001,
-    0.000001, 0.000001, 0.000000, 0.000000,
-  };
-  const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]);
-  assert(x >= 0.0);
-  return linear_interpolate(x, rate_tab_num, rate_tab_step, rate_tab);
-}
-
-static double model_dist_norm(double x) {
-  // Normalized distortion
-  // This function models the normalized distortion for a Laplacian source
-  // source with given variance when quantized with a uniform quantizer
-  // with given stepsize. The closed form expression is:
-  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
-  // where x = qpstep / sqrt(variance)
-  // Note the actual distortion is Dn * variance.
-  static const double dist_tab_step = 0.25;
-  static const double dist_tab[] = {
-    0.000000, 0.005189, 0.020533, 0.045381,
-    0.078716, 0.119246, 0.165508, 0.215979,
-    0.269166, 0.323686, 0.378318, 0.432034,
-    0.484006, 0.533607, 0.580389, 0.624063,
-    0.664475, 0.701581, 0.735418, 0.766092,
-    0.793751, 0.818575, 0.840761, 0.860515,
-    0.878045, 0.893554, 0.907238, 0.919281,
-    0.929857, 0.939124, 0.947229, 0.954306,
-    0.960475, 0.965845, 0.970512, 0.974563,
-    0.978076, 0.981118, 0.983750, 0.986024,
-    0.987989, 0.989683, 0.991144, 0.992402,
-    0.993485, 0.994417, 0.995218, 0.995905,
-    0.996496, 0.997002, 0.997437, 0.997809,
-    0.998128, 0.998401, 0.998635, 0.998835,
-    0.999006, 0.999152, 0.999277, 0.999384,
-    0.999475, 0.999553, 0.999619, 0.999676,
-    0.999724, 0.999765, 0.999800, 0.999830,
-    0.999855, 0.999877, 0.999895, 0.999911,
-    0.999924, 0.999936, 0.999945, 0.999954,
-    0.999961, 0.999967, 0.999972, 0.999976,
-    0.999980, 0.999983, 0.999985, 0.999988,
-    0.999989, 0.999991, 0.999992, 0.999994,
-    0.999995, 0.999995, 0.999996, 0.999997,
-    0.999997, 0.999998, 0.999998, 0.999998,
-    0.999999, 0.999999, 0.999999, 0.999999,
-    0.999999, 0.999999, 0.999999, 1.000000,
-  };
-  const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]);
-  assert(x >= 0.0);
-  return linear_interpolate(x, dist_tab_num, dist_tab_step, dist_tab);
-}
-
-static void model_rd_from_var_lapndz(int var, int n, int qstep,
-                                     int *rate, int64_t *dist) {
-  // This function models the rate and distortion for a Laplacian
-  // source with given variance when quantized with a uniform quantizer
-  // with given stepsize. The closed form expression is:
-  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
-  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance)
-  vp9_clear_system_state();
-  if (var == 0 || n == 0) {
-    *rate = 0;
-    *dist = 0;
-  } else {
-    double D, R;
-    double s2 = (double) var / n;
-    double x = qstep / sqrt(s2);
-    // TODO(debargha): Make the modeling functions take (qstep^2 / s2)
-    // as argument rather than qstep / sqrt(s2) to obviate the need for
-    // the sqrt() operation.
-    D = model_dist_norm(x);
-    R = model_rate_norm(x);
-    if (R < 0) {
-      R = 0;
-      D = var;
-    }
-    *rate = (n * R * 256 + 0.5);
-    *dist = (n * D * s2 + 0.5);
-  }
-  vp9_clear_system_state();
-}
-
-static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
-                                           struct macroblockd_plane *pd) {
-  return get_block_size(plane_block_width(bsize, pd),
-                        plane_block_height(bsize, pd));
-}
-
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
-                            MACROBLOCK *x, MACROBLOCKD *xd,
-                            int *out_rate_sum, int64_t *out_dist_sum) {
-  // Note our transform coeffs are 8 times an orthogonal transform.
-  // Hence quantizer step is also 8 times. To get effective quantizer
-  // we need to divide by 8 before sending to modeling function.
-  unsigned int sse;
-  int i, rate_sum = 0;
-  int64_t dist_sum = 0;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    struct macroblock_plane *const p = &x->plane[i];
-    struct macroblockd_plane *const pd = &xd->plane[i];
-
-    // TODO(dkovalev) the same code in get_plane_block_size
-    const int bw = plane_block_width(bsize, pd);
-    const int bh = plane_block_height(bsize, pd);
-    const enum BlockSize bs = get_block_size(bw, bh);
-    int rate;
-    int64_t dist;
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                       pd->dst.buf, pd->dst.stride, &sse);
-
-    model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
-
-    rate_sum += rate;
-    dist_sum += dist;
-  }
-
-  *out_rate_sum = rate_sum;
-  *out_dist_sum = dist_sum << 4;
-}
-
 static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -2564,7 +2772,6 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int rate4x4_y, rate4x4_y_tokenonly;
   int64_t dist4x4_y;
   int64_t err4x4 = INT64_MAX;
-  int i;
 
   vpx_memset(&txfm_cache,0,sizeof(txfm_cache));
   ctx->skip = 0;
@@ -2597,11 +2804,14 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
     xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   } else {
+    int i;
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
+    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
+      }
     }
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
     xd->mode_info_context->mbmi.mode = mode;