diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index af91eeb61e944bd343e2908b92b33debf73f069c..fb5a58a4c8288079c8a0b576e32c29604df72b95 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -122,13 +122,6 @@ typedef enum { #define WHT_UPSCALE_FACTOR 2 -#define TX_SIZE_PROBS 6 // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2) - -#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \ - (c)->fc.tx_probs_8x8p : \ - (b) < BLOCK_SIZE_SB32X32 ? \ - (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p) - /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index a27e6b09e3342082f7854788987c5ed1c17c12fd..2c833ab536e085881daf734437fcc3dda778f35b 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1678,6 +1678,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { if (lossless) { + // printf("Switching to lossless\n"); cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; @@ -1687,6 +1688,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { cpi->zbin_mode_boost_enabled = 0; cpi->common.txfm_mode = ONLY_4X4; } else { + // printf("Not lossless\n"); cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; @@ -1695,7 +1697,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { } static void switch_txfm_mode(VP9_COMP *cpi) { - if (cpi->sf.use_largest_txform && + if (cpi->sf.tx_size_search_method == USE_LARGESTALL && cpi->common.txfm_mode >= ALLOW_32X32) cpi->common.txfm_mode = ALLOW_32X32; } @@ -1728,6 +1730,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cm->fc.switchable_interp_count); vp9_zero(cpi->best_switchable_interp_count); + vp9_zero(cpi->txfm_stepdown_count); xd->mode_info_context = cm->mi; xd->prev_mode_info_context = cm->prev_mi; @@ -1930,6 +1933,47 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) { } } +static int get_frame_type(VP9_COMP *cpi) { + int frame_type; + if (cpi->common.frame_type == KEY_FRAME) + frame_type = 0; + else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) + frame_type = 3; + else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) + frame_type = 1; + else + frame_type = 2; + return frame_type; +} + +static void select_txfm_mode(VP9_COMP *cpi) { + if (cpi->oxcf.lossless) { + cpi->common.txfm_mode = ONLY_4X4; + } else if (cpi->common.current_video_frame == 0) { + cpi->common.txfm_mode = TX_MODE_SELECT; + } else { + if (cpi->sf.tx_size_search_method == USE_FULL_RD) { + int frame_type = get_frame_type(cpi); + cpi->common.txfm_mode = + cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] + > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? + ALLOW_32X32 : TX_MODE_SELECT; + } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { + cpi->common.txfm_mode = ALLOW_32X32; + } else { + unsigned int total = 0; + int i; + for (i = 0; i < TX_SIZE_MAX_SB; ++i) + total += cpi->txfm_stepdown_count[i]; + if (total) { + double fraction = (double)cpi->txfm_stepdown_count[0] / total; + cpi->common.txfm_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT; + // printf("fraction = %f\n", fraction); + } // else keep unchanged + } + } +} + void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON * const cm = &cpi->common; @@ -1940,7 +1984,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { // side behaviour is where the ALT ref buffer has oppositie sign bias to // the other two. if ((cm->ref_frame_sign_bias[ALTREF_FRAME] - == cm->ref_frame_sign_bias[GOLDEN_FRAME]) + == cm->ref_frame_sign_bias[GOLDEN_FRAME]) || (cm->ref_frame_sign_bias[ALTREF_FRAME] == cm->ref_frame_sign_bias[LAST_FRAME])) { cm->allow_comp_inter_inter = 0; @@ -1952,9 +1996,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { } if (cpi->sf.RD) { - int i, frame_type, pred_type; - TXFM_MODE txfm_type; - + int i, pred_type; /* * This code does a single RD pass over the whole frame assuming * either compound, single or hybrid prediction as per whatever has @@ -1964,26 +2006,19 @@ void vp9_encode_frame(VP9_COMP *cpi) { * that for subsequent frames. * It does the same analysis for transform size selection also. */ - if (cpi->common.frame_type == KEY_FRAME) - frame_type = 0; - else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) - frame_type = 3; - else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) - frame_type = 1; - else - frame_type = 2; + int frame_type = get_frame_type(cpi); /* prediction (compound, single or hybrid) mode selection */ if (frame_type == 3 || !cm->allow_comp_inter_inter) pred_type = SINGLE_PREDICTION_ONLY; else if (cpi->rd_prediction_type_threshes[frame_type][1] - > cpi->rd_prediction_type_threshes[frame_type][0] - && cpi->rd_prediction_type_threshes[frame_type][1] - > cpi->rd_prediction_type_threshes[frame_type][2] - && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) + > cpi->rd_prediction_type_threshes[frame_type][0] + && cpi->rd_prediction_type_threshes[frame_type][1] + > cpi->rd_prediction_type_threshes[frame_type][2] + && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) pred_type = COMP_PREDICTION_ONLY; else if (cpi->rd_prediction_type_threshes[frame_type][0] - > cpi->rd_prediction_type_threshes[frame_type][2]) + > cpi->rd_prediction_type_threshes[frame_type][2]) pred_type = SINGLE_PREDICTION_ONLY; else pred_type = HYBRID_PREDICTION; @@ -1992,43 +2027,10 @@ void vp9_encode_frame(VP9_COMP *cpi) { cpi->mb.e_mbd.lossless = 0; if (cpi->oxcf.lossless) { - txfm_type = ONLY_4X4; cpi->mb.e_mbd.lossless = 1; - } else -#if 0 - /* FIXME (rbultje): this code is disabled until we support cost updates - * while a frame is being encoded; the problem is that each time we - * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities - * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging - * further behind and not being chosen for subsequent frames either. This - * is essentially a local minimum problem that we can probably fix by - * estimating real costs more closely within a frame, perhaps by re- - * calculating costs on-the-fly as frame encoding progresses. */ - if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] && - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] && - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) { - txfm_type = TX_MODE_SELECT; - } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8] - && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] - ) { - txfm_type = ONLY_4X4; - } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >= - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) { - txfm_type = ALLOW_16X16; - } else - txfm_type = ALLOW_8X8; -#else - txfm_type = - cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] - > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? - ALLOW_32X32 : TX_MODE_SELECT; -#endif - cpi->common.txfm_mode = txfm_type; + } + + select_txfm_mode(cpi); cpi->common.comp_pred_mode = pred_type; encode_frame_internal(cpi); @@ -2043,7 +2045,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { int diff; if (i == TX_MODE_SELECT) pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, - 2048 * (TX_SIZE_MAX_SB - 1), 0); + 2048 * (TX_SIZE_MAX_SB - 1), 0); diff = (int) (pd / cpi->common.MBs); cpi->rd_tx_select_threshes[frame_type][i] += diff; cpi->rd_tx_select_threshes[frame_type][i] /= 2; @@ -2102,7 +2104,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { cpi->common.txfm_mode = ALLOW_8X8; reset_skip_txfm_size(cpi, TX_8X8); } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 - && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { + && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { cpi->common.txfm_mode = ONLY_4X4; reset_skip_txfm_size(cpi, TX_4X4); } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 03d8ea3027f8b95a5bd663ff48efc3200bbff706..49582b264f2e16015af916295813946de70b7b81 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -701,7 +701,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4; sf->adaptive_rd_thresh = 0; sf->use_lastframe_partitioning = 0; - sf->use_largest_txform = 0; + sf->tx_size_search_method = USE_FULL_RD; sf->use_8tap_always = 0; sf->use_avoid_tested_higherror = 0; sf->skip_lots_of_modes = 0; @@ -744,17 +744,15 @@ void vp9_set_speed_features(VP9_COMP *cpi) { if (speed == 1) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; sf->less_rectangular_check = 1; - sf->use_largest_txform = !(cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only || - cpi->common.show_frame == 0); - + sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0) ? + USE_FULL_RD : + USE_LARGESTINTRA); sf->disable_splitmv = (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; } if (speed == 2) { - sf->use_largest_txform = !(cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only || - cpi->common.show_frame == 0); sf->adjust_thresholds_by_speed = 1; sf->less_rectangular_check = 1; sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; @@ -763,15 +761,30 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_lastframe_partitioning = 1; sf->adjust_partitioning_from_last_frame = 1; sf->last_partitioning_redo_frequency = 3; + sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0) ? + USE_FULL_RD : + USE_LARGESTALL); } if (speed == 3) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; sf->partition_by_variance = 1; + sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0) ? + USE_FULL_RD : + USE_LARGESTALL); } if (speed == 4) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; sf->use_one_partition_size_always = 1; sf->always_this_block_size = BLOCK_SIZE_MB16X16; + sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0) ? + USE_FULL_RD : + USE_LARGESTALL); } /* if (speed == 2) { @@ -788,7 +801,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { } */ - break; + break; }; /* switch */ diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 153abbbe0412b4c1219130199a974f4691eb0099..bc1e54b0e50022b34a934a3152b9d1cb90ae5e0c 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -200,6 +200,13 @@ typedef enum { HEX = 2 } SEARCH_METHODS; +typedef enum { + USE_FULL_RD = 0, + USE_LARGESTINTRA, + USE_LARGESTINTRA_MODELINTER, + USE_LARGESTALL +} TX_SIZE_SEARCH_METHOD; + typedef struct { int RD; SEARCH_METHODS search_method; @@ -219,7 +226,7 @@ typedef struct { int adaptive_rd_thresh; int skip_encode_sb; int use_lastframe_partitioning; - int use_largest_txform; + TX_SIZE_SEARCH_METHOD tx_size_search_method; int use_8tap_always; int use_avoid_tested_higherror; int skip_lots_of_modes; @@ -589,6 +596,8 @@ typedef struct VP9_COMP { [VP9_SWITCHABLE_FILTERS]; unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS]; + unsigned int txfm_stepdown_count[TX_SIZE_MAX_SB]; + int initial_width; int initial_height; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 22a2c41218e2ed4f33262b4a2dc523e6f8884366..38460a5f5417cd6193bb1a45b7a1df8dcbdac653 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -279,6 +279,242 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { } } +static enum BlockSize get_block_size(int bw, int bh) { + if (bw == 4 && bh == 4) + return BLOCK_4X4; + + if (bw == 4 && bh == 8) + return BLOCK_4X8; + + if (bw == 8 && bh == 4) + return BLOCK_8X4; + + if (bw == 8 && bh == 8) + return BLOCK_8X8; + + if (bw == 8 && bh == 16) + return BLOCK_8X16; + + if (bw == 16 && bh == 8) + return BLOCK_16X8; + + if (bw == 16 && bh == 16) + return BLOCK_16X16; + + if (bw == 32 && bh == 32) + return BLOCK_32X32; + + if (bw == 32 && bh == 16) + return BLOCK_32X16; + + if (bw == 16 && bh == 32) + return BLOCK_16X32; + + if (bw == 64 && bh == 32) + return BLOCK_64X32; + + if (bw == 32 && bh == 64) + return BLOCK_32X64; + + if (bw == 64 && bh == 64) + return BLOCK_64X64; + + assert(0); + return -1; +} + +static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize, + struct macroblockd_plane *pd) { + return get_block_size(plane_block_width(bsize, pd), + plane_block_height(bsize, pd)); +} + +static double linear_interpolate(double x, int ntab, int inv_step, + const double *tab) { + double y = x * inv_step; + int d = (int) y; + if (d >= ntab - 1) { + return tab[ntab - 1]; + } else { + double a = y - d; + return tab[d] * (1 - a) + tab[d + 1] * a; + } +} + +static double model_rate_norm(double x) { + // Normalized rate + // This function models the rate for a Laplacian source + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), + // and H(x) is the binary entropy function. + static const int inv_rate_tab_step = 8; + static const double rate_tab[] = { + 64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194, + 2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206, + 1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708, + 0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412, + 0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236, + 0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132, + 0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073, + 0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040, + 0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022, + 0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012, + 0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006, + 0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003, + 0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002, + 0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, + 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000, + }; + const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]); + assert(x >= 0.0); + return linear_interpolate(x, rate_tab_num, inv_rate_tab_step, rate_tab); +} + +static double model_dist_norm(double x) { + // Normalized distortion + // This function models the normalized distortion for a Laplacian source + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance) + // Note the actual distortion is Dn * variance. + static const int inv_dist_tab_step = 8; + static const double dist_tab[] = { + 0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061, + 0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242, + 0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458, + 0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645, + 0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780, + 0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870, + 0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925, + 0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957, + 0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976, + 0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987, + 0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993, + 0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996, + 0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998, + 0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000, + }; + const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]); + assert(x >= 0.0); + return linear_interpolate(x, dist_tab_num, inv_dist_tab_step, dist_tab); +} + +static void model_rd_from_var_lapndz(int var, int n, int qstep, + int *rate, int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + vp9_clear_system_state(); + if (var == 0 || n == 0) { + *rate = 0; + *dist = 0; + } else { + double D, R; + double s2 = (double) var / n; + double x = qstep / sqrt(s2); + D = model_dist_norm(x); + R = model_rate_norm(x); + if (R < 0) { + R = 0; + D = var; + } + *rate = (n * R * 256 + 0.5); + *dist = (n * D * s2 + 0.5); + } + vp9_clear_system_state(); +} + +static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + int i, rate_sum = 0, dist_sum = 0; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + + // TODO(dkovalev) the same code in get_plane_block_size + const int bw = plane_block_width(bsize, pd); + const int bh = plane_block_height(bsize, pd); + const enum BlockSize bs = get_block_size(bw, bh); + unsigned int sse; + int rate; + int64_t dist; + (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + // sse works better than var, since there is no dc prediction used + model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist); + + rate_sum += rate; + dist_sum += dist; + } + + *out_rate_sum = rate_sum; + *out_dist_sum = dist_sum << 4; +} + +static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, + TX_SIZE tx_size, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum, + int *out_skip) { + int t, j, k; + enum BlockSize bs; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bw = plane_block_width(bsize, pd); + const int bh = plane_block_height(bsize, pd); + int rate_sum = 0; + int64_t dist_sum = 0; + + if (tx_size == TX_4X4) { + bs = BLOCK_4X4; + t = 4; + } else if (tx_size == TX_8X8) { + bs = BLOCK_8X8; + t = 8; + } else if (tx_size == TX_16X16) { + bs = BLOCK_16X16; + t = 16; + } else if (tx_size == TX_32X32) { + bs = BLOCK_32X32; + t = 32; + } else { + assert(0); + } + assert(bs <= get_block_size(bw, bh)); + *out_skip = 1; + for (j = 0; j < bh; j+=t) { + for (k = 0; k < bw; k+=t) { + int rate; + int64_t dist; + unsigned int sse; + (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k, + p->src.stride, + pd->dst.buf + j * pd->dst.stride + k, + pd->dst.stride, &sse); + // sse works better than var, since there is no dc prediction used + model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, + &rate, &dist); + rate_sum += rate; + dist_sum += dist; + *out_skip &= (rate < 1024); + } + } + *out_rate_sum = rate_sum; + *out_dist_sum = (dist_sum << 4); +} + int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz) { int i; @@ -423,12 +659,199 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, return cost; } +struct rdcost_block_args { + VP9_COMMON *cm; + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[16]; + ENTROPY_CONTEXT t_left[16]; + TX_SIZE tx_size; + int bw; + int bh; + int rate; + int64_t dist; + int64_t sse; + int64_t best_rd; + int skip; +}; + +static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg) { + struct rdcost_block_args* args = arg; + MACROBLOCK* const x = args->x; + MACROBLOCKD* const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + int64_t this_sse; + int shift = args->tx_size == TX_32X32 ? 0 : 2; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); + args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, + &this_sse) >> shift; + args->sse += this_sse >> shift; +} + +static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg) { + struct rdcost_block_args* args = arg; + int x_idx, y_idx; + MACROBLOCKD * const xd = &args->x->e_mbd; + + txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx, + &y_idx); + + args->rate += cost_coeffs(args->cm, args->x, plane, block, + xd->plane[plane].plane_type, args->t_above + x_idx, + args->t_left + y_idx, args->tx_size, + args->bw * args->bh); +} + + +static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane, + BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { + MACROBLOCKD * const xd = &x->e_mbd; + const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; + const int bw = 1 << bwl, bh = 1 << bhl; + struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, + 0, 0, 0, 0, 0 }; + + vpx_memcpy(&args.t_above, xd->plane[plane].above_context, + sizeof(ENTROPY_CONTEXT) * bw); + vpx_memcpy(&args.t_left, xd->plane[plane].left_context, + sizeof(ENTROPY_CONTEXT) * bh); + + foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args); + return args.rate; +} + +static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x, + BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { + int cost = 0, plane; + + for (plane = 1; plane < MAX_MB_PLANE; plane++) { + cost += rdcost_plane(cm, x, plane, bsize, tx_size); + } + return cost; +} + +static int block_error(int16_t *coeff, int16_t *dqcoeff, + int block_size, int shift) { + int i; + int64_t error = 0; + + for (i = 0; i < block_size; i++) { + int this_diff = coeff[i] - dqcoeff[i]; + error += (unsigned)this_diff * this_diff; + } + error >>= shift; + + return error > INT_MAX ? INT_MAX : (int)error; +} + +static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, + int shift, int64_t *sse) { + struct macroblockd_plane *p = &x->e_mbd.plane[0]; + const int bw = plane_block_width(bsize, p); + const int bh = plane_block_height(bsize, p); + int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, + bw * bh, sse) >> shift; + *sse >>= shift; + return e; +} + +static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, + int shift, int64_t *sse) { + int64_t sum = 0, this_sse; + int plane; + + *sse = 0; + for (plane = 1; plane < MAX_MB_PLANE; plane++) { + struct macroblockd_plane *p = &x->e_mbd.plane[plane]; + const int bw = plane_block_width(bsize, p); + const int bh = plane_block_height(bsize, p); + sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, + bw * bh, &this_sse); + *sse += this_sse; + } + *sse >>= shift; + return sum >> shift; +} + +static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg) { + struct rdcost_block_args *args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct encode_b_args encode_args = {args->cm, x, NULL}; + + if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) + encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args); + else + xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args); + + dist_block(plane, block, bsize, ss_txfrm_size, args); + rate_block(plane, block, bsize, ss_txfrm_size, args); +} + +static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int64_t *distortion, + int *skippable, int64_t *sse, + BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bwl = b_width_log2(bsize) - xd->plane[0].subsampling_x; + const int bhl = b_height_log2(bsize) - xd->plane[0].subsampling_y; + const int bw = 1 << bwl, bh = 1 << bhl; + struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, + 0, 0, 0, 0, 0 }; + xd->mode_info_context->mbmi.txfm_size = tx_size; + vpx_memcpy(&args.t_above, pd->above_context, sizeof(ENTROPY_CONTEXT) * bw); + vpx_memcpy(&args.t_left, pd->left_context, sizeof(ENTROPY_CONTEXT) * bh); + + foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args); + *distortion = args.dist; + *rate = args.rate; + *sse = args.sse; + *skippable = vp9_sby_is_skippable(xd, bsize); +} + +static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, int64_t *distortion, + int *skip, int64_t *sse, + BLOCK_SIZE_TYPE bs) { + const TX_SIZE max_txfm_size = TX_32X32 + - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16); + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + if (max_txfm_size == TX_32X32 && + (cm->txfm_mode == ALLOW_32X32 || + cm->txfm_mode == TX_MODE_SELECT)) { + mbmi->txfm_size = TX_32X32; + } else if (max_txfm_size >= TX_16X16 && + (cm->txfm_mode == ALLOW_16X16 || + cm->txfm_mode == ALLOW_32X32 || + cm->txfm_mode == TX_MODE_SELECT)) { + mbmi->txfm_size = TX_16X16; + } else if (cm->txfm_mode != ONLY_4X4) { + mbmi->txfm_size = TX_8X8; + } else { + mbmi->txfm_size = TX_4X4; + } + super_block_yrd_for_txfm(cm, x, rate, distortion, skip, + &sse[mbmi->txfm_size], bs, + mbmi->txfm_size); + cpi->txfm_stepdown_count[0]++; +} + static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int (*r)[2], int *rate, int64_t *d, int64_t *distortion, int *s, int *skip, int64_t txfm_cache[NB_TXFM_MODES], - TX_SIZE max_txfm_size) { + BLOCK_SIZE_TYPE bs) { + const TX_SIZE max_txfm_size = TX_32X32 + - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16); VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; @@ -502,137 +925,122 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, else txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ? rd[TX_4X4][1] : rd[TX_8X8][1]; -} - -static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, - int shift, int64_t *sse) { - int64_t sum = 0, this_sse; - int plane; - *sse = 0; - for (plane = 1; plane < MAX_MB_PLANE; plane++) { - struct macroblockd_plane *p = &x->e_mbd.plane[plane]; - const int bw = plane_block_width(bsize, p); - const int bh = plane_block_height(bsize, p); - sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, - bw * bh, &this_sse); - *sse += this_sse; + if (max_txfm_size == TX_32X32 && + rd[TX_32X32][1] < rd[TX_16X16][1] && + rd[TX_32X32][1] < rd[TX_8X8][1] && + rd[TX_32X32][1] < rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[0]++; + } else if (max_txfm_size >= TX_16X16 && + rd[TX_16X16][1] < rd[TX_8X8][1] && + rd[TX_16X16][1] < rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++; + } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++; + } else { + cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++; } - *sse >>= shift; - return sum >> shift; } -struct rdcost_block_args { - VP9_COMMON *cm; - MACROBLOCK *x; - ENTROPY_CONTEXT t_above[16]; - ENTROPY_CONTEXT t_left[16]; - TX_SIZE tx_size; - int bw; - int bh; - int rate; - int64_t dist; - int64_t sse; - int64_t best_rd; - int skip; -}; - -static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { - struct rdcost_block_args* args = arg; - MACROBLOCK* const x = args->x; - MACROBLOCKD* const xd = &x->e_mbd; - struct macroblock_plane *const p = &x->plane[0]; - struct macroblockd_plane *const pd = &xd->plane[0]; - int64_t this_sse; - int shift = args->tx_size == TX_32X32 ? 0 : 2; - int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); - args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, - &this_sse) >> shift; - args->sse += this_sse >> shift; -} - -static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { - struct rdcost_block_args* args = arg; - int x_idx, y_idx; - MACROBLOCKD * const xd = &args->x->e_mbd; - - txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx, - &y_idx); - - args->rate += cost_coeffs(args->cm, args->x, plane, block, - xd->plane[plane].plane_type, args->t_above + x_idx, - args->t_left + y_idx, args->tx_size, - args->bw * args->bh); -} - -static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { - MACROBLOCKD * const xd = &x->e_mbd; - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; - const int bw = 1 << bwl, bh = 1 << bhl; - struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, - 0, 0, 0, 0, 0 }; - - vpx_memcpy(&args.t_above, xd->plane[plane].above_context, - sizeof(ENTROPY_CONTEXT) * bw); - vpx_memcpy(&args.t_left, xd->plane[plane].left_context, - sizeof(ENTROPY_CONTEXT) * bh); - - foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args); +static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, + int (*r)[2], int *rate, + int64_t *d, int64_t *distortion, + int *s, int *skip, int64_t *sse, + BLOCK_SIZE_TYPE bs, + int *model_used) { + const TX_SIZE max_txfm_size = TX_32X32 + - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16); + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP); + int64_t rd[TX_SIZE_MAX_SB][2]; + int n, m; + int s0, s1; + double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00}; + // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00}; - return args.rate; -} + const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE); -static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { - int cost = 0, plane; + // for (n = TX_4X4; n <= max_txfm_size; n++) + // r[n][0] = (r[n][0] * scale_r[n]); - for (plane = 1; plane < MAX_MB_PLANE; plane++) { - cost += rdcost_plane(cm, x, plane, bsize, tx_size); + for (n = TX_4X4; n <= max_txfm_size; n++) { + r[n][1] = r[n][0]; + for (m = 0; m <= n - (n == max_txfm_size); m++) { + if (m == n) + r[n][1] += vp9_cost_zero(tx_probs[m]); + else + r[n][1] += vp9_cost_one(tx_probs[m]); + } } - return cost; -} -static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { - struct rdcost_block_args *args = arg; - MACROBLOCK *const x = args->x; - MACROBLOCKD *const xd = &x->e_mbd; - struct encode_b_args encode_args = {args->cm, x, NULL}; + assert(skip_prob > 0); + s0 = vp9_cost_bit(skip_prob, 0); + s1 = vp9_cost_bit(skip_prob, 1); - if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) - encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args); - else - xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args); + for (n = TX_4X4; n <= max_txfm_size; n++) { + if (s[n]) { + rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); + } else { + rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]); + rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); + } + } + for (n = TX_4X4; n <= max_txfm_size; n++) { + rd[n][0] = (scale_rd[n] * rd[n][0]); + rd[n][1] = (scale_rd[n] * rd[n][1]); + } - dist_block(plane, block, bsize, ss_txfrm_size, args); - rate_block(plane, block, bsize, ss_txfrm_size, args); -} + if (max_txfm_size == TX_32X32 && + (cm->txfm_mode == ALLOW_32X32 || + (cm->txfm_mode == TX_MODE_SELECT && + rd[TX_32X32][1] <= rd[TX_16X16][1] && + rd[TX_32X32][1] <= rd[TX_8X8][1] && + rd[TX_32X32][1] <= rd[TX_4X4][1]))) { + mbmi->txfm_size = TX_32X32; + } else if (max_txfm_size >= TX_16X16 && + (cm->txfm_mode == ALLOW_16X16 || + cm->txfm_mode == ALLOW_32X32 || + (cm->txfm_mode == TX_MODE_SELECT && + rd[TX_16X16][1] <= rd[TX_8X8][1] && + rd[TX_16X16][1] <= rd[TX_4X4][1]))) { + mbmi->txfm_size = TX_16X16; + } else if (cm->txfm_mode == ALLOW_8X8 || + cm->txfm_mode == ALLOW_16X16 || + cm->txfm_mode == ALLOW_32X32 || + (cm->txfm_mode == TX_MODE_SELECT && + rd[TX_8X8][1] <= rd[TX_4X4][1])) { + mbmi->txfm_size = TX_8X8; + } else { + mbmi->txfm_size = TX_4X4; + } -static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int64_t *distortion, - int *skippable, int64_t *sse, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblockd_plane *const pd = &xd->plane[0]; - const int bwl = b_width_log2(bsize) - xd->plane[0].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[0].subsampling_y; - const int bw = 1 << bwl, bh = 1 << bhl; - struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, - 0, 0, 0, 0, 0 }; - xd->mode_info_context->mbmi.txfm_size = tx_size; - vpx_memcpy(&args.t_above, pd->above_context, sizeof(ENTROPY_CONTEXT) * bw); - vpx_memcpy(&args.t_left, pd->left_context, sizeof(ENTROPY_CONTEXT) * bh); + if (model_used[mbmi->txfm_size]) { + // Actually encode using the chosen mode if a model was used, but do not + // update the r, d costs + super_block_yrd_for_txfm(cm, x, rate, distortion, skip, + &sse[mbmi->txfm_size], bs, mbmi->txfm_size); + } else { + *distortion = d[mbmi->txfm_size]; + *rate = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT]; + *skip = s[mbmi->txfm_size]; + } - foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args); - *distortion = args.dist; - *rate = args.rate; - *sse = args.sse; - *skippable = vp9_sby_is_skippable(xd, bsize); + if (max_txfm_size == TX_32X32 && + rd[TX_32X32][1] <= rd[TX_16X16][1] && + rd[TX_32X32][1] <= rd[TX_8X8][1] && + rd[TX_32X32][1] <= rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[0]++; + } else if (max_txfm_size >= TX_16X16 && + rd[TX_16X16][1] <= rd[TX_8X8][1] && + rd[TX_16X16][1] <= rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++; + } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++; + } else { + cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++; + } } static void super_block_yrd(VP9_COMP *cpi, @@ -649,38 +1057,67 @@ static void super_block_yrd(VP9_COMP *cpi, if (mbmi->ref_frame[0] > INTRA_FRAME) vp9_subtract_sby(x, bs); - if (cpi->sf.use_largest_txform) { - if (bs >= BLOCK_SIZE_SB32X32) { - mbmi->txfm_size = TX_32X32; - } else if (bs >= BLOCK_SIZE_MB16X16) { - mbmi->txfm_size = TX_16X16; - } else if (bs >= BLOCK_SIZE_SB8X8) { - mbmi->txfm_size = TX_8X8; - } else { - mbmi->txfm_size = TX_4X4; - } + if (cpi->sf.tx_size_search_method == USE_LARGESTALL || + (cpi->sf.tx_size_search_method != USE_FULL_RD && + mbmi->ref_frame[0] == INTRA_FRAME)) { vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t)); - super_block_yrd_for_txfm(cm, x, rate, distortion, skip, &sse[0], bs, - mbmi->txfm_size); + choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, bs); if (psse) - *psse = sse[0]; + *psse = sse[mbmi->txfm_size]; return; } - if (bs >= BLOCK_SIZE_SB32X32) - super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], - &sse[TX_32X32], bs, TX_32X32); - if (bs >= BLOCK_SIZE_MB16X16) - super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], - &sse[TX_16X16], bs, TX_16X16); - super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], - &sse[TX_8X8], bs, TX_8X8); - super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], - &sse[TX_4X4], bs, TX_4X4); - - choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, - skip, txfm_cache, - TX_32X32 - (bs < BLOCK_SIZE_SB32X32) - - (bs < BLOCK_SIZE_MB16X16)); + + if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER && + mbmi->ref_frame[0] > INTRA_FRAME) { + int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1}; + if (bs >= BLOCK_SIZE_SB32X32) { + if (model_used[TX_32X32]) { + model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd, + &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); + } else { + super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], + &s[TX_32X32], &sse[TX_32X32], bs, TX_32X32); + } + } + if (bs >= BLOCK_SIZE_MB16X16) { + if (model_used[TX_16X16]) { + model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd, + &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); + } else { + super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], + &s[TX_16X16], &sse[TX_16X16], bs, TX_16X16); + } + } + if (model_used[TX_8X8]) { + model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd, + &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); + } else { + super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], + &sse[TX_8X8], bs, TX_8X8); + } + if (model_used[TX_4X4]) { + model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd, + &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); + } else { + super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], + &sse[TX_4X4], bs, TX_4X4); + } + choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s, + skip, sse, bs, model_used); + } else { + if (bs >= BLOCK_SIZE_SB32X32) + super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], + &s[TX_32X32], &sse[TX_32X32], bs, TX_32X32); + if (bs >= BLOCK_SIZE_MB16X16) + super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], + &s[TX_16X16], &sse[TX_16X16], bs, TX_16X16); + super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], + &sse[TX_8X8], bs, TX_8X8); + super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], + &sse[TX_4X4], bs, TX_4X4); + choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, + skip, txfm_cache, bs); + } if (psse) *psse = sse[mbmi->txfm_size]; } @@ -909,8 +1346,10 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } - for (i = 0; i < NB_TXFM_MODES; i++) - txfm_cache[i] = INT64_MAX; + if (cpi->sf.tx_size_search_method == USE_FULL_RD) { + for (i = 0; i < NB_TXFM_MODES; i++) + txfm_cache[i] = INT64_MAX; + } /* Y Search for 32x32 intra prediction mode */ for (mode = DC_PRED; mode <= TM_PRED; mode++) { @@ -943,11 +1382,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, *skippable = s; } - for (i = 0; i < NB_TXFM_MODES; i++) { - int64_t adj_rd = this_rd + local_txfm_cache[i] - - local_txfm_cache[cpi->common.txfm_mode]; - if (adj_rd < txfm_cache[i]) { - txfm_cache[i] = adj_rd; + if (cpi->sf.tx_size_search_method == USE_FULL_RD) { + for (i = 0; i < NB_TXFM_MODES; i++) { + int64_t adj_rd = this_rd + local_txfm_cache[i] - + local_txfm_cache[cpi->common.txfm_mode]; + if (adj_rd < txfm_cache[i]) { + txfm_cache[i] = adj_rd; + } } } } @@ -1246,50 +1687,6 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { return r; } -static enum BlockSize get_block_size(int bw, int bh) { - if (bw == 4 && bh == 4) - return BLOCK_4X4; - - if (bw == 4 && bh == 8) - return BLOCK_4X8; - - if (bw == 8 && bh == 4) - return BLOCK_8X4; - - if (bw == 8 && bh == 8) - return BLOCK_8X8; - - if (bw == 8 && bh == 16) - return BLOCK_8X16; - - if (bw == 16 && bh == 8) - return BLOCK_16X8; - - if (bw == 16 && bh == 16) - return BLOCK_16X16; - - if (bw == 32 && bh == 32) - return BLOCK_32X32; - - if (bw == 32 && bh == 16) - return BLOCK_32X16; - - if (bw == 16 && bh == 32) - return BLOCK_16X32; - - if (bw == 64 && bh == 32) - return BLOCK_64X32; - - if (bw == 32 && bh == 64) - return BLOCK_32X64; - - if (bw == 64 && bh == 64) - return BLOCK_64X64; - - assert(0); - return -1; -} - static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi; x->plane[0].src.buf = @@ -1837,195 +2234,6 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { return scaled_ref_frame; } -static double linear_interpolate(double x, int ntab, double step, - const double *tab) { - double y = x / step; - int d = (int) y; - double a = y - d; - if (d >= ntab - 1) - return tab[ntab - 1]; - else - return tab[d] * (1 - a) + tab[d + 1] * a; -} - -static double model_rate_norm(double x) { - // Normalized rate - // This function models the rate for a Laplacian source - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expressions are in: - // Hang and Chen, "Source Model for transform video coder and its - // application - Part I: Fundamental Theory", IEEE Trans. Circ. - // Sys. for Video Tech., April 1997. - static const double rate_tab_step = 0.125; - static const double rate_tab[] = { - 256.0000, 4.944453, 3.949276, 3.371593, - 2.965771, 2.654550, 2.403348, 2.193612, - 2.014208, 1.857921, 1.719813, 1.596364, - 1.484979, 1.383702, 1.291025, 1.205767, - 1.126990, 1.053937, 0.985991, 0.922644, - 0.863472, 0.808114, 0.756265, 0.707661, - 0.662070, 0.619287, 0.579129, 0.541431, - 0.506043, 0.472828, 0.441656, 0.412411, - 0.384980, 0.359260, 0.335152, 0.312563, - 0.291407, 0.271600, 0.253064, 0.235723, - 0.219508, 0.204351, 0.190189, 0.176961, - 0.164611, 0.153083, 0.142329, 0.132298, - 0.122945, 0.114228, 0.106106, 0.098541, - 0.091496, 0.084937, 0.078833, 0.073154, - 0.067872, 0.062959, 0.058392, 0.054147, - 0.050202, 0.046537, 0.043133, 0.039971, - 0.037036, 0.034312, 0.031783, 0.029436, - 0.027259, 0.025240, 0.023367, 0.021631, - 0.020021, 0.018528, 0.017145, 0.015863, - 0.014676, 0.013575, 0.012556, 0.011612, - 0.010738, 0.009929, 0.009180, 0.008487, - 0.007845, 0.007251, 0.006701, 0.006193, - 0.005722, 0.005287, 0.004884, 0.004512, - 0.004168, 0.003850, 0.003556, 0.003284, - 0.003032, 0.002800, 0.002585, 0.002386, - 0.002203, 0.002034, 0.001877, 0.001732, - 0.001599, 0.001476, 0.001362, 0.001256, - 0.001159, 0.001069, 0.000987, 0.000910, - 0.000840, 0.000774, 0.000714, 0.000659, - 0.000608, 0.000560, 0.000517, 0.000476, - 0.000439, 0.000405, 0.000373, 0.000344, - 0.000317, 0.000292, 0.000270, 0.000248, - 0.000229, 0.000211, 0.000195, 0.000179, - 0.000165, 0.000152, 0.000140, 0.000129, - 0.000119, 0.000110, 0.000101, 0.000093, - 0.000086, 0.000079, 0.000073, 0.000067, - 0.000062, 0.000057, 0.000052, 0.000048, - 0.000044, 0.000041, 0.000038, 0.000035, - 0.000032, 0.000029, 0.000027, 0.000025, - 0.000023, 0.000021, 0.000019, 0.000018, - 0.000016, 0.000015, 0.000014, 0.000013, - 0.000012, 0.000011, 0.000010, 0.000009, - 0.000008, 0.000008, 0.000007, 0.000007, - 0.000006, 0.000006, 0.000005, 0.000005, - 0.000004, 0.000004, 0.000004, 0.000003, - 0.000003, 0.000003, 0.000003, 0.000002, - 0.000002, 0.000002, 0.000002, 0.000002, - 0.000002, 0.000001, 0.000001, 0.000001, - 0.000001, 0.000001, 0.000001, 0.000001, - 0.000001, 0.000001, 0.000001, 0.000001, - 0.000001, 0.000001, 0.000000, 0.000000, - }; - const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]); - assert(x >= 0.0); - return linear_interpolate(x, rate_tab_num, rate_tab_step, rate_tab); -} - -static double model_dist_norm(double x) { - // Normalized distortion - // This function models the normalized distortion for a Laplacian source - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expression is: - // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) - // where x = qpstep / sqrt(variance) - // Note the actual distortion is Dn * variance. - static const double dist_tab_step = 0.25; - static const double dist_tab[] = { - 0.000000, 0.005189, 0.020533, 0.045381, - 0.078716, 0.119246, 0.165508, 0.215979, - 0.269166, 0.323686, 0.378318, 0.432034, - 0.484006, 0.533607, 0.580389, 0.624063, - 0.664475, 0.701581, 0.735418, 0.766092, - 0.793751, 0.818575, 0.840761, 0.860515, - 0.878045, 0.893554, 0.907238, 0.919281, - 0.929857, 0.939124, 0.947229, 0.954306, - 0.960475, 0.965845, 0.970512, 0.974563, - 0.978076, 0.981118, 0.983750, 0.986024, - 0.987989, 0.989683, 0.991144, 0.992402, - 0.993485, 0.994417, 0.995218, 0.995905, - 0.996496, 0.997002, 0.997437, 0.997809, - 0.998128, 0.998401, 0.998635, 0.998835, - 0.999006, 0.999152, 0.999277, 0.999384, - 0.999475, 0.999553, 0.999619, 0.999676, - 0.999724, 0.999765, 0.999800, 0.999830, - 0.999855, 0.999877, 0.999895, 0.999911, - 0.999924, 0.999936, 0.999945, 0.999954, - 0.999961, 0.999967, 0.999972, 0.999976, - 0.999980, 0.999983, 0.999985, 0.999988, - 0.999989, 0.999991, 0.999992, 0.999994, - 0.999995, 0.999995, 0.999996, 0.999997, - 0.999997, 0.999998, 0.999998, 0.999998, - 0.999999, 0.999999, 0.999999, 0.999999, - 0.999999, 0.999999, 0.999999, 1.000000, - }; - const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]); - assert(x >= 0.0); - return linear_interpolate(x, dist_tab_num, dist_tab_step, dist_tab); -} - -static void model_rd_from_var_lapndz(int var, int n, int qstep, - int *rate, int64_t *dist) { - // This function models the rate and distortion for a Laplacian - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expression is: - // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], - // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance) - vp9_clear_system_state(); - if (var == 0 || n == 0) { - *rate = 0; - *dist = 0; - } else { - double D, R; - double s2 = (double) var / n; - double x = qstep / sqrt(s2); - // TODO(debargha): Make the modeling functions take (qstep^2 / s2) - // as argument rather than qstep / sqrt(s2) to obviate the need for - // the sqrt() operation. - D = model_dist_norm(x); - R = model_rate_norm(x); - if (R < 0) { - R = 0; - D = var; - } - *rate = (n * R * 256 + 0.5); - *dist = (n * D * s2 + 0.5); - } - vp9_clear_system_state(); -} - -static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize, - struct macroblockd_plane *pd) { - return get_block_size(plane_block_width(bsize, pd), - plane_block_height(bsize, pd)); -} - -static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, - MACROBLOCK *x, MACROBLOCKD *xd, - int *out_rate_sum, int64_t *out_dist_sum) { - // Note our transform coeffs are 8 times an orthogonal transform. - // Hence quantizer step is also 8 times. To get effective quantizer - // we need to divide by 8 before sending to modeling function. - unsigned int sse; - int i, rate_sum = 0; - int64_t dist_sum = 0; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - struct macroblock_plane *const p = &x->plane[i]; - struct macroblockd_plane *const pd = &xd->plane[i]; - - // TODO(dkovalev) the same code in get_plane_block_size - const int bw = plane_block_width(bsize, pd); - const int bh = plane_block_height(bsize, pd); - const enum BlockSize bs = get_block_size(bw, bh); - int rate; - int64_t dist; - cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); - - model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist); - - rate_sum += rate; - dist_sum += dist; - } - - *out_rate_sum = rate_sum; - *out_dist_sum = dist_sum << 4; -} - static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; @@ -2564,7 +2772,6 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int rate4x4_y, rate4x4_y_tokenonly; int64_t dist4x4_y; int64_t err4x4 = INT64_MAX; - int i; vpx_memset(&txfm_cache,0,sizeof(txfm_cache)); ctx->skip = 0; @@ -2597,11 +2804,14 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff)); xd->mode_info_context->mbmi.txfm_size = TX_4X4; } else { + int i; *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); *returndist = dist_y + (dist_uv >> 2); - for (i = 0; i < NB_TXFM_MODES; i++) { - ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode]; + if (cpi->sf.tx_size_search_method == USE_FULL_RD) { + for (i = 0; i < NB_TXFM_MODES; i++) { + ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode]; + } } xd->mode_info_context->mbmi.txfm_size = txfm_size; xd->mode_info_context->mbmi.mode = mode;