Commit 0aae1000 authored by Yunqing Wang's avatar Yunqing Wang

Reuse inter prediction result in real-time speed 6

In real-time speed 6, no partition search is done. The inter
prediction results got from picking mode can be reused in the
following encoding process. A speed feature reuse_inter_pred_sby
is added to only enable the resue in speed 6.

This patch doesn't change encoding result. RTC set tests showed
that the encoding speed gain is 2% - 5%.

Change-Id: I3884780f64ef95dd8be10562926542528713b92c
parent ab9755f3
......@@ -3364,7 +3364,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
&xd->block_refs[ref]->sf);
}
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
if (!cpi->sf.reuse_inter_pred_sby)
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
if (!x->skip) {
mbmi->skip = 1;
......
......@@ -23,6 +23,7 @@
#include "vp9/common/vp9_reconintra.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_pickmode.h"
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rdopt.h"
......@@ -183,6 +184,22 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
*out_dist_sum += dist << 4;
}
static int get_pred_buffer(PRED_BUFFER *p, int len) {
int i;
for (i = 0; i < len; i++) {
if (!p[i].in_use) {
p[i].in_use = 1;
return i;
}
}
return -1;
}
static void free_pred_buffer(PRED_BUFFER *p) {
p->in_use = 0;
}
// TODO(jingning) placeholder for inter-frame non-RD mode decision.
// this needs various further optimizations. to be continued..
int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
......@@ -229,6 +246,31 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
const int pred_filter_search = (((mi_row + mi_col) >> bsl) +
get_chessboard_index(cm)) % 2;
// For speed 6, the result of interp filter is reused later in actual encoding
// process.
int bh = num_4x4_blocks_high_lookup[bsize] << 2;
int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
int pixels_in_block = bh * bw;
// tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
PRED_BUFFER tmp[4];
DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
struct buf_2d orig_dst = pd->dst;
PRED_BUFFER *best_pred = NULL;
PRED_BUFFER *this_mode_pred = NULL;
int i;
if (cpi->sf.reuse_inter_pred_sby) {
for (i = 0; i < 3; i++) {
tmp[i].data = &pred_buf[pixels_in_block * i];
tmp[i].stride = bw;
tmp[i].in_use = 0;
}
tmp[3].data = pd->dst.buf;
tmp[3].stride = pd->dst.stride;
tmp[3].in_use = 0;
}
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
x->skip = 0;
......@@ -324,6 +366,16 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
// Search for the best prediction filter type, when the resulting
// motion vector is at sub-pixel accuracy level for luma component, i.e.,
// the last three bits are all zeros.
if (cpi->sf.reuse_inter_pred_sby) {
if (this_mode == NEARESTMV) {
this_mode_pred = &tmp[3];
} else {
this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
pd->dst.buf = this_mode_pred->data;
pd->dst.stride = bw;
}
}
if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
pred_filter_search &&
((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
......@@ -334,6 +386,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
unsigned int pf_sse[3];
int64_t best_cost = INT64_MAX;
INTERP_FILTER best_filter = SWITCHABLE, filter;
PRED_BUFFER *current_pred = this_mode_pred;
for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
int64_t cost;
......@@ -345,12 +398,28 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
vp9_get_switchable_rate(cpi) + pf_rate[filter],
pf_dist[filter]);
if (cost < best_cost) {
best_filter = filter;
best_cost = cost;
skip_txfm = x->skip_txfm;
best_filter = filter;
best_cost = cost;
skip_txfm = x->skip_txfm;
if (cpi->sf.reuse_inter_pred_sby) {
if (this_mode_pred != current_pred) {
free_pred_buffer(this_mode_pred);
this_mode_pred = current_pred;
}
if (filter < EIGHTTAP_SHARP) {
current_pred = &tmp[get_pred_buffer(tmp, 3)];
pd->dst.buf = current_pred->data;
pd->dst.stride = bw;
}
}
}
}
if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
free_pred_buffer(current_pred);
mbmi->interp_filter = best_filter;
rate = pf_rate[mbmi->interp_filter];
dist = pf_dist[mbmi->interp_filter];
......@@ -451,6 +520,16 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
best_pred_filter = mbmi->interp_filter;
best_ref_frame = ref_frame;
skip_txfm = x->skip_txfm;
if (cpi->sf.reuse_inter_pred_sby) {
if (best_pred != NULL)
free_pred_buffer(best_pred);
best_pred = this_mode_pred;
}
} else {
if (cpi->sf.reuse_inter_pred_sby)
free_pred_buffer(this_mode_pred);
}
if (x->skip)
......@@ -458,6 +537,19 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
}
// If best prediction is not in dst buf, then copy the prediction block from
// temp buf to dst buf.
if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
uint8_t *copy_from, *copy_to;
pd->dst = orig_dst;
copy_to = pd->dst.buf;
copy_from = best_pred->data;
vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0,
bw, bh);
}
mbmi->mode = best_mode;
mbmi->interp_filter = best_pred_filter;
......@@ -471,12 +563,21 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (!x->skip && best_rd > inter_mode_thresh &&
bsize <= cpi->sf.max_intra_bsize) {
for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
if (cpi->sf.reuse_inter_pred_sby) {
pd->dst.buf = tmp[0].data;
pd->dst.stride = bw;
}
vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
mbmi->tx_size, this_mode,
&p->src.buf[0], p->src.stride,
&pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
if (cpi->sf.reuse_inter_pred_sby)
pd->dst = orig_dst;
rate += cpi->mbmode_cost[this_mode];
rate += intra_cost_penalty;
this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
......@@ -494,6 +595,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
}
}
#if CONFIG_DENOISING
vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, bsize);
#endif
......
......@@ -17,6 +17,12 @@
extern "C" {
#endif
typedef struct {
uint8_t *data;
int stride;
int in_use;
} PRED_BUFFER;
int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
const struct TileInfo *const tile,
int mi_row, int mi_col,
......
......@@ -274,6 +274,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
// is checked for a partition block. Later, we can try to allow large
// partitions to do intra mode checking.
sf->max_intra_bsize = BLOCK_8X8;
// This feature is only enabled when partition search is disabled.
sf->reuse_inter_pred_sby = 1;
}
if (speed >= 7) {
......@@ -339,6 +342,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
for (i = 0; i < BLOCK_SIZES; ++i)
sf->inter_mode_mask[i] = INTER_ALL;
sf->max_intra_bsize = BLOCK_64X64;
sf->reuse_inter_pred_sby = 0;
// This setting only takes effect when partition_search_type is set
// to FIXED_PARTITION.
sf->always_this_block_size = BLOCK_16X16;
......
......@@ -353,6 +353,11 @@ typedef struct SPEED_FEATURES {
// The threshold used in SOURCE_VAR_BASED_PARTITION search type.
unsigned int source_var_thresh;
// When partition is pre-set, the inter prediction result from pick_inter_mode
// can be reused in final block encoding process. It is enabled only for real-
// time mode speed 6.
int reuse_inter_pred_sby;
} SPEED_FEATURES;
struct VP9_COMP;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment