From 5ade423774690e2bf877559dc1a1f9547db5dac1 Mon Sep 17 00:00:00 2001 From: Deb Mukherjee <debargha@google.com> Date: Tue, 5 Nov 2013 17:25:38 -0800 Subject: [PATCH] Removes conditional statements from band getting Implements scan order to band map with arrays in both the encoder and decoder to remove conditional statements. Encoding seems to be about 1% faster at speed 0, tested on football. Decoding seems to be about 0.5-1% faster on a set of 25 videos. Change-Id: Idb233ca0b9e0efd790e30880642e8717e1c5c8dd --- vp9/common/vp9_entropy.h | 11 ----- vp9/decoder/vp9_decodframe.c | 79 ++++++++++++++++++++++++++---------- vp9/decoder/vp9_detokenize.c | 23 ++++++----- vp9/decoder/vp9_detokenize.h | 3 +- vp9/decoder/vp9_onyxd_int.h | 3 +- vp9/encoder/vp9_block.h | 3 ++ vp9/encoder/vp9_encodemb.c | 12 +++--- vp9/encoder/vp9_onyx_if.c | 7 ++++ vp9/encoder/vp9_tokenize.c | 6 ++- 9 files changed, 95 insertions(+), 52 deletions(-) diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index c58e852fe1..ccb9c4c552 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -127,12 +127,6 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]; extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]; - -static int get_coef_band(const uint8_t * band_translate, int coef_index) { - return (coef_index > MAXBAND_INDEX) - ? (COEF_BANDS-1) : band_translate[coef_index]; -} - // 128 lists of probabilities are stored for the following ONE node probs: // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly @@ -181,11 +175,6 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, return combine_entropy_contexts(above_ec, left_ec); } -static const uint8_t *get_band_translate(TX_SIZE tx_size) { - return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 - : vp9_coefband_trans_8x8plus; -} - static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, PLANE_TYPE type, int block_idx, const int16_t **scan, const int16_t **scan_nb) { diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index aa3903e5a0..bf70e1392a 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -45,6 +45,7 @@ typedef struct TileWorkerData { DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]); DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]); DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]); + const uint8_t *band_translate[2]; } TileWorkerData; static int read_be32(const uint8_t *p) { @@ -294,7 +295,8 @@ struct intra_args { VP9_COMMON *cm; MACROBLOCKD *xd; vp9_reader *r; - unsigned char* token_cache; + uint8_t *token_cache; + const uint8_t *band_translate[2]; }; static void predict_and_reconstruct_intra_block(int plane, int block, @@ -303,6 +305,9 @@ static void predict_and_reconstruct_intra_block(int plane, int block, struct intra_args *const args = arg; VP9_COMMON *const cm = args->cm; MACROBLOCKD *const xd = args->xd; + const uint8_t *band_translate[2] = { + args->band_translate[0], args->band_translate[1] + }; struct macroblockd_plane *const pd = &xd->plane[plane]; MODE_INFO *const mi = xd->mi_8x8[0]; @@ -324,7 +329,7 @@ static void predict_and_reconstruct_intra_block(int plane, int block, if (!mi->mbmi.skip_coeff) { vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size, - args->r, args->token_cache); + args->r, args->token_cache, band_translate); inverse_transform_block(xd, plane, block, plane_bsize, tx_size); } } @@ -334,7 +339,8 @@ struct inter_args { MACROBLOCKD *xd; vp9_reader *r; int *eobtotal; - unsigned char* token_cache; + uint8_t *token_cache; + const uint8_t *band_translate[2]; }; static void reconstruct_inter_block(int plane, int block, @@ -343,10 +349,14 @@ static void reconstruct_inter_block(int plane, int block, struct inter_args *args = arg; VP9_COMMON *const cm = args->cm; MACROBLOCKD *const xd = args->xd; + const uint8_t *band_translate[2] = { + args->band_translate[0], args->band_translate[1] + }; *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size, - args->r, args->token_cache); + args->r, args->token_cache, + band_translate); inverse_transform_block(xd, plane, block, plane_bsize, tx_size); } @@ -398,7 +408,8 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r, BLOCK_SIZE bsize, - unsigned char *token_cache) { + uint8_t *token_cache, + const uint8_t *band_translate[2]) { const int less8x8 = bsize < BLOCK_8X8; MB_MODE_INFO *mbmi; @@ -420,7 +431,9 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, } if (!is_inter_block(mbmi)) { - struct intra_args arg = { cm, xd, r, token_cache }; + struct intra_args arg = { + cm, xd, r, token_cache, {band_translate[0], band_translate[1]} + }; foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block, &arg); } else { @@ -438,7 +451,10 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, // Reconstruction if (!mbmi->skip_coeff) { int eobtotal = 0; - struct inter_args arg = { cm, xd, r, &eobtotal, token_cache }; + struct inter_args arg = { + cm, xd, r, &eobtotal, token_cache, + {band_translate[0], band_translate[1]} + }; foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); if (!less8x8 && eobtotal == 0) mbmi->skip_coeff = 1; // skip loopfilter @@ -478,7 +494,8 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader* r, BLOCK_SIZE bsize, - unsigned char *token_cache) { + uint8_t *token_cache, + const uint8_t *band_translate[2]) { const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; PARTITION_TYPE partition; BLOCK_SIZE subsize; @@ -489,33 +506,37 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd, partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r); subsize = get_subsize(bsize, partition); if (subsize < BLOCK_8X8) { - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache, + band_translate); } else { switch (partition) { case PARTITION_NONE: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache, + band_translate); break; case PARTITION_HORZ: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache, + band_translate); if (mi_row + hbs < cm->mi_rows) decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, - token_cache); + token_cache, band_translate); break; case PARTITION_VERT: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache, + band_translate); if (mi_col + hbs < cm->mi_cols) decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, - token_cache); + token_cache, band_translate); break; case PARTITION_SPLIT: decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize, - token_cache); + token_cache, band_translate); decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, - token_cache); + token_cache, band_translate); decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, - token_cache); + token_cache, band_translate); decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize, - token_cache); + token_cache, band_translate); break; default: assert(!"Invalid partition type"); @@ -798,9 +819,13 @@ static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile, vp9_zero(xd->left_context); vp9_zero(xd->left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE) + mi_col += MI_BLOCK_SIZE) { + const uint8_t *band_translate[2] = { + vp9_coefband_trans_4x4, pbi->coefband_trans_8x8plus + }; decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, - pbi->token_cache); + pbi->token_cache, band_translate); + } if (pbi->do_loopfilter_inline) { const int lf_start = mi_row - MI_BLOCK_SIZE; @@ -948,7 +973,7 @@ static void setup_tile_macroblockd(TileWorkerData *const tile_data) { } static int tile_worker_hook(void *arg1, void *arg2) { - TileWorkerData *tile_data = (TileWorkerData*)arg1; + TileWorkerData *const tile_data = (TileWorkerData*)arg1; const TileInfo *const tile = (TileInfo*)arg2; int mi_row, mi_col; @@ -960,7 +985,8 @@ static int tile_worker_hook(void *arg1, void *arg2) { mi_col += MI_BLOCK_SIZE) { decode_modes_sb(tile_data->cm, &tile_data->xd, tile, mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64, - tile_data->token_cache); + tile_data->token_cache, + tile_data->band_translate); } } return !tile_data->xd.corrupted; @@ -1019,6 +1045,8 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) { tile_data->cm = cm; tile_data->xd = pbi->mb; tile_data->xd.corrupted = 0; + tile_data->band_translate[0] = vp9_coefband_trans_4x4; + tile_data->band_translate[1] = pbi->coefband_trans_8x8plus; vp9_tile_init(tile, tile_data->cm, 0, tile_col); setup_token_decoder(data, data_end, size, &cm->error, @@ -1299,6 +1327,13 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { const int tile_cols = 1 << cm->log2_tile_cols; YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + vpx_memset(pbi->coefband_trans_8x8plus, + (COEF_BANDS - 1), + sizeof(pbi->coefband_trans_8x8plus)); + vpx_memcpy(pbi->coefband_trans_8x8plus, + vp9_coefband_trans_8x8plus, + sizeof(vp9_coefband_trans_8x8plus)); + if (!first_partition_size) { // showing a frame directly *p_data_end = data + 1; diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index b8d670b965..65786dd8c3 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -93,7 +93,8 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_reader *r, int block_idx, PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr, TX_SIZE tx_size, const int16_t *dq, int pt, - uint8_t *token_cache) { + uint8_t *token_cache, + const uint8_t *band_translate) { const FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); @@ -108,31 +109,30 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] = counts->eob_branch[tx_size][type][ref]; const int16_t *scan, *nb; - const uint8_t *const band_translate = get_band_translate(tx_size); + const uint8_t *cat6; get_scan(xd, tx_size, type, block_idx, &scan, &nb); - while (1) { + while (c < seg_eob) { int val; - const uint8_t *cat6 = cat6_prob; - if (c >= seg_eob) - break; if (c) pt = get_coef_context(nb, token_cache, c); - band = get_coef_band(band_translate, c); + band = *band_translate++; prob = coef_probs[band][pt]; if (!cm->frame_parallel_decoding_mode) ++eob_branch_count[band][pt]; if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) break; + goto DECODE_ZERO; SKIP_START: if (c >= seg_eob) break; if (c) pt = get_coef_context(nb, token_cache, c); - band = get_coef_band(band_translate, c); + band = *band_translate++; prob = coef_probs[band][pt]; + DECODE_ZERO: if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) { INCREMENT_COUNT(ZERO_TOKEN); token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN]; @@ -200,6 +200,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY5); } val = 0; + cat6 = cat6_prob; while (*cat6) { val = (val << 1) | vp9_read(r, *cat6++); } @@ -218,7 +219,8 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, vp9_reader *r, - uint8_t *token_cache) { + uint8_t *token_cache, + const uint8_t *band_translate[2]) { struct macroblockd_plane *const pd = &xd->plane[plane]; const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id, tx_size); @@ -229,7 +231,8 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block), - tx_size, pd->dequant, pt, token_cache); + tx_size, pd->dequant, pt, token_cache, + band_translate[tx_size != TX_4X4]); set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff); diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h index 04939ead36..9b8c17a455 100644 --- a/vp9/decoder/vp9_detokenize.h +++ b/vp9/decoder/vp9_detokenize.h @@ -18,6 +18,7 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, vp9_reader *r, - uint8_t *token_cache); + uint8_t *token_cache, + const uint8_t *band_translate[2]); #endif // VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h index 7ad05e6b29..e29b453ff6 100644 --- a/vp9/decoder/vp9_onyxd_int.h +++ b/vp9/decoder/vp9_onyxd_int.h @@ -54,7 +54,8 @@ typedef struct VP9Decompressor { ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; PARTITION_CONTEXT *above_seg_context; - DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); + DECLARE_ALIGNED(16, uint8_t, token_cache[1024]); + DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]); } VP9D_COMP; #endif // VP9_DECODER_VP9_ONYXD_INT_H_ diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 4d9a92a712..04ce1f4169 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -184,6 +184,9 @@ struct macroblock { BLOCK_SIZE sb64_partitioning; void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride); + + // band cache + DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]); }; // TODO(jingning): the variables used here are little complicated. need further diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 75ed8eab75..70008103e4 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -138,7 +138,9 @@ static void optimize_b(MACROBLOCK *mb, uint8_t token_cache[1024]; const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block); const int16_t *dequant_ptr = pd->dequant; - const uint8_t *const band_translate = get_band_translate(tx_size); + const uint8_t *const band_translate = (tx_size == TX_4X4 ? + vp9_coefband_trans_4x4 : + mb->coefband_trans_8x8plus); assert((!type && !plane) || (type && plane)); dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); @@ -179,7 +181,7 @@ static void optimize_b(MACROBLOCK *mb, t0 = (vp9_dct_value_tokens_ptr + x)->token; /* Consider both possible successor states. */ if (next < default_eob) { - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][band][0][pt] @@ -230,7 +232,7 @@ static void optimize_b(MACROBLOCK *mb, t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token; } if (next < default_eob) { - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; if (t0 != DCT_EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt] @@ -264,7 +266,7 @@ static void optimize_b(MACROBLOCK *mb, /* There's no choice to make for a zero coefficient, so we don't * add a new trellis node, but we do need to update the costs. */ - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; t0 = tokens[next][0].token; t1 = tokens[next][1].token; /* Update the cost of each path if we're past the EOB token. */ @@ -284,7 +286,7 @@ static void optimize_b(MACROBLOCK *mb, } /* Now pick the best path through the whole trellis. */ - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; pt = combine_entropy_contexts(*a, *l); rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index f4106934c6..7603ac03db 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -1223,6 +1223,13 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->fixed_divide[0] = 0; for (i = 1; i < 512; i++) cpi->fixed_divide[i] = 0x80000 / i; + + vpx_memset(cpi->mb.coefband_trans_8x8plus, + (COEF_BANDS-1), + sizeof(cpi->mb.coefband_trans_8x8plus)); + vpx_memcpy(cpi->mb.coefband_trans_8x8plus, + vp9_coefband_trans_8x8plus, + sizeof(vp9_coefband_trans_8x8plus)); } diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 7d4676e97b..11dd0c0af3 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -115,7 +115,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, vp9_coeff_count *const counts = cpi->coef_counts[tx_size]; vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size]; const int ref = is_inter_block(mbmi); - const uint8_t *const band_translate = get_band_translate(tx_size); + const uint8_t *const band_translate = (tx_size == TX_4X4 ? + vp9_coefband_trans_4x4 : + cpi->mb.coefband_trans_8x8plus); const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); int aoff, loff; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); @@ -127,7 +129,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, get_scan(xd, tx_size, type, block, &scan, &nb); c = 0; do { - const int band = get_coef_band(band_translate, c); + const int band = band_translate[c]; int token; int v = 0; rc = scan[c]; -- GitLab