diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index af35432c44cc8998ccea332801a5fc124ad2aa69..64f14c993ed065c7fe6b0c226264a0fed2e713ec 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -71,12 +71,6 @@ static INLINE int dct_const_round_shift(int input) { return rv; } -static INLINE int dct_32_round(int input) { - int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - assert(-131072 <= rv && rv <= 131071); - return rv; -} - typedef void (*transform_1d)(int16_t*, int16_t*); typedef struct { diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 17d1e801c7727f4bbf69c3cecf9471f9f4cdd24c..a405aab8d8c79ad6cf5f3e777701b23a375ad6ec 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -572,6 +572,9 @@ specialize vp9_short_fdct8x4 sse2 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct32x32 +prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_fdct32x32_rd + prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct16x16 sse2 diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index e78f54eb4913e589b65136f506033b81633ae1f1..59cc3d95cf479a832982d953f0d4a285d5cac1f1 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -139,6 +139,9 @@ struct macroblock { int optimize; + // indicate if it is in the rd search loop or encoding process + int rd_search; + // TODO(jingning): Need to refactor the structure arrays that buffers the // coding mode decisions of each partition type. PICK_MODE_CONTEXT ab4x4_context[4][4][4]; diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 8d4eec1394bf36e8bcce7ab24e3bf189696a02fa..a90bcf5df6d6e4ce10bba6651d1d33ee2a940fd1 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -991,8 +991,18 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output, } } +static INLINE int dct_32_round(int input) { + int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + assert(-131072 <= rv && rv <= 131071); + return rv; +} + +static INLINE int half_round_shift(int input) { + int rv = (input + 1 + (input < 0)) >> 2; + return rv; +} -static void dct32_1d(int *input, int *output) { +static void dct32_1d(int *input, int *output, int round) { int step[32]; // Stage 1 step[0] = input[0] + input[(32 - 1)]; @@ -1101,6 +1111,44 @@ static void dct32_1d(int *input, int *output) { step[30] = output[30] + output[25]; step[31] = output[31] + output[24]; + // dump the magnitude by half, hence the intermediate values are within 1108 + // the range of 16 bits. + if (round) { + step[0] = half_round_shift(step[0]); + step[1] = half_round_shift(step[1]); + step[2] = half_round_shift(step[2]); + step[3] = half_round_shift(step[3]); + step[4] = half_round_shift(step[4]); + step[5] = half_round_shift(step[5]); + step[6] = half_round_shift(step[6]); + step[7] = half_round_shift(step[7]); + step[8] = half_round_shift(step[8]); + step[9] = half_round_shift(step[9]); + step[10] = half_round_shift(step[10]); + step[11] = half_round_shift(step[11]); + step[12] = half_round_shift(step[12]); + step[13] = half_round_shift(step[13]); + step[14] = half_round_shift(step[14]); + step[15] = half_round_shift(step[15]); + + step[16] = half_round_shift(step[16]); + step[17] = half_round_shift(step[17]); + step[18] = half_round_shift(step[18]); + step[19] = half_round_shift(step[19]); + step[20] = half_round_shift(step[20]); + step[21] = half_round_shift(step[21]); + step[22] = half_round_shift(step[22]); + step[23] = half_round_shift(step[23]); + step[24] = half_round_shift(step[24]); + step[25] = half_round_shift(step[25]); + step[26] = half_round_shift(step[26]); + step[27] = half_round_shift(step[27]); + step[28] = half_round_shift(step[28]); + step[29] = half_round_shift(step[29]); + step[30] = half_round_shift(step[30]); + step[31] = half_round_shift(step[31]); + } + // Stage 4 output[0] = step[0] + step[3]; output[1] = step[1] + step[2]; @@ -1283,12 +1331,12 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { int output[32 * 32]; // Columns - for (i = 0; i < 32; i++) { + for (i = 0; i < 32; ++i) { int temp_in[32], temp_out[32]; - for (j = 0; j < 32; j++) + for (j = 0; j < 32; ++j) temp_in[j] = input[j * shortpitch + i] << 2; - dct32_1d(temp_in, temp_out); - for (j = 0; j < 32; j++) + dct32_1d(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } @@ -1297,8 +1345,37 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - dct32_1d(temp_in, temp_out); + dct32_1d(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } } + +// Note that although we use dct_32_round in dct32_1d computation flow, +// this 2d fdct32x32 for rate-distortion optimization loop is operating +// within 16 bits precision. +void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { + int shortpitch = pitch >> 1; + int i, j; + int output[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + int temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = input[j * shortpitch + i] << 2; + dct32_1d(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + int temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = output[j + i * 32]; + dct32_1d(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) + out[j + i * 32] = temp_out[j]; + } +} diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index f0f2ef5d3779ae9b050affea93172883cf353ae8..54b6e24404bea0f92e3cbee268ec6554d5ef66df 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -602,6 +602,8 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + x->rd_search = 1; + if (bsize < BLOCK_SIZE_SB8X8) if (xd->ab_index != 0) return; @@ -1974,6 +1976,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, const int mis = cm->mode_info_stride; const int bwl = mi_width_log2(bsize); const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize); + x->rd_search = 0; if (cm->frame_type == KEY_FRAME) { if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index f0202450eefc7e0ef0249742fd9265268da35d6e..4f45496df0654ee73534d667968fb888c001b45f 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -462,7 +462,10 @@ static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, switch (ss_txfrm_size / 2) { case TX_32X32: - vp9_short_fdct32x32(src_diff, coeff, bw * 2); + if (x->rd_search) + vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2); + else + vp9_short_fdct32x32(src_diff, coeff, bw * 2); break; case TX_16X16: tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;