diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 715df98bd52bcd6a5e52fc01cc8363cda94a7afd..76f80d4d74dc07e80c21bbb39f79b21e8a6006df 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -340,7 +340,7 @@ struct buf_2d { int stride; }; -struct mb_plane { +struct macroblockd_plane { DECLARE_ALIGNED(16, int16_t, qcoeff[64 * 64]); DECLARE_ALIGNED(16, int16_t, dqcoeff[64 * 64]); DECLARE_ALIGNED(16, uint16_t, eobs[256]); @@ -360,7 +360,7 @@ struct mb_plane { BLOCK_OFFSET((x)->plane[2].field, ((i) - 20), 16)) typedef struct macroblockd { - struct mb_plane plane[MAX_MB_PLANE]; + struct macroblockd_plane plane[MAX_MB_PLANE]; /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */ BLOCKD block[24]; @@ -924,6 +924,18 @@ static INLINE void foreach_predicted_block_uv( foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg); } } +static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, + int plane, int block) { + const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int stride = 4 << bw; + const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1)); + return y * stride + x; +} +static int16_t* raster_block_offset_int16(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE bsize, + int plane, int block, int16_t *base) { + return base + raster_block_offset(xd, bsize, plane, block); +} #if CONFIG_CODE_ZEROGROUP static int get_zpc_used(TX_SIZE tx_size) { diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 1f838a574dee2bfcd5768a0204ade58c817f9990..020841c42e710bf5d9314fa54f6c492449600849 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -617,12 +617,6 @@ prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size" specialize vp9_block_error mmx sse2 vp9_block_error_sse2=vp9_block_error_xmm -prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch" -# TODO(jingning): The prototype function in c has been changed to remove -# the use of predictor buffer in MACROBLOCKD. Need to modify the mmx and sse2 -# versions accordingly. -specialize vp9_subtract_b - # # Structured Similarity (SSIM) # diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index fefa2af556de3ece9f0b4f752ef6f36d5ea067ee..91aca6f54861641160d2cf20ba35d7b361cc5a29 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -305,7 +305,7 @@ static void decode_8x8(MACROBLOCKD *xd) { static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx) { BLOCKD *const b = &xd->block[idx]; - struct mb_plane *const y = &xd->plane[0]; + struct macroblockd_plane *const y = &xd->plane[0]; if (tx_type != DCT_DCT) { vp9_dequant_iht_add_c(tx_type, BLOCK_OFFSET(y->qcoeff, idx, 16), diff --git a/vp9/encoder/vp9_asm_enc_offsets.c b/vp9/encoder/vp9_asm_enc_offsets.c index 1a770dcf74df8310c3c8e78422fe8f7ed37480e1..db9a4fc4373a7ce170dd4119f8cf471938c00f7d 100644 --- a/vp9/encoder/vp9_asm_enc_offsets.c +++ b/vp9/encoder/vp9_asm_enc_offsets.c @@ -20,7 +20,6 @@ BEGIN /* regular quantize */ -DEFINE(vp9_block_coeff, offsetof(BLOCK, coeff)); DEFINE(vp9_block_zbin, offsetof(BLOCK, zbin)); DEFINE(vp9_block_round, offsetof(BLOCK, round)); DEFINE(vp9_block_quant, offsetof(BLOCK, quant)); diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index e511be8cd1c0f9c8bcd0bbbeccfd91ed99174316..a054269aac85a80cf00b6aa8c859b29669206dc0 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -25,7 +25,6 @@ typedef struct { typedef struct block { // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries - int16_t *src_diff; int16_t *coeff; // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries @@ -83,9 +82,13 @@ typedef struct { int64_t txfm_rd_diff[NB_TXFM_MODES]; } PICK_MODE_CONTEXT; +struct macroblock_plane { + DECLARE_ALIGNED(16, int16_t, src_diff[64*64]); +}; + typedef struct macroblock MACROBLOCK; struct macroblock { - DECLARE_ALIGNED(16, int16_t, src_diff[64*64+32*32*2]); + struct macroblock_plane plane[MAX_MB_PLANE]; DECLARE_ALIGNED(16, int16_t, coeff[64*64+32*32*2]); // 16 Y blocks, 4 U blocks, 4 V blocks, BLOCK block[24]; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 1f20d5e1c3861d87f4cf0e44fffd5e4659bb4da1..fd65dfa45e6936ca7c7b1c1e12946f37f23fef50 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1734,25 +1734,8 @@ void vp9_encode_frame(VP9_COMP *cpi) { } void vp9_setup_block_ptrs(MACROBLOCK *x) { - int r, c; int i; - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) - x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4; - } - - for (r = 0; r < 2; r++) { - for (c = 0; c < 2; c++) - x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4; - } - - - for (r = 0; r < 2; r++) { - for (c = 0; c < 2; c++) - x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4; - } - for (i = 0; i < 24; i++) x->block[i].coeff = x->coeff + i * 16; } @@ -2100,14 +2083,6 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - const uint8_t *src = x->src.y_buffer; - uint8_t *dst = xd->plane[0].dst.buf; - const uint8_t *usrc = x->src.u_buffer; - uint8_t *udst = xd->plane[1].dst.buf; - const uint8_t *vsrc = x->src.v_buffer; - uint8_t *vdst = xd->plane[2].dst.buf; - int src_y_stride = x->src.y_stride, dst_y_stride = xd->plane[0].dst.stride; - int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->plane[1].dst.stride; int n; MODE_INFO *mi = x->e_mbd.mode_info_context; unsigned int segment_id = mi->mbmi.segment_id; @@ -2187,10 +2162,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, } if (!x->skip) { - vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride, - bsize); - vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride, - udst, vdst, dst_uv_stride, bsize); + vp9_subtract_sb(x, bsize); switch (xd->mode_info_context->mbmi.txfm_size) { case TX_32X32: diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index 5c6559bdb5b846fae7e86942ac3bb42ac61b3c3a..32a92b8e7bd395a87328e9ac995ddb4f7194c3ee 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -37,7 +37,7 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) { } } - return vp9_get_mb_ss(x->src_diff); + return vp9_get_mb_ss(x->plane[0].src_diff); } static void encode_intra4x4block(MACROBLOCK *x, int ib) { @@ -45,6 +45,9 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib) { BLOCK *be = &x->block[ib]; MACROBLOCKD * const xd = &x->e_mbd; TX_TYPE tx_type; + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib, + x->plane[0].src_diff); assert(ib < 16); @@ -54,16 +57,18 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib) { vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, *(b->base_dst) + b->dst, b->dst_stride); - vp9_subtract_b(be, b, 16); + vp9_subtract_block(4, 4, src_diff, 16, + *(be->base_src) + be->src, be->src_stride, + *(b->base_dst) + b->dst, b->dst_stride); tx_type = get_tx_type_4x4(&x->e_mbd, ib); if (tx_type != DCT_DCT) { - vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_short_fht4x4(src_diff, be->coeff, 16, tx_type); vp9_ht_quantize_b_4x4(x, ib, tx_type); vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), b->diff, 16, tx_type); } else { - x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->fwd_txm4x4(src_diff, be->coeff, 32); x->quantize_b_4x4(x, ib, 16); vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib], BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), @@ -86,10 +91,7 @@ void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) { TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16); - vp9_subtract_sby_s_c(x->src_diff, - x->src.y_buffer, x->src.y_stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, - BLOCK_SIZE_MB16X16); + vp9_subtract_sby(x, BLOCK_SIZE_MB16X16); switch (tx_size) { case TX_16X16: @@ -123,11 +125,7 @@ void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) { TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16); - vp9_subtract_sbuv_s_c(x->src_diff, - x->src.u_buffer, x->src.v_buffer, x->src.uv_stride, - xd->plane[1].dst.buf, xd->plane[2].dst.buf, - xd->plane[1].dst.stride, - BLOCK_SIZE_MB16X16); + vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16); switch (tx_size) { case TX_4X4: @@ -153,6 +151,9 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { MACROBLOCKD *xd = &x->e_mbd; BLOCKD *b = &xd->block[ib]; BLOCK *be = &x->block[ib]; + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib, + x->plane[0].src_diff); const int iblock[4] = {0, 1, 4, 5}; int i; TX_TYPE tx_type; @@ -160,7 +161,9 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first, *(b->base_dst) + b->dst, b->dst_stride); // generate residual blocks - vp9_subtract_4b_c(be, b, 16); + vp9_subtract_block(8, 8, src_diff, 16, + *(be->base_src) + be->src, be->src_stride, + *(b->base_dst) + b->dst, b->dst_stride); if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { int idx = (ib & 0x02) ? (ib + 2) : ib; @@ -169,12 +172,12 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { assert(idx < 16); tx_type = get_tx_type_8x8(xd, ib); if (tx_type != DCT_DCT) { - vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type); + vp9_short_fht8x8(src_diff, (x->block + idx)->coeff, 16, tx_type); x->quantize_b_8x8(x, idx, tx_type, 16); vp9_short_iht8x8(dqcoeff, xd->block[ib].diff, 16, tx_type); } else { - x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32); + x->fwd_txm8x8(src_diff, (x->block + idx)->coeff, 32); x->quantize_b_8x8(x, idx, DCT_DCT, 16); vp9_short_idct8x8(dqcoeff, xd->block[ib].diff, 32); } @@ -182,18 +185,21 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { for (i = 0; i < 4; i++) { int idx = ib + iblock[i]; int16_t * const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16); + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx, + x->plane[0].src_diff); assert(idx < 16); b = &xd->block[ib + iblock[i]]; be = &x->block[ib + iblock[i]]; tx_type = get_tx_type_4x4(xd, ib + iblock[i]); if (tx_type != DCT_DCT) { - vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_short_fht4x4(src_diff, be->coeff, 16, tx_type); vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type); vp9_short_iht4x4(dqcoeff, b->diff, 16, tx_type); } else if (!(i & 1) && get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) { - x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->fwd_txm8x4(src_diff, be->coeff, 32); x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16); vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]], dqcoeff, b->diff, 32); @@ -201,7 +207,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { dqcoeff + 16, (b + 1)->diff, 32); i++; } else { - x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->fwd_txm4x4(src_diff, be->coeff, 32); x->quantize_b_4x4(x, ib + iblock[i], 16); vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]], dqcoeff, b->diff, 32); @@ -231,14 +237,20 @@ static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) { int16_t * const dqcoeff = MB_SUBBLOCK_FIELD(xd, dqcoeff, ib); const int plane = ib < 20 ? 1 : 2; const int block = ib < 20 ? ib - 16 : ib - 20; + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block, + x->plane[plane].src_diff); assert(ib >= 16 && ib < 24); vp9_intra_uv4x4_predict(&x->e_mbd, b, mode, *(b->base_dst) + b->dst, b->dst_stride); - vp9_subtract_b(be, b, 8); + assert(xd->plane[1].subsampling_x == 1); + vp9_subtract_block(4, 4, src_diff, 8, + *(be->base_src) + be->src, be->src_stride, + *(b->base_dst) + b->dst, b->dst_stride); - x->fwd_txm4x4(be->src_diff, be->coeff, 16); + x->fwd_txm4x4(src_diff, be->coeff, 16); x->quantize_b_4x4(x, ib, 16); vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[plane].eobs[block], dqcoeff, b->diff, 16); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index c841c2823728d2190e17f29f8dad2e4334d56fe0..b6fe6dfa787d658e14cedf8999c48f7ac44374dd 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -20,102 +20,54 @@ #include "vp9/common/vp9_systemdependent.h" #include "vp9_rtcd.h" -void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) { - uint8_t *src_ptr = (*(be->base_src) + be->src); - int16_t *diff_ptr = be->src_diff; - uint8_t *pred_ptr = *(bd->base_dst) + bd->dst; - int src_stride = be->src_stride; - int dst_stride = bd->dst_stride; - +void vp9_subtract_block(int rows, int cols, + int16_t *diff_ptr, int diff_stride, + const uint8_t *src_ptr, int src_stride, + const uint8_t *pred_ptr, int pred_stride) { int r, c; - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - diff_ptr += pitch; - pred_ptr += dst_stride; + diff_ptr += diff_stride; + pred_ptr += pred_stride; src_ptr += src_stride; } } -void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) { - uint8_t *src_ptr = (*(be->base_src) + be->src); - int16_t *diff_ptr = be->src_diff; - uint8_t *pred_ptr = *(bd->base_dst) + bd->dst; - int src_stride = be->src_stride; - int dst_stride = bd->dst_stride; - int r, c; - for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) - diff_ptr[c] = src_ptr[c] - pred_ptr[c]; +static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) { + const MACROBLOCKD * const xd = &x->e_mbd; + const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x); + const int bh = 4 << (b_height_log2(bsize) - xd->plane[plane].subsampling_y); + const uint8_t *src = plane == 0 ? x->src.y_buffer : + plane == 1 ? x->src.u_buffer : x->src.v_buffer; + const int src_stride = plane == 0 ? x->src.y_stride : x->src.uv_stride; - diff_ptr += pitch; - pred_ptr += dst_stride; - src_ptr += src_stride; - } + assert(plane < 3); + vp9_subtract_block(bh, bw, + x->plane[plane].src_diff, bw, src, src_stride, + xd->plane[plane].dst.buf, xd->plane[plane].dst.stride); } -void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride, - const uint8_t *pred, int dst_stride, - BLOCK_SIZE_TYPE bsize) { - const int bh = 16 << mb_height_log2(bsize), bw = 16 << mb_width_log2(bsize); - int r, c; - - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - diff[c] = src[c] - pred[c]; - - diff += bw; - pred += dst_stride; - src += src_stride; - } +void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { + subtract_plane(x, bsize, 0); } -void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc, - const uint8_t *vsrc, int src_stride, - const uint8_t *upred, - const uint8_t *vpred, int dst_stride, - BLOCK_SIZE_TYPE bsize) { - const int bhl = mb_height_log2(bsize), bwl = mb_width_log2(bsize); - const int uoff = (16 * 16) << (bhl + bwl), voff = (uoff * 5) >> 2; - const int bw = 8 << bwl, bh = 8 << bhl; - int16_t *udiff = diff + uoff; - int16_t *vdiff = diff + voff; - int r, c; - - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - udiff[c] = usrc[c] - upred[c]; - - udiff += bw; - upred += dst_stride; - usrc += src_stride; - } +void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { + int i; - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - vdiff[c] = vsrc[c] - vpred[c]; - - vdiff += bw; - vpred += dst_stride; - vsrc += src_stride; - } + for (i = 1; i < MAX_MB_PLANE; i++) + subtract_plane(x, bsize, i); } -static void subtract_mb(MACROBLOCK *x) { - MACROBLOCKD *xd = &x->e_mbd; - vp9_subtract_sby_s_c(x->src_diff, x->src.y_buffer, x->src.y_stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, - BLOCK_SIZE_MB16X16); - vp9_subtract_sbuv_s_c(x->src_diff, x->src.u_buffer, x->src.v_buffer, - x->src.uv_stride, - xd->plane[1].dst.buf, xd->plane[2].dst.buf, - xd->plane[1].dst.stride, - BLOCK_SIZE_MB16X16); +void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { + vp9_subtract_sby(x, bsize); + vp9_subtract_sbuv(x, bsize); } + void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { const int bwl = mb_width_log2(bsize) - 1, bw = 1 << bwl; const int bh = 1 << (mb_height_log2(bsize) - 1); @@ -125,7 +77,7 @@ void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { for (n = 0; n < bw * bh; n++) { const int x_idx = n & (bw - 1), y_idx = n >> bwl; - vp9_short_fdct32x32(x->src_diff + y_idx * stride * 32 + x_idx * 32, + vp9_short_fdct32x32(x->plane[0].src_diff + y_idx * stride * 32 + x_idx * 32, x->coeff + n * 1024, stride * 2); } } @@ -143,10 +95,11 @@ void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { (y_idx * bstride + x_idx) * 4); if (tx_type != DCT_DCT) { - vp9_short_fht16x16(x->src_diff + y_idx * stride * 16 + x_idx * 16, + vp9_short_fht16x16(x->plane[0].src_diff + + y_idx * stride * 16 + x_idx * 16, x->coeff + n * 256, stride, tx_type); } else { - x->fwd_txm16x16(x->src_diff + y_idx * stride * 16 + x_idx * 16, + x->fwd_txm16x16(x->plane[0].src_diff + y_idx * stride * 16 + x_idx * 16, x->coeff + n * 256, stride * 2); } } @@ -164,10 +117,10 @@ void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2); if (tx_type != DCT_DCT) { - vp9_short_fht8x8(x->src_diff + y_idx * stride * 8 + x_idx * 8, + vp9_short_fht8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8, x->coeff + n * 64, stride, tx_type); } else { - x->fwd_txm8x8(x->src_diff + y_idx * stride * 8 + x_idx * 8, + x->fwd_txm8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8, x->coeff + n * 64, stride * 2); } } @@ -185,10 +138,10 @@ void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { const TX_TYPE tx_type = get_tx_type_4x4(xd, n); if (tx_type != DCT_DCT) { - vp9_short_fht4x4(x->src_diff + y_idx * stride * 4 + x_idx * 4, + vp9_short_fht4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4, x->coeff + n * 16, stride, tx_type); } else { - x->fwd_txm4x4(x->src_diff + y_idx * stride * 4 + x_idx * 4, + x->fwd_txm4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4, x->coeff + n * 16, stride * 2); } } @@ -197,9 +150,9 @@ void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { assert(bsize == BLOCK_SIZE_SB64X64); vp9_clear_system_state(); - vp9_short_fdct32x32(x->src_diff + 4096, + vp9_short_fdct32x32(x->plane[1].src_diff, x->coeff + 4096, 64); - vp9_short_fdct32x32(x->src_diff + 4096 + 1024, + vp9_short_fdct32x32(x->plane[2].src_diff, x->coeff + 4096 + 1024, 64); } @@ -214,9 +167,9 @@ void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { for (n = 0; n < bw * bh; n++) { const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1); - x->fwd_txm16x16(x->src_diff + uoff + y_idx * stride * 16 + x_idx * 16, + x->fwd_txm16x16(x->plane[1].src_diff + y_idx * stride * 16 + x_idx * 16, x->coeff + uoff + n * 256, stride * 2); - x->fwd_txm16x16(x->src_diff + voff + y_idx * stride * 16 + x_idx * 16, + x->fwd_txm16x16(x->plane[2].src_diff + y_idx * stride * 16 + x_idx * 16, x->coeff + voff + n * 256, stride * 2); } } @@ -232,9 +185,9 @@ void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { for (n = 0; n < bw * bh; n++) { const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1); - x->fwd_txm8x8(x->src_diff + uoff + y_idx * stride * 8 + x_idx * 8, + x->fwd_txm8x8(x->plane[1].src_diff + y_idx * stride * 8 + x_idx * 8, x->coeff + uoff + n * 64, stride * 2); - x->fwd_txm8x8(x->src_diff + voff + y_idx * stride * 8 + x_idx * 8, + x->fwd_txm8x8(x->plane[2].src_diff + y_idx * stride * 8 + x_idx * 8, x->coeff + voff + n * 64, stride * 2); } } @@ -250,9 +203,9 @@ void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { for (n = 0; n < bw * bh; n++) { const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1); - x->fwd_txm4x4(x->src_diff + uoff + y_idx * stride * 4 + x_idx * 4, + x->fwd_txm4x4(x->plane[1].src_diff + y_idx * stride * 4 + x_idx * 4, x->coeff + uoff + n * 16, stride * 2); - x->fwd_txm4x4(x->src_diff + voff + y_idx * stride * 4 + x_idx * 4, + x->fwd_txm4x4(x->plane[2].src_diff + y_idx * stride * 4 + x_idx * 4, x->coeff + voff + n * 16, stride * 2); } } @@ -844,7 +797,7 @@ void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; vp9_build_inter_predictors_sb(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16); - subtract_mb(x); + vp9_subtract_sb(x, BLOCK_SIZE_MB16X16); vp9_fidct_mb(cm, x); vp9_recon_sb(xd, BLOCK_SIZE_MB16X16); } @@ -854,9 +807,7 @@ void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) { MACROBLOCKD *xd = &x->e_mbd; vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16); - vp9_subtract_sby_s_c(x->src_diff, x->src.y_buffer, x->src.y_stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, - BLOCK_SIZE_MB16X16); + vp9_subtract_sby(x, BLOCK_SIZE_MB16X16); vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16); vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16); diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 3c0d760a1933db794a41cffb0625301319dd4923..da134a86b847c5780f52a2042300dd17779e14bb 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -56,15 +56,12 @@ void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x, void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x); -void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch); - -void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride, - const uint8_t *pred, int dst_stride, - BLOCK_SIZE_TYPE bsize); -void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc, - const uint8_t *vsrc, int src_stride, - const uint8_t *upred, - const uint8_t *vpred, int dst_stride, - BLOCK_SIZE_TYPE bsize); +void vp9_subtract_block(int rows, int cols, + int16_t *diff_ptr, int diff_stride, + const uint8_t *src_ptr, int src_stride, + const uint8_t *pred_ptr, int pred_stride); +void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); +void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); +void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize); #endif // VP9_ENCODER_VP9_ENCODEMB_H_ diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 18ad10b8e9eefe2b2c006635defec8be81f2517e..5517b157402d54003f42fe013c1546ab3570c9ea 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -845,12 +845,9 @@ static void super_block_yrd(VP9_COMP *cpi, int *skip, BLOCK_SIZE_TYPE bs, int64_t txfm_cache[NB_TXFM_MODES]) { VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB]; - uint8_t *src = x->src.y_buffer, *dst = xd->plane[0].dst.buf; - int src_y_stride = x->src.y_stride, dst_y_stride = xd->plane[0].dst.stride; - vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride, bs); + vp9_subtract_sby(x, bs); if (bs >= BLOCK_SIZE_SB32X32) super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], @@ -877,7 +874,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, VP9_COMMON *const cm = &cpi->common; BLOCK *be = x->block + ib; BLOCKD *b = xd->block + ib; - + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib, + x->plane[0].src_diff); ENTROPY_CONTEXT ta = *a, tempa = *a; ENTROPY_CONTEXT tl = *l, templ = *l; TX_TYPE tx_type = DCT_DCT; @@ -917,15 +916,17 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, #endif vp9_intra4x4_predict(xd, b, mode, *(b->base_dst) + b->dst, b->dst_stride); - vp9_subtract_b(be, b, 16); + vp9_subtract_block(4, 4, src_diff, 16, + *(be->base_src) + be->src, be->src_stride, + *(b->base_dst) + b->dst, b->dst_stride); b->bmi.as_mode.first = mode; tx_type = get_tx_type_4x4(xd, be - x->block); if (tx_type != DCT_DCT) { - vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_short_fht4x4(src_diff, be->coeff, 16, tx_type); vp9_ht_quantize_b_4x4(x, be - x->block, tx_type); } else { - x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->fwd_txm4x4(src_diff, be->coeff, 32); x->quantize_b_4x4(x, be - x->block, 16); } @@ -1107,10 +1108,12 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, ENTROPY_CONTEXT_PLANES ta, tl; ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0; ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0; - // perform transformation of dimension 8x8 // note the input and output index mapping int idx = (ib & 0x02) ? (ib + 2) : ib; + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib, + x->plane[0].src_diff); assert(ib < 16); for (mode = DC_PRED; mode <= TM_PRED; mode++) { @@ -1123,14 +1126,16 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vp9_intra8x8_predict(xd, b, mode, *(b->base_dst) + b->dst, b->dst_stride); - vp9_subtract_4b_c(be, b, 16); + vp9_subtract_block(8, 8, src_diff, 16, + *(be->base_src) + be->src, be->src_stride, + *(b->base_dst) + b->dst, b->dst_stride); if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { TX_TYPE tx_type = get_tx_type_8x8(xd, ib); if (tx_type != DCT_DCT) - vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type); + vp9_short_fht8x8(src_diff, (x->block + idx)->coeff, 16, tx_type); else - x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32); + x->fwd_txm8x8(src_diff, (x->block + idx)->coeff, 32); x->quantize_b_8x8(x, idx, tx_type, 16); // compute quantization mse of 8x8 block @@ -1162,20 +1167,24 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, distortion = 0; rate_t = 0; for (i = 0; i < 4; ++i) { + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, + 0, ib + iblock[i], + x->plane[0].src_diff); int do_two = 0; b = &xd->block[ib + iblock[i]]; be = &x->block[ib + iblock[i]]; tx_type = get_tx_type_4x4(xd, ib + iblock[i]); if (tx_type != DCT_DCT) { - vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_short_fht4x4(src_diff, be->coeff, 16, tx_type); vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type); } else if (!(i & 1) && get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) { - x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->fwd_txm8x4(src_diff, be->coeff, 32); x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16); do_two = 1; } else { - x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->fwd_txm4x4(src_diff, be->coeff, 32); x->quantize_b_4x4(x, ib + iblock[i], 16); } distortion += vp9_block_error_c(be->coeff, @@ -1531,12 +1540,8 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - uint8_t *usrc = x->src.u_buffer, *udst = xd->plane[1].dst.buf; - uint8_t *vsrc = x->src.v_buffer, *vdst = xd->plane[2].dst.buf; - int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->plane[1].dst.stride; - vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride, - udst, vdst, dst_uv_stride, bsize); + vp9_subtract_sbuv(x, bsize); if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) { super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize); @@ -1738,6 +1743,9 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, if (labels[i] == which_label) { BLOCKD *bd = &x->e_mbd.block[i]; BLOCK *be = &x->block[i]; + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, i, + x->plane[0].src_diff); int thisdistortion; vp9_build_inter_predictor(*(bd->base_pre) + bd->pre, @@ -1760,8 +1768,10 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, &xd->subpix); } - vp9_subtract_b(be, bd, 16); - x->fwd_txm4x4(be->src_diff, be->coeff, 32); + vp9_subtract_block(4, 4, src_diff, 16, + *(be->base_src) + be->src, be->src_stride, + *(bd->base_dst) + bd->dst, bd->dst_stride); + x->fwd_txm4x4(src_diff, be->coeff, 32); x->quantize_b_4x4(x, i, 16); thisdistortion = vp9_block_error(be->coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16); @@ -1809,6 +1819,9 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm, const int idx = (ib & 8) + ((ib & 2) << 1); BLOCKD *bd = &xd->block[ib]; BLOCK *be = &x->block[ib], *be2 = &x->block[idx]; + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib, + x->plane[0].src_diff); int thisdistortion; assert(idx < 16); @@ -1826,11 +1839,13 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm, &xd->subpix); } - vp9_subtract_4b_c(be, bd, 16); + vp9_subtract_block(8, 8, src_diff, 16, + *(be->base_src) + be->src, be->src_stride, + *(bd->base_dst) + bd->dst, bd->dst_stride); if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) { if (otherrd) { - x->fwd_txm8x8(be->src_diff, be2->coeff, 32); + x->fwd_txm8x8(src_diff, be2->coeff, 32); x->quantize_b_8x8(x, idx, DCT_DCT, 16); thisdistortion = vp9_block_error_c(be2->coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64); @@ -1843,9 +1858,13 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm, xd->mode_info_context->mbmi.txfm_size = TX_4X4; } for (j = 0; j < 4; j += 2) { + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, + 0, ib + iblock[j], + x->plane[0].src_diff); bd = &xd->block[ib + iblock[j]]; be = &x->block[ib + iblock[j]]; - x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->fwd_txm8x4(src_diff, be->coeff, 32); x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16); thisdistortion = vp9_block_error_c(be->coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32); @@ -1866,7 +1885,11 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm, if (otherrd) { for (j = 0; j < 4; j += 2) { BLOCK *be = &x->block[ib + iblock[j]]; - x->fwd_txm8x4(be->src_diff, be->coeff, 32); + int16_t* const src_diff = + raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, + 0, ib + iblock[j], + x->plane[0].src_diff); + x->fwd_txm8x4(src_diff, be->coeff, 32); x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16); thisdistortion = vp9_block_error_c(be->coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32); @@ -1886,7 +1909,7 @@ static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm, xd->mode_info_context->mbmi.txfm_size = TX_8X8; } } - x->fwd_txm8x8(be->src_diff, be2->coeff, 32); + x->fwd_txm8x8(src_diff, be2->coeff, 32); x->quantize_b_8x8(x, idx, DCT_DCT, 16); thisdistortion = vp9_block_error_c(be2->coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64); @@ -3782,12 +3805,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_build_inter_predictors_sbuv(&x->e_mbd, mb_row, mb_col, BLOCK_SIZE_MB16X16); - vp9_subtract_sbuv_s_c(x->src_diff, - x->src.u_buffer, - x->src.v_buffer, x->src.uv_stride, - xd->plane[1].dst.buf, - xd->plane[2].dst.buf, xd->plane[1].dst.stride, - BLOCK_SIZE_MB16X16); + vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16); super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv, &uv_skippable, BLOCK_SIZE_MB16X16); diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c index 04383fcb4a3b772e9b1b02c94f33100636e2a312..6016e14eb14ba167275a01aab3b6940781ac460a 100644 --- a/vp9/encoder/x86/vp9_x86_csystemdependent.c +++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c @@ -17,7 +17,7 @@ // TODO(jimbankoski) Consider rewriting the c to take the same values rather // than going through these pointer conversions -#if HAVE_MMX +#if 0 && HAVE_MMX void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) { vp9_short_fdct4x4_mmx(input, output, pitch); vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch); @@ -38,7 +38,7 @@ void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) { #endif -#if HAVE_SSE2 +#if 0 && HAVE_SSE2 void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride, short *diff, unsigned char *predictor, int pitch);