From 993d10a21774a29d6c708131e54ee3331bda20df Mon Sep 17 00:00:00 2001 From: Deb Mukherjee <debargha@google.com> Date: Wed, 24 Sep 2014 06:36:34 -0700 Subject: [PATCH] Adds various high bit-depth encode functions Change-Id: I6f67b171022bbc8199c6d674190b57f6bab1b62f --- vp9/decoder/vp9_decodeframe.c | 81 ++++++++- vp9/encoder/vp9_aq_variance.c | 29 +++ vp9/encoder/vp9_bitstream.c | 18 +- vp9/encoder/vp9_encodeframe.c | 138 +++++++++++++- vp9/encoder/vp9_encodemb.c | 329 +++++++++++++++++++++++++++++++++- vp9/encoder/vp9_extend.c | 66 +++++++ vp9/encoder/vp9_firstpass.c | 119 +++++++++++- vp9/encoder/vp9_mcomp.c | 57 ++++-- vp9/encoder/vp9_picklpf.c | 27 +++ vp9/encoder/vp9_pickmode.c | 76 +++++++- vp9/encoder/vp9_rd.c | 27 ++- vp9/encoder/vp9_rd.h | 4 + vp9/encoder/vp9_rdopt.c | 324 ++++++++++++++++++++++++++++++++- vp9/encoder/vp9_rdopt.h | 1 - vp9/encoder/vp9_tokenize.h | 6 + 15 files changed, 1247 insertions(+), 55 deletions(-) diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 4e85caf45c..0ec2710827 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -196,6 +196,64 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, if (eob > 0) { TX_TYPE tx_type = DCT_DCT; tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (xd->lossless) { + tx_type = DCT_DCT; + vp9_high_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + } else { + const PLANE_TYPE plane_type = pd->plane_type; + switch (tx_size) { + case TX_4X4: + tx_type = get_tx_type_4x4(plane_type, xd, block); + vp9_high_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_8X8: + tx_type = get_tx_type(plane_type, xd); + vp9_high_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_16X16: + tx_type = get_tx_type(plane_type, xd); + vp9_high_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_32X32: + tx_type = DCT_DCT; + vp9_high_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + break; + default: + assert(0 && "Invalid transform size"); + } + } + } else { + if (xd->lossless) { + tx_type = DCT_DCT; + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + const PLANE_TYPE plane_type = pd->plane_type; + switch (tx_size) { + case TX_4X4: + tx_type = get_tx_type_4x4(plane_type, xd, block); + vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_8X8: + tx_type = get_tx_type(plane_type, xd); + vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_16X16: + tx_type = get_tx_type(plane_type, xd); + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: + tx_type = DCT_DCT; + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(0 && "Invalid transform size"); + return; + } + } + } +#else if (xd->lossless) { tx_type = DCT_DCT; vp9_iwht4x4_add(dqcoeff, dst, stride, eob); @@ -220,8 +278,10 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, break; default: assert(0 && "Invalid transform size"); + return; } } +#endif // CONFIG_VP9_HIGHBITDEPTH if (eob == 1) { vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0])); @@ -599,6 +659,9 @@ static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; +#if CONFIG_VP9_HIGHBITDEPTH + xd->bd = (int)cm->bit_depth; +#endif } static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) { @@ -1139,8 +1202,17 @@ BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) { static void read_bitdepth_colorspace_sampling( VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - if (cm->profile >= PROFILE_2) + if (cm->profile >= PROFILE_2) { cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 1; +#endif + } else { + cm->bit_depth = VPX_BITS_8; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 0; +#endif + } cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3); if (cm->color_space != SRGB) { vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range @@ -1244,6 +1316,10 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, // case (normative). cm->color_space = BT_601; cm->subsampling_y = cm->subsampling_x = 1; + cm->bit_depth = VPX_BITS_8; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 0; +#endif } pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES); @@ -1284,6 +1360,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, } } } +#if CONFIG_VP9_HIGHBITDEPTH + get_frame_new_buffer(cm)->bit_depth = cm->bit_depth; +#endif if (pbi->need_resync) { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c index b96f00fd19..15de5c473b 100644 --- a/vp9/encoder/vp9_aq_variance.c +++ b/vp9/encoder/vp9_aq_variance.c @@ -34,6 +34,9 @@ static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 }; #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN] DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0}; +#if CONFIG_VP9_HIGHBITDEPTH +DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0}; +#endif unsigned int vp9_vaq_segment_id(int energy) { ENERGY_IN_BOUNDS(energy); @@ -126,14 +129,40 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow; const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow; int avg; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_variance(x->plane[0].src.buf, x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, &sse, + &avg); + sse >>= 2 * (xd->bd - 8); + avg >>= (xd->bd - 8); + } else { + variance(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, bw, bh, &sse, &avg); + } +#else variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0, bw, bh, &sse, &avg); +#endif // CONFIG_VP9_HIGHBITDEPTH var = sse - (((int64_t)avg * avg) / (bw * bh)); return (256 * var) / (bw * bh); } else { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, + x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), + 0, &sse); + } else { + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, + x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + } +#else var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0, &sse); +#endif // CONFIG_VP9_HIGHBITDEPTH return (256 * var) >> num_pels_log2_lookup[bs]; } } diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index f658ddafb1..5114fc55f4 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -120,16 +120,28 @@ static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) { } static void pack_mb_tokens(vp9_writer *w, - TOKENEXTRA **tp, const TOKENEXTRA *const stop) { + TOKENEXTRA **tp, const TOKENEXTRA *const stop, + vpx_bit_depth_t bit_depth) { TOKENEXTRA *p = *tp; while (p < stop && p->token != EOSB_TOKEN) { const int t = p->token; const struct vp9_token *const a = &vp9_coef_encodings[t]; - const vp9_extra_bit *const b = &vp9_extra_bits[t]; int i = 0; int v = a->value; int n = a->len; +#if CONFIG_VP9_HIGHBITDEPTH + const vp9_extra_bit *b; + if (bit_depth == VPX_BITS_12) + b = &vp9_extra_bits_high12[t]; + else if (bit_depth == VPX_BITS_10) + b = &vp9_extra_bits_high10[t]; + else + b = &vp9_extra_bits[t]; +#else + const vp9_extra_bit *const b = &vp9_extra_bits[t]; + (void) bit_depth; +#endif // CONFIG_VP9_HIGHBITDEPTH /* skip one or two nodes */ if (p->skip_eob_node) { @@ -387,7 +399,7 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, } assert(*tok < tok_end); - pack_mb_tokens(w, tok, tok_end); + pack_mb_tokens(w, tok, tok_end, cm->bit_depth); } static void write_partition(const VP9_COMMON *const cm, diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index be5ee7b729..6b1a2597ce 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -61,16 +61,51 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, // Eventually this should be replaced by custom no-reference routines, // which will be faster. static const uint8_t VP9_VAR_OFFS[64] = { - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128 + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 }; +#if CONFIG_VP9_HIGHBITDEPTH +static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = { + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 +}; + +static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = { + 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, + 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, + 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, + 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, + 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, + 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, + 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, + 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4 +}; + +static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = { + 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, + 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, + 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, + 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, + 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, + 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, + 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, + 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16 +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs) { @@ -80,6 +115,32 @@ static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } +#if CONFIG_VP9_HIGHBITDEPTH +static unsigned int high_get_sby_perpixel_variance( + VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) { + unsigned int var, sse; + switch (bd) { + case 10: + var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10), + 0, &sse); + break; + case 12: + var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12), + 0, &sse); + break; + case 8: + default: + var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), + 0, &sse); + break; + } + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, const struct buf_2d *ref, int mi_row, int mi_col, @@ -419,6 +480,22 @@ static void choose_partitioning(VP9_COMP *cpi, } else { d = VP9_VAR_OFFS; dp = 0; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (xd->bd) { + case 10: + d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10); + break; + case 12: + d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12); + break; + case 8: + default: + d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8); + break; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH } // Fill in the entire tree of 8x8 variances for splits. @@ -734,7 +811,17 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, // Set to zero to make sure we do not use the previous encoded frame stats mbmi->skip = 0; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + x->source_variance = + high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd); + } else { + x->source_variance = + get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } +#else x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); +#endif // CONFIG_VP9_HIGHBITDEPTH // Save rdmult before it might be changed, so it can be restored later. orig_rdmult = x->rdmult; @@ -3170,9 +3257,34 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { for (i = 0; i < cm->mb_rows; i++) { for (j = 0; j < cm->mb_cols; j++) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: + vp9_high_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + break; + case VPX_BITS_10: + vp9_high_10_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + break; + case VPX_BITS_12: + vp9_high_12_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + break; + default: + assert(0 && "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10" + " or VPX_BITS_12"); + return -1; + } + } else { + vp9_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + } +#else vp9_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); - +#endif // CONFIG_VP9_HIGHBITDEPTH var16->var = var16->sse - (((uint32_t)var16->sum * var16->sum) >> 8); @@ -3314,7 +3426,15 @@ static void encode_frame_internal(VP9_COMP *cpi) { cm->tx_mode = select_tx_mode(cpi); +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4; + else + x->fwd_txm4x4 = xd->lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4; + x->high_itxm_add = xd->lossless ? vp9_high_iwht4x4_add : vp9_high_idct4x4_add; +#else x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4; +#endif // CONFIG_VP9_HIGHBITDEPTH x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; if (xd->lossless) { diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 2eae149700..c413f10f61 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -51,6 +51,29 @@ void vp9_subtract_block_c(int rows, int cols, } } +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_subtract_block_c(int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src8, ptrdiff_t src_stride, + const uint8_t *pred8, ptrdiff_t pred_stride, + int bd) { + int r, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + (void) bd; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) { + diff[c] = src[c] - pred[c]; + } + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; @@ -58,6 +81,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize]; +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_high_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, x->e_mbd.bd); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); } @@ -124,6 +154,8 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, int64_t rd_cost0, rd_cost1; int rate0, rate1, error0, error1, t0, t1; int best, band, pt, i, final_eob; + const TOKENVALUE *dct_value_tokens; + const int16_t *dct_value_cost; assert((!type && !plane) || (type && plane)); assert(eob <= default_eob); @@ -140,9 +172,24 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, tokens[eob][0].qc = 0; tokens[eob][1] = tokens[eob][0]; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->bd == 12) { + dct_value_tokens = vp9_dct_value_tokens_high12_ptr; + dct_value_cost = vp9_dct_value_cost_high12_ptr; + } else if (xd->bd == 10) { + dct_value_tokens = vp9_dct_value_tokens_high10_ptr; + dct_value_cost = vp9_dct_value_cost_high10_ptr; + } else { + dct_value_tokens = vp9_dct_value_tokens_ptr; + dct_value_cost = vp9_dct_value_cost_ptr; + } +#else + dct_value_tokens = vp9_dct_value_tokens_ptr; + dct_value_cost = vp9_dct_value_cost_ptr; +#endif for (i = 0; i < eob; i++) token_cache[scan[i]] = - vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token]; + vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token]; for (i = eob; i-- > 0;) { int base_bits, d2, dx; @@ -156,7 +203,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, /* Evaluate the first possibility for this state. */ rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; - t0 = (vp9_dct_value_tokens_ptr + x)->token; + t0 = (dct_value_tokens + x)->token; /* Consider both possible successor states. */ if (next < default_eob) { band = band_translate[i + 1]; @@ -169,8 +216,13 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; - base_bits = vp9_dct_value_cost_ptr[x]; + base_bits = dct_value_cost[x]; dx = mul * (dqcoeff[rc] - coeff[rc]); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx >>= xd->bd - 8; + } +#endif // CONFIG_VP9_HIGHBITDEPTH d2 = dx * dx; tokens[i][0].rate = base_bits + (best ? rate1 : rate0); tokens[i][0].error = d2 + (best ? error1 : error0); @@ -203,7 +255,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; } else { - t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token; + t0 = t1 = (dct_value_tokens + x)->token; } if (next < default_eob) { band = band_translate[i + 1]; @@ -222,10 +274,19 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; - base_bits = vp9_dct_value_cost_ptr[x]; + base_bits = dct_value_cost[x]; if (shortcut) { dx -= (dequant_ptr[rc != 0] + sz) ^ sz; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz; + } else { + dx -= (dequant_ptr[rc != 0] + sz) ^ sz; + } +#else + dx -= (dequant_ptr[rc != 0] + sz) ^ sz; +#endif // CONFIG_VP9_HIGHBITDEPTH d2 = dx * dx; } tokens[i][1].rate = base_bits + (best ? rate1 : rate0); @@ -310,7 +371,7 @@ static INLINE void high_fdct32x32(int rd_transform, const int16_t *src, else vp9_high_fdct32x32(src, dst, src_stride); } -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { @@ -328,6 +389,44 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); src_diff = &p->src_diff[4 * (j * diff_stride + i)]; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (tx_size) { + case TX_32X32: + high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_high_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, + p->round_fp, p->quant_fp, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, p->zbin_extra, + eob, scan_order->scan, scan_order->iscan); + break; + case TX_16X16: + vp9_high_fdct16x16(src_diff, coeff, diff_stride); + vp9_high_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_8X8: + vp9_high_fdct8x8(src_diff, coeff, diff_stride); + vp9_high_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_high_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + default: + assert(0); + } + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); @@ -379,6 +478,40 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); src_diff = &p->src_diff[4 * (j * diff_stride + i)]; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (tx_size) { + case TX_32X32: + vp9_high_fdct32x32_1(src_diff, coeff, diff_stride); + vp9_high_quantize_dc_32x32(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_16X16: + vp9_high_fdct16x16_1(src_diff, coeff, diff_stride); + vp9_high_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_8X8: + vp9_high_fdct8x8_1(src_diff, coeff, diff_stride); + vp9_high_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_high_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + default: + assert(0); + } + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + switch (tx_size) { case TX_32X32: vp9_fdct32x32_1(src_diff, coeff, diff_stride); @@ -426,6 +559,44 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); src_diff = &p->src_diff[4 * (j * diff_stride + i)]; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (tx_size) { + case TX_32X32: + high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_high_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, + p->round, p->quant, p->quant_shift, qcoeff, + dqcoeff, pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_16X16: + vp9_high_fdct16x16(src_diff, coeff, diff_stride); + vp9_high_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_8X8: + vp9_high_fdct8x8(src_diff, coeff, diff_stride); + vp9_high_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_high_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + default: + assert(0); + } + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); @@ -520,6 +691,34 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, if (x->skip_encode || p->eobs[block] == 0) return; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (tx_size) { + case TX_32X32: + vp9_high_idct32x32_add(dqcoeff, dst, pd->dst.stride, + p->eobs[block], xd->bd); + break; + case TX_16X16: + vp9_high_idct16x16_add(dqcoeff, dst, pd->dst.stride, + p->eobs[block], xd->bd); + break; + case TX_8X8: + vp9_high_idct8x8_add(dqcoeff, dst, pd->dst.stride, + p->eobs[block], xd->bd); + break; + case TX_4X4: + // this is like vp9_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + x->high_itxm_add(dqcoeff, dst, pd->dst.stride, + p->eobs[block], xd->bd); + break; + default: + assert(0 && "Invalid transform size"); + } + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH switch (tx_size) { case TX_32X32: @@ -557,8 +756,15 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, vp9_xform_quant(x, plane, block, plane_bsize, tx_size); - if (p->eobs[block] > 0) + if (p->eobs[block] > 0) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + x->high_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + } } void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { @@ -622,6 +828,115 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, src = &p->src.buf[4 * (j * src_stride + i)]; src_diff = &p->src_diff[4 * (j * diff_stride + i)]; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + switch (tx_size) { + case TX_32X32: + scan_order = &vp9_default_scan_orders[TX_32X32]; + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; + vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); + if (!x->skip_recode) { + vp9_high_subtract_block(32, 32, src_diff, diff_stride, + src, src_stride, dst, dst_stride, xd->bd); + high_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_high_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, + p->round, p->quant, p->quant_shift, qcoeff, + dqcoeff, pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + } + if (!x->skip_encode && *eob) { + vp9_high_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + } + break; + case TX_16X16: + tx_type = get_tx_type(pd->plane_type, xd); + scan_order = &vp9_scan_orders[TX_16X16][tx_type]; + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; + vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); + if (!x->skip_recode) { + vp9_high_subtract_block(16, 16, src_diff, diff_stride, + src, src_stride, dst, dst_stride, xd->bd); + vp9_high_fht16x16(src_diff, coeff, diff_stride, tx_type); + vp9_high_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + } + if (!x->skip_encode && *eob) { + vp9_high_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, + *eob, xd->bd); + } + break; + case TX_8X8: + tx_type = get_tx_type(pd->plane_type, xd); + scan_order = &vp9_scan_orders[TX_8X8][tx_type]; + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; + vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); + if (!x->skip_recode) { + vp9_high_subtract_block(8, 8, src_diff, diff_stride, + src, src_stride, dst, dst_stride, xd->bd); + vp9_high_fht8x8(src_diff, coeff, diff_stride, tx_type); + vp9_high_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + } + if (!x->skip_encode && *eob) { + vp9_high_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob, + xd->bd); + } + break; + case TX_4X4: + tx_type = get_tx_type_4x4(pd->plane_type, xd, block); + scan_order = &vp9_scan_orders[TX_4X4][tx_type]; + mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode; + vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); + + if (!x->skip_recode) { + vp9_high_subtract_block(4, 4, src_diff, diff_stride, + src, src_stride, dst, dst_stride, xd->bd); + if (tx_type != DCT_DCT) + vp9_high_fht4x4(src_diff, coeff, diff_stride, tx_type); + else + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_high_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + } + + if (!x->skip_encode && *eob) { + if (tx_type == DCT_DCT) + // this is like vp9_short_idct4x4 but has a special case around + // eob<=1 which is significant (not just an optimization) for the + // lossless case. + x->high_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + else + vp9_high_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd); + } + break; + default: + assert(0); + return; + } + if (*eob) + *(args->skip) = 0; + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + switch (tx_size) { case TX_32X32: scan_order = &vp9_default_scan_orders[TX_32X32]; diff --git a/vp9/encoder/vp9_extend.c b/vp9/encoder/vp9_extend.c index e8517c8892..5b01bc9f2a 100644 --- a/vp9/encoder/vp9_extend.c +++ b/vp9/encoder/vp9_extend.c @@ -55,6 +55,52 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch, } } +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, + uint8_t *dst8, int dst_pitch, + int w, int h, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + // copy the left and right most columns out + const uint16_t *src_ptr1 = src; + const uint16_t *src_ptr2 = src + w - 1; + uint16_t *dst_ptr1 = dst - extend_left; + uint16_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + vpx_memset16(dst_ptr1, src_ptr1[0], extend_left); + vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t)); + vpx_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h) - extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t)); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t)); + dst_ptr2 += dst_pitch; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst) { // Extend src frame in buffer @@ -75,6 +121,26 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int eb_uv = eb_y >> uv_height_subsampling; const int er_uv = er_y >> uv_width_subsampling; +#if CONFIG_VP9_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, + dst->y_buffer, dst->y_stride, + src->y_width, src->y_height, + et_y, el_y, eb_y, er_y); + + highbd_copy_and_extend_plane(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, + src->uv_width, src->uv_height, + et_uv, el_uv, eb_uv, er_uv); + + highbd_copy_and_extend_plane(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, + src->uv_width, src->uv_height, + et_uv, el_uv, eb_uv, er_uv); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_width, src->y_height, diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 0282e9f9a3..9b1fa65081 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -281,6 +281,60 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize, return sse; } +#if CONFIG_VP9_HIGHBITDEPTH +static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, + int bd) { + switch (bd) { + default: + switch (bsize) { + case BLOCK_8X8: + return vp9_high_mse8x8; + case BLOCK_16X8: + return vp9_high_mse16x8; + case BLOCK_8X16: + return vp9_high_mse8x16; + default: + return vp9_high_mse16x16; + } + break; + case 10: + switch (bsize) { + case BLOCK_8X8: + return vp9_high_10_mse8x8; + case BLOCK_16X8: + return vp9_high_10_mse16x8; + case BLOCK_8X16: + return vp9_high_10_mse8x16; + default: + return vp9_high_10_mse16x16; + } + break; + case 12: + switch (bsize) { + case BLOCK_8X8: + return vp9_high_12_mse8x8; + case BLOCK_16X8: + return vp9_high_12_mse16x8; + case BLOCK_8X16: + return vp9_high_12_mse8x16; + default: + return vp9_high_12_mse16x16; + } + break; + } +} + +static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref, + int bd) { + unsigned int sse; + const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + // Refine the motion search range according to the frame dimension // for first pass test. static int get_search_range(const VP9_COMMON *cm) { @@ -311,6 +365,11 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // Override the default variance function to use MSE. v_fn_ptr.vf = get_block_variance_fn(bsize); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd); + } +#endif // CONFIG_VP9_HIGHBITDEPTH // Center the initial step/diamond search on best mv. tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, @@ -562,6 +621,24 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; vp9_encode_intra_block_plane(x, bsize, 0); this_error = vp9_get_mb_ss(x->plane[0].src_diff); +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: + break; + case VPX_BITS_10: + this_error >>= 4; + break; + case VPX_BITS_12: + this_error >>= 8; + break; + default: + assert(0 && "cm->bit_depth should be VPX_BITS_8, " + "VPX_BITS_10 or VPX_BITS_12"); + return; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_clear_system_state(); @@ -601,8 +678,18 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { struct buf_2d unscaled_last_source_buf_2d; xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; - motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { + motion_error = get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + } +#else + motion_error = get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0]); +#endif // CONFIG_VP9_HIGHBITDEPTH // Compute the motion error of the 0,0 motion using the last source // frame as the reference. Skip the further motion search on @@ -611,8 +698,18 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { cpi->unscaled_last_source->y_buffer + recon_yoffset; unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; - raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &unscaled_last_source_buf_2d); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + raw_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); + } else { + raw_motion_error = get_prediction_error( + bsize, &x->plane[0].src, &unscaled_last_source_buf_2d); + } +#else + raw_motion_error = get_prediction_error( + bsize, &x->plane[0].src, &unscaled_last_source_buf_2d); +#endif // CONFIG_VP9_HIGHBITDEPTH // TODO(pengchong): Replace the hard-coded threshold if (raw_motion_error > 25 || lc != NULL) { @@ -648,8 +745,18 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { int gf_motion_error; xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; - gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + gf_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { + gf_motion_error = get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + } +#else + gf_motion_error = get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0]); +#endif // CONFIG_VP9_HIGHBITDEPTH first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 5366c3c1eb..89c37d98ec 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -284,16 +284,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { int tc = bc; \ \ bestmv->row *= 8; \ - bestmv->col *= 8; \ - if (second_pred != NULL) { \ - DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \ - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \ - besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \ - } else { \ - besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \ - } \ - *distortion = besterr; \ - besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + bestmv->col *= 8; int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, @@ -309,6 +300,29 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x, const uint8_t *second_pred, int w, int h) { SETUP_SUBPEL_SEARCH; + if (second_pred != NULL) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64); + vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + y_stride); + besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride, + sse1); + } else { + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); + vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + } +#else + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); + vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); +#endif // CONFIG_VP9_HIGHBITDEPTH + } else { + besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); + } + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); if (sad_list && sad_list[0] != INT_MAX && sad_list[1] != INT_MAX && @@ -401,6 +415,29 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, const uint8_t *second_pred, int w, int h) { SETUP_SUBPEL_SEARCH; + if (second_pred != NULL) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64); + vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + y_stride); + besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride, + sse1); + } else { + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); + vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + } +#else + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); + vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); +#endif // CONFIG_VP9_HIGHBITDEPTH + } else { + besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); + } + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); (void) sad_list; // to silence compiler warning // Each subsequent iteration checks at least one point in diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 2fc05e7fe2..85984fd7ef 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -40,7 +40,15 @@ static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi, vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1, partial_frame); +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth); + } else { + filt_err = vp9_get_y_sse(sd, cm->frame_to_show); + } +#else filt_err = vp9_get_y_sse(sd, cm->frame_to_show); +#endif // CONFIG_VP9_HIGHBITDEPTH // Re-instate the unfiltered frame vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); @@ -145,7 +153,26 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth); // These values were determined by linear fitting the result of the // searched level, filt_guess = q * 0.316206 + 3.87252 +#if CONFIG_VP9_HIGHDEPTH + int filt_guess; + switch (cm->bit_depth) { + case VPX_BITS_8: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); + break; + case VPX_BITS_10: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); + break; + case VPX_BITS_12: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); + break; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 " + "or VPX_BITS_12"); + return; + } +#else int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); +#endif // CONFIG_VP9_HIGHBITDEPTH if (cm->frame_type == KEY_FRAME) filt_guess -= 4; lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index a788c1d8e3..428767a441 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -241,13 +241,44 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], + dc_quant >> (xd->bd - 5), &rate, &dist); + } else { + vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], + dc_quant >> 3, &rate, &dist); + } +#else vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], dc_quant >> 3, &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + *out_rate_sum = rate >> 1; *out_dist_sum = dist << 3; - vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize], - ac_quant >> 3, &rate, &dist); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_model_rd_from_var_lapndz(var, + 1 << num_pels_log2_lookup[bsize], + ac_quant >> (xd->bd - 5), + &rate, + &dist); + } else { + vp9_model_rd_from_var_lapndz(var, + 1 << num_pels_log2_lookup[bsize], + ac_quant >> 3, + &rate, + &dist); + } +#else + vp9_model_rd_from_var_lapndz(var, + 1 << num_pels_log2_lookup[bsize], + ac_quant >> 3, + &rate, + &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + *out_rate_sum += rate; *out_dist_sum += dist << 4; } @@ -293,9 +324,17 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, // The encode_breakout input const unsigned int min_thresh = MIN(((unsigned int)x->encode_breakout << 4), max_thresh); +#if CONFIG_VP9_HIGHBITDEPTH + const int shift = 2 * xd->bd - 16; +#endif // Calculate threshold according to dequant value. thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; +#if CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) { + thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift); + } +#endif // CONFIG_VP9_HIGHBITDEPTH thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); // Adjust ac threshold according to partition size. @@ -303,6 +342,11 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, 8 - (b_width_log2(bsize) + b_height_log2(bsize)); thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); +#if CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) { + thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift); + } +#endif // CONFIG_VP9_HIGHBITDEPTH } else { thresh_ac = 0; thresh_dc = 0; @@ -438,9 +482,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; - - const int intra_cost_penalty = - 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = vp9_get_intra_cost_penalty( + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int intra_mode_cost = 50; @@ -461,14 +504,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // tmp[3] points to dst buffer, and the other 3 point to allocated buffers. PRED_BUFFER tmp[4]; DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED_ARRAY(16, uint16_t, pred_buf_16, 3 * 64 * 64); +#endif struct buf_2d orig_dst = pd->dst; PRED_BUFFER *best_pred = NULL; PRED_BUFFER *this_mode_pred = NULL; + const int pixels_in_block = bh * bw; if (cpi->sf.reuse_inter_pred_sby) { int i; for (i = 0; i < 3; i++) { - tmp[i].data = &pred_buf[bw * bh * i]; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]); + else + tmp[i].data = &pred_buf[pixels_in_block * i]; +#else + tmp[i].data = &pred_buf[pixels_in_block * i]; +#endif // CONFIG_VP9_HIGHBITDEPTH tmp[i].stride = bw; tmp[i].in_use = 0; } @@ -703,8 +757,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) { pd->dst = orig_dst; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + vp9_high_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, + NULL, 0, NULL, 0, bw, bh, xd->bd); + } else { + vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, + NULL, 0, NULL, 0, bw, bh); + } +#else vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, bw, bh); +#endif // CONFIG_VP9_HIGHBITDEPTH } mbmi->mode = best_mode; diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 17369d4c73..a32776ac7d 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -155,7 +155,7 @@ int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { } #else int rdmult = 88 * q * q / 24; -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; @@ -187,7 +187,7 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { #else (void) bit_depth; q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH // TODO(debargha): Adjust the function below. return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); } @@ -213,7 +213,7 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { #else cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex]; cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex]; -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH } static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { @@ -598,3 +598,24 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { if (sf->disable_split_mask & (1 << i)) rd->thresh_mult_sub8x8[i] = INT_MAX; } + +int vp9_get_intra_cost_penalty(int qindex, int qdelta, + vpx_bit_depth_t bit_depth) { + const int q = vp9_dc_quant(qindex, qdelta, bit_depth); +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: + return 20 * q; + case VPX_BITS_10: + return 5 * q; + case VPX_BITS_12: + return ROUND_POWER_OF_TWO(5 * q, 2); + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else + return 20 * q; +#endif // CONFIG_VP9_HIGHBITDEPTH +} + diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index 5dcb2f8d75..214f6965b7 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -162,6 +162,10 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd, int mi_row, int mi_col, const struct scale_factors *scale, const struct scale_factors *scale_uv); + +int vp9_get_intra_cost_penalty(int qindex, int qdelta, + vpx_bit_depth_t bit_depth); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 0f3009ddc5..d4d8bbfff9 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -228,9 +228,13 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, // Fast approximate the modelling function. if (cpi->oxcf.speed > 4) { int64_t rate; - int64_t dist; int64_t square_error = sse; int quantizer = (pd->dequant[1] >> 3); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + quantizer >>= (xd->bd - 8); + } +#endif // CONFIG_VP9_HIGHBITDEPTH if (quantizer < 120) rate = (square_error * (280 - quantizer)) >> 8; @@ -240,8 +244,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, rate_sum += rate; dist_sum += dist; } else { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs], + pd->dequant[1] >> (xd->bd - 5), + &rate, &dist); + } else { + vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs], + pd->dequant[1] >> 3, &rate, &dist); + } +#else vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs], pd->dequant[1] >> 3, &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH rate_sum += rate; dist_sum += dist; } @@ -266,6 +281,31 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, return error; } + +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vp9_high_block_error_c(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, + int64_t *ssz, int bd) { + int i; + int64_t error = 0, sqcoeff = 0; + int shift = 2 * (bd - 8); + int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i++) { + const int64_t diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + /* The trailing '0' is a terminator which is used inside cost_coeffs() to * decide whether to include cost of a trailing EOB node or not (i.e. we * can skip this if the last coefficient in this transform block, e.g. the @@ -351,8 +391,14 @@ static INLINE int cost_coeffs(MACROBLOCK *x, return cost; } + +#if CONFIG_VP9_HIGHBITDEPTH +static void dist_block(int plane, int block, TX_SIZE tx_size, + struct rdcost_block_args* args, int bd) { +#else static void dist_block(int plane, int block, TX_SIZE tx_size, struct rdcost_block_args* args) { +#endif // CONFIG_VP9_HIGHBITDEPTH const int ss_txfrm_size = tx_size << 1; MACROBLOCK* const x = args->x; MACROBLOCKD* const xd = &x->e_mbd; @@ -362,14 +408,24 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, int shift = tx_size == TX_32X32 ? 0 : 2; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); +#if CONFIG_VP9_HIGHBITDEPTH + args->dist = vp9_high_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, + &this_sse, bd) >> shift; +#else args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >> shift; +#endif // CONFIG_VP9_HIGHBITDEPTH args->sse = this_sse >> shift; if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) { // TODO(jingning): tune the model to better capture the distortion. int64_t p = (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> (shift + 2); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + p >>= ((xd->bd - 8) * 2); + } +#endif // CONFIG_VP9_HIGHBITDEPTH args->dist += (p >> 4); args->sse += p; } @@ -399,12 +455,28 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, if (!is_inter_block(mbmi)) { vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dist_block(plane, block, tx_size, args, xd->bd); + } else { + dist_block(plane, block, tx_size, args, 8); + } +#else dist_block(plane, block, tx_size, args); +#endif // CONFIG_VP9_HIGHBITDEPTH } else if (max_txsize_lookup[plane_bsize] == tx_size) { if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) { // full forward transform and quantization vp9_xform_quant(x, plane, block, plane_bsize, tx_size); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dist_block(plane, block, tx_size, args, xd->bd); + } else { + dist_block(plane, block, tx_size, args, 8); + } +#else dist_block(plane, block, tx_size, args); +#endif // CONFIG_VP9_HIGHBITDEPTH } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) { // compute DC coefficient tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); @@ -424,7 +496,15 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, } else { // full forward transform and quantization vp9_xform_quant(x, plane, block, plane_bsize, tx_size); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dist_block(plane, block, tx_size, args, xd->bd); + } else { + dist_block(plane, block, tx_size, args, 8); + } +#else dist_block(plane, block, tx_size, args); +#endif // CONFIG_VP9_HIGHBITDEPTH } rate_block(plane, block, plane_bsize, tx_size, args); @@ -659,6 +739,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; uint8_t best_dst[8 * 8]; +#if CONFIG_VP9_HIGHBITDEPTH + uint16_t best_dst16[8 * 8]; +#endif assert(ib < 4); @@ -666,6 +749,108 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vpx_memcpy(tl, l, sizeof(tl)); xd->mi[0].src_mi->mbmi.tx_size = TX_4X4; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + int64_t this_rd; + int ratey = 0; + int64_t distortion = 0; + int rate = bmode_costs[mode]; + + if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) + continue; + + // Only do the oblique modes if the best so far is + // one of the neighboring directional modes + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(mode, *best_mode)) + continue; + } + + vpx_memcpy(tempa, ta, sizeof(ta)); + vpx_memcpy(templ, tl, sizeof(tl)); + + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { + const int block = ib + idy * 2 + idx; + const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; + uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; + int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block, + p->src_diff); + tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + xd->mi[0].src_mi->bmi[block].as_mode = mode; + vp9_predict_intra_block(xd, block, 1, + TX_4X4, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, idx, idy, 0); + vp9_high_subtract_block(4, 4, src_diff, 8, src, src_stride, + dst, dst_stride, xd->bd); + if (xd->lossless) { + const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + vp9_high_fwht4x4(src_diff, coeff, 8); + vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4, + so->scan, so->neighbors, + cpi->sf.use_fast_coef_costing); + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next_highbd; + vp9_high_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), + dst, dst_stride, + p->eobs[block], xd->bd); + } else { + int64_t unused; + const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); + const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type]; + vp9_high_fht4x4(src_diff, coeff, 8, tx_type); + vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4, + so->scan, so->neighbors, + cpi->sf.use_fast_coef_costing); + distortion += vp9_high_block_error(coeff, + BLOCK_OFFSET(pd->dqcoeff, block), + 16, &unused, xd->bd) >> 2; + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next_highbd; + vp9_high_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), + dst, dst_stride, p->eobs[block], xd->bd); + } + } + } + + rate += ratey; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + + if (this_rd < best_rd) { + *bestrate = rate; + *bestratey = ratey; + *bestdistortion = distortion; + best_rd = this_rd; + *best_mode = mode; + vpx_memcpy(a, tempa, sizeof(tempa)); + vpx_memcpy(l, templ, sizeof(templ)); + for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) { + vpx_memcpy(best_dst16 + idy * 8, + CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride), + num_4x4_blocks_wide * 4 * sizeof(uint16_t)); + } + } + next_highbd: + {} + } + if (best_rd >= rd_thresh || x->skip_encode) + return best_rd; + + for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) { + vpx_memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride), + best_dst16 + idy * 8, + num_4x4_blocks_wide * 4 * sizeof(uint16_t)); + } + + return best_rd; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { int64_t this_rd; int ratey = 0; @@ -1118,6 +1303,16 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, for (ref = 0; ref < 1 + is_compound; ++ref) { const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i, pd->pre[ref].stride)]; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_high_build_inter_predictor(pre, pd->pre[ref].stride, + dst, pd->dst.stride, + &mi->bmi[i].as_mv[ref].as_mv, + &xd->block_refs[ref]->sf, width, height, ref, + kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i % 2), + mi_row * MI_SIZE + 4 * (i / 2), xd->bd); + } else { vp9_build_inter_predictor(pre, pd->pre[ref].stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, @@ -1126,11 +1321,32 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2)); } +#else + vp9_build_inter_predictor(pre, pd->pre[ref].stride, + dst, pd->dst.stride, + &mi->bmi[i].as_mv[ref].as_mv, + &xd->block_refs[ref]->sf, width, height, ref, + kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i % 2), + mi_row * MI_SIZE + 4 * (i / 2)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_high_subtract_block( + height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, + src, p->src.stride, dst, pd->dst.stride, xd->bd); + } else { + vp9_subtract_block( + height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, + src, p->src.stride, dst, pd->dst.stride); + } +#else vp9_subtract_block(height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, - src, p->src.stride, - dst, pd->dst.stride); + src, p->src.stride, dst, pd->dst.stride); +#endif // CONFIG_VP9_HIGHBITDEPTH k = i; for (idy = 0; idy < height / 4; ++idy) { @@ -1143,8 +1359,19 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), coeff, 8); vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + thisdistortion += vp9_high_block_error(coeff, + BLOCK_OFFSET(pd->dqcoeff, k), + 16, &ssz, xd->bd); + } else { + thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), + 16, &ssz); + } +#else thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); +#endif // CONFIG_VP9_HIGHBITDEPTH thissse += ssz; thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4, so->scan, so->neighbors, @@ -1901,7 +2128,12 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int_mv ref_mv[2]; int ite, ref; // Prediction buffer from second frame. +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t *second_pred; + uint8_t *second_pred_alloc; +#else uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t)); +#endif // CONFIG_VP9_HIGHBITDEPTH const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter); // Do joint motion search in compound mode to get more accurate mv. @@ -1912,6 +2144,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]), vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1]) }; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t)); + second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc); + } else { + second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t)); + second_pred = second_pred_alloc; + } +#endif // CONFIG_VP9_HIGHBITDEPTH for (ref = 0; ref < 2; ++ref) { ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0]; @@ -1950,6 +2191,28 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, ref_yv12[1] = xd->plane[0].pre[1]; // Get pred block from second frame. +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_high_build_inter_predictor(ref_yv12[!id].buf, + ref_yv12[!id].stride, + second_pred, pw, + &frame_mv[refs[!id]].as_mv, + &xd->block_refs[!id]->sf, + pw, ph, 0, + kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE, + xd->bd); + } else { + vp9_build_inter_predictor(ref_yv12[!id].buf, + ref_yv12[!id].stride, + second_pred, pw, + &frame_mv[refs[!id]].as_mv, + &xd->block_refs[!id]->sf, + pw, ph, 0, + kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + } +#else vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, @@ -1958,6 +2221,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, pw, ph, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); +#endif // CONFIG_VP9_HIGHBITDEPTH // Compound motion search on first ref frame. if (id) @@ -2026,7 +2290,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } +#if CONFIG_VP9_HIGHBITDEPTH + vpx_free(second_pred_alloc); +#else vpx_free(second_pred); +#endif // CONFIG_VP9_HIGHBITDEPTH } static INLINE void restore_dst_buf(MACROBLOCKD *xd, @@ -2068,12 +2336,26 @@ static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, // Calculate threshold according to dequant value. thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + const int shift = 2 * xd->bd - 16; + if (shift > 0) + thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift); + } +#endif // CONFIG_VP9_HIGHBITDEPTH thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); // Adjust threshold according to partition size. thresh_ac >>= 8 - (b_width_log2(bsize) + b_height_log2(bsize)); thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + const int shift = 2 * xd->bd - 16; + if (shift > 0) + thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift); + } +#endif // CONFIG_VP9_HIGHBITDEPTH } else { thresh_ac = 0; thresh_dc = 0; @@ -2145,7 +2427,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int_mv cur_mv[2]; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64); + DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64); + uint8_t *tmp_buf = tmp_buf8; +#else DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64); +#endif // CONFIG_VP9_HIGHBITDEPTH int pred_exists = 0; int intpel_mv; int64_t rd, tmp_rd, best_rd = INT64_MAX; @@ -2162,6 +2450,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16); + } else { + tmp_buf = tmp_buf8; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + if (pred_filter_search) { INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE; if (xd->up_available) @@ -2575,8 +2871,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; PREDICTION_MODE mode_uv[TX_SIZES]; - const int intra_cost_penalty = - 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = vp9_get_intra_cost_penalty( + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); int best_skip2 = 0; uint8_t ref_frame_skip_mask[2] = { 0 }; uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; @@ -3011,9 +3307,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // based on qp, activity mask and history if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) && (mode_index > MIN_EARLY_TERM_INDEX)) { - const int qstep = xd->plane[0].dequant[1]; + int qstep = xd->plane[0].dequant[1]; // TODO(debargha): Enhance this by specializing for each mode_index int scale = 4; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + qstep >>= (xd->bd - 8); + } +#endif // CONFIG_VP9_HIGHBITDEPTH if (x->source_variance < UINT_MAX) { const int var_adjust = (x->source_variance < 16); scale -= var_adjust; @@ -3329,8 +3630,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int64_t dist_uv; int skip_uv; PREDICTION_MODE mode_uv = DC_PRED; - const int intra_cost_penalty = - 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = vp9_get_intra_cost_penalty( + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; @@ -3748,9 +4049,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // based on qp, activity mask and history if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) && (ref_index > MIN_EARLY_TERM_INDEX)) { - const int qstep = xd->plane[0].dequant[1]; + int qstep = xd->plane[0].dequant[1]; // TODO(debargha): Enhance this by specializing for each mode_index int scale = 4; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + qstep >>= (xd->bd - 8); + } +#endif // CONFIG_VP9_HIGHBITDEPTH if (x->source_variance < UINT_MAX) { const int var_adjust = (x->source_variance < 16); scale -= var_adjust; diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 52c603fb64..50cb108ddd 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -54,7 +54,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index 063c0bafe7..da2b6857ca 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -53,6 +53,12 @@ extern const int16_t *vp9_dct_value_cost_ptr; * fields are not. */ extern const TOKENVALUE *vp9_dct_value_tokens_ptr; +#if CONFIG_VP9_HIGHBITDEPTH +extern const int16_t *vp9_dct_value_cost_high10_ptr; +extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr; +extern const int16_t *vp9_dct_value_cost_high12_ptr; +extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr; +#endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" -- GitLab