diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 8e568f83c222693d702a1df82e2729ec4c1ac753..892ad561597360fc1142a5ffbc53a0faea79463c 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -529,9 +529,8 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *" specialize vp9_get_mb_ss mmx sse2 # ENCODEMB INVOKE -prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size" -specialize vp9_block_error mmx sse2 -vp9_block_error_sse2=vp9_block_error_xmm +prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size" +specialize vp9_block_error sse2 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" specialize vp9_subtract_block sse2 diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 35e1d646b9d3665c2188cee8dd961edeaeacba65..f655d456ba5610958a4c14bc4f38b13ffc6c17e8 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -582,7 +582,7 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, } static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, - TOKENEXTRA **tp, int *totalrate, int *totaldist, + TOKENEXTRA **tp, int *totalrate, int64_t *totaldist, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; @@ -1195,7 +1195,7 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, } static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize, - int *rate, int *dist) { + int *rate, int64_t *dist) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD *xd = &cpi->mb.e_mbd; @@ -1211,7 +1211,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, BLOCK_SIZE_TYPE subsize; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; - int r = 0, d = 0; + int r = 0; + int64_t d = 0; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -1252,7 +1253,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize, get_block_context(x, subsize)); if (mi_row + (bh >> 1) <= cm->mi_rows) { - int rt, dt; + int rt; + int64_t dt; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *(get_sb_index(xd, subsize)) = 1; @@ -1270,7 +1272,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize, get_block_context(x, subsize)); if (mi_col + (bs >> 1) <= cm->mi_cols) { - int rt, dt; + int rt; + int64_t dt; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *(get_sb_index(xd, subsize)) = 1; @@ -1289,7 +1292,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, int x_idx = (i & 1) * (bs >> 2); int y_idx = (i >> 1) * (bs >> 2); int jj = i >> 1, ii = i & 0x01; - int rt, dt; + int rt; + int64_t dt; if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; @@ -1323,7 +1327,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, // results, for encoding speed-up. static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize, int *rate, - int *dist) { + int64_t *dist) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD * const xd = &x->e_mbd; @@ -1334,7 +1338,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, TOKENEXTRA *tp_orig = *tp; int i, pl; BLOCK_SIZE_TYPE subsize; - int srate = INT_MAX, sdist = INT_MAX; + int srate = INT_MAX; + int64_t sdist = INT_MAX; if (bsize < BLOCK_SIZE_SB8X8) if (xd->ab_index != 0) { @@ -1351,14 +1356,16 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, || (cpi->sf.use_partitions_greater_than && bsize > cpi->sf.greater_than_block_size)) { if (bsize >= BLOCK_SIZE_SB8X8) { - int r4 = 0, d4 = 0; + int r4 = 0; + int64_t d4 = 0; subsize = get_subsize(bsize, PARTITION_SPLIT); *(get_sb_partitioning(x, bsize)) = subsize; for (i = 0; i < 4; ++i) { int x_idx = (i & 1) * (ms >> 1); int y_idx = (i >> 1) * (ms >> 1); - int r = 0, d = 0; + int r = 0; + int64_t d = 0; if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; @@ -1386,8 +1393,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, && bsize <= cpi->sf.less_than_block_size)) { // PARTITION_HORZ if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) { - int r2, d2; - int r = 0, d = 0; + int r2, r = 0; + int64_t d2, d = 0; subsize = get_subsize(bsize, PARTITION_HORZ); *(get_sb_index(xd, subsize)) = 0; pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, @@ -1418,13 +1425,15 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, // PARTITION_VERT if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) { - int r2, d2; + int r2; + int64_t d2; subsize = get_subsize(bsize, PARTITION_VERT); *(get_sb_index(xd, subsize)) = 0; pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, get_block_context(x, subsize)); if (mi_col + (ms >> 1) < cm->mi_cols) { - int r = 0, d = 0; + int r = 0; + int64_t d = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); @@ -1450,7 +1459,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, // PARTITION_NONE if ((mi_row + (ms >> 1) < cm->mi_rows) && (mi_col + (ms >> 1) < cm->mi_cols)) { - int r, d; + int r; + int64_t d; pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize, get_block_context(x, bsize)); if (bsize >= BLOCK_SIZE_SB8X8) { @@ -1497,7 +1507,8 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, // Code each SB in the row for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) { - int dummy_rate, dummy_dist; + int dummy_rate; + int64_t dummy_dist; if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning || cpi->sf.use_one_partition_size_always ) { const int idx_str = cm->mode_info_stride * mi_row + mi_col; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index be6c191eb95577e2cef1160704235ccbfd54b096..a48e7dbb38a57f7e2e4b272325d09d526eead70e 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -274,12 +274,14 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { } } -int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) { - int i, error = 0; +int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, + intptr_t block_size) { + int i; + int64_t error = 0; for (i = 0; i < block_size; i++) { int this_diff = coeff[i] - dqcoeff[i]; - error += this_diff * this_diff; + error += (unsigned)this_diff * this_diff; } return error; @@ -417,7 +419,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int (*r)[2], int *rate, - int *d, int *distortion, + int64_t *d, int64_t *distortion, int *s, int *skip, int64_t txfm_cache[NB_TXFM_MODES], TX_SIZE max_txfm_size) { @@ -496,27 +498,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, rd[TX_4X4][1] : rd[TX_8X8][1]; } -static int block_error(int16_t *coeff, int16_t *dqcoeff, - int block_size, int shift) { - int i; - int64_t error = 0; - - for (i = 0; i < block_size; i++) { - int this_diff = coeff[i] - dqcoeff[i]; - error += (unsigned)this_diff * this_diff; - } - error >>= shift; - - return error > INT_MAX ? INT_MAX : (int)error; -} - -static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) { +static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, + int shift) { const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); - return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, - 16 << (bwl + bhl), shift); + return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, + 16 << (bwl + bhl)) >> shift; } -static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) { +static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, + int shift) { const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); int64_t sum = 0; int plane; @@ -524,11 +514,10 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) { for (plane = 1; plane < MAX_MB_PLANE; plane++) { const int subsampling = x->e_mbd.plane[plane].subsampling_x + x->e_mbd.plane[plane].subsampling_y; - sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, - 16 << (bwl + bhl - subsampling), 0); + sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, + 16 << (bwl + bhl - subsampling)); } - sum >>= shift; - return sum > INT_MAX ? INT_MAX : (int)sum; + return sum >> shift; } struct rdcost_block_args { @@ -586,7 +575,8 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x, } static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, int *skippable, + int *rate, int64_t *distortion, + int *skippable, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; xd->mode_info_context->mbmi.txfm_size = tx_size; @@ -602,11 +592,12 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, } static void super_block_yrd(VP9_COMP *cpi, - MACROBLOCK *x, int *rate, int *distortion, + MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, BLOCK_SIZE_TYPE bs, int64_t txfm_cache[NB_TXFM_MODES]) { VP9_COMMON *const cm = &cpi->common; - int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB]; + int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB]; + int64_t d[TX_SIZE_MAX_SB]; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; @@ -651,13 +642,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, int *bmode_costs, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, - int *bestdistortion, + int64_t *bestdistortion, BLOCK_SIZE_TYPE bsize) { MB_PREDICTION_MODE mode; MACROBLOCKD *xd = &x->e_mbd; int64_t best_rd = INT64_MAX; int rate = 0; - int distortion; + int64_t distortion; VP9_COMMON *const cm = &cpi->common; const int src_stride = x->plane[0].src.stride; uint8_t *src, *dst; @@ -777,7 +768,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, - int *Distortion, int64_t best_rd) { + int64_t *Distortion, int64_t best_rd) { int i, j; MACROBLOCKD *const xd = &mb->e_mbd; BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; @@ -785,7 +776,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int bh = 1 << b_height_log2(bsize); int idx, idy; int cost = 0; - int distortion = 0; + int64_t distortion = 0; int tot_rate_y = 0; int64_t total_rd = 0; ENTROPY_CONTEXT t_above[4], t_left[4]; @@ -802,7 +793,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, const int mis = xd->mode_info_stride; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry); - int UNINITIALIZED_IS_SAFE(d); + int64_t UNINITIALIZED_IS_SAFE(d); i = idy * 2 + idx; if (xd->frame_type == KEY_FRAME) { @@ -844,14 +835,14 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, - int *distortion, int *skippable, + int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize, int64_t txfm_cache[NB_TXFM_MODES]) { MB_PREDICTION_MODE mode; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); MACROBLOCKD *const xd = &x->e_mbd; - int this_rate, this_rate_tokenonly; - int this_distortion, s; + int this_rate, this_rate_tokenonly, s; + int64_t this_distortion; int64_t best_rd = INT64_MAX, this_rd; TX_SIZE UNINITIALIZED_IS_SAFE(best_tx); int i; @@ -912,7 +903,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, + int *rate, int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize, TX_SIZE uv_tx_size) { MACROBLOCKD *const xd = &x->e_mbd; @@ -927,7 +918,7 @@ static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, } static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, int *skippable, + int *rate, int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; @@ -952,13 +943,13 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, - int *distortion, int *skippable, + int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize) { MB_PREDICTION_MODE mode; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); int64_t best_rd = INT64_MAX, this_rd; - int this_rate_tokenonly, this_rate; - int this_distortion, s; + int this_rate_tokenonly, this_rate, s; + int64_t this_distortion; for (mode = DC_PRED; mode <= TM_PRED; mode++) { x->e_mbd.mode_info_context->mbmi.uv_mode = mode; @@ -1101,7 +1092,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, MACROBLOCK *x, int i, int *labelyrate, - int *distortion, + int64_t *distortion, ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl) { int k; @@ -1126,7 +1117,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, xd->plane[0].dst.buf, xd->plane[0].dst.stride); - int thisdistortion = 0; + int64_t thisdistortion = 0; int thisrate = 0; *labelyrate = 0; @@ -1189,7 +1180,7 @@ typedef struct { int64_t segment_rd; int r; - int d; + int64_t d; int segment_yrate; MB_PREDICTION_MODE modes[4]; int_mv mvs[4], second_mvs[4]; @@ -1281,21 +1272,18 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, int_mv seg_mvs[4][MAX_REF_FRAMES], int mi_row, int mi_col) { - int i, j; - int br = 0, bd = 0; + int i, j, br = 0, rate = 0, sbr = 0, idx, idy; + int64_t bd = 0, sbd = 0; MB_PREDICTION_MODE this_mode; MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; const int label_count = 4; int64_t this_segment_rd = 0, other_segment_rd; int label_mv_thresh; - int rate = 0; - int sbr = 0, sbd = 0; int segmentyrate = 0; int best_eobs[4] = { 0 }; BLOCK_SIZE_TYPE bsize = mbmi->sb_type; int bwl = b_width_log2(bsize), bw = 1 << bwl; int bhl = b_height_log2(bsize), bh = 1 << bhl; - int idx, idy; vp9_variance_fn_ptr_t *v_fn_ptr; ENTROPY_CONTEXT t_above[4], t_left[4]; ENTROPY_CONTEXT t_above_b[4], t_left_b[4]; @@ -1340,7 +1328,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // search for the best motion vector on this segment for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { int64_t this_rd; - int distortion; + int64_t distortion; int labelyrate; ENTROPY_CONTEXT t_above_s[4], t_left_s[4]; const struct buf_2d orig_src = x->plane[0].src; @@ -1527,7 +1515,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_rd, int *returntotrate, int *returnyrate, - int *returndistortion, + int64_t *returndistortion, int *skippable, int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], int mi_row, int mi_col) { @@ -1921,7 +1909,7 @@ static double model_dist_norm(double x) { } static void model_rd_from_var_lapndz(int var, int n, int qstep, - int *rate, int *dist) { + int *rate, int64_t *dist) { // This function models the rate and distortion for a Laplacian // source with given variance when quantized with a uniform quantizer // with given stepsize. The closed form expression is: @@ -1958,12 +1946,13 @@ static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize, static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, MACROBLOCK *x, MACROBLOCKD *xd, - int *out_rate_sum, int *out_dist_sum) { + int *out_rate_sum, int64_t *out_dist_sum) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. unsigned int sse; - int i, rate_sum = 0, dist_sum = 0; + int i, rate_sum = 0; + int64_t dist_sum = 0; for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; @@ -1973,7 +1962,8 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, const int bw = plane_block_width(bsize, pd); const int bh = plane_block_height(bsize, pd); const enum BlockSize bs = get_block_size(bw, bh); - int rate, dist; + int rate; + int64_t dist; cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist); @@ -2238,9 +2228,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int64_t txfm_cache[], - int *rate2, int *distortion, int *skippable, - int *rate_y, int *distortion_y, - int *rate_uv, int *distortion_uv, + int *rate2, int64_t *distortion, + int *skippable, + int *rate_y, int64_t *distortion_y, + int *rate_uv, int64_t *distortion_uv, int *mode_excluded, int *disable_skip, INTERPOLATIONFILTERTYPE *best_filter, int_mv *frame_mv, @@ -2344,7 +2335,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *best_filter = EIGHTTAP; } else { int i, newbest; - int tmp_rate_sum = 0, tmp_dist_sum = 0; + int tmp_rate_sum = 0; + int64_t tmp_dist_sum = 0; for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) { int rs = 0; const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i]; @@ -2359,7 +2351,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (interpolating_intpel_seen && is_intpel_interp) { rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum); } else { - int rate_sum = 0, dist_sum = 0; + int rate_sum = 0; + int64_t dist_sum = 0; vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum); rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum); @@ -2503,19 +2496,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int *returnrate, int *returndist, + int *returnrate, int64_t *returndist, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - int rate_y = 0, rate_uv; - int rate_y_tokenonly = 0, rate_uv_tokenonly; - int dist_y = 0, dist_uv; - int y_skip = 0, uv_skip; + int rate_y = 0, rate_uv = 0; + int rate_y_tokenonly = 0, rate_uv_tokenonly = 0; + int64_t dist_y = 0, dist_uv = 0; + int y_skip = 0, uv_skip = 0; int64_t txfm_cache[NB_TXFM_MODES], err; MB_PREDICTION_MODE mode; TX_SIZE txfm_size; - int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y; + int rate4x4_y, rate4x4_y_tokenonly; + int64_t dist4x4_y; int64_t err4x4 = INT64_MAX; int i; @@ -2566,7 +2560,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int *returnrate, - int *returndistortion, + int64_t *returndistortion, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *cm = &cpi->common; @@ -2601,7 +2595,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE; INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB]; - int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB]; + int64_t dist_uv[TX_SIZE_MAX_SB]; + int skip_uv[TX_SIZE_MAX_SB]; MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB]; struct scale_factors scale_factor[4]; unsigned int ref_frame_mask = 0; @@ -2704,7 +2699,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int disable_skip = 0; int compmode_cost = 0; int rate2 = 0, rate_y = 0, rate_uv = 0; - int distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; int skippable; int64_t txfm_cache[NB_TXFM_MODES]; int i; @@ -2891,11 +2886,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, distortion2 = distortion_y + distortion_uv; } else if (this_mode == SPLITMV) { const int is_comp_pred = mbmi->ref_frame[1] > 0; - int rate, distortion; + int rate; + int64_t distortion; int64_t this_rd_thresh; int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX; int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX; - int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0; + int64_t tmp_best_distortion = INT_MAX; + int tmp_best_skippable = 0; int switchable_filter_index; int_mv *second_ref = is_comp_pred ? &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL; diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index dcf5d00e9f04d97b3b8b49f4476e94815d3b6ace..67ef73db760ec561d8f06c6c435f008f6deeae34 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -20,12 +20,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int *r, int *d, BLOCK_SIZE_TYPE bsize, + int *r, int64_t *d, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx); int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, - int *r, int *d, BLOCK_SIZE_TYPE bsize, + int *r, int64_t *d, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx); void vp9_init_me_luts(); diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm deleted file mode 100644 index 734cb61cae9fb43acec2a4eaafd2ec586631527f..0000000000000000000000000000000000000000 --- a/vp9/encoder/x86/vp9_encodeopt.asm +++ /dev/null @@ -1,125 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) -global sym(vp9_block_error_xmm) PRIVATE -sym(vp9_block_error_xmm): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prologue - - mov rsi, arg(0) ;coeff_ptr - mov rdi, arg(1) ;dcoef_ptr - - movdqa xmm0, [rsi] - movdqa xmm1, [rdi] - - movdqa xmm2, [rsi+16] - movdqa xmm3, [rdi+16] - - psubw xmm0, xmm1 - psubw xmm2, xmm3 - - pmaddwd xmm0, xmm0 - pmaddwd xmm2, xmm2 - - paddd xmm0, xmm2 - - pxor xmm5, xmm5 - movdqa xmm1, xmm0 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - psrldq xmm0, 8 - paddd xmm0, xmm1 - - movq rax, xmm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - -;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) -global sym(vp9_block_error_mmx) PRIVATE -sym(vp9_block_error_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - pxor mm7, mm7 - - mov rdi, arg(1) ;dcoef_ptr - movq mm3, [rsi] - - movq mm4, [rdi] - movq mm5, [rsi+8] - - movq mm6, [rdi+8] - pxor mm1, mm1 ; from movd mm1, dc ; dc =0 - - movq mm2, mm7 - psubw mm5, mm6 - - por mm1, mm2 - pmaddwd mm5, mm5 - - pcmpeqw mm1, mm7 - psubw mm3, mm4 - - pand mm1, mm3 - pmaddwd mm1, mm1 - - paddd mm1, mm5 - movq mm3, [rsi+16] - - movq mm4, [rdi+16] - movq mm5, [rsi+24] - - movq mm6, [rdi+24] - psubw mm5, mm6 - - pmaddwd mm5, mm5 - psubw mm3, mm4 - - pmaddwd mm3, mm3 - paddd mm3, mm5 - - paddd mm1, mm3 - movq mm0, mm1 - - psrlq mm1, 32 - paddd mm0, mm1 - - movq rax, mm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm new file mode 100644 index 0000000000000000000000000000000000000000..bb1ea71b9b54fb831646ec279961aaa51fc043f5 --- /dev/null +++ b/vp9/encoder/x86/vp9_error_sse2.asm @@ -0,0 +1,57 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 6, uqc, dqc, size + pxor m4, m4 ; accumulator + pxor m5, m5 ; dedicated zero register + lea uqcq, [uqcq+sizeq*2] + lea dqcq, [dqcq+sizeq*2] + neg sizeq +.loop: + mova m0, [uqcq+sizeq*2] + mova m2, [dqcq+sizeq*2] + mova m1, [uqcq+sizeq*2+mmsize] + mova m3, [dqcq+sizeq*2+mmsize] + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + ; accumulate in 64bit + punpckldq m2, m0, m5 + punpckhdq m0, m5 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + paddq m4, m2 + paddq m4, m0 + paddq m4, m3 + paddq m4, m1 + add sizeq, mmsize + jl .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + paddq m4, m5 +%if ARCH_X86_64 + movq rax, m4 +%else + pshufd m5, m4, 0x1 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index b9afe96b2f6a1baabf379d4364aa49cbf4ed18e8..a1e93753de5f0a17812eef73ced5fab757a7bc01 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -85,12 +85,12 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm -VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c