vp9_rdopt.c 165 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
2
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5 6
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9 10 11
 */

#include <assert.h>
12
#include <math.h>
13

14
#include "./vp9_rtcd.h"
15
#include "./vpx_dsp_rtcd.h"
16

17
#include "vpx_dsp/vpx_dsp_common.h"
18
#include "vpx_mem/vpx_mem.h"
19
#include "vpx_ports/mem.h"
20
#include "vpx_ports/system_state.h"
21 22 23

#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_entropy.h"
24
#include "vp9/common/vp9_entropymode.h"
25 26 27 28
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_quant_common.h"
29 30
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
Scott LaVarnway's avatar
Scott LaVarnway committed
31
#include "vp9/common/vp9_scan.h"
32 33
#include "vp9/common/vp9_seg_common.h"

Dmitry Kovalev's avatar
Dmitry Kovalev committed
34
#include "vp9/encoder/vp9_cost.h"
35
#include "vp9/encoder/vp9_encodemb.h"
36
#include "vp9/encoder/vp9_encodemv.h"
Dmitry Kovalev's avatar
Dmitry Kovalev committed
37
#include "vp9/encoder/vp9_encoder.h"
38
#include "vp9/encoder/vp9_mcomp.h"
39
#include "vp9/encoder/vp9_quantize.h"
40
#include "vp9/encoder/vp9_ratectrl.h"
41
#include "vp9/encoder/vp9_rd.h"
42
#include "vp9/encoder/vp9_rdopt.h"
43
#include "vp9/encoder/vp9_aq_variance.h"
Paul Wilkins's avatar
Paul Wilkins committed
44

clang-format's avatar
clang-format committed
45 46 47 48 49 50
#define LAST_FRAME_MODE_MASK \
  ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
#define GOLDEN_FRAME_MODE_MASK \
  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
#define ALT_REF_MODE_MASK \
  ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME))
51

clang-format's avatar
clang-format committed
52
#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
53

clang-format's avatar
clang-format committed
54 55
#define MIN_EARLY_TERM_INDEX 3
#define NEW_MV_DISCOUNT_FACTOR 8
Paul Wilkins's avatar
Paul Wilkins committed
56

57
typedef struct {
58
  PREDICTION_MODE mode;
59 60 61
  MV_REFERENCE_FRAME ref_frame[2];
} MODE_DEFINITION;

clang-format's avatar
clang-format committed
62
typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
63

Alex Converse's avatar
Alex Converse committed
64
struct rdcost_block_args {
65
  const VP9_COMP *cpi;
Alex Converse's avatar
Alex Converse committed
66 67 68 69 70 71 72 73
  MACROBLOCK *x;
  ENTROPY_CONTEXT t_above[16];
  ENTROPY_CONTEXT t_left[16];
  int this_rate;
  int64_t this_dist;
  int64_t this_sse;
  int64_t this_rd;
  int64_t best_rd;
Alex Converse's avatar
Alex Converse committed
74
  int exit_early;
75
  int use_fast_coef_costing;
76
  const scan_order *so;
77
  uint8_t skippable;
Alex Converse's avatar
Alex Converse committed
78 79
};

80
#define LAST_NEW_MV_INDEX 6
81
static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
clang-format's avatar
clang-format committed
82 83 84
  { NEARESTMV, { LAST_FRAME, NONE } },
  { NEARESTMV, { ALTREF_FRAME, NONE } },
  { NEARESTMV, { GOLDEN_FRAME, NONE } },
85

clang-format's avatar
clang-format committed
86
  { DC_PRED, { INTRA_FRAME, NONE } },
87

clang-format's avatar
clang-format committed
88 89 90
  { NEWMV, { LAST_FRAME, NONE } },
  { NEWMV, { ALTREF_FRAME, NONE } },
  { NEWMV, { GOLDEN_FRAME, NONE } },
91

clang-format's avatar
clang-format committed
92 93 94
  { NEARMV, { LAST_FRAME, NONE } },
  { NEARMV, { ALTREF_FRAME, NONE } },
  { NEARMV, { GOLDEN_FRAME, NONE } },
Jingning Han's avatar
Jingning Han committed
95

clang-format's avatar
clang-format committed
96 97 98
  { ZEROMV, { LAST_FRAME, NONE } },
  { ZEROMV, { GOLDEN_FRAME, NONE } },
  { ZEROMV, { ALTREF_FRAME, NONE } },
Jingning Han's avatar
Jingning Han committed
99

clang-format's avatar
clang-format committed
100 101
  { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
  { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
102

clang-format's avatar
clang-format committed
103
  { TM_PRED, { INTRA_FRAME, NONE } },
104

clang-format's avatar
clang-format committed
105 106 107 108
  { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
  { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
  { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
  { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
109

clang-format's avatar
clang-format committed
110 111
  { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
  { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
112

clang-format's avatar
clang-format committed
113 114 115 116 117 118 119 120
  { H_PRED, { INTRA_FRAME, NONE } },
  { V_PRED, { INTRA_FRAME, NONE } },
  { D135_PRED, { INTRA_FRAME, NONE } },
  { D207_PRED, { INTRA_FRAME, NONE } },
  { D153_PRED, { INTRA_FRAME, NONE } },
  { D63_PRED, { INTRA_FRAME, NONE } },
  { D117_PRED, { INTRA_FRAME, NONE } },
  { D45_PRED, { INTRA_FRAME, NONE } },
121 122
};

123
static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
clang-format's avatar
clang-format committed
124 125 126
  { { LAST_FRAME, NONE } },           { { GOLDEN_FRAME, NONE } },
  { { ALTREF_FRAME, NONE } },         { { LAST_FRAME, ALTREF_FRAME } },
  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } },
John Koleszar's avatar
John Koleszar committed
127 128
};

clang-format's avatar
clang-format committed
129 130
static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
                           int min_plane, int max_plane) {
131 132 133 134 135 136
  int i;

  for (i = min_plane; i < max_plane; ++i) {
    struct macroblock_plane *const p = &x->plane[i];
    struct macroblockd_plane *const pd = &x->e_mbd.plane[i];

clang-format's avatar
clang-format committed
137 138
    p->coeff = ctx->coeff_pbuf[i][m];
    p->qcoeff = ctx->qcoeff_pbuf[i][m];
139
    pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
clang-format's avatar
clang-format committed
140
    p->eobs = ctx->eobs_pbuf[i][m];
141

clang-format's avatar
clang-format committed
142 143
    ctx->coeff_pbuf[i][m] = ctx->coeff_pbuf[i][n];
    ctx->qcoeff_pbuf[i][m] = ctx->qcoeff_pbuf[i][n];
144
    ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
clang-format's avatar
clang-format committed
145
    ctx->eobs_pbuf[i][m] = ctx->eobs_pbuf[i][n];
146

clang-format's avatar
clang-format committed
147 148
    ctx->coeff_pbuf[i][n] = p->coeff;
    ctx->qcoeff_pbuf[i][n] = p->qcoeff;
149
    ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
clang-format's avatar
clang-format committed
150
    ctx->eobs_pbuf[i][n] = p->eobs;
151 152 153
  }
}

clang-format's avatar
clang-format committed
154 155 156 157
static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
                            MACROBLOCKD *xd, int *out_rate_sum,
                            int64_t *out_dist_sum, int *skip_txfm_sb,
                            int64_t *skip_sse_sb) {
Deb Mukherjee's avatar
Deb Mukherjee committed
158 159 160
  // Note our transform coeffs are 8 times an orthogonal transform.
  // Hence quantizer step is also 8 times. To get effective quantizer
  // we need to divide by 8 before sending to modeling function.
161 162 163
  int i;
  int64_t rate_sum = 0;
  int64_t dist_sum = 0;
Scott LaVarnway's avatar
Scott LaVarnway committed
164
  const int ref = xd->mi[0]->ref_frame[0];
165
  unsigned int sse;
166
  unsigned int var = 0;
167 168
  int64_t total_sse = 0;
  int skip_flag = 1;
169
  const int shift = 6;
170
  int64_t dist;
171 172
  const int dequant_shift =
#if CONFIG_VP9_HIGHBITDEPTH
clang-format's avatar
clang-format committed
173
      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
174
#endif  // CONFIG_VP9_HIGHBITDEPTH
clang-format's avatar
clang-format committed
175
                                                    3;
176 177 178 179
  unsigned int qstep_vec[MAX_MB_PLANE];
  unsigned int nlog2_vec[MAX_MB_PLANE];
  unsigned int sum_sse_vec[MAX_MB_PLANE];
  int any_zero_sum_sse = 0;
180 181

  x->pred_sse[ref] = 0;
Deb Mukherjee's avatar
Deb Mukherjee committed
182 183 184 185

  for (i = 0; i < MAX_MB_PLANE; ++i) {
    struct macroblock_plane *const p = &x->plane[i];
    struct macroblockd_plane *const pd = &xd->plane[i];
186
    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
187 188
    const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
189 190
    const int64_t dc_thr = p->quant_thred[0] >> shift;
    const int64_t ac_thr = p->quant_thred[1] >> shift;
191
    unsigned int sum_sse = 0;
192 193
    // The low thresholds are used to measure if the prediction errors are
    // low enough so that we can skip the mode search.
194 195
    const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
    const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2);
196 197 198 199 200 201 202 203 204 205
    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
    int idx, idy;
    int lw = b_width_log2_lookup[unit_size] + 2;
    int lh = b_height_log2_lookup[unit_size] + 2;

    for (idy = 0; idy < bh; ++idy) {
      for (idx = 0; idx < bw; ++idx) {
        uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
        uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
206
        int block_idx = (idy << 1) + idx;
207
        int low_err_skip = 0;
208

clang-format's avatar
clang-format committed
209 210
        var = cpi->fn_ptr[unit_size].vf(src, p->src.stride, dst, pd->dst.stride,
                                        &sse);
211 212 213
        x->bsse[(i << 2) + block_idx] = sse;
        sum_sse += sse;

214
        x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_NONE;
215
        if (!x->select_tx_size) {
216
          // Check if all ac coefficients can be quantized to zero.
217
          if (var < ac_thr || var == 0) {
218
            x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_ONLY;
219 220

            // Check if dc coefficient can be quantized to zero.
221
            if (sse - var < dc_thr || sse == var) {
222
              x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_DC;
223 224 225 226

              if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
                low_err_skip = 1;
            }
227
          }
228
        }
229

clang-format's avatar
clang-format committed
230
        if (skip_flag && !low_err_skip) skip_flag = 0;
231

clang-format's avatar
clang-format committed
232
        if (i == 0) x->pred_sse[ref] += sse;
233 234
      }
    }
235

236
    total_sse += sum_sse;
237 238 239 240 241
    sum_sse_vec[i] = sum_sse;
    any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
    qstep_vec[i] = pd->dequant[1] >> dequant_shift;
    nlog2_vec[i] = num_pels_log2_lookup[bs];
  }
242

243 244 245
  // Fast approximate the modelling function.
  if (cpi->sf.simple_model_rd_from_var) {
    for (i = 0; i < MAX_MB_PLANE; ++i) {
246
      int64_t rate;
247 248
      const int64_t square_error = sum_sse_vec[i];
      int quantizer = qstep_vec[i];
249

250
      if (quantizer < 120)
251
        rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
252 253 254 255 256
      else
        rate = 0;
      dist = (square_error * quantizer) >> 8;
      rate_sum += rate;
      dist_sum += dist;
257 258 259 260 261 262 263 264 265 266
    }
  } else {
    if (any_zero_sum_sse) {
      for (i = 0; i < MAX_MB_PLANE; ++i) {
        int rate;
        vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
                                     &rate, &dist);
        rate_sum += rate;
        dist_sum += dist;
      }
267
    } else {
268 269
      vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
                                       &rate_sum, &dist_sum);
270
    }
Deb Mukherjee's avatar
Deb Mukherjee committed
271 272
  }

273 274
  *skip_txfm_sb = skip_flag;
  *skip_sse_sb = total_sse << 4;
275 276
  *out_rate_sum = (int)rate_sum;
  *out_dist_sum = dist_sum << 4;
Deb Mukherjee's avatar
Deb Mukherjee committed
277 278
}

279 280
#if CONFIG_VP9_HIGHBITDEPTH
int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
clang-format's avatar
clang-format committed
281
                                 const tran_low_t *dqcoeff, intptr_t block_size,
282
                                 int64_t *ssz, int bd) {
Ronald S. Bultje's avatar
Ronald S. Bultje committed
283
  int i;
284
  int64_t error = 0, sqcoeff = 0;
285 286
  int shift = 2 * (bd - 8);
  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
John Koleszar's avatar
John Koleszar committed
287

288
  for (i = 0; i < block_size; i++) {
289
    const int64_t diff = coeff[i] - dqcoeff[i];
clang-format's avatar
clang-format committed
290
    error += diff * diff;
291
    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
John Koleszar's avatar
John Koleszar committed
292
  }
293 294 295
  assert(error >= 0 && sqcoeff >= 0);
  error = (error + rounding) >> shift;
  sqcoeff = (sqcoeff + rounding) >> shift;
John Koleszar's avatar
John Koleszar committed
296

297
  *ssz = sqcoeff;
John Koleszar's avatar
John Koleszar committed
298
  return error;
John Koleszar's avatar
John Koleszar committed
299 300
}

301 302 303 304 305
static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
                                               const tran_low_t *dqcoeff,
                                               intptr_t block_size,
                                               int64_t *ssz, int bd) {
  if (bd == 8) {
Johann's avatar
Johann committed
306
    return vp9_block_error(coeff, dqcoeff, block_size, ssz);
307 308 309 310 311 312 313 314
  } else {
    return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
  }
}
#endif  // CONFIG_VP9_HIGHBITDEPTH

int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                          intptr_t block_size, int64_t *ssz) {
315 316 317 318
  int i;
  int64_t error = 0, sqcoeff = 0;

  for (i = 0; i < block_size; i++) {
319
    const int diff = coeff[i] - dqcoeff[i];
clang-format's avatar
clang-format committed
320
    error += diff * diff;
321
    sqcoeff += coeff[i] * coeff[i];
322 323 324 325 326
  }

  *ssz = sqcoeff;
  return error;
}
327

328
int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
329 330 331 332 333 334
                             int block_size) {
  int i;
  int64_t error = 0;

  for (i = 0; i < block_size; i++) {
    const int diff = coeff[i] - dqcoeff[i];
clang-format's avatar
clang-format committed
335
    error += diff * diff;
336 337 338 339
  }

  return error;
}
340

341 342 343 344 345
/* The trailing '0' is a terminator which is used inside cost_coeffs() to
 * decide whether to include cost of a trailing EOB node or not (i.e. we
 * can skip this if the last coefficient in this transform block, e.g. the
 * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
 * were non-zero). */
346
static const int16_t band_counts[TX_SIZES][8] = {
clang-format's avatar
clang-format committed
347 348 349
  { 1, 2, 3, 4, 3, 16 - 13, 0 },
  { 1, 2, 3, 4, 11, 64 - 21, 0 },
  { 1, 2, 3, 4, 11, 256 - 21, 0 },
350
  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
351
};
clang-format's avatar
clang-format committed
352 353
static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
                       int pt, const int16_t *scan, const int16_t *nb,
Alex Converse's avatar
Alex Converse committed
354
                       int use_fast_coef_costing) {
355
  MACROBLOCKD *const xd = &x->e_mbd;
Scott LaVarnway's avatar
Scott LaVarnway committed
356
  MODE_INFO *mi = xd->mi[0];
357
  const struct macroblock_plane *p = &x->plane[plane];
358
  const PLANE_TYPE type = get_plane_type(plane);
359
  const int16_t *band_count = &band_counts[tx_size][1];
360
  const int eob = p->eobs[block];
361
  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
clang-format's avatar
clang-format committed
362 363
  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
      x->token_costs[tx_size][type][is_inter_block(mi)];
364
  uint8_t token_cache[32 * 32];
Johann's avatar
Johann committed
365
  int cost;
366
#if CONFIG_VP9_HIGHBITDEPTH
367
  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
368
#else
369
  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
370 371
#endif

372
  // Check for consistency of tx_size with mode info
clang-format's avatar
clang-format committed
373 374 375
  assert(type == PLANE_TYPE_Y
             ? mi->tx_size == tx_size
             : get_uv_tx_size(mi, &xd->plane[plane]) == tx_size);
376

377 378
  if (eob == 0) {
    // single eob token
379
    cost = token_costs[0][0][pt][EOB_TOKEN];
380
  } else {
381 382
    if (use_fast_coef_costing) {
      int band_left = *band_count++;
Johann's avatar
Johann committed
383
      int c;
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406

      // dc token
      int v = qcoeff[0];
      int16_t prev_t;
      cost = vp9_get_token_cost(v, &prev_t, cat6_high_cost);
      cost += (*token_costs)[0][pt][prev_t];

      token_cache[0] = vp9_pt_energy_class[prev_t];
      ++token_costs;

      // ac tokens
      for (c = 1; c < eob; c++) {
        const int rc = scan[c];
        int16_t t;

        v = qcoeff[rc];
        cost += vp9_get_token_cost(v, &t, cat6_high_cost);
        cost += (*token_costs)[!prev_t][!prev_t][t];
        prev_t = t;
        if (!--band_left) {
          band_left = *band_count++;
          ++token_costs;
        }
407
      }
408

409
      // eob token
clang-format's avatar
clang-format committed
410
      if (band_left) cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
411 412 413

    } else {  // !use_fast_coef_costing
      int band_left = *band_count++;
Johann's avatar
Johann committed
414
      int c;
415 416 417 418

      // dc token
      int v = qcoeff[0];
      int16_t tok;
clang-format's avatar
clang-format committed
419
      unsigned int(*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
      cost = vp9_get_token_cost(v, &tok, cat6_high_cost);
      cost += (*token_costs)[0][pt][tok];

      token_cache[0] = vp9_pt_energy_class[tok];
      ++token_costs;

      tok_cost_ptr = &((*token_costs)[!tok]);

      // ac tokens
      for (c = 1; c < eob; c++) {
        const int rc = scan[c];

        v = qcoeff[rc];
        cost += vp9_get_token_cost(v, &tok, cat6_high_cost);
        pt = get_coef_context(nb, token_cache, c);
        cost += (*tok_cost_ptr)[pt][tok];
        token_cache[rc] = vp9_pt_energy_class[tok];
        if (!--band_left) {
          band_left = *band_count++;
          ++token_costs;
        }
        tok_cost_ptr = &((*token_costs)[!tok]);
      }

      // eob token
      if (band_left) {
446
        pt = get_coef_context(nb, token_cache, c);
447 448
        cost += (*token_costs)[0][pt][EOB_TOKEN];
      }
449
    }
450 451 452 453
  }

  return cost;
}
454

455 456 457 458 459 460 461 462 463 464
static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
                                  int subsampling_dim, int blk_dim) {
  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
}

// Compute the pixel domain sum square error on all visible 4x4s in the
// transform block.
static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
                          const struct macroblockd_plane *const pd,
                          const uint8_t *src, const int src_stride,
clang-format's avatar
clang-format committed
465 466
                          const uint8_t *dst, const int dst_stride, int blk_row,
                          int blk_col, const BLOCK_SIZE plane_bsize,
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
                          const BLOCK_SIZE tx_bsize) {
  unsigned int sse = 0;
  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
                                            pd->subsampling_x, blk_col);
  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
                                             pd->subsampling_y, blk_row);
  if (tx_bsize == BLOCK_4X4 ||
      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
  } else {
    const vpx_variance_fn_t vf_4x4 = cpi->fn_ptr[BLOCK_4X4].vf;
    int r, c;
    unsigned this_sse = 0;
    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
    sse = 0;
    // if we are in the unrestricted motion border.
    for (r = 0; r < max_r; ++r) {
      // Skip visiting the sub blocks that are wholly within the UMV.
      for (c = 0; c < max_c; ++c) {
        vf_4x4(src + r * src_stride * 4 + c * 4, src_stride,
clang-format's avatar
clang-format committed
492
               dst + r * dst_stride * 4 + c * 4, dst_stride, &this_sse);
493 494 495 496 497 498 499 500
        sse += this_sse;
      }
    }
  }
  return sse;
}

// Compute the squares sum squares on all visible 4x4s in the transform block.
501 502 503 504 505 506 507
static int64_t sum_squares_visible(const MACROBLOCKD *xd,
                                   const struct macroblockd_plane *const pd,
                                   const int16_t *diff, const int diff_stride,
                                   int blk_row, int blk_col,
                                   const BLOCK_SIZE plane_bsize,
                                   const BLOCK_SIZE tx_bsize) {
  int64_t sse;
508 509 510 511 512 513 514 515 516 517
  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
                                            pd->subsampling_x, blk_col);
  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
                                             pd->subsampling_y, blk_row);
  if (tx_bsize == BLOCK_4X4 ||
      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
518 519
    assert(tx_4x4_w == tx_4x4_h);
    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
520 521 522 523 524 525 526 527 528
  } else {
    int r, c;
    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
    sse = 0;
    // if we are in the unrestricted motion border.
    for (r = 0; r < max_r; ++r) {
      // Skip visiting the sub blocks that are wholly within the UMV.
      for (c = 0; c < max_c; ++c) {
529 530
        sse += (int64_t)vpx_sum_squares_2d_i16(
            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
531 532 533 534 535 536 537
      }
    }
  }
  return sse;
}

static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
clang-format's avatar
clang-format committed
538 539 540 541
                       BLOCK_SIZE plane_bsize, int block, int blk_row,
                       int blk_col, TX_SIZE tx_size, int64_t *out_dist,
                       int64_t *out_sse) {
  MACROBLOCKD *const xd = &x->e_mbd;
542 543
  const struct macroblock_plane *const p = &x->plane[plane];
  const struct macroblockd_plane *const pd = &xd->plane[plane];
544

545
  if (x->block_tx_domain) {
546 547 548 549 550
    const int ss_txfrm_size = tx_size << 1;
    int64_t this_sse;
    const int shift = tx_size == TX_32X32 ? 0 : 2;
    const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
    const tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
551
#if CONFIG_VP9_HIGHBITDEPTH
552 553 554 555
    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
    *out_dist = vp9_highbd_block_error_dispatch(
                    coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd) >>
                shift;
556
#else
557 558 559
    *out_dist =
        vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >>
        shift;
560
#endif  // CONFIG_VP9_HIGHBITDEPTH
561
    *out_sse = this_sse >> shift;
562

563 564 565 566 567
    if (x->skip_encode && !is_inter_block(xd->mi[0])) {
      // TODO(jingning): tune the model to better capture the distortion.
      const int64_t p =
          (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
#if CONFIG_VP9_HIGHBITDEPTH
clang-format's avatar
clang-format committed
568
          (shift + 2 + (bd - 8) * 2);
569
#else
clang-format's avatar
clang-format committed
570
          (shift + 2);
571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
#endif  // CONFIG_VP9_HIGHBITDEPTH
      *out_dist += (p >> 4);
      *out_sse += p;
    }
  } else {
    const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
    const int bs = 4 * num_4x4_blocks_wide_lookup[tx_bsize];
    const int src_stride = p->src.stride;
    const int dst_stride = pd->dst.stride;
    const int src_idx = 4 * (blk_row * src_stride + blk_col);
    const int dst_idx = 4 * (blk_row * dst_stride + blk_col);
    const uint8_t *src = &p->src.buf[src_idx];
    const uint8_t *dst = &pd->dst.buf[dst_idx];
    const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    const uint16_t *eob = &p->eobs[block];
    unsigned int tmp;

clang-format's avatar
clang-format committed
588 589
    tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
                    blk_col, plane_bsize, tx_bsize);
590 591 592
    *out_sse = (int64_t)tmp * 16;

    if (*eob) {
593
#if CONFIG_VP9_HIGHBITDEPTH
594 595
      DECLARE_ALIGNED(16, uint16_t, recon16[1024]);
      uint8_t *recon = (uint8_t *)recon16;
Alex Converse's avatar
Alex Converse committed
596
#else
597
      DECLARE_ALIGNED(16, uint8_t, recon[1024]);
598
#endif  // CONFIG_VP9_HIGHBITDEPTH
599 600 601

#if CONFIG_VP9_HIGHBITDEPTH
      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
602
        vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
603
                                 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
604
        if (xd->lossless) {
605
          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
606 607 608
        } else {
          switch (tx_size) {
            case TX_4X4:
609
              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
610 611
              break;
            case TX_8X8:
612
              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
613 614
              break;
            case TX_16X16:
615
              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
616 617
              break;
            case TX_32X32:
618
              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
619
              break;
clang-format's avatar
clang-format committed
620
            default: assert(0 && "Invalid transform size");
621 622
          }
        }
623
        recon = CONVERT_TO_BYTEPTR(recon16);
624
      } else {
clang-format's avatar
clang-format committed
625
#endif  // CONFIG_VP9_HIGHBITDEPTH
626
        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
627
        switch (tx_size) {
clang-format's avatar
clang-format committed
628 629 630
          case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
          case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
          case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break;
631 632 633 634
          case TX_4X4:
            // this is like vp9_short_idct4x4 but has a special case around
            // eob<=1, which is significant (not just an optimization) for
            // the lossless case.
635
            x->inv_txfm_add(dqcoeff, recon, 32, *eob);
636
            break;
clang-format's avatar
clang-format committed
637
          default: assert(0 && "Invalid transform size"); break;
638 639 640 641 642
        }
#if CONFIG_VP9_HIGHBITDEPTH
      }
#endif  // CONFIG_VP9_HIGHBITDEPTH

clang-format's avatar
clang-format committed
643 644
      tmp = pixel_sse(cpi, xd, pd, src, src_stride, recon, 32, blk_row, blk_col,
                      plane_bsize, tx_bsize);
645 646 647
    }

    *out_dist = (int64_t)tmp * 16;
648
  }
Deb Mukherjee's avatar
Deb Mukherjee committed
649 650
}

Jingning Han's avatar
Jingning Han committed
651
static int rate_block(int plane, int block, TX_SIZE tx_size, int coeff_ctx,
clang-format's avatar
clang-format committed
652 653 654
                      struct rdcost_block_args *args) {
  return cost_coeffs(args->x, plane, block, tx_size, coeff_ctx, args->so->scan,
                     args->so->neighbors, args->use_fast_coef_costing);
Deb Mukherjee's avatar
Deb Mukherjee committed
655 656
}

657 658
static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
Deb Mukherjee's avatar
Deb Mukherjee committed
659 660 661
  struct rdcost_block_args *args = arg;
  MACROBLOCK *const x = args->x;
  MACROBLOCKD *const xd = &x->e_mbd;
Scott LaVarnway's avatar
Scott LaVarnway committed
662
  MODE_INFO *const mi = xd->mi[0];
663
  int64_t rd1, rd2, rd;
Alex Converse's avatar
Alex Converse committed
664 665 666
  int rate;
  int64_t dist;
  int64_t sse;
clang-format's avatar
clang-format committed
667 668
  const int coeff_ctx =
      combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]);
Deb Mukherjee's avatar
Deb Mukherjee committed
669

clang-format's avatar
clang-format committed
670
  if (args->exit_early) return;
671

Scott LaVarnway's avatar
Scott LaVarnway committed
672
  if (!is_inter_block(mi)) {
673 674
    struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above,
                                       args->t_left, &mi->skip };
675
    vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
676
                           &intra_arg);
677
    if (x->block_tx_domain) {
678 679
      dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
                 tx_size, &dist, &sse);
680 681 682 683 684 685 686 687 688 689 690
    } else {
      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
      const struct macroblock_plane *const p = &x->plane[plane];
      const struct macroblockd_plane *const pd = &xd->plane[plane];
      const int src_stride = p->src.stride;
      const int dst_stride = pd->dst.stride;
      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
      const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
      const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
      unsigned int tmp;
clang-format's avatar
clang-format committed
691 692
      sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
                                plane_bsize, tx_bsize);
693 694
#if CONFIG_VP9_HIGHBITDEPTH
      if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
Yaowu Xu's avatar
Yaowu Xu committed
695
        sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
696 697
#endif  // CONFIG_VP9_HIGHBITDEPTH
      sse = sse * 16;
698 699
      tmp = pixel_sse(args->cpi, xd, pd, src, src_stride, dst, dst_stride,
                      blk_row, blk_col, plane_bsize, tx_bsize);
700 701
      dist = (int64_t)tmp * 16;
    }
702
  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
703 704
    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
        SKIP_TXFM_NONE) {
705
      // full forward transform and quantization
706
      vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
707
      if (x->block_qcoeff_opt)
708
        vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
709 710
      dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
                 tx_size, &dist, &sse);
711 712
    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
               SKIP_TXFM_AC_ONLY) {
713
      // compute DC coefficient
clang-format's avatar
clang-format committed
714
      tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
715
      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
716 717
      vp9_xform_quant_dc(x, plane, block, blk_row, blk_col, plane_bsize,
                         tx_size);
clang-format's avatar
clang-format committed
718
      sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
Alex Converse's avatar
Alex Converse committed
719
      dist = sse;
720
      if (x->plane[plane].eobs[block]) {
Jingning Han's avatar
Jingning Han committed
721 722 723
        const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
        const int64_t resd_sse = coeff[0] - dqcoeff[0];
        int64_t dc_correct = orig_sse - resd_sse * resd_sse;
724 725 726
#if CONFIG_VP9_HIGHBITDEPTH
        dc_correct >>= ((xd->bd - 8) * 2);
#endif
clang-format's avatar
clang-format committed
727
        if (tx_size != TX_32X32) dc_correct >>= 2;
728

729
        dist = VPXMAX(0, sse - dc_correct);
730
      }
731
    } else {
732
      // SKIP_TXFM_AC_DC
Johann's avatar
Johann committed
733 734
      // skip forward transform. Because this is handled here, the quantization
      // does not need to do it.
735
      x->plane[plane].eobs[block] = 0;
clang-format's avatar
clang-format committed
736
      sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
Alex Converse's avatar
Alex Converse committed
737
      dist = sse;
738
    }
739 740
  } else {
    // full forward transform and quantization
741
    vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
742
    if (x->block_qcoeff_opt)
743
      vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
744 745
    dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
               tx_size, &dist, &sse);
746
  }
Deb Mukherjee's avatar
Deb Mukherjee committed
747

748 749 750 751 752 753
  rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
  if (args->this_rd + rd > args->best_rd) {
    args->exit_early = 1;
    return;
  }

Jingning Han's avatar
Jingning Han committed
754 755 756
  rate = rate_block(plane, block, tx_size, coeff_ctx, args);
  args->t_above[blk_col] = (x->plane[plane].eobs[block] > 0) ? 1 : 0;
  args->t_left[blk_row] = (x->plane[plane].eobs[block] > 0) ? 1 : 0;
Alex Converse's avatar
Alex Converse committed
757 758
  rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
  rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
759 760

  // TODO(jingning): temporarily enabled only for luma component
761
  rd = VPXMIN(rd1, rd2);
762
  if (plane == 0) {
clang-format's avatar
clang-format committed
763 764
    x->zcoeff_blk[tx_size][block] =
        !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless);
765 766
    x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block];
  }
767

Alex Converse's avatar
Alex Converse committed
768 769 770
  args->this_rate += rate;
  args->this_dist += dist;
  args->this_sse += sse;
771 772 773
  args->this_rd += rd;

  if (args->this_rd > args->best_rd) {
Alex Converse's avatar
Alex Converse committed
774
    args->exit_early = 1;
775 776
    return;
  }
777 778

  args->skippable &= !x->plane[plane].eobs[block];
Deb Mukherjee's avatar
Deb Mukherjee committed
779 780
}

781 782 783 784
static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                             int64_t *distortion, int *skippable, int64_t *sse,
                             int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,
                             TX_SIZE tx_size, int use_fast_coef_casting) {
Deb Mukherjee's avatar
Deb Mukherjee committed
785
  MACROBLOCKD *const xd = &x->e_mbd;
786
  const struct macroblockd_plane *const pd = &xd->plane[plane];
787 788
  struct rdcost_block_args args;
  vp9_zero(args);
789
  args.cpi = cpi;
790 791