vp9_mcomp.c 85.4 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
2
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5 6
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9 10
 */

Dmitry Kovalev's avatar
Dmitry Kovalev committed
11 12
#include <limits.h>
#include <math.h>
Dmitry Kovalev's avatar
Dmitry Kovalev committed
13
#include <stdio.h>
John Koleszar's avatar
John Koleszar committed
14

15
#include "./vpx_config.h"
Johann's avatar
Johann committed
16
#include "./vpx_dsp_rtcd.h"
Dmitry Kovalev's avatar
Dmitry Kovalev committed
17 18

#include "vpx_mem/vpx_mem.h"
19
#include "vpx_ports/mem.h"
Dmitry Kovalev's avatar
Dmitry Kovalev committed
20

Ronald S. Bultje's avatar
Ronald S. Bultje committed
21
#include "vp9/common/vp9_common.h"
22
#include "vp9/common/vp9_reconinter.h"
John Koleszar's avatar
John Koleszar committed
23

Dmitry Kovalev's avatar
Dmitry Kovalev committed
24
#include "vp9/encoder/vp9_encoder.h"
Dmitry Kovalev's avatar
Dmitry Kovalev committed
25 26
#include "vp9/encoder/vp9_mcomp.h"

27 28
// #define NEW_DIAMOND_SEARCH

29 30 31 32 33
static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                             const MV *mv) {
  return &buf->buf[mv->row * buf->stride + mv->col];
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
34
void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
35 36 37 38 39 40 41 42 43
  int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
  int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;

  col_min = MAX(col_min, (MV_LOW >> 3) + 1);
  row_min = MAX(row_min, (MV_LOW >> 3) + 1);
  col_max = MIN(col_max, (MV_UPP >> 3) - 1);
  row_max = MIN(row_max, (MV_UPP >> 3) - 1);
44 45 46

  // Get intersection of UMV window and valid MV window to reduce # of checks
  // in diamond search.
47 48 49 50 51 52 53 54 55 56
  if (x->mv_col_min < col_min)
    x->mv_col_min = col_min;
  if (x->mv_col_max > col_max)
    x->mv_col_max = col_max;
  if (x->mv_row_min < row_min)
    x->mv_row_min = row_min;
  if (x->mv_row_max > row_max)
    x->mv_row_max = row_max;
}

57
int vp9_init_search_range(int size) {
58
  int sr = 0;
Paul Wilkins's avatar
Paul Wilkins committed
59 60 61
  // Minimum search size no matter what the passed in value.
  size = MAX(16, size);

62
  while ((size << sr) < MAX_FULL_PEL_VAL)
63 64
    sr++;

65
  sr = MIN(sr, MAX_MVSEARCH_STEPS - 2);
66 67 68
  return sr;
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
69
static INLINE int mv_cost(const MV *mv,
70
                          const int *joint_cost, int *const comp_cost[2]) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
71 72
  return joint_cost[vp9_get_mv_joint(mv)] +
             comp_cost[0][mv->row] + comp_cost[1][mv->col];
John Koleszar's avatar
John Koleszar committed
73 74
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
75 76 77 78 79 80 81 82 83
int vp9_mv_bit_cost(const MV *mv, const MV *ref,
                    const int *mvjcost, int *mvcost[2], int weight) {
  const MV diff = { mv->row - ref->row,
                    mv->col - ref->col };
  return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
}

static int mv_err_cost(const MV *mv, const MV *ref,
                       const int *mvjcost, int *mvcost[2],
84
                       int error_per_bit) {
85
  if (mvcost) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
86 87 88 89
    const MV diff = { mv->row - ref->row,
                      mv->col - ref->col };
    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) *
                                  error_per_bit, 13);
90
  }
John Koleszar's avatar
John Koleszar committed
91
  return 0;
92 93
}

94
static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
95
                          int error_per_bit) {
Johann's avatar
Johann committed
96 97 98 99
  const MV diff = { mv->row - ref->row,
                    mv->col - ref->col };
  return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
                                    x->nmvsadcost) * error_per_bit, 8);
John Koleszar's avatar
John Koleszar committed
100 101
}

102
void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
103
  int len, ss_count = 1;
John Koleszar's avatar
John Koleszar committed
104

105 106
  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
  cfg->ss[0].offset = 0;
John Koleszar's avatar
John Koleszar committed
107

Dmitry Kovalev's avatar
Dmitry Kovalev committed
108
  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
109 110 111 112
    // Generate offsets for 4 search sites per step.
    const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}};
    int i;
    for (i = 0; i < 4; ++i) {
113
      search_site *const ss = &cfg->ss[ss_count++];
114 115 116
      ss->mv = ss_mvs[i];
      ss->offset = ss->mv.row * stride + ss->mv.col;
    }
John Koleszar's avatar
John Koleszar committed
117 118
  }

119 120
  cfg->ss_count = ss_count;
  cfg->searches_per_step = 4;
John Koleszar's avatar
John Koleszar committed
121 122
}

123
void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
124
  int len, ss_count = 1;
John Koleszar's avatar
John Koleszar committed
125

126 127
  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
  cfg->ss[0].offset = 0;
John Koleszar's avatar
John Koleszar committed
128

Dmitry Kovalev's avatar
Dmitry Kovalev committed
129
  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
130 131
    // Generate offsets for 8 search sites per step.
    const MV ss_mvs[8] = {
132 133
      {-len,  0  }, {len,  0  }, { 0,   -len}, {0,    len},
      {-len, -len}, {-len, len}, {len,  -len}, {len,  len}
134 135 136
    };
    int i;
    for (i = 0; i < 8; ++i) {
137
      search_site *const ss = &cfg->ss[ss_count++];
138 139 140
      ss->mv = ss_mvs[i];
      ss->offset = ss->mv.row * stride + ss->mv.col;
    }
John Koleszar's avatar
John Koleszar committed
141 142
  }

143 144
  cfg->ss_count = ss_count;
  cfg->searches_per_step = 8;
John Koleszar's avatar
John Koleszar committed
145 146
}

147 148 149 150 151 152 153 154 155
/*
 * To avoid the penalty for crossing cache-line read, preload the reference
 * area in a small buffer, which is aligned to make sure there won't be crossing
 * cache-line read while reading from this buffer. This reduced the cpu
 * cycles spent on reading ref data in sub-pixel filter functions.
 * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
 * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
 * could reduce the area.
 */
156

157
/* estimated cost of a motion vector (r,c) */
158 159 160 161
#define MVC(r, c)                                       \
    (mvcost ?                                           \
     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
162 163
      error_per_bit + 4096) >> 13 : 0)

164

Johann's avatar
Johann committed
165
// convert motion vector component to offset for sv[a]f calc
166
static INLINE int sp(int x) {
Johann's avatar
Johann committed
167
  return x & 7;
168
}
169

170 171
static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
  return &buf[(r >> 3) * stride + (c >> 3)];
172
}
173

174 175
/* checks if (r, c) has better score than previous best */
#define CHECK_BETTER(v, r, c) \
Dmitry Kovalev's avatar
Dmitry Kovalev committed
176
  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
177 178 179 180 181 182
    if (second_pred == NULL)                                           \
      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
                             src_stride, &sse);                        \
    else                                                               \
      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
                              z, src_stride, &sse, second_pred);       \
Dmitry Kovalev's avatar
Dmitry Kovalev committed
183 184 185 186 187 188 189 190 191 192
    if ((v = MVC(r, c) + thismse) < besterr) {                         \
      besterr = v;                                                     \
      br = r;                                                          \
      bc = c;                                                          \
      *distortion = thismse;                                           \
      *sse1 = sse;                                                     \
    }                                                                  \
  } else {                                                             \
    v = INT_MAX;                                                       \
  }
193

194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
#define FIRST_LEVEL_CHECKS                              \
  {                                                     \
    unsigned int left, right, up, down, diag;           \
    CHECK_BETTER(left, tr, tc - hstep);                 \
    CHECK_BETTER(right, tr, tc + hstep);                \
    CHECK_BETTER(up, tr - hstep, tc);                   \
    CHECK_BETTER(down, tr + hstep, tc);                 \
    whichdir = (left < right ? 0 : 1) +                 \
               (up < down ? 0 : 2);                     \
    switch (whichdir) {                                 \
      case 0:                                           \
        CHECK_BETTER(diag, tr - hstep, tc - hstep);     \
        break;                                          \
      case 1:                                           \
        CHECK_BETTER(diag, tr - hstep, tc + hstep);     \
        break;                                          \
      case 2:                                           \
        CHECK_BETTER(diag, tr + hstep, tc - hstep);     \
        break;                                          \
      case 3:                                           \
        CHECK_BETTER(diag, tr + hstep, tc + hstep);     \
        break;                                          \
    }                                                   \
  }

#define SECOND_LEVEL_CHECKS                             \
  {                                                     \
    int kr, kc;                                         \
    unsigned int second;                                \
    if (tr != br && tc != bc) {                         \
      kr = br - tr;                                     \
      kc = bc - tc;                                     \
      CHECK_BETTER(second, tr + kr, tc + 2 * kc);       \
      CHECK_BETTER(second, tr + 2 * kr, tc + kc);       \
    } else if (tr == br && tc != bc) {                  \
      kc = bc - tc;                                     \
      CHECK_BETTER(second, tr + hstep, tc + 2 * kc);    \
      CHECK_BETTER(second, tr - hstep, tc + 2 * kc);    \
      switch (whichdir) {                               \
        case 0:                                         \
        case 1:                                         \
          CHECK_BETTER(second, tr + hstep, tc + kc);    \
          break;                                        \
        case 2:                                         \
        case 3:                                         \
          CHECK_BETTER(second, tr - hstep, tc + kc);    \
          break;                                        \
      }                                                 \
    } else if (tr != br && tc == bc) {                  \
      kr = br - tr;                                     \
      CHECK_BETTER(second, tr + 2 * kr, tc + hstep);    \
      CHECK_BETTER(second, tr + 2 * kr, tc - hstep);    \
      switch (whichdir) {                               \
        case 0:                                         \
        case 2:                                         \
          CHECK_BETTER(second, tr + kr, tc + hstep);    \
          break;                                        \
        case 1:                                         \
        case 3:                                         \
          CHECK_BETTER(second, tr + kr, tc - hstep);    \
          break;                                        \
      }                                                 \
    }                                                   \
  }

259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
#define SETUP_SUBPEL_SEARCH                                                \
  const uint8_t *const z = x->plane[0].src.buf;                            \
  const int src_stride = x->plane[0].src.stride;                           \
  const MACROBLOCKD *xd = &x->e_mbd;                                       \
  unsigned int besterr = INT_MAX;                                          \
  unsigned int sse;                                                        \
  unsigned int whichdir;                                                   \
  int thismse;                                                             \
  const unsigned int halfiters = iters_per_step;                           \
  const unsigned int quarteriters = iters_per_step;                        \
  const unsigned int eighthiters = iters_per_step;                         \
  const int y_stride = xd->plane[0].pre[0].stride;                         \
  const int offset = bestmv->row * y_stride + bestmv->col;                 \
  const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
                                                                           \
  int rr = ref_mv->row;                                                    \
  int rc = ref_mv->col;                                                    \
  int br = bestmv->row * 8;                                                \
  int bc = bestmv->col * 8;                                                \
  int hstep = 4;                                                           \
  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);           \
  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);           \
  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);           \
  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);           \
  int tr = br;                                                             \
  int tc = bc;                                                             \
                                                                           \
  bestmv->row *= 8;                                                        \
287
  bestmv->col *= 8;
288

289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
                                              const MV *bestmv,
                                              const MV *ref_mv,
                                              int error_per_bit,
                                              const vp9_variance_fn_ptr_t *vfp,
                                              const uint8_t *const src,
                                              const int src_stride,
                                              const uint8_t *const y,
                                              int y_stride,
                                              const uint8_t *second_pred,
                                              int w, int h, int offset,
                                              int *mvjcost, int *mvcost[2],
                                              unsigned int *sse1,
                                              int *distortion) {
  unsigned int besterr;
304
#if CONFIG_VP9_HIGHBITDEPTH
305 306
  if (second_pred != NULL) {
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
307
      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
Johann's avatar
Johann committed
308
      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
309 310 311 312
                               y_stride);
      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                        sse1);
    } else {
313
      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
Johann's avatar
Johann committed
314
      vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
315 316 317 318 319 320
      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
    }
  } else {
    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
  }
  *distortion = besterr;
321 322
  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
#else
323 324
  (void) xd;
  if (second_pred != NULL) {
325
    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
Johann's avatar
Johann committed
326
    vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
327 328 329 330 331
    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
  } else {
    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
  }
  *distortion = besterr;
332 333
  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
#endif  // CONFIG_VP9_HIGHBITDEPTH
334 335
  return besterr;
}
336 337 338 339 340

static INLINE int divide_and_round(const int n, const int d) {
  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
}

341 342 343 344 345
static INLINE int is_cost_list_wellbehaved(int *cost_list) {
  return cost_list[0] < cost_list[1] &&
         cost_list[0] < cost_list[2] &&
         cost_list[0] < cost_list[3] &&
         cost_list[0] < cost_list[4];
346 347 348 349 350 351 352 353 354 355
}

// Returns surface minima estimate at given precision in 1/2^n bits.
// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
// For a given set of costs S0, S1, S2, S3, S4 at points
// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
// the solution for the location of the minima (x0, y0) is given by:
// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
// The code below is an integerized version of that.
356
static void get_cost_surf_min(int *cost_list, int *ir, int *ic,
357
                              int bits) {
358 359 360 361
  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
362 363
}

364 365 366 367 368 369 370 371 372 373 374 375 376 377
int vp9_find_best_sub_pixel_tree_pruned_evenmore(
    const MACROBLOCK *x,
    MV *bestmv, const MV *ref_mv,
    int allow_hp,
    int error_per_bit,
    const vp9_variance_fn_ptr_t *vfp,
    int forced_stop,
    int iters_per_step,
    int *cost_list,
    int *mvjcost, int *mvcost[2],
    int *distortion,
    unsigned int *sse1,
    const uint8_t *second_pred,
    int w, int h) {
378
  SETUP_SUBPEL_SEARCH;
379 380 381 382
  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                               z, src_stride, y, y_stride, second_pred,
                               w, h, offset, mvjcost, mvcost,
                               sse1, distortion);
383 384 385 386 387 388 389 390
  (void) halfiters;
  (void) quarteriters;
  (void) eighthiters;
  (void) whichdir;
  (void) allow_hp;
  (void) forced_stop;
  (void) hstep;

391 392 393 394 395
  if (cost_list &&
      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
      cost_list[4] != INT_MAX &&
      is_cost_list_wellbehaved(cost_list)) {
396 397
    int ir, ic;
    unsigned int minpt;
398
    get_cost_surf_min(cost_list, &ir, &ic, 2);
399
    if (ir != 0 || ic != 0) {
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
      CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
    }
  } else {
    FIRST_LEVEL_CHECKS;
    if (halfiters > 1) {
      SECOND_LEVEL_CHECKS;
    }

    tr = br;
    tc = bc;

    // Each subsequent iteration checks at least one point in common with
    // the last iteration could be 2 ( if diag selected) 1/4 pel
    // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
    if (forced_stop != 2) {
      hstep >>= 1;
      FIRST_LEVEL_CHECKS;
      if (quarteriters > 1) {
        SECOND_LEVEL_CHECKS;
      }
    }
  }

  tr = br;
  tc = bc;

  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
    hstep >>= 1;
    FIRST_LEVEL_CHECKS;
    if (eighthiters > 1) {
      SECOND_LEVEL_CHECKS;
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
    }
  }

  bestmv->row = br;
  bestmv->col = bc;

  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
    return INT_MAX;

  return besterr;
}

int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
                                             MV *bestmv, const MV *ref_mv,
                                             int allow_hp,
                                             int error_per_bit,
                                             const vp9_variance_fn_ptr_t *vfp,
                                             int forced_stop,
                                             int iters_per_step,
451
                                             int *cost_list,
452 453 454 455 456 457
                                             int *mvjcost, int *mvcost[2],
                                             int *distortion,
                                             unsigned int *sse1,
                                             const uint8_t *second_pred,
                                             int w, int h) {
  SETUP_SUBPEL_SEARCH;
458 459 460 461
  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                               z, src_stride, y, y_stride, second_pred,
                               w, h, offset, mvjcost, mvcost,
                               sse1, distortion);
462 463 464 465 466
  if (cost_list &&
      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
      cost_list[4] != INT_MAX &&
      is_cost_list_wellbehaved(cost_list)) {
467 468
    unsigned int minpt;
    int ir, ic;
469
    get_cost_surf_min(cost_list, &ir, &ic, 1);
470 471
    if (ir != 0 || ic != 0) {
      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
472 473
    }
  } else {
474 475 476 477
    FIRST_LEVEL_CHECKS;
    if (halfiters > 1) {
      SECOND_LEVEL_CHECKS;
    }
478
  }
479

480 481 482 483 484
  // Each subsequent iteration checks at least one point in common with
  // the last iteration could be 2 ( if diag selected) 1/4 pel

  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
  if (forced_stop != 2) {
485 486
    tr = br;
    tc = bc;
487 488 489 490 491 492 493 494
    hstep >>= 1;
    FIRST_LEVEL_CHECKS;
    if (quarteriters > 1) {
      SECOND_LEVEL_CHECKS;
    }
  }

  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
495 496
    tr = br;
    tc = bc;
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
    hstep >>= 1;
    FIRST_LEVEL_CHECKS;
    if (eighthiters > 1) {
      SECOND_LEVEL_CHECKS;
    }
  }
  // These lines insure static analysis doesn't warn that
  // tr and tc aren't used after the above point.
  (void) tr;
  (void) tc;

  bestmv->row = br;
  bestmv->col = bc;

  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
    return INT_MAX;

  return besterr;
}

int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                        MV *bestmv, const MV *ref_mv,
                                        int allow_hp,
                                        int error_per_bit,
                                        const vp9_variance_fn_ptr_t *vfp,
                                        int forced_stop,
                                        int iters_per_step,
525
                                        int *cost_list,
526 527 528 529 530 531
                                        int *mvjcost, int *mvcost[2],
                                        int *distortion,
                                        unsigned int *sse1,
                                        const uint8_t *second_pred,
                                        int w, int h) {
  SETUP_SUBPEL_SEARCH;
532 533 534 535
  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                               z, src_stride, y, y_stride, second_pred,
                               w, h, offset, mvjcost, mvcost,
                               sse1, distortion);
536 537 538 539
  if (cost_list &&
      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
      cost_list[4] != INT_MAX) {
540
    unsigned int left, right, up, down, diag;
541 542
    whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
               (cost_list[2] < cost_list[4] ? 0 : 2);
543 544 545
    switch (whichdir) {
      case 0:
        CHECK_BETTER(left, tr, tc - hstep);
546 547
        CHECK_BETTER(down, tr + hstep, tc);
        CHECK_BETTER(diag, tr + hstep, tc - hstep);
548 549 550
        break;
      case 1:
        CHECK_BETTER(right, tr, tc + hstep);
551 552
        CHECK_BETTER(down, tr + hstep, tc);
        CHECK_BETTER(diag, tr + hstep, tc + hstep);
553 554 555
        break;
      case 2:
        CHECK_BETTER(left, tr, tc - hstep);
556 557
        CHECK_BETTER(up, tr - hstep, tc);
        CHECK_BETTER(diag, tr - hstep, tc - hstep);
558 559 560
        break;
      case 3:
        CHECK_BETTER(right, tr, tc + hstep);
561 562
        CHECK_BETTER(up, tr - hstep, tc);
        CHECK_BETTER(diag, tr - hstep, tc + hstep);
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
        break;
    }
  } else {
    FIRST_LEVEL_CHECKS;
    if (halfiters > 1) {
      SECOND_LEVEL_CHECKS;
    }
  }

  tr = br;
  tc = bc;

  // Each subsequent iteration checks at least one point in common with
  // the last iteration could be 2 ( if diag selected) 1/4 pel

  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
  if (forced_stop != 2) {
    hstep >>= 1;
    FIRST_LEVEL_CHECKS;
    if (quarteriters > 1) {
      SECOND_LEVEL_CHECKS;
    }
    tr = br;
    tc = bc;
  }

  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
    hstep >>= 1;
    FIRST_LEVEL_CHECKS;
    if (eighthiters > 1) {
      SECOND_LEVEL_CHECKS;
    }
    tr = br;
    tc = bc;
  }
  // These lines insure static analysis doesn't warn that
  // tr and tc aren't used after the above point.
  (void) tr;
  (void) tc;

  bestmv->row = br;
  bestmv->col = bc;

  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
    return INT_MAX;

  return besterr;
}

613 614 615 616 617 618 619
const MV search_step_table[12] = {
    // left, right, up, down
    {0, -4}, {0, 4}, {-4, 0}, {4, 0},
    {0, -2}, {0, 2}, {-2, 0}, {2, 0},
    {0, -1}, {0, 1}, {-1, 0}, {1, 0}
};

Dmitry Kovalev's avatar
Dmitry Kovalev committed
620
int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
621
                                 MV *bestmv, const MV *ref_mv,
622
                                 int allow_hp,
623 624 625 626
                                 int error_per_bit,
                                 const vp9_variance_fn_ptr_t *vfp,
                                 int forced_stop,
                                 int iters_per_step,
627
                                 int *cost_list,
628 629
                                 int *mvjcost, int *mvcost[2],
                                 int *distortion,
630 631 632
                                 unsigned int *sse1,
                                 const uint8_t *second_pred,
                                 int w, int h) {
633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
  const uint8_t *const z = x->plane[0].src.buf;
  const uint8_t *const src_address = z;
  const int src_stride = x->plane[0].src.stride;
  const MACROBLOCKD *xd = &x->e_mbd;
  unsigned int besterr = INT_MAX;
  unsigned int sse;
  unsigned int whichdir = 0;
  int thismse;
  const int y_stride = xd->plane[0].pre[0].stride;
  const int offset = bestmv->row * y_stride + bestmv->col;
  const uint8_t *const y = xd->plane[0].pre[0].buf;

  int rr = ref_mv->row;
  int rc = ref_mv->col;
  int br = bestmv->row * 8;
  int bc = bestmv->col * 8;
  int hstep = 4;
  int iter, round = 3 - forced_stop;
  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
  int tr = br;
  int tc = bc;
  const MV *search_step = search_step_table;
  int idx, best_idx = -1;
  unsigned int cost_array[5];

  if (!(allow_hp && vp9_use_mv_hp(ref_mv)))
    if (round == 3)
      round = 2;

  bestmv->row *= 8;
  bestmv->col *= 8;
667

668 669 670 671
  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                               z, src_stride, y, y_stride, second_pred,
                               w, h, offset, mvjcost, mvcost,
                               sse1, distortion);
672

673
  (void) cost_list;  // to silence compiler warning
674

675 676 677 678 679 680 681 682 683 684 685
  for (iter = 0; iter < round; ++iter) {
    // Check vertical and horizontal sub-pixel positions.
    for (idx = 0; idx < 4; ++idx) {
      tr = br + search_step[idx].row;
      tc = bc + search_step[idx].col;
      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
        MV this_mv;
        this_mv.row = tr;
        this_mv.col = tc;
        if (second_pred == NULL)
Johann's avatar
Johann committed
686
          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
687 688
                             src_address, src_stride, &sse);
        else
Johann's avatar
Johann committed
689
          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
690 691 692 693 694 695 696 697 698 699 700 701 702
                              src_address, src_stride, &sse, second_pred);
        cost_array[idx] = thismse +
            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);

        if (cost_array[idx] < besterr) {
          best_idx = idx;
          besterr = cost_array[idx];
          *distortion = thismse;
          *sse1 = sse;
        }
      } else {
        cost_array[idx] = INT_MAX;
      }
703 704
    }

705 706 707 708 709 710 711
    // Check diagonal sub-pixel position
    tc = bc + (cost_array[0] < cost_array[1] ? -hstep : hstep);
    tr = br + (cost_array[2] < cost_array[3] ? -hstep : hstep);
    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
      MV this_mv = {tr, tc};
      if (second_pred == NULL)
Johann's avatar
Johann committed
712
        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
713 714
                           src_address, src_stride, &sse);
      else
Johann's avatar
Johann committed
715
        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
                            src_address, src_stride, &sse, second_pred);
      cost_array[4] = thismse +
          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);

      if (cost_array[4] < besterr) {
        best_idx = 4;
        besterr = cost_array[4];
        *distortion = thismse;
        *sse1 = sse;
      }
    } else {
      cost_array[idx] = INT_MAX;
    }

    if (best_idx < 4 && best_idx >= 0) {
      br += search_step[best_idx].row;
      bc += search_step[best_idx].col;
    } else if (best_idx == 4) {
      br = tr;
      bc = tc;
736
    }
737 738 739 740

    if (iters_per_step > 1)
      SECOND_LEVEL_CHECKS;

741 742
    tr = br;
    tc = bc;
743 744 745 746

    search_step += 4;
    hstep >>= 1;
    best_idx = -1;
747
  }
748 749 750 751

  // Each subsequent iteration checks at least one point in common with
  // the last iteration could be 2 ( if diag selected) 1/4 pel

752 753 754 755 756
  // These lines insure static analysis doesn't warn that
  // tr and tc aren't used after the above point.
  (void) tr;
  (void) tc;

757 758
  bestmv->row = br;
  bestmv->col = bc;
759

760 761
  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
762 763 764 765
    return INT_MAX;

  return besterr;
}
766

John Koleszar's avatar
John Koleszar committed
767 768 769
#undef MVC
#undef PRE
#undef CHECK_BETTER
770

771 772
static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
                               int range) {
773 774 775 776 777
  return ((row - range) >= x->mv_row_min) &
         ((row + range) <= x->mv_row_max) &
         ((col - range) >= x->mv_col_min) &
         ((col + range) <= x->mv_col_max);
}
Yunqing Wang's avatar
Yunqing Wang committed
778

Dmitry Kovalev's avatar
Dmitry Kovalev committed
779 780 781
static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) {
  return (mv->col >= x->mv_col_min) && (mv->col <= x->mv_col_max) &&
         (mv->row >= x->mv_row_min) && (mv->row <= x->mv_row_max);
782
}
Yunqing Wang's avatar
Yunqing Wang committed
783 784

#define CHECK_BETTER \
John Koleszar's avatar
John Koleszar committed
785
  {\
Dmitry Kovalev's avatar
Dmitry Kovalev committed
786
    if (thissad < bestsad) {\
787
      if (use_mvcost) \
788
        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);\
Dmitry Kovalev's avatar
Dmitry Kovalev committed
789
      if (thissad < bestsad) {\
John Koleszar's avatar
John Koleszar committed
790 791 792
        bestsad = thissad;\
        best_site = i;\
      }\
Yunqing Wang's avatar
Yunqing Wang committed
793
    }\
John Koleszar's avatar
John Koleszar committed
794 795
  }

796 797 798 799
#define MAX_PATTERN_SCALES         11
#define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
#define PATTERN_CANDIDATES_REF      3  // number of refinement candidates

800
// Calculate and return a sad+mvcost list around an integer best pel.
801 802 803 804 805 806
static INLINE void calc_int_cost_list(const MACROBLOCK *x,
                                      const MV *ref_mv,
                                      int sadpb,
                                      const vp9_variance_fn_ptr_t *fn_ptr,
                                      const MV *best_mv,
                                      int *cost_list) {
807 808 809 810 811 812 813 814
  static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
  const struct buf_2d *const what = &x->plane[0].src;
  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
  const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
  int br = best_mv->row;
  int bc = best_mv->col;
  MV this_mv;
  int i;
815
  unsigned int sse;
816 817 818

  this_mv.row = br;
  this_mv.col = bc;
819 820 821
  cost_list[0] = fn_ptr->vf(what->buf, what->stride,
                            get_buf_from_mv(in_what, &this_mv),
                            in_what->stride, &sse) +
822 823 824 825 826
      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
  if (check_bounds(x, br, bc, 1)) {
    for (i = 0; i < 4; i++) {
      const MV this_mv = {br + neighbors[i].row,
        bc + neighbors[i].col};
827 828 829 830 831 832
      cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
                                    get_buf_from_mv(in_what, &this_mv),
                                    in_what->stride, &sse) +
          // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
          mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
                      x->errorperbit);
833 834 835 836 837 838 839 840
    }
  } else {
    for (i = 0; i < 4; i++) {
      const MV this_mv = {br + neighbors[i].row,
        bc + neighbors[i].col};
      if (!is_mv_in(x, &this_mv))
        cost_list[i + 1] = INT_MAX;
      else
841 842 843 844 845 846
        cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
                                      get_buf_from_mv(in_what, &this_mv),
                                      in_what->stride, &sse) +
            // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
            mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
                        x->errorperbit);
847 848 849 850
    }
  }
}

851 852 853 854
// Generic pattern search function that searches over multiple scales.
// Each scale can have a different number of candidates and shape of
// candidates as indicated in the num_candidates and candidates arrays
// passed into this function
855
//
856
static int vp9_pattern_search(const MACROBLOCK *x,
857
                              MV *ref_mv,
858 859
                              int search_param,
                              int sad_per_bit,
860
                              int do_init_search,
861
                              int *cost_list,
862 863
                              const vp9_variance_fn_ptr_t *vfp,
                              int use_mvcost,
864 865
                              const MV *center_mv,
                              MV *best_mv,
866 867 868
                              const int num_candidates[MAX_PATTERN_SCALES],
                              const MV candidates[MAX_PATTERN_SCALES]
                                                 [MAX_PATTERN_CANDIDATES]) {
869
  const MACROBLOCKD *const xd = &x->e_mbd;
870 871 872
  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
  };
873
  int i, s, t;
874 875
  const struct buf_2d *const what = &x->plane[0].src;
  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
John Koleszar's avatar
John Koleszar committed
876
  int br, bc;
877 878
  int bestsad = INT_MAX;
  int thissad;
John Koleszar's avatar
John Koleszar committed
879
  int k = -1;
Dmitry Kovalev's avatar
Dmitry Kovalev committed
880
  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
881
  int best_init_s = search_param_to_steps[search_param];
John Koleszar's avatar
John Koleszar committed
882
  // adjust ref_mv to make sure it is within MV range
883 884 885
  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
  br = ref_mv->row;
  bc = ref_mv->col;
John Koleszar's avatar
John Koleszar committed
886 887

  // Work out the start point for the search
888
  bestsad = vfp->sdf(what->buf, what->stride,
889 890
                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
John Koleszar's avatar
John Koleszar committed
891

892 893 894 895 896 897 898
  // Search all possible scales upto the search param around the center point
  // pick the scale of the point that is best as the starting scale of
  // further steps around it.
  if (do_init_search) {
    s = best_init_s;
    best_init_s = -1;
    for (t = 0; t <= s; ++t) {
899
      int best_site = -1;
900
      if (check_bounds(x, br, bc, 1 << t)) {
901
        for (i = 0; i < num_candidates[t]; i++) {
902 903 904 905
          const MV this_mv = {br + candidates[t][i].row,
                              bc + candidates[t][i].col};
          thissad = vfp->sdf(what->buf, what->stride,
                             get_buf_from_mv(in_what, &this_mv),
906
                             in_what->stride);
907 908 909 910
          CHECK_BETTER
        }
      } else {
        for (i = 0; i < num_candidates[t]; i++) {
911 912
          const MV this_mv = {br + candidates[t][i].row,
                              bc + candidates[t][i].col};
Dmitry Kovalev's avatar
Dmitry Kovalev committed
913
          if (!is_mv_in(x, &this_mv))
914
            continue;
915 916
          thissad = vfp->sdf(what->buf, what->stride,
                             get_buf_from_mv(in_what, &this_mv),
917
                             in_what->stride);
918 919 920 921 922 923 924 925 926
          CHECK_BETTER
        }
      }
      if (best_site == -1) {
        continue;
      } else {
        best_init_s = t;
        k = best_site;
      }
John Koleszar's avatar
John Koleszar committed
927
    }
928 929 930
    if (best_init_s != -1) {
      br += candidates[best_init_s][k].row;
      bc += candidates[best_init_s][k].col;
John Koleszar's avatar
John Koleszar committed
931 932 933
    }
  }

934 935 936
  // If the center point is still the best, just skip this and move to
  // the refinement step.
  if (best_init_s != -1) {
937
    int best_site = -1;
938
    s = best_init_s;
939

940 941 942
    do {
      // No need to search all 6 points the 1st time if initial search was used
      if (!do_init_search || s != best_init_s) {
943
        if (check_bounds(x, br, bc, 1 << s)) {
944
          for (i = 0; i < num_candidates[s]; i++) {
945 946 947 948
            const MV this_mv = {br + candidates[s][i].row,
                                bc + candidates[s][i].col};
            thissad = vfp->sdf(what->buf, what->stride,
                               get_buf_from_mv(in_what, &this_mv),
949
                               in_what->stride);
950 951 952 953
            CHECK_BETTER
          }
        } else {
          for (i = 0; i < num_candidates[s]; i++) {
954 955
            const MV this_mv = {br + candidates[s][i].row,
                                bc + candidates[s][i].col};
Dmitry Kovalev's avatar
Dmitry Kovalev committed
956
            if (!is_mv_in(x, &this_mv))
957
              continue;
958 959
            thissad = vfp->sdf(what->buf, what->stride,
                               get_buf_from_mv(in_what, &this_mv),
960
                               in_what->stride);
961 962 963 964 965 966 967 968 969 970 971
            CHECK_BETTER
          }
        }

        if (best_site == -1) {
          continue;
        } else {
          br += candidates[s][best_site].row;
          bc += candidates[s][best_site].col;
          k = best_site;
        }
John Koleszar's avatar
John Koleszar committed
972
      }
973

974 975 976
      do {
        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
        best_site = -1;
977 978 979 980
        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
        next_chkpts_indices[1] = k;
        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;

981
        if (check_bounds(x, br, bc, 1 << s)) {
982
          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
983 984 985 986
            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
                                bc + candidates[s][next_chkpts_indices[i]].col};
            thissad = vfp->sdf(what->buf, what->stride,
                               get_buf_from_mv(in_what, &this_mv),
987
                               in_what->stride);
988 989 990 991
            CHECK_BETTER
          }
        } else {
          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
992 993
            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
                                bc + candidates[s][next_chkpts_indices[i]].col};
Dmitry Kovalev's avatar
Dmitry Kovalev committed
994
            if (!is_mv_in(x, &this_mv))
995
              continue;
996 997
            thissad = vfp->sdf(what->buf, what->stride,
                               get_buf_from_mv(in_what, &this_mv),
998
                               in_what->stride);
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
            CHECK_BETTER
          }
        }

        if (best_site != -1) {
          k = next_chkpts_indices[best_site];
          br += candidates[s][k].row;
          bc += candidates[s][k].col;
        }
      } while (best_site != -1);
    } while (s--);
John Koleszar's avatar
John Koleszar committed
1010
  }
1011

1012
  // Returns the one-away integer pel sad values around the best as follows:
1013 1014 1015 1016 1017 1018 1019 1020
  // cost_list[0]: cost at the best integer pel
  // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
  // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel
  // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
  // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
  if (cost_list) {
    const MV best_mv = { br, bc };
    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
1021 1022 1023 1024 1025 1026 1027
  }
  best_mv->row = br;
  best_mv->col = bc;
  return bestsad;
}

// A specialized function where the smallest scale search candidates
1028
// are 4 1-away neighbors, and cost_list is non-null
1029 1030 1031 1032 1033 1034 1035
// TODO(debargha): Merge this function with the one above. Also remove
// use_mvcost option since it is always 1, to save unnecessary branches.
static int vp9_pattern_search_sad(const MACROBLOCK *x,
                                  MV *ref_mv,
                                  int search_param,
                                  int sad_per_bit,
                                  int do_init_search,
1036
                                  int *cost_list,
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
                                  const vp9_variance_fn_ptr_t *vfp,