vp9_encodeframe.c

    FILE *statsfile;
    statsfile = fopen("segmap2.stt", "a");
    fprintf(statsfile, "\n");
    fclose(statsfile);
  }
#endif
}

static void encode_sb64(VP9_COMP *cpi,
                        int mb_row,
                        int mb_col,
                        TOKENEXTRA **tp, int is_sb[4]) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCK *const x = &cpi->mb;
  MACROBLOCKD *const xd = &x->e_mbd;

  cpi->sb64_count[is_sb[0] == 2]++;
  if (is_sb[0] == 2) {
    set_offsets(cpi, mb_row, mb_col, 64);
    update_state(cpi, &x->sb64_context, 64, 1);
    encode_superblock64(cpi, tp,
                        1, mb_row, mb_col);
    update_stats(cpi, mb_row, mb_col);

    (*tp)->Token = EOSB_TOKEN;
    (*tp)++;
    if (mb_row < cm->mb_rows)
      cpi->tplist[mb_row].stop = *tp;
  } else {
    int i;

    for (i = 0; i < 4; i++) {
      const int x_idx = i & 1, y_idx = i >> 1;

      if (mb_row + y_idx * 2 >= cm->mb_rows ||
          mb_col + x_idx * 2 >= cm->mb_cols) {
        // MB lies outside frame, move on
        continue;
      }
      xd->sb_index = i;
      encode_sb(cpi, mb_row + 2 * y_idx, mb_col + 2 * x_idx, 1, tp,
                is_sb[i]);
    }
  }
}

static void encode_sb_row(VP9_COMP *cpi,
                          int mb_row,
                          TOKENEXTRA **tp,
                          int *totalrate) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCK *const x = &cpi->mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  int mb_col;

  // Initialize the left context for the new SB row
  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));

  // Code each SB in the row
  for (mb_col = cm->cur_tile_mb_col_start;
       mb_col < cm->cur_tile_mb_col_end; mb_col += 4) {
    int i;
    int sb32_rate = 0, sb32_dist = 0;
    int is_sb[4];
    int sb64_rate = INT_MAX, sb64_dist;
    int sb64_skip = 0;
    ENTROPY_CONTEXT_PLANES l[4], a[4];
    TOKENEXTRA *tp_orig = *tp;

    memcpy(&a, cm->above_context + mb_col, sizeof(a));
    memcpy(&l, cm->left_context, sizeof(l));
    for (i = 0; i < 4; i++) {
      const int x_idx = (i & 1) << 1, y_idx = i & 2;
      int mb_rate = 0, mb_dist = 0;
      int sb_rate = INT_MAX, sb_dist;
      int splitmodes_used = 0;
      int sb32_skip = 0;

      if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols)
        continue;

      xd->sb_index = i;

      splitmodes_used = pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
                                      tp, &mb_rate, &mb_dist);

      mb_rate += vp9_cost_bit(cm->sb32_coded, 0);

      if (cpi->sf.splitmode_breakout) {
        sb32_skip = splitmodes_used;
        sb64_skip += splitmodes_used;
      }

      if ( !sb32_skip &&
           !(((cm->mb_cols & 1) && mb_col + x_idx == cm->mb_cols - 1) ||
             ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {
        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
        pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
                      tp, &sb_rate, &sb_dist);
        sb_rate += vp9_cost_bit(cm->sb32_coded, 1);
      }

      /* Decide whether to encode as a SB or 4xMBs */
      if (sb_rate < INT_MAX &&
          RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
              RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
        is_sb[i] = 1;
        sb32_rate += sb_rate;
        sb32_dist += sb_dist;
      } else {
        is_sb[i] = 0;
        sb32_rate += mb_rate;
        sb32_dist += mb_dist;

        // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
        if (cpi->sf.mb16_breakout) {
          ++sb64_skip;
        }
      }

      /* Encode SB using best computed mode(s) */
      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
      // for each level that we go up, we can just keep tokens and recon
      // pixels of the lower level; also, inverting SB/MB order (big->small
      // instead of small->big) means we can use as threshold for small, which
      // may enable breakouts if RD is not good enough (i.e. faster)
      encode_sb(cpi, mb_row + y_idx, mb_col + x_idx, 0, tp, is_sb[i]);
    }

    memcpy(cm->above_context + mb_col, &a, sizeof(a));
    memcpy(cm->left_context, &l, sizeof(l));
    sb32_rate += vp9_cost_bit(cm->sb64_coded, 0);

    if (!sb64_skip &&
        !(((cm->mb_cols & 3) && mb_col + 3 >= cm->mb_cols) ||
          ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) {
      pick_sb64_modes(cpi, mb_row, mb_col, tp, &sb64_rate, &sb64_dist);
      sb64_rate += vp9_cost_bit(cm->sb64_coded, 1);
    }

    /* Decide whether to encode as a SB or 4xMBs */
    if (sb64_rate < INT_MAX &&
        RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist) <
            RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
      is_sb[0] = 2;
      *totalrate += sb64_rate;
    } else {
      *totalrate += sb32_rate;
    }

    assert(tp_orig == *tp);
    encode_sb64(cpi, mb_row, mb_col, tp, is_sb);
    assert(tp_orig < *tp);
  }
}

static void init_encode_frame_mb_context(VP9_COMP *cpi) {
  MACROBLOCK *const x = &cpi->mb;
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;

  x->act_zbin_adj = 0;
  cpi->seg0_idx = 0;
  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));

  xd->mode_info_stride = cm->mode_info_stride;
  xd->frame_type = cm->frame_type;

  xd->frames_since_golden = cm->frames_since_golden;
  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;

  // reset intra mode contexts
  if (cm->frame_type == KEY_FRAME)
    vp9_init_mbmode_probs(cm);

  // Copy data over into macro block data structures.
  x->src = *cpi->Source;
  xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
  xd->dst = cm->yv12_fb[cm->new_fb_idx];

  // set up frame for intra coded blocks
  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);

  vp9_build_block_offsets(x);

  vp9_setup_block_dptrs(&x->e_mbd);

  vp9_setup_block_ptrs(x);

  xd->mode_info_context->mbmi.mode = DC_PRED;
  xd->mode_info_context->mbmi.uv_mode = DC_PRED;

  vp9_zero(cpi->count_mb_ref_frame_usage)
  vp9_zero(cpi->bmode_count)
  vp9_zero(cpi->ymode_count)
  vp9_zero(cpi->i8x8_mode_count)
  vp9_zero(cpi->y_uv_mode_count)
  vp9_zero(cpi->sub_mv_ref_count)
  vp9_zero(cpi->mbsplit_count)
  vp9_zero(cpi->common.fc.mv_ref_ct)
  vp9_zero(cpi->sb_ymode_count)
  vp9_zero(cpi->sb32_count);
  vp9_zero(cpi->sb64_count);
#if CONFIG_COMP_INTERINTRA_PRED
  vp9_zero(cpi->interintra_count);
  vp9_zero(cpi->interintra_select_count);
#endif

  vpx_memset(cm->above_context, 0,
             sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);

  xd->fullpixel_mask = cm->full_pixel ? 0xfffffff8 : 0xffffffff;
}

static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
  if (lossless) {
    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
    cpi->mb.optimize              = 0;
    cpi->common.filter_level      = 0;
    cpi->zbin_mode_boost_enabled  = FALSE;
    cpi->common.txfm_mode         = ONLY_4X4;
  } else {
    cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
    cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
  }
}


static void encode_frame_internal(VP9_COMP *cpi) {
  int mb_row;
  MACROBLOCK *const x = &cpi->mb;
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;
  int totalrate;

//   fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
//            cpi->common.current_video_frame, cpi->common.show_frame,
//            cm->frame_type);

  // Compute a modified set of reference frame probabilities to use when
  // prediction fails. These are based on the current general estimates for
  // this frame which may be updated with each iteration of the recode loop.
  vp9_compute_mod_refprobs(cm);

// debug output
#if DBG_PRNT_SEGMAP
  {
    FILE *statsfile;
    statsfile = fopen("segmap2.stt", "a");
    fprintf(statsfile, "\n");
    fclose(statsfile);
  }
#endif

  totalrate = 0;

  // Reset frame count of inter 0,0 motion vector usage.
  cpi->inter_zz_count = 0;

  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;

  vp9_zero(cpi->switchable_interp_count);
  vp9_zero(cpi->best_switchable_interp_count);

  xd->mode_info_context = cm->mi;
  xd->prev_mode_info_context = cm->prev_mi;

  vp9_zero(cpi->NMVcount);
  vp9_zero(cpi->coef_counts_4x4);
  vp9_zero(cpi->coef_counts_8x8);
  vp9_zero(cpi->coef_counts_16x16);
  vp9_zero(cpi->coef_counts_32x32);
#if CONFIG_CODE_NONZEROCOUNT
  vp9_zero(cm->fc.nzc_counts_4x4);
  vp9_zero(cm->fc.nzc_counts_8x8);
  vp9_zero(cm->fc.nzc_counts_16x16);
  vp9_zero(cm->fc.nzc_counts_32x32);
  vp9_zero(cm->fc.nzc_pcat_counts);
#endif
#if CONFIG_NEW_MVREF
  vp9_zero(cpi->mb_mv_ref_count);
#endif

  // force lossless mode
  if (cm->base_qindex <= 4)
    cm->base_qindex = 0;
  cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
                            cm->y1dc_delta_q == 0 &&
                            cm->uvdc_delta_q == 0 &&
                            cm->uvac_delta_q == 0);
  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);

  vp9_frame_init_quantizer(cpi);

  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
  vp9_initialize_me_consts(cpi, cm->base_qindex);

  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
    // Initialize encode frame context.
    init_encode_frame_mb_context(cpi);

    // Build a frame level activity map
    build_activity_map(cpi);
  }

  // re-initencode frame context.
  init_encode_frame_mb_context(cpi);

  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
  vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p));
  vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));
  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
  {
    struct vpx_usec_timer  emr_timer;
    vpx_usec_timer_start(&emr_timer);

    {
      // Take tiles into account and give start/end MB
      int tile_col, tile_row;
      TOKENEXTRA *tp = cpi->tok;

      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
        vp9_get_tile_row_offsets(cm, tile_row);

        for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
          TOKENEXTRA *tp_old = tp;

          // For each row of SBs in the frame
          vp9_get_tile_col_offsets(cm, tile_col);
          for (mb_row = cm->cur_tile_mb_row_start;
               mb_row < cm->cur_tile_mb_row_end; mb_row += 4) {
            encode_sb_row(cpi, mb_row, &tp, &totalrate);
          }
          cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
        }
      }
    }

    vpx_usec_timer_mark(&emr_timer);
    cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
  }

  // 256 rate units to the bit,
  // projected_frame_size in units of BYTES
  cpi->projected_frame_size = totalrate >> 8;

#if 0
  // Keep record of the total distortion this time around for future use
  cpi->last_frame_distortion = cpi->frame_distortion;
#endif

}

static int check_dual_ref_flags(VP9_COMP *cpi) {
  MACROBLOCKD *xd = &cpi->mb.e_mbd;
  int ref_flags = cpi->ref_frame_flags;

  if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&
        vp9_check_segref(xd, 1, LAST_FRAME))
      return 1;
    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&
        vp9_check_segref(xd, 1, GOLDEN_FRAME))
      return 1;
    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&
        vp9_check_segref(xd, 1, ALTREF_FRAME))
      return 1;
    return 0;
  } else {
    return (!!(ref_flags & VP9_GOLD_FLAG) +
            !!(ref_flags & VP9_LAST_FLAG) +
            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
  }
}

static void reset_skip_txfm_size_mb(VP9_COMP *cpi,
                                    MODE_INFO *mi, TX_SIZE txfm_max) {
  MB_MODE_INFO *const mbmi = &mi->mbmi;

  if (mbmi->txfm_size > txfm_max) {
    VP9_COMMON *const cm = &cpi->common;
    MACROBLOCK *const x = &cpi->mb;
    MACROBLOCKD *const xd = &x->e_mbd;
    const int segment_id = mbmi->segment_id;

    xd->mode_info_context = mi;
    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
           (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
    mbmi->txfm_size = txfm_max;
  }
}

static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
  int x, y;

  for (y = 0; y < ymbs; y++) {
    for (x = 0; x < xmbs; x++) {
      if (!mi[y * mis + x].mbmi.mb_skip_coeff)
        return 0;
    }
  }

  return 1;
}

static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
                          TX_SIZE txfm_size) {
  int x, y;

  for (y = 0; y < ymbs; y++) {
    for (x = 0; x < xmbs; x++)
      mi[y * mis + x].mbmi.txfm_size = txfm_size;
  }
}

static void reset_skip_txfm_size_sb32(VP9_COMP *cpi, MODE_INFO *mi,
                                      int mis, TX_SIZE txfm_max,
                                      int mb_rows_left, int mb_cols_left) {
  MB_MODE_INFO *const mbmi = &mi->mbmi;

  if (mbmi->txfm_size > txfm_max) {
    VP9_COMMON *const cm = &cpi->common;
    MACROBLOCK *const x = &cpi->mb;
    MACROBLOCKD *const xd = &x->e_mbd;
    const int segment_id = mbmi->segment_id;
    const int ymbs = MIN(2, mb_rows_left);
    const int xmbs = MIN(2, mb_cols_left);

    xd->mode_info_context = mi;
    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
  }
}

static void reset_skip_txfm_size_sb64(VP9_COMP *cpi, MODE_INFO *mi,
                                      int mis, TX_SIZE txfm_max,
                                      int mb_rows_left, int mb_cols_left) {
  MB_MODE_INFO *const mbmi = &mi->mbmi;

  if (mbmi->txfm_size > txfm_max) {
    VP9_COMMON *const cm = &cpi->common;
    MACROBLOCK *const x = &cpi->mb;
    MACROBLOCKD *const xd = &x->e_mbd;
    const int segment_id = mbmi->segment_id;
    const int ymbs = MIN(4, mb_rows_left);
    const int xmbs = MIN(4, mb_cols_left);

    xd->mode_info_context = mi;
    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
  }
}

static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
  VP9_COMMON *const cm = &cpi->common;
  int mb_row, mb_col;
  const int mis = cm->mode_info_stride;
  MODE_INFO *mi, *mi_ptr = cm->mi;

  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
    mi = mi_ptr;
    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {
      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
        reset_skip_txfm_size_sb64(cpi, mi, mis, txfm_max,
                                  cm->mb_rows - mb_row, cm->mb_cols - mb_col);
      } else {
        int i;

        for (i = 0; i < 4; i++) {
          const int x_idx_sb = (i & 1) << 1, y_idx_sb = i & 2;
          MODE_INFO *sb_mi = mi + y_idx_sb * mis + x_idx_sb;

          if (mb_row + y_idx_sb >= cm->mb_rows ||
              mb_col + x_idx_sb >= cm->mb_cols)
            continue;

          if (sb_mi->mbmi.sb_type) {
            reset_skip_txfm_size_sb32(cpi, sb_mi, mis, txfm_max,
                                      cm->mb_rows - mb_row - y_idx_sb,
                                      cm->mb_cols - mb_col - x_idx_sb);
          } else {
            int m;

            for (m = 0; m < 4; m++) {
              const int x_idx = x_idx_sb + (m & 1), y_idx = y_idx_sb + (m >> 1);
              MODE_INFO *mb_mi;

              if (mb_col + x_idx >= cm->mb_cols ||
                  mb_row + y_idx >= cm->mb_rows)
                continue;

              mb_mi = mi + y_idx * mis + x_idx;
              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
              reset_skip_txfm_size_mb(cpi, mb_mi, txfm_max);
            }
          }
        }
      }
    }
  }
}

void vp9_encode_frame(VP9_COMP *cpi) {
  if (cpi->sf.RD) {
    int i, frame_type, pred_type;
    TXFM_MODE txfm_type;

    /*
     * This code does a single RD pass over the whole frame assuming
     * either compound, single or hybrid prediction as per whatever has
     * worked best for that type of frame in the past.
     * It also predicts whether another coding mode would have worked
     * better that this coding mode. If that is the case, it remembers
     * that for subsequent frames.
     * It does the same analysis for transform size selection also.
     */
    if (cpi->common.frame_type == KEY_FRAME)
      frame_type = 0;
    else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
      frame_type = 3;
    else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
      frame_type = 1;
    else
      frame_type = 2;

    /* prediction (compound, single or hybrid) mode selection */
    if (frame_type == 3)
      pred_type = SINGLE_PREDICTION_ONLY;
    else if (cpi->rd_prediction_type_threshes[frame_type][1] >
                 cpi->rd_prediction_type_threshes[frame_type][0] &&
             cpi->rd_prediction_type_threshes[frame_type][1] >
                 cpi->rd_prediction_type_threshes[frame_type][2] &&
             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
      pred_type = COMP_PREDICTION_ONLY;
    else if (cpi->rd_prediction_type_threshes[frame_type][0] >
                 cpi->rd_prediction_type_threshes[frame_type][2])
      pred_type = SINGLE_PREDICTION_ONLY;
    else
      pred_type = HYBRID_PREDICTION;

    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */

    cpi->mb.e_mbd.lossless = 0;
    if (cpi->oxcf.lossless) {
      txfm_type = ONLY_4X4;
      cpi->mb.e_mbd.lossless = 1;
    } else
#if 0
    /* FIXME (rbultje): this code is disabled until we support cost updates
     * while a frame is being encoded; the problem is that each time we
     * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
     * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
     * further behind and not being chosen for subsequent frames either. This
     * is essentially a local minimum problem that we can probably fix by
     * estimating real costs more closely within a frame, perhaps by re-
     * calculating costs on-the-fly as frame encoding progresses. */
    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
      txfm_type = TX_MODE_SELECT;
    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
               ) {
      txfm_type = ONLY_4X4;
    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
      txfm_type = ALLOW_16X16;
    } else
      txfm_type = ALLOW_8X8;
#else
    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=
                  cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
                    ALLOW_32X32 : TX_MODE_SELECT;
#endif
    cpi->common.txfm_mode = txfm_type;
    if (txfm_type != TX_MODE_SELECT) {
      cpi->common.prob_tx[0] = 128;
      cpi->common.prob_tx[1] = 128;
    }
    cpi->common.comp_pred_mode = pred_type;
    encode_frame_internal(cpi);

    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
      const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
      cpi->rd_prediction_type_threshes[frame_type][i] += diff;
      cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
    }

    for (i = 0; i < NB_TXFM_MODES; ++i) {
      int64_t pd = cpi->rd_tx_select_diff[i];
      int diff;
      if (i == TX_MODE_SELECT)
        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
                     2048 * (TX_SIZE_MAX_SB - 1), 0);
      diff = (int)(pd / cpi->common.MBs);
      cpi->rd_tx_select_threshes[frame_type][i] += diff;
      cpi->rd_tx_select_threshes[frame_type][i] /= 2;
    }

    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
      int single_count_zero = 0;
      int comp_count_zero = 0;

      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
        single_count_zero += cpi->single_pred_count[i];
        comp_count_zero += cpi->comp_pred_count[i];
      }

      if (comp_count_zero == 0) {
        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
      } else if (single_count_zero == 0) {
        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
      }
    }

    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
      const int count4x4 = cpi->txfm_count_16x16p[TX_4X4] +
                           cpi->txfm_count_32x32p[TX_4X4] +
                           cpi->txfm_count_8x8p[TX_4X4];
      const int count8x8_lp = cpi->txfm_count_32x32p[TX_8X8] +
                              cpi->txfm_count_16x16p[TX_8X8];
      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
      const int count16x16_16x16p = cpi->txfm_count_16x16p[TX_16X16];
      const int count16x16_lp = cpi->txfm_count_32x32p[TX_16X16];
      const int count32x32 = cpi->txfm_count_32x32p[TX_32X32];

      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
          count32x32 == 0) {
        cpi->common.txfm_mode = ALLOW_8X8;
        reset_skip_txfm_size(cpi, TX_8X8);
      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
        cpi->common.txfm_mode = ONLY_4X4;
        reset_skip_txfm_size(cpi, TX_4X4);
      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
        cpi->common.txfm_mode = ALLOW_32X32;
      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
        cpi->common.txfm_mode = ALLOW_16X16;
        reset_skip_txfm_size(cpi, TX_16X16);
      }
    }

    // Update interpolation filter strategy for next frame.
    if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter))
      vp9_select_interp_filter_type(cpi);
  } else {
    encode_frame_internal(cpi);
  }

}

void vp9_setup_block_ptrs(MACROBLOCK *x) {
  int r, c;
  int i;

  for (r = 0; r < 4; r++) {
    for (c = 0; c < 4; c++)
      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
  }

  for (r = 0; r < 2; r++) {
    for (c = 0; c < 2; c++)
      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
  }


  for (r = 0; r < 2; r++) {
    for (c = 0; c < 2; c++)
      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
  }

  for (i = 0; i < 24; i++)
    x->block[i].coeff = x->coeff + i * 16;
}

void vp9_build_block_offsets(MACROBLOCK *x) {
  int block = 0;
  int br, bc;

  vp9_build_block_doffsets(&x->e_mbd);

  for (br = 0; br < 4; br++) {
    for (bc = 0; bc < 4; bc++) {
      BLOCK *this_block = &x->block[block];
      // this_block->base_src = &x->src.y_buffer;
      // this_block->src_stride = x->src.y_stride;
      // this_block->src = 4 * br * this_block->src_stride + 4 * bc;
      this_block->base_src = &x->src.y_buffer;
      this_block->src_stride = x->src.y_stride;
      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
      ++block;
    }
  }

  // u blocks
  for (br = 0; br < 2; br++) {
    for (bc = 0; bc < 2; bc++) {
      BLOCK *this_block = &x->block[block];
      this_block->base_src = &x->src.u_buffer;
      this_block->src_stride = x->src.uv_stride;
      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
      ++block;
    }
  }

  // v blocks
  for (br = 0; br < 2; br++) {
    for (bc = 0; bc < 2; bc++) {
      BLOCK *this_block = &x->block[block];
      this_block->base_src = &x->src.v_buffer;
      this_block->src_stride = x->src.uv_stride;
      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
      ++block;
    }
  }
}

static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
  const MACROBLOCKD *xd = &x->e_mbd;
  const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
  const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;

#ifdef MODE_STATS
  const int is_key = cpi->common.frame_type == KEY_FRAME;

  ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
  ++ uv_modes_y[m][uvm];

  if (m == B_PRED) {
    unsigned int *const bct = is_key ? b_modes : inter_b_modes;

    int b = 0;

    do {
      ++ bct[xd->block[b].bmi.as_mode.first];
    } while (++b < 16);
  }

  if (m == I8X8_PRED) {
    i8x8_modes[xd->block[0].bmi.as_mode.first]++;
    i8x8_modes[xd->block[2].bmi.as_mode.first]++;
    i8x8_modes[xd->block[8].bmi.as_mode.first]++;
    i8x8_modes[xd->block[10].bmi.as_mode.first]++;
  }
#endif

  if (xd->mode_info_context->mbmi.sb_type) {
    ++cpi->sb_ymode_count[m];
  } else {
    ++cpi->ymode_count[m];
  }
  if (m != I8X8_PRED)
    ++cpi->y_uv_mode_count[m][uvm];
  else {
    cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;
    cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;
    cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;
    cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;
  }
  if (m == B_PRED) {
    int b = 0;
    do {
      int m = xd->block[b].bmi.as_mode.first;
#if CONFIG_NEWBINTRAMODES
      if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
#endif
      ++cpi->bmode_count[m];
    } while (++b < 16);
  }
}

// Experimental stub function to create a per MB zbin adjustment based on
// some previously calculated measure of MB activity.
static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
#if USE_ACT_INDEX
  x->act_zbin_adj = *(x->mb_activity_ptr);
#else
  int64_t a;
  int64_t b;
  int64_t act = *(x->mb_activity_ptr);

  // Apply the masking to the RD multiplier.
  a = act + 4 * cpi->activity_avg;
  b = 4 * act + cpi->activity_avg;

  if (act > cpi->activity_avg)
    x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
  else
    x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
#endif
}

static void update_sb64_skip_coeff_state(VP9_COMP *cpi,
                                         ENTROPY_CONTEXT_PLANES ta[16],
                                         ENTROPY_CONTEXT_PLANES tl[16],
                                         TOKENEXTRA *t[16],
                                         TOKENEXTRA **tp,
                                         int skip[16], int output_enabled) {
  MACROBLOCK *const x = &cpi->mb;

  if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_32X32) {
    TOKENEXTRA tokens[4][1024+512];
    int n_tokens[4], n;

    // if there were no skips, we don't need to do anything
    if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
      return;

    // if we don't do coeff skipping for this frame, we don't
    // need to do anything here
    if (!cpi->common.mb_no_coeff_skip)
      return;

    // if all 4 MBs skipped coeff coding, nothing to be done
    if (skip[0] && skip[1] && skip[2] && skip[3])
      return;

    // so the situation now is that we want to skip coeffs
    // for some MBs, but not all, and we didn't code EOB
    // coefficients for them. However, the skip flag for this
    // SB will be 0 overall, so we need to insert EOBs in the
    // middle of the token tree. Do so here.
    for (n = 0; n < 4; n++) {
      if (n < 3) {
        n_tokens[n] = t[n + 1] - t[n];
      } else {
        n_tokens[n] = *tp - t[3];
      }
      if (n_tokens[n]) {
        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
      }
    }

    // reset pointer, stuff EOBs where necessary
    *tp = t[0];
    for (n = 0; n < 4; n++) {
      if (skip[n]) {
        x->e_mbd.above_context = &ta[n * 2];
        x->e_mbd.left_context  = &tl[n * 2];
        vp9_stuff_sb(cpi, &x->e_mbd, tp, !output_enabled);
      } else {
        if (n_tokens[n]) {
          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
        }
        (*tp) += n_tokens[n];
      }
    }
  } else {
    TOKENEXTRA tokens[16][16 * 25];
    int n_tokens[16], n;

    // if there were no skips, we don't need to do anything
    if (!skip[ 0] && !skip[ 1] && !skip[ 2] && !skip[ 3] &&
        !skip[ 4] && !skip[ 5] && !skip[ 6] && !skip[ 7] &&
        !skip[ 8] && !skip[ 9] && !skip[10] && !skip[11] &&
        !skip[12] && !skip[13] && !skip[14] && !skip[15])
      return;

    // if we don't do coeff skipping for this frame, we don't
    // need to do anything here
    if (!cpi->common.mb_no_coeff_skip)
      return;

    // if all 4 MBs skipped coeff coding, nothing to be done
    if (skip[ 0] && skip[ 1] && skip[ 2] && skip[ 3] &&
        skip[ 4] && skip[ 5] && skip[ 6] && skip[ 7] &&
        skip[ 8] && skip[ 9] && skip[10] && skip[11] &&
        skip[12] && skip[13] && skip[14] && skip[15])
      return;

    // so the situation now is that we want to skip coeffs
    // for some MBs, but not all, and we didn't code EOB
    // coefficients for them. However, the skip flag for this
    // SB will be 0 overall, so we need to insert EOBs in the
    // middle of the token tree. Do so here.
    for (n = 0; n < 16; n++) {
      if (n < 15) {
        n_tokens[n] = t[n + 1] - t[n];
      } else {
        n_tokens[n] = *tp - t[15];
      }
      if (n_tokens[n]) {
        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
      }
    }

    // reset pointer, stuff EOBs where necessary
    *tp = t[0];
    for (n = 0; n < 16; n++) {
      if (skip[n]) {
        x->e_mbd.above_context = &ta[n];
        x->e_mbd.left_context  = &tl[n];
        vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);
      } else {
        if (n_tokens[n]) {
          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
        }
        (*tp) += n_tokens[n];
      }
    }
  }
}

#if CONFIG_CODE_NONZEROCOUNT
static void gather_nzcs_mb16(VP9_COMMON *const cm,
                             MACROBLOCKD *xd) {
  int i;
  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
  switch (xd->mode_info_context->mbmi.txfm_size) {
    case TX_4X4:
      for (i = 0; i < 24; ++i) {
        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
      }
      break;

    case TX_8X8:
      for (i = 0; i < 16; i += 4) {
        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
      }
      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
          xd->mode_info_context->mbmi.mode == SPLITMV) {
        for (i = 16; i < 24; ++i) {
          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
        }
      } else {
        for (i = 16; i < 24; i += 4) {
          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
        }
      }
      break;

    case TX_16X16:
      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
      for (i = 16; i < 24; i += 4) {
        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
      }
      break;

    default:
      break;
  }
}

static void gather_nzcs_sb32(VP9_COMMON *const cm,
                             MACROBLOCKD *xd) {
  int i, j;
  MODE_INFO *m = xd->mode_info_context;
  int mis = cm->mode_info_stride;
  vpx_memset(m->mbmi.nzcs, 0,
             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
  switch (xd->mode_info_context->mbmi.txfm_size) {
    case TX_4X4:
      for (i = 0; i < 96; ++i) {
        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
      }
      break;

    case TX_8X8:
      for (i = 0; i < 96; i += 4) {
        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
      }
      break;

    case TX_16X16:
      for (i = 0; i < 96; i += 16) {
        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
      }
      break;

    case TX_32X32:
      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
      for (i = 64; i < 96; i += 16) {
        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
      }
      break;

    default:
      break;
  }
  for (i = 0; i < 2; ++i)
    for (j = 0; j < 2; ++j) {
      if (i == 0 && j == 0) continue;
      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,