vp9_encodeframe.c

                      tp, &r, &d, BLOCK_SIZE_SB8X16,
                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
        r2 += r;
        d2 += d;
        update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
                                            [xd->b_index],
                     BLOCK_SIZE_SB8X16, 0);
        encode_superblock(cpi, tp,
                          0, mi_row + y_idx_m, mi_col + x_idx_m,
                          BLOCK_SIZE_SB8X16);
        xd->b_index = 1;
        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
                      tp, &r, &d, BLOCK_SIZE_SB8X16,
                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
        r2 += r;
        d2 += d;
        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
        r2 += x->partition_cost[pl][PARTITION_VERT];
        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
          mb16_rate = r2;
          mb16_dist = d2;
          mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
        }
        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
                     l3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
                     a3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
        }

        // try 16x8 coding
        r2 = 0;
        d2 = 0;
        xd->b_index = 0;
        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
                      tp, &r, &d, BLOCK_SIZE_SB16X8,
                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
        r2 += r;
        d2 += d;
        update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
                                            [xd->b_index],
                     BLOCK_SIZE_SB16X8, 0);
        encode_superblock(cpi, tp,
                          0, mi_row + y_idx_m, mi_col + x_idx_m,
                          BLOCK_SIZE_SB16X8);
        xd->b_index = 1;
        pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
                      tp, &r, &d, BLOCK_SIZE_SB16X8,
                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
        r2 += r;
        d2 += d;
        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
        r2 += x->partition_cost[pl][PARTITION_HORZ];
        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
          mb16_rate = r2;
          mb16_dist = d2;
          mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
        }
        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
                     l3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
                     a3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
        }

        // try as 16x16
        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
                      tp, &r, &d, BLOCK_SIZE_MB16X16,
                      &x->mb_context[xd->sb_index][xd->mb_index]);
        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
        r += x->partition_cost[pl][PARTITION_NONE];
        if (RDCOST(x->rdmult, x->rddiv, r, d) <
                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
          mb16_rate = r;
          mb16_dist = d;
          mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
        }
        sb32_rate += mb16_rate;
        sb32_dist += mb16_dist;

        // Dummy encode, do not do the tokenization
        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
                  BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
      }

      /* Restore L & A coding context to those in place on entry */
      for (p = 0; p < MAX_MB_PLANE; p++) {
        vpx_memcpy(cm->left_context[p] +
                       (y_idx * 2 >> xd->plane[p].subsampling_y),
                   l2 + 8 * p,
                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
        vpx_memcpy(cm->above_context[p] +
                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                   a2 + 8 * p,
                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
      }
      // restore partition information context
      vpx_memcpy(cm->above_seg_context + mi_col + x_idx, sa32, sizeof(sa32));
      vpx_memcpy(cm->left_seg_context + y_idx, sl32, sizeof(sl32));

      set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
      pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
      sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];

      if (cpi->sf.splitmode_breakout) {
        sb32_skip = splitmodes_used;
        sb64_skip += splitmodes_used;
      }

      // check 32x16
      if (mi_col + x_idx + 4 <= cm->mi_cols) {
        int r, d;

        xd->mb_index = 0;
        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                      tp, &r, &d, BLOCK_SIZE_SB32X16,
                      &x->sb32x16_context[xd->sb_index][xd->mb_index]);
        if (mi_row + y_idx + 2 < cm->mi_rows) {
          int r2, d2;

          update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
                       BLOCK_SIZE_SB32X16, 0);
          encode_superblock(cpi, tp,
                            0, mi_row + y_idx, mi_col + x_idx,
                            BLOCK_SIZE_SB32X16);
          xd->mb_index = 1;
          pick_sb_modes(cpi, mi_row + y_idx + 2,
                        mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
                        &x->sb32x16_context[xd->sb_index][xd->mb_index]);
          r += r2;
          d += d2;
        }

        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
        r += x->partition_cost[pl][PARTITION_HORZ];

        /* is this better than MB coding? */
        if (RDCOST(x->rdmult, x->rddiv, r, d) <
                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
          sb32_rate = r;
          sb32_dist = d;
          sb_partitioning[i] = BLOCK_SIZE_SB32X16;
        }

        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx * 2 >> xd->plane[p].subsampling_y),
                     l2 + 8 * p,
                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                     a2 + 8 * p,
                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
        }
      }

      // check 16x32
      if (mi_row + y_idx + 4 <= cm->mi_rows) {
        int r, d;

        xd->mb_index = 0;
        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                      tp, &r, &d, BLOCK_SIZE_SB16X32,
                      &x->sb16x32_context[xd->sb_index][xd->mb_index]);
        if (mi_col + x_idx + 2 < cm->mi_cols) {
          int r2, d2;

          update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
                       BLOCK_SIZE_SB16X32, 0);
          encode_superblock(cpi, tp,
                            0, mi_row + y_idx, mi_col + x_idx,
                            BLOCK_SIZE_SB16X32);
          xd->mb_index = 1;
          pick_sb_modes(cpi, mi_row + y_idx,
                        mi_col + x_idx + 2,
                        tp, &r2, &d2, BLOCK_SIZE_SB16X32,
                        &x->sb16x32_context[xd->sb_index][xd->mb_index]);
          r += r2;
          d += d2;
        }

        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
        r += x->partition_cost[pl][PARTITION_VERT];

        /* is this better than MB coding? */
        if (RDCOST(x->rdmult, x->rddiv, r, d) <
                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
          sb32_rate = r;
          sb32_dist = d;
          sb_partitioning[i] = BLOCK_SIZE_SB16X32;
        }

        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx * 2 >> xd->plane[p].subsampling_y),
                     l2 + 8 * p,
                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                     a2 + 8 * p,
                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
        }
      }

      if (!sb32_skip &&
          mi_col + x_idx + 4 <= cm->mi_cols &&
          mi_row + y_idx + 4 <= cm->mi_rows) {
        int r, d;

        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                      tp, &r, &d, BLOCK_SIZE_SB32X32,
                      &x->sb32_context[xd->sb_index]);

        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
        r += x->partition_cost[pl][PARTITION_NONE];

        if (RDCOST(x->rdmult, x->rddiv, r, d) <
                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
          sb32_rate = r;
          sb32_dist = d;
          sb_partitioning[i] = BLOCK_SIZE_SB32X32;
        }
      }

      // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
      if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) {
        ++sb64_skip;
      }

      sb64_rate += sb32_rate;
      sb64_dist += sb32_dist;

      /* Encode SB using best computed mode(s) */
      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
      // for each level that we go up, we can just keep tokens and recon
      // pixels of the lower level; also, inverting SB/MB order (big->small
      // instead of small->big) means we can use as threshold for small, which
      // may enable breakouts if RD is not good enough (i.e. faster)
      encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
                BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
                NULL);
    }

    for (p = 0; p < MAX_MB_PLANE; p++) {
      memcpy(cm->above_context[p] +
                 (mi_col * 2 >> xd->plane[p].subsampling_x),
             a + 16 * p,
             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
      memcpy(cm->left_context[p], l + 16 * p,
             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
    }
    memcpy(cm->above_seg_context + mi_col, &seg_a, sizeof(seg_a));
    memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));

    set_partition_seg_context(cpi, mi_row, mi_col);
    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
    sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];

    // check 64x32
    if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) {
      int r, d;

      xd->sb_index = 0;
      pick_sb_modes(cpi, mi_row, mi_col,
                    tp, &r, &d, BLOCK_SIZE_SB64X32,
                    &x->sb64x32_context[xd->sb_index]);
      if (mi_row + 4 != cm->mi_rows) {
        int r2, d2;

        update_state(cpi, &x->sb64x32_context[xd->sb_index],
                     BLOCK_SIZE_SB64X32, 0);
        encode_superblock(cpi, tp,
                          0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
        xd->sb_index = 1;
        pick_sb_modes(cpi, mi_row + 4, mi_col,
                      tp, &r2, &d2, BLOCK_SIZE_SB64X32,
                      &x->sb64x32_context[xd->sb_index]);
        r += r2;
        d += d2;
      }

      set_partition_seg_context(cpi, mi_row, mi_col);
      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
      r += x->partition_cost[pl][PARTITION_HORZ];

      /* is this better than MB coding? */
      if (RDCOST(x->rdmult, x->rddiv, r, d) <
              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
        sb64_rate = r;
        sb64_dist = d;
        sb64_partitioning = BLOCK_SIZE_SB64X32;
      }

      for (p = 0; p < MAX_MB_PLANE; p++) {
        memcpy(cm->above_context[p] +
                   (mi_col * 2 >> xd->plane[p].subsampling_x),
               a + 16 * p,
               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
        memcpy(cm->left_context[p], l + 16 * p,
               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
      }
    }

    // check 32x64
    if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) {
      int r, d;

      xd->sb_index = 0;
      pick_sb_modes(cpi, mi_row, mi_col,
                    tp, &r, &d, BLOCK_SIZE_SB32X64,
                    &x->sb32x64_context[xd->sb_index]);
      if (mi_col + 4 != cm->mi_cols) {
        int r2, d2;

        update_state(cpi, &x->sb32x64_context[xd->sb_index],
                     BLOCK_SIZE_SB32X64, 0);
        encode_superblock(cpi, tp,
                          0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
        xd->sb_index = 1;
        pick_sb_modes(cpi, mi_row, mi_col + 4,
                      tp, &r2, &d2, BLOCK_SIZE_SB32X64,
                      &x->sb32x64_context[xd->sb_index]);
        r += r2;
        d += d2;
      }

      set_partition_seg_context(cpi, mi_row, mi_col);
      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
      r += x->partition_cost[pl][PARTITION_VERT];

      /* is this better than MB coding? */
      if (RDCOST(x->rdmult, x->rddiv, r, d) <
              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
        sb64_rate = r;
        sb64_dist = d;
        sb64_partitioning = BLOCK_SIZE_SB32X64;
      }

      for (p = 0; p < MAX_MB_PLANE; p++) {
        memcpy(cm->above_context[p] +
                   (mi_col * 2 >> xd->plane[p].subsampling_x),
               a + 16 * p,
               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
        memcpy(cm->left_context[p], l + 16 * p,
               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
      }
    }

    if (!sb64_skip &&
        mi_col + 8 <= cm->mi_cols &&
        mi_row + 8 <= cm->mi_rows) {
      int r, d;

      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
                    BLOCK_SIZE_SB64X64, &x->sb64_context);

      set_partition_seg_context(cpi, mi_row, mi_col);
      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
      r += x->partition_cost[pl][PARTITION_NONE];

      if (RDCOST(x->rdmult, x->rddiv, r, d) <
              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
        sb64_rate = r;
        sb64_dist = d;
        sb64_partitioning = BLOCK_SIZE_SB64X64;
      }
    }

    assert(tp_orig == *tp);
    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
              sb64_partitioning, sb_partitioning, mb_partitioning);
    assert(tp_orig < *tp);
  }
}

static void init_encode_frame_mb_context(VP9_COMP *cpi) {
  MACROBLOCK *const x = &cpi->mb;
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;

  x->act_zbin_adj = 0;
  cpi->seg0_idx = 0;
  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));

  xd->mode_info_stride = cm->mode_info_stride;
  xd->frame_type = cm->frame_type;

  xd->frames_since_golden = cm->frames_since_golden;
  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;

  // reset intra mode contexts
  if (cm->frame_type == KEY_FRAME)
    vp9_init_mbmode_probs(cm);

  // Copy data over into macro block data structures.
  vp9_setup_src_planes(x, cpi->Source, 0, 0);

  // TODO(jkoleszar): are these initializations required?
  setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
                   0, 0, NULL, NULL);
  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);

  vp9_build_block_offsets(x);

  vp9_setup_block_dptrs(&x->e_mbd);

  xd->mode_info_context->mbmi.mode = DC_PRED;
  xd->mode_info_context->mbmi.uv_mode = DC_PRED;

  vp9_zero(cpi->count_mb_ref_frame_usage)
  vp9_zero(cpi->bmode_count)
  vp9_zero(cpi->ymode_count)
  vp9_zero(cpi->y_uv_mode_count)
  vp9_zero(cpi->sub_mv_ref_count)
  vp9_zero(cpi->common.fc.mv_ref_ct)
  vp9_zero(cpi->sb_ymode_count)
  vp9_zero(cpi->partition_count);

  // Note: this memset assumes above_context[0], [1] and [2]
  // are allocated as part of the same buffer.
  vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
                                       mi_cols_aligned_to_sb(cm));
}

static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
  if (lossless) {
    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
    cpi->mb.optimize              = 0;
    cpi->common.filter_level      = 0;
    cpi->zbin_mode_boost_enabled  = 0;
    cpi->common.txfm_mode         = ONLY_4X4;
  } else {
    cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
    cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
  }
}


static void encode_frame_internal(VP9_COMP *cpi) {
  int mi_row;
  MACROBLOCK *const x = &cpi->mb;
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;
  int totalrate;

//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
//           cpi->common.current_video_frame, cpi->common.show_frame,
//           cm->frame_type);

  // Compute a modified set of reference frame probabilities to use when
  // prediction fails. These are based on the current general estimates for
  // this frame which may be updated with each iteration of the recode loop.
  vp9_compute_mod_refprobs(cm);

// debug output
#if DBG_PRNT_SEGMAP
  {
    FILE *statsfile;
    statsfile = fopen("segmap2.stt", "a");
    fprintf(statsfile, "\n");
    fclose(statsfile);
  }
#endif

  totalrate = 0;

  // Reset frame count of inter 0,0 motion vector usage.
  cpi->inter_zz_count = 0;

  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;

  vp9_zero(cpi->switchable_interp_count);
  vp9_zero(cpi->best_switchable_interp_count);

  xd->mode_info_context = cm->mi;
  xd->prev_mode_info_context = cm->prev_mi;

  vp9_zero(cpi->NMVcount);
  vp9_zero(cpi->coef_counts_4x4);
  vp9_zero(cpi->coef_counts_8x8);
  vp9_zero(cpi->coef_counts_16x16);
  vp9_zero(cpi->coef_counts_32x32);
  vp9_zero(cm->fc.eob_branch_counts);

  cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
                            cm->y_dc_delta_q == 0 &&
                            cm->uv_dc_delta_q == 0 &&
                            cm->uv_ac_delta_q == 0);
  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);

  vp9_frame_init_quantizer(cpi);

  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
  vp9_initialize_me_consts(cpi, cm->base_qindex);

  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
    // Initialize encode frame context.
    init_encode_frame_mb_context(cpi);

    // Build a frame level activity map
    build_activity_map(cpi);
  }

  // re-initencode frame context.
  init_encode_frame_mb_context(cpi);

  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
  vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p));
  vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));
  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
  {
    struct vpx_usec_timer  emr_timer;
    vpx_usec_timer_start(&emr_timer);

    {
      // Take tiles into account and give start/end MB
      int tile_col, tile_row;
      TOKENEXTRA *tp = cpi->tok;

      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
        vp9_get_tile_row_offsets(cm, tile_row);

        for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
          TOKENEXTRA *tp_old = tp;

          // For each row of SBs in the frame
          vp9_get_tile_col_offsets(cm, tile_col);
          for (mi_row = cm->cur_tile_mi_row_start;
               mi_row < cm->cur_tile_mi_row_end;
               mi_row += 8) {
            encode_sb_row(cpi, mi_row, &tp, &totalrate);
          }
          cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
          assert(tp - cpi->tok <=
                 get_token_alloc(cm->mb_rows, cm->mb_cols));
        }
      }
    }

    vpx_usec_timer_mark(&emr_timer);
    cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
  }

  // 256 rate units to the bit,
  // projected_frame_size in units of BYTES
  cpi->projected_frame_size = totalrate >> 8;

#if 0
  // Keep record of the total distortion this time around for future use
  cpi->last_frame_distortion = cpi->frame_distortion;
#endif

}

static int check_dual_ref_flags(VP9_COMP *cpi) {
  MACROBLOCKD *xd = &cpi->mb.e_mbd;
  int ref_flags = cpi->ref_frame_flags;

  if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&
        vp9_check_segref(xd, 1, LAST_FRAME))
      return 1;
    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&
        vp9_check_segref(xd, 1, GOLDEN_FRAME))
      return 1;
    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&
        vp9_check_segref(xd, 1, ALTREF_FRAME))
      return 1;
    return 0;
  } else {
    return (!!(ref_flags & VP9_GOLD_FLAG) +
            !!(ref_flags & VP9_LAST_FLAG) +
            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
  }
}

static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
  int x, y;

  for (y = 0; y < ymbs; y++) {
    for (x = 0; x < xmbs; x++) {
      if (!mi[y * mis + x].mbmi.mb_skip_coeff)
        return 0;
    }
  }

  return 1;
}

static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
                          TX_SIZE txfm_size) {
  int x, y;

  for (y = 0; y < ymbs; y++) {
    for (x = 0; x < xmbs; x++)
      mi[y * mis + x].mbmi.txfm_size = txfm_size;
  }
}

static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi,
                                   int mis, TX_SIZE txfm_max,
                                   int bw, int bh, int mi_row, int mi_col,
                                   BLOCK_SIZE_TYPE bsize) {
  VP9_COMMON *const cm = &cpi->common;
  MB_MODE_INFO *const mbmi = &mi->mbmi;

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;

  if (mbmi->txfm_size > txfm_max) {
    MACROBLOCK *const x = &cpi->mb;
    MACROBLOCKD *const xd = &x->e_mbd;
    const int segment_id = mbmi->segment_id;
    const int ymbs = MIN(bh, cm->mi_rows - mi_row);
    const int xmbs = MIN(bw, cm->mi_cols - mi_col);

    xd->mode_info_context = mi;
    assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ||
           get_skip_flag(mi, mis, ymbs, xmbs));
    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
  }
}

static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
                                    TX_SIZE txfm_max,
                                    int mi_row, int mi_col,
                                    BLOCK_SIZE_TYPE bsize) {
  VP9_COMMON *const cm = &cpi->common;
  const int mis = cm->mode_info_stride;
  int bwl, bhl;
  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;

  bwl = mi_width_log2(mi->mbmi.sb_type);
  bhl = mi_height_log2(mi->mbmi.sb_type);

  if (bwl == bsl && bhl == bsl) {
    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl,
                           mi_row, mi_col, bsize);
  } else if (bwl == bsl && bhl < bsl) {
    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs,
                           mi_row, mi_col, bsize);
    reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
                           mi_row + bs, mi_col, bsize);
  } else if (bwl < bsl && bhl == bsl) {
    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl,
                           mi_row, mi_col, bsize);
    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl,
                           mi_row, mi_col + bs, bsize);
  } else {
    BLOCK_SIZE_TYPE subsize;
    int n;

    assert(bwl < bsl && bhl < bsl);
    if (bsize == BLOCK_SIZE_SB64X64) {
      subsize = BLOCK_SIZE_SB32X32;
    } else if (bsize == BLOCK_SIZE_SB32X32) {
      subsize = BLOCK_SIZE_MB16X16;
    } else {
      assert(bsize == BLOCK_SIZE_MB16X16);
      subsize = BLOCK_SIZE_SB8X8;
    }

    for (n = 0; n < 4; n++) {
      const int y_idx = n >> 1, x_idx = n & 0x01;

      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
                              txfm_max, mi_row + y_idx * bs,
                              mi_col + x_idx * bs, subsize);
    }
  }
}

static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
  VP9_COMMON *const cm = &cpi->common;
  int mi_row, mi_col;
  const int mis = cm->mode_info_stride;
  MODE_INFO *mi, *mi_ptr = cm->mi;

  for (mi_row = 0; mi_row < cm->mi_rows;
       mi_row += 8, mi_ptr += 8 * mis) {
    mi = mi_ptr;
    for (mi_col = 0; mi_col < cm->mi_cols;
         mi_col += 8, mi += 8) {
      reset_skip_txfm_size_sb(cpi, mi, txfm_max,
                              mi_row, mi_col, BLOCK_SIZE_SB64X64);
    }
  }
}

void vp9_encode_frame(VP9_COMP *cpi) {
  if (cpi->sf.RD) {
    int i, frame_type, pred_type;
    TXFM_MODE txfm_type;

    /*
     * This code does a single RD pass over the whole frame assuming
     * either compound, single or hybrid prediction as per whatever has
     * worked best for that type of frame in the past.
     * It also predicts whether another coding mode would have worked
     * better that this coding mode. If that is the case, it remembers
     * that for subsequent frames.
     * It does the same analysis for transform size selection also.
     */
    if (cpi->common.frame_type == KEY_FRAME)
      frame_type = 0;
    else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
      frame_type = 3;
    else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
      frame_type = 1;
    else
      frame_type = 2;

    /* prediction (compound, single or hybrid) mode selection */
    if (frame_type == 3)
      pred_type = SINGLE_PREDICTION_ONLY;
    else if (cpi->rd_prediction_type_threshes[frame_type][1] >
                 cpi->rd_prediction_type_threshes[frame_type][0] &&
             cpi->rd_prediction_type_threshes[frame_type][1] >
                 cpi->rd_prediction_type_threshes[frame_type][2] &&
             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
      pred_type = COMP_PREDICTION_ONLY;
    else if (cpi->rd_prediction_type_threshes[frame_type][0] >
                 cpi->rd_prediction_type_threshes[frame_type][2])
      pred_type = SINGLE_PREDICTION_ONLY;
    else
      pred_type = HYBRID_PREDICTION;

    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */

    cpi->mb.e_mbd.lossless = 0;
    if (cpi->oxcf.lossless) {
      txfm_type = ONLY_4X4;
      cpi->mb.e_mbd.lossless = 1;
    } else
#if 0
    /* FIXME (rbultje): this code is disabled until we support cost updates
     * while a frame is being encoded; the problem is that each time we
     * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
     * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
     * further behind and not being chosen for subsequent frames either. This
     * is essentially a local minimum problem that we can probably fix by
     * estimating real costs more closely within a frame, perhaps by re-
     * calculating costs on-the-fly as frame encoding progresses. */
    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
      txfm_type = TX_MODE_SELECT;
    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
               ) {
      txfm_type = ONLY_4X4;
    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
      txfm_type = ALLOW_16X16;
    } else
      txfm_type = ALLOW_8X8;
#else
    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=
                  cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
                    ALLOW_32X32 : TX_MODE_SELECT;
#endif
    cpi->common.txfm_mode = txfm_type;
    if (txfm_type != TX_MODE_SELECT) {
      cpi->common.prob_tx[0] = 128;
      cpi->common.prob_tx[1] = 128;
    }
    cpi->common.comp_pred_mode = pred_type;
    encode_frame_internal(cpi);

    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
      const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
      cpi->rd_prediction_type_threshes[frame_type][i] += diff;
      cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
    }

    for (i = 0; i < NB_TXFM_MODES; ++i) {
      int64_t pd = cpi->rd_tx_select_diff[i];
      int diff;
      if (i == TX_MODE_SELECT)
        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
                     2048 * (TX_SIZE_MAX_SB - 1), 0);
      diff = (int)(pd / cpi->common.MBs);
      cpi->rd_tx_select_threshes[frame_type][i] += diff;
      cpi->rd_tx_select_threshes[frame_type][i] /= 2;
    }

    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
      int single_count_zero = 0;
      int comp_count_zero = 0;

      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
        single_count_zero += cpi->single_pred_count[i];
        comp_count_zero += cpi->comp_pred_count[i];
      }

      if (comp_count_zero == 0) {
        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
      } else if (single_count_zero == 0) {
        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
      }
    }

    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
      const int count4x4 = cpi->txfm_count_16x16p[TX_4X4] +
                           cpi->txfm_count_32x32p[TX_4X4] +
                           cpi->txfm_count_8x8p[TX_4X4];
      const int count8x8_lp = cpi->txfm_count_32x32p[TX_8X8] +
                              cpi->txfm_count_16x16p[TX_8X8];
      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
      const int count16x16_16x16p = cpi->txfm_count_16x16p[TX_16X16];
      const int count16x16_lp = cpi->txfm_count_32x32p[TX_16X16];
      const int count32x32 = cpi->txfm_count_32x32p[TX_32X32];

      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
          count32x32 == 0) {
        cpi->common.txfm_mode = ALLOW_8X8;
        reset_skip_txfm_size(cpi, TX_8X8);
      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
        cpi->common.txfm_mode = ONLY_4X4;
        reset_skip_txfm_size(cpi, TX_4X4);
      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
        cpi->common.txfm_mode = ALLOW_32X32;
      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
        cpi->common.txfm_mode = ALLOW_16X16;
        reset_skip_txfm_size(cpi, TX_16X16);
      }
    }

    // Update interpolation filter strategy for next frame.
    if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter))
      vp9_select_interp_filter_type(cpi);
  } else {
    encode_frame_internal(cpi);
  }

}

void vp9_build_block_offsets(MACROBLOCK *x) {
}

static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
  const MACROBLOCKD *xd = &x->e_mbd;
  const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
  const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;

#ifdef MODE_STATS
  const int is_key = cpi->common.frame_type == KEY_FRAME;

  ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
  ++ uv_modes_y[m][uvm];

  if (m == I4X4_PRED) {
    unsigned int *const bct = is_key ? b_modes : inter_b_modes;

    int b = 0;

    do {
      ++ bct[xd->block[b].bmi.as_mode.first];
    } while (++b < 4);
  }
#endif

  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8) {
    ++cpi->sb_ymode_count[m];
  } else {
    ++cpi->ymode_count[m];
  }
    ++cpi->y_uv_mode_count[m][uvm];
  if (m == I4X4_PRED) {
    int b = 0;
    do {
      int m = xd->mode_info_context->bmi[b].as_mode.first;
      ++cpi->bmode_count[m];
    } while (++b < 4);
  }
}

// Experimental stub function to create a per MB zbin adjustment based on
// some previously calculated measure of MB activity.
static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
#if USE_ACT_INDEX
  x->act_zbin_adj = *(x->mb_activity_ptr);
#else
  int64_t a;
  int64_t b;
  int64_t act = *(x->mb_activity_ptr);

  // Apply the masking to the RD multiplier.
  a = act + 4 * cpi->activity_avg;
  b = 4 * act + cpi->activity_avg;

  if (act > cpi->activity_avg)
    x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
  else
    x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
#endif
}

static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                              int output_enabled, int mi_row, int mi_col,
                              BLOCK_SIZE_TYPE bsize) {
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCK *const x = &cpi->mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  int n;
  MODE_INFO *mi = x->e_mbd.mode_info_context;
  unsigned int segment_id = mi->mbmi.segment_id;
  const int mis = cm->mode_info_stride;
  const int bwl = mi_width_log2(bsize);
  const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);

  if (cm->frame_type == KEY_FRAME) {
    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
      adjust_act_zbin(cpi, x);
      vp9_update_zbin_extra(cpi, x);
    }
  } else {
    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, cm);

    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
      // Adjust the zbin based on this MB rate.
      adjust_act_zbin(cpi, x);
    }

    // Experimental code. Special case for gf and arf zeromv modes.
    // Increase zbin size to suppress noise
    cpi->zbin_mode_boost = 0;
    if (cpi->zbin_mode_boost_enabled) {
      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
          else
            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
        } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
        } else {
          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
        }
      } else {
        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;
      }
    }

    vp9_update_zbin_extra(cpi, x);
  }

  if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
    assert(bsize == BLOCK_SIZE_SB8X8 &&
           xd->mode_info_context->mbmi.txfm_size == TX_4X4);

    vp9_encode_intra4x4mby(x, bsize);
    vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
    vp9_encode_sbuv(cm, x, bsize);

    if (output_enabled)
      sum_intra_stats(cpi, x);
  } else if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
    vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);