vp9_encodeframe.c

    memcpy(&seg_a, cm->above_seg_context + (mi_col >> CONFIG_SB8X8),
           sizeof(seg_a));
    memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));

    // FIXME(rbultje): this function should probably be rewritten to be
    // recursive at some point in the future.
    for (i = 0; i < 4; i++) {
      const int x_idx = (i & 1) << (1 + CONFIG_SB8X8);
      const int y_idx = (i & 2) << CONFIG_SB8X8;
      int sb32_rate = 0, sb32_dist = 0;
      int splitmodes_used = 0;
      int sb32_skip = 0;
      int j;
      ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];

      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
        continue;

      xd->sb_index = i;

      /* Function should not modify L & A contexts; save and restore on exit */
      for (p = 0; p < MAX_MB_PLANE; p++) {
        vpx_memcpy(l2 + 8 * p,
                   cm->left_context[p] +
                       (y_idx * 4 >> (CONFIG_SB8X8 +
                                      xd->plane[p].subsampling_y)),
                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
        vpx_memcpy(a2 + 8 * p,
                   cm->above_context[p] +
                       ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
                                                 xd->plane[p].subsampling_x)),
                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
      }

      /* Encode MBs in raster order within the SB */
      for (j = 0; j < 4; j++) {
        const int x_idx_m = x_idx + ((j & 1) << CONFIG_SB8X8);
        const int y_idx_m = y_idx + ((j >> 1) << CONFIG_SB8X8);
        int r, d;
#if CONFIG_SB8X8
        int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
        ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];

        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
#endif

        if (mi_row + y_idx_m >= cm->mi_rows ||
            mi_col + x_idx_m >= cm->mi_cols) {
          // MB lies outside frame, move on
          continue;
        }

        // Index of the MB in the SB 0..3
        xd->mb_index = j;

#if CONFIG_SB8X8
        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(l3 + 4 * p,
                     cm->left_context[p] +
                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
                                          xd->plane[p].subsampling_y)),
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
          vpx_memcpy(a3 + 4 * p,
                     cm->above_context[p] +
                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
                                                   xd->plane[p].subsampling_x)),
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
        }

        for (k = 0; k < 4; k++) {
          xd->b_index = k;

          // try 8x8 coding
          pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
                        mi_col + x_idx_m + (k & 1),
                        tp, &r, &d, BLOCK_SIZE_SB8X8,
                        &x->sb8_context[xd->sb_index][xd->mb_index]
                                       [xd->b_index]);
          mb16_rate += r;
          mb16_dist += d;
          update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
                                           [xd->b_index],
                       BLOCK_SIZE_SB8X8, 0);
          encode_superblock(cpi, tp,
                            0, mi_row + y_idx_m + (k >> 1),
                            mi_col + x_idx_m + (k & 1),
                            BLOCK_SIZE_SB8X8);
        }
        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
        mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
                                          xd->plane[p].subsampling_y)),
                     l3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
                                                   xd->plane[p].subsampling_x)),
                     a3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
        }

        // try 8x16 coding
        r2 = 0;
        d2 = 0;
        xd->b_index = 0;
        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
                      tp, &r, &d, BLOCK_SIZE_SB8X16,
                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
        r2 += r;
        d2 += d;
        update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
                                            [xd->b_index],
                     BLOCK_SIZE_SB8X16, 0);
        encode_superblock(cpi, tp,
                          0, mi_row + y_idx_m, mi_col + x_idx_m,
                          BLOCK_SIZE_SB8X16);
        xd->b_index = 1;
        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
                      tp, &r, &d, BLOCK_SIZE_SB8X16,
                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
        r2 += r;
        d2 += d;
        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
        r2 += x->partition_cost[pl][PARTITION_VERT];
        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
          mb16_rate = r;
          mb16_dist = d;
          mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
        }
        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
                                          xd->plane[p].subsampling_y)),
                     l3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
                                                   xd->plane[p].subsampling_x)),
                     a3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
        }

        // try 16x8 coding
        r2 = 0;
        d2 = 0;
        xd->b_index = 0;
        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
                      tp, &r, &d, BLOCK_SIZE_SB16X8,
                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
        r2 += r;
        d2 += d;
        update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
                                            [xd->b_index],
                     BLOCK_SIZE_SB16X8, 0);
        encode_superblock(cpi, tp,
                          0, mi_row + y_idx_m, mi_col + x_idx_m,
                          BLOCK_SIZE_SB16X8);
        xd->b_index = 1;
        pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
                      tp, &r, &d, BLOCK_SIZE_SB16X8,
                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
                                        [xd->b_index]);
        r2 += r;
        d2 += d;
        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
        r2 += x->partition_cost[pl][PARTITION_HORZ];
        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
          mb16_rate = r;
          mb16_dist = d;
          mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
        }
        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
                                          xd->plane[p].subsampling_y)),
                     l3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
                                                   xd->plane[p].subsampling_x)),
                     a3 + 4 * p,
                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
        }

        // try as 16x16
        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
                      tp, &r, &d, BLOCK_SIZE_MB16X16,
                      &x->mb_context[xd->sb_index][xd->mb_index]);
        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
        r += x->partition_cost[pl][PARTITION_NONE];
        if (RDCOST(x->rdmult, x->rddiv, r, d) <
                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
          mb16_rate = r;
          mb16_dist = d;
          mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
        }
        sb32_rate += mb16_rate;
        sb32_dist += mb16_dist;
#else
        splitmodes_used += pick_mb_mode(cpi, mi_row + y_idx_m,
                                        mi_col + x_idx_m, tp, &r, &d);
        sb32_rate += r;
        sb32_dist += d;
#endif

        // Dummy encode, do not do the tokenization
#if CONFIG_SB8X8
        encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
                  BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
#else
        encode_macroblock(cpi, tp, 0, mi_row + y_idx_m,
                          mi_col + x_idx_m);
#endif
      }

      /* Restore L & A coding context to those in place on entry */
      for (p = 0; p < MAX_MB_PLANE; p++) {
        vpx_memcpy(cm->left_context[p] +
                       (y_idx * 4 >> (CONFIG_SB8X8 +
                                      xd->plane[p].subsampling_y)),
                   l2 + 8 * p,
                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
        vpx_memcpy(cm->above_context[p] +
                       ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
                                                 xd->plane[p].subsampling_x)),
                   a2 + 8 * p,
                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
      }

      set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
      pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
      sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];

      if (cpi->sf.splitmode_breakout) {
        sb32_skip = splitmodes_used;
        sb64_skip += splitmodes_used;
      }

      // check 32x16
      if (mi_col + x_idx + (2 << CONFIG_SB8X8) <= cm->mi_cols) {
        int r, d;

        xd->mb_index = 0;
        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                      tp, &r, &d, BLOCK_SIZE_SB32X16,
                      &x->sb32x16_context[xd->sb_index][xd->mb_index]);
        if (mi_row + y_idx + (1 << CONFIG_SB8X8) < cm->mi_rows) {
          int r2, d2;

          update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
                       BLOCK_SIZE_SB32X16, 0);
          encode_superblock(cpi, tp,
                            0, mi_row + y_idx, mi_col + x_idx,
                            BLOCK_SIZE_SB32X16);
          xd->mb_index = 1;
          pick_sb_modes(cpi, mi_row + y_idx + (1 << CONFIG_SB8X8),
                        mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
                        &x->sb32x16_context[xd->sb_index][xd->mb_index]);
          r += r2;
          d += d2;
        }

        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
        r += x->partition_cost[pl][PARTITION_HORZ];

        /* is this better than MB coding? */
        if (RDCOST(x->rdmult, x->rddiv, r, d) <
                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
          sb32_rate = r;
          sb32_dist = d;
          sb_partitioning[i] = BLOCK_SIZE_SB32X16;
        }

        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx * 4 >> (CONFIG_SB8X8 +
                                        xd->plane[p].subsampling_y)),
                     l2 + 8 * p,
                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
                                                   xd->plane[p].subsampling_x)),
                     a2 + 8 * p,
                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
        }
      }

      // check 16x32
      if (mi_row + y_idx + (2 << CONFIG_SB8X8) <= cm->mi_rows) {
        int r, d;

        xd->mb_index = 0;
        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                      tp, &r, &d, BLOCK_SIZE_SB16X32,
                      &x->sb16x32_context[xd->sb_index][xd->mb_index]);
        if (mi_col + x_idx + (1 << CONFIG_SB8X8) < cm->mi_cols) {
          int r2, d2;

          update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
                       BLOCK_SIZE_SB16X32, 0);
          encode_superblock(cpi, tp,
                            0, mi_row + y_idx, mi_col + x_idx,
                            BLOCK_SIZE_SB16X32);
          xd->mb_index = 1;
          pick_sb_modes(cpi, mi_row + y_idx,
                        mi_col + x_idx + (1 << CONFIG_SB8X8),
                        tp, &r2, &d2, BLOCK_SIZE_SB16X32,
                        &x->sb16x32_context[xd->sb_index][xd->mb_index]);
          r += r2;
          d += d2;
        }

        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
        r += x->partition_cost[pl][PARTITION_VERT];

        /* is this better than MB coding? */
        if (RDCOST(x->rdmult, x->rddiv, r, d) <
                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
          sb32_rate = r;
          sb32_dist = d;
          sb_partitioning[i] = BLOCK_SIZE_SB16X32;
        }

        for (p = 0; p < MAX_MB_PLANE; p++) {
          vpx_memcpy(cm->left_context[p] +
                         (y_idx * 4 >> (CONFIG_SB8X8 +
                                        xd->plane[p].subsampling_y)),
                     l2 + 8 * p,
                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
          vpx_memcpy(cm->above_context[p] +
                         ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
                                                   xd->plane[p].subsampling_x)),
                     a2 + 8 * p,
                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
        }
      }

      if (!sb32_skip &&
          mi_col + x_idx + (2 << CONFIG_SB8X8) <= cm->mi_cols &&
          mi_row + y_idx + (2 << CONFIG_SB8X8) <= cm->mi_rows) {
        int r, d;

        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                      tp, &r, &d, BLOCK_SIZE_SB32X32,
                      &x->sb32_context[xd->sb_index]);

        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
        r += x->partition_cost[pl][PARTITION_NONE];

        if (RDCOST(x->rdmult, x->rddiv, r, d) <
                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
          sb32_rate = r;
          sb32_dist = d;
          sb_partitioning[i] = BLOCK_SIZE_SB32X32;
        }
      }

      // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
      if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) {
        ++sb64_skip;
      }

      sb64_rate += sb32_rate;
      sb64_dist += sb32_dist;

      /* Encode SB using best computed mode(s) */
      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
      // for each level that we go up, we can just keep tokens and recon
      // pixels of the lower level; also, inverting SB/MB order (big->small
      // instead of small->big) means we can use as threshold for small, which
      // may enable breakouts if RD is not good enough (i.e. faster)
      encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
#if CONFIG_SB8X8
                BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
                NULL);
#else
                BLOCK_SIZE_SB32X32, sb_partitioning[i], NULL);
#endif
    }

    for (p = 0; p < MAX_MB_PLANE; p++) {
      memcpy(cm->above_context[p] +
                 (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
             a + 16 * p,
             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
      memcpy(cm->left_context[p], l + 16 * p,
             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
    }
    memcpy(cm->above_seg_context + (mi_col >> CONFIG_SB8X8), &seg_a,
           sizeof(seg_a));
    memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));

    set_partition_seg_context(cpi, mi_row, mi_col);
    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
    sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];

    // check 64x32
    if (mi_col + (4 << CONFIG_SB8X8) <= cm->mi_cols && !(cm->mb_rows & 1)) {
      int r, d;

      xd->sb_index = 0;
      pick_sb_modes(cpi, mi_row, mi_col,
                    tp, &r, &d, BLOCK_SIZE_SB64X32,
                    &x->sb64x32_context[xd->sb_index]);
      if (mi_row + (2 << CONFIG_SB8X8) != cm->mi_rows) {
        int r2, d2;

        update_state(cpi, &x->sb64x32_context[xd->sb_index],
                     BLOCK_SIZE_SB64X32, 0);
        encode_superblock(cpi, tp,
                          0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
        xd->sb_index = 1;
        pick_sb_modes(cpi, mi_row + (2 << CONFIG_SB8X8), mi_col,
                      tp, &r2, &d2, BLOCK_SIZE_SB64X32,
                      &x->sb64x32_context[xd->sb_index]);
        r += r2;
        d += d2;
      }

      set_partition_seg_context(cpi, mi_row, mi_col);
      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
      r += x->partition_cost[pl][PARTITION_HORZ];

      /* is this better than MB coding? */
      if (RDCOST(x->rdmult, x->rddiv, r, d) <
              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
        sb64_rate = r;
        sb64_dist = d;
        sb64_partitioning = BLOCK_SIZE_SB64X32;
      }

      for (p = 0; p < MAX_MB_PLANE; p++) {
        memcpy(cm->above_context[p] +
                   (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
               a + 16 * p,
               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
        memcpy(cm->left_context[p], l + 16 * p,
               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
      }
    }

    // check 32x64
    if (mi_row + (4 << CONFIG_SB8X8) <= cm->mi_rows && !(cm->mb_cols & 1)) {
      int r, d;

      xd->sb_index = 0;
      pick_sb_modes(cpi, mi_row, mi_col,
                    tp, &r, &d, BLOCK_SIZE_SB32X64,
                    &x->sb32x64_context[xd->sb_index]);
      if (mi_col + (2 << CONFIG_SB8X8) != cm->mi_cols) {
        int r2, d2;

        update_state(cpi, &x->sb32x64_context[xd->sb_index],
                     BLOCK_SIZE_SB32X64, 0);
        encode_superblock(cpi, tp,
                          0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
        xd->sb_index = 1;
        pick_sb_modes(cpi, mi_row, mi_col + (2 << CONFIG_SB8X8),
                      tp, &r2, &d2, BLOCK_SIZE_SB32X64,
                      &x->sb32x64_context[xd->sb_index]);
        r += r2;
        d += d2;
      }

      set_partition_seg_context(cpi, mi_row, mi_col);
      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
      r += x->partition_cost[pl][PARTITION_VERT];

      /* is this better than MB coding? */
      if (RDCOST(x->rdmult, x->rddiv, r, d) <
              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
        sb64_rate = r;
        sb64_dist = d;
        sb64_partitioning = BLOCK_SIZE_SB32X64;
      }

      for (p = 0; p < MAX_MB_PLANE; p++) {
        memcpy(cm->above_context[p] +
                   (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
               a + 16 * p,
               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
        memcpy(cm->left_context[p], l + 16 * p,
               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
      }
    }

    if (!sb64_skip &&
        mi_col + (4 << CONFIG_SB8X8) <= cm->mi_cols &&
        mi_row + (4 << CONFIG_SB8X8) <= cm->mi_rows) {
      int r, d;

      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
                    BLOCK_SIZE_SB64X64, &x->sb64_context);

      set_partition_seg_context(cpi, mi_row, mi_col);
      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
      r += x->partition_cost[pl][PARTITION_NONE];

      if (RDCOST(x->rdmult, x->rddiv, r, d) <
              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
        sb64_rate = r;
        sb64_dist = d;
        sb64_partitioning = BLOCK_SIZE_SB64X64;
      }
    }

    assert(tp_orig == *tp);
    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
#if CONFIG_SB8X8
              sb64_partitioning, sb_partitioning, mb_partitioning);
#else
              sb64_partitioning, sb_partitioning);
#endif
    assert(tp_orig < *tp);
  }
}

static void init_encode_frame_mb_context(VP9_COMP *cpi) {
  MACROBLOCK *const x = &cpi->mb;
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;

  x->act_zbin_adj = 0;
  cpi->seg0_idx = 0;
  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));

  xd->mode_info_stride = cm->mode_info_stride;
  xd->frame_type = cm->frame_type;

  xd->frames_since_golden = cm->frames_since_golden;
  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;

  // reset intra mode contexts
  if (cm->frame_type == KEY_FRAME)
    vp9_init_mbmode_probs(cm);

  // Copy data over into macro block data structures.
  vp9_setup_src_planes(x, cpi->Source, 0, 0);

  // TODO(jkoleszar): are these initializations required?
  setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
                   0, 0, NULL, NULL);
  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);

  // set up frame for intra coded blocks
  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);

  vp9_build_block_offsets(x);

  vp9_setup_block_dptrs(&x->e_mbd);

  xd->mode_info_context->mbmi.mode = DC_PRED;
  xd->mode_info_context->mbmi.uv_mode = DC_PRED;

  vp9_zero(cpi->count_mb_ref_frame_usage)
  vp9_zero(cpi->bmode_count)
  vp9_zero(cpi->ymode_count)
#if !CONFIG_SB8X8
  vp9_zero(cpi->i8x8_mode_count)
#endif
  vp9_zero(cpi->y_uv_mode_count)
  vp9_zero(cpi->sub_mv_ref_count)
#if !CONFIG_SB8X8
  vp9_zero(cpi->mbsplit_count)
#endif
  vp9_zero(cpi->common.fc.mv_ref_ct)
  vp9_zero(cpi->sb_ymode_count)
  vp9_zero(cpi->partition_count);

#if CONFIG_COMP_INTERINTRA_PRED
  vp9_zero(cpi->interintra_count);
  vp9_zero(cpi->interintra_select_count);
#endif

  // Note: this memset assumes above_context[0], [1] and [2]
  // are allocated as part of the same buffer.
  vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 4 *
                                      MAX_MB_PLANE * mb_cols_aligned_to_sb(cm));
  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
                                       mb_cols_aligned_to_sb(cm));
}

static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
  if (lossless) {
    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
    cpi->mb.optimize              = 0;
    cpi->common.filter_level      = 0;
    cpi->zbin_mode_boost_enabled  = 0;
    cpi->common.txfm_mode         = ONLY_4X4;
  } else {
    cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
    cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
  }
}


static void encode_frame_internal(VP9_COMP *cpi) {
  int mi_row;
  MACROBLOCK *const x = &cpi->mb;
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;
  int totalrate;

//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
//           cpi->common.current_video_frame, cpi->common.show_frame,
//           cm->frame_type);

  // Compute a modified set of reference frame probabilities to use when
  // prediction fails. These are based on the current general estimates for
  // this frame which may be updated with each iteration of the recode loop.
  vp9_compute_mod_refprobs(cm);

// debug output
#if DBG_PRNT_SEGMAP
  {
    FILE *statsfile;
    statsfile = fopen("segmap2.stt", "a");
    fprintf(statsfile, "\n");
    fclose(statsfile);
  }
#endif

  totalrate = 0;

  // Reset frame count of inter 0,0 motion vector usage.
  cpi->inter_zz_count = 0;

  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;

  vp9_zero(cpi->switchable_interp_count);
  vp9_zero(cpi->best_switchable_interp_count);

  xd->mode_info_context = cm->mi;
  xd->prev_mode_info_context = cm->prev_mi;

  vp9_zero(cpi->NMVcount);
  vp9_zero(cpi->coef_counts_4x4);
  vp9_zero(cpi->coef_counts_8x8);
  vp9_zero(cpi->coef_counts_16x16);
  vp9_zero(cpi->coef_counts_32x32);
  vp9_zero(cm->fc.eob_branch_counts);
#if CONFIG_CODE_ZEROGROUP
  vp9_zero(cm->fc.zpc_counts_4x4);
  vp9_zero(cm->fc.zpc_counts_8x8);
  vp9_zero(cm->fc.zpc_counts_16x16);
  vp9_zero(cm->fc.zpc_counts_32x32);
#endif

  cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
                            cm->y_dc_delta_q == 0 &&
                            cm->uv_dc_delta_q == 0 &&
                            cm->uv_ac_delta_q == 0);
  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);

  vp9_frame_init_quantizer(cpi);

  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
  vp9_initialize_me_consts(cpi, cm->base_qindex);

  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
    // Initialize encode frame context.
    init_encode_frame_mb_context(cpi);

    // Build a frame level activity map
    build_activity_map(cpi);
  }

  // re-initencode frame context.
  init_encode_frame_mb_context(cpi);

  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
  vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p));
  vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));
  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
  {
    struct vpx_usec_timer  emr_timer;
    vpx_usec_timer_start(&emr_timer);

    {
      // Take tiles into account and give start/end MB
      int tile_col, tile_row;
      TOKENEXTRA *tp = cpi->tok;

      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
        vp9_get_tile_row_offsets(cm, tile_row);

        for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
          TOKENEXTRA *tp_old = tp;

          // For each row of SBs in the frame
          vp9_get_tile_col_offsets(cm, tile_col);
          for (mi_row = cm->cur_tile_mi_row_start;
               mi_row < cm->cur_tile_mi_row_end;
               mi_row += (4 << CONFIG_SB8X8)) {
            encode_sb_row(cpi, mi_row, &tp, &totalrate);
          }
          cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
          assert(tp - cpi->tok <=
                 get_token_alloc(cm->mb_rows, cm->mb_cols));
        }
      }
    }

    vpx_usec_timer_mark(&emr_timer);
    cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
  }

  // 256 rate units to the bit,
  // projected_frame_size in units of BYTES
  cpi->projected_frame_size = totalrate >> 8;

#if 0
  // Keep record of the total distortion this time around for future use
  cpi->last_frame_distortion = cpi->frame_distortion;
#endif

}

static int check_dual_ref_flags(VP9_COMP *cpi) {
  MACROBLOCKD *xd = &cpi->mb.e_mbd;
  int ref_flags = cpi->ref_frame_flags;

  if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&
        vp9_check_segref(xd, 1, LAST_FRAME))
      return 1;
    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&
        vp9_check_segref(xd, 1, GOLDEN_FRAME))
      return 1;
    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&
        vp9_check_segref(xd, 1, ALTREF_FRAME))
      return 1;
    return 0;
  } else {
    return (!!(ref_flags & VP9_GOLD_FLAG) +
            !!(ref_flags & VP9_LAST_FLAG) +
            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
  }
}

static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
  int x, y;

  for (y = 0; y < ymbs; y++) {
    for (x = 0; x < xmbs; x++) {
      if (!mi[y * mis + x].mbmi.mb_skip_coeff)
        return 0;
    }
  }

  return 1;
}

static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
                          TX_SIZE txfm_size) {
  int x, y;

  for (y = 0; y < ymbs; y++) {
    for (x = 0; x < xmbs; x++)
      mi[y * mis + x].mbmi.txfm_size = txfm_size;
  }
}

static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi,
                                   int mis, TX_SIZE txfm_max,
                                   int bw, int bh, int mi_row, int mi_col,
                                   BLOCK_SIZE_TYPE bsize) {
  VP9_COMMON *const cm = &cpi->common;
  MB_MODE_INFO *const mbmi = &mi->mbmi;

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;

  if (mbmi->txfm_size > txfm_max) {
    MACROBLOCK *const x = &cpi->mb;
    MACROBLOCKD *const xd = &x->e_mbd;
    const int segment_id = mbmi->segment_id;
    const int ymbs = MIN(bh, cm->mi_rows - mi_row);
    const int xmbs = MIN(bw, cm->mi_cols - mi_col);

    xd->mode_info_context = mi;
    assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ||
           get_skip_flag(mi, mis, ymbs, xmbs));
    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
  }
}

static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
                                    TX_SIZE txfm_max,
                                    int mi_row, int mi_col,
                                    BLOCK_SIZE_TYPE bsize) {
  VP9_COMMON *const cm = &cpi->common;
  const int mis = cm->mode_info_stride;
  int bwl, bhl;
  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);

  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
    return;

  bwl = mi_width_log2(mi->mbmi.sb_type);
  bhl = mi_height_log2(mi->mbmi.sb_type);

  if (bwl == bsl && bhl == bsl) {
    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl,
                           mi_row, mi_col, bsize);
  } else if (bwl == bsl && bhl < bsl) {
    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs,
                           mi_row, mi_col, bsize);
    reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
                           mi_row + bs, mi_col, bsize);
  } else if (bwl < bsl && bhl == bsl) {
    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl,
                           mi_row, mi_col, bsize);
    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl,
                           mi_row, mi_col + bs, bsize);
  } else {
    BLOCK_SIZE_TYPE subsize;
    int n;

    assert(bwl < bsl && bhl < bsl);
    if (bsize == BLOCK_SIZE_SB64X64) {
      subsize = BLOCK_SIZE_SB32X32;
#if CONFIG_SB8X8
    } else if (bsize == BLOCK_SIZE_SB32X32) {
      subsize = BLOCK_SIZE_MB16X16;
    } else {
      assert(bsize == BLOCK_SIZE_MB16X16);
      subsize = BLOCK_SIZE_SB8X8;
#else
    } else {
      assert(bsize == BLOCK_SIZE_SB32X32);
      subsize = BLOCK_SIZE_MB16X16;
#endif
    }

    for (n = 0; n < 4; n++) {
      const int y_idx = n >> 1, x_idx = n & 0x01;

      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
                              txfm_max, mi_row + y_idx * bs,
                              mi_col + x_idx * bs, subsize);
    }
  }
}

static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
  VP9_COMMON *const cm = &cpi->common;
  int mi_row, mi_col;
  const int mis = cm->mode_info_stride;
  MODE_INFO *mi, *mi_ptr = cm->mi;

  for (mi_row = 0; mi_row < cm->mi_rows;
       mi_row += (4 << CONFIG_SB8X8), mi_ptr += (4 << CONFIG_SB8X8) * mis) {
    mi = mi_ptr;
    for (mi_col = 0; mi_col < cm->mi_cols;
         mi_col += (4 << CONFIG_SB8X8), mi += (4 << CONFIG_SB8X8)) {
      reset_skip_txfm_size_sb(cpi, mi, txfm_max,
                              mi_row, mi_col, BLOCK_SIZE_SB64X64);
    }
  }
}

void vp9_encode_frame(VP9_COMP *cpi) {
  if (cpi->sf.RD) {
    int i, frame_type, pred_type;
    TXFM_MODE txfm_type;

    /*
     * This code does a single RD pass over the whole frame assuming
     * either compound, single or hybrid prediction as per whatever has
     * worked best for that type of frame in the past.
     * It also predicts whether another coding mode would have worked
     * better that this coding mode. If that is the case, it remembers
     * that for subsequent frames.
     * It does the same analysis for transform size selection also.
     */
    if (cpi->common.frame_type == KEY_FRAME)
      frame_type = 0;
    else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
      frame_type = 3;
    else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
      frame_type = 1;
    else
      frame_type = 2;

    /* prediction (compound, single or hybrid) mode selection */
    if (frame_type == 3)
      pred_type = SINGLE_PREDICTION_ONLY;
    else if (cpi->rd_prediction_type_threshes[frame_type][1] >
                 cpi->rd_prediction_type_threshes[frame_type][0] &&
             cpi->rd_prediction_type_threshes[frame_type][1] >
                 cpi->rd_prediction_type_threshes[frame_type][2] &&
             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
      pred_type = COMP_PREDICTION_ONLY;
    else if (cpi->rd_prediction_type_threshes[frame_type][0] >
                 cpi->rd_prediction_type_threshes[frame_type][2])
      pred_type = SINGLE_PREDICTION_ONLY;
    else
      pred_type = HYBRID_PREDICTION;

    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */

    cpi->mb.e_mbd.lossless = 0;
    if (cpi->oxcf.lossless) {
      txfm_type = ONLY_4X4;
      cpi->mb.e_mbd.lossless = 1;
    } else
#if 0
    /* FIXME (rbultje): this code is disabled until we support cost updates
     * while a frame is being encoded; the problem is that each time we
     * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
     * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
     * further behind and not being chosen for subsequent frames either. This
     * is essentially a local minimum problem that we can probably fix by
     * estimating real costs more closely within a frame, perhaps by re-
     * calculating costs on-the-fly as frame encoding progresses. */
    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
      txfm_type = TX_MODE_SELECT;
    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
               ) {
      txfm_type = ONLY_4X4;
    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
      txfm_type = ALLOW_16X16;
    } else
      txfm_type = ALLOW_8X8;
#else
    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=
                  cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
                    ALLOW_32X32 : TX_MODE_SELECT;
#endif
    cpi->common.txfm_mode = txfm_type;
    if (txfm_type != TX_MODE_SELECT) {
      cpi->common.prob_tx[0] = 128;
      cpi->common.prob_tx[1] = 128;
    }
    cpi->common.comp_pred_mode = pred_type;
    encode_frame_internal(cpi);

    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
      const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
      cpi->rd_prediction_type_threshes[frame_type][i] += diff;
      cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
    }

    for (i = 0; i < NB_TXFM_MODES; ++i) {
      int64_t pd = cpi->rd_tx_select_diff[i];
      int diff;
      if (i == TX_MODE_SELECT)
        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
                     2048 * (TX_SIZE_MAX_SB - 1), 0);
      diff = (int)(pd / cpi->common.MBs);
      cpi->rd_tx_select_threshes[frame_type][i] += diff;
      cpi->rd_tx_select_threshes[frame_type][i] /= 2;
    }

    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
      int single_count_zero = 0;
      int comp_count_zero = 0;

      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
        single_count_zero += cpi->single_pred_count[i];
        comp_count_zero += cpi->comp_pred_count[i];
      }

      if (comp_count_zero == 0) {
        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;