From 118ccdcd309c6bdaa967ffe1e13ba6df8aad64db Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rbultje@google.com>
Date: Wed, 24 Jul 2013 15:13:58 -0700
Subject: [PATCH] Inverse dimension order in token_cost array.

This allows us to increment the position at the band-level only as
we go from one band to the next; more importantly, that allows us to
use an add instead of multiply instruction, and omit the instruction
altogether if the band doesn't change from one coef to the next, thus
being slightly faster (probably more noticeable on systems where a
multiply is expensive, like arm).

Change-Id: I4343fe35b9f9a47fa00b217bdcbf5f91ff96c381
---
 vp9/encoder/vp9_block.h    |  7 ++++++-
 vp9/encoder/vp9_encodemb.c | 16 +++++++--------
 vp9/encoder/vp9_rdopt.c    | 41 ++++++++++++++++++++++----------------
 3 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 4b49b17a29..a171b7971c 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -72,6 +72,11 @@ struct macroblock_plane {
   int16_t zbin_extra;
 };
 
+/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
+ * coefficient in this block was zero) or not. */
+typedef unsigned int vp9_coeff_cost[BLOCK_TYPES][REF_TYPES][COEF_BANDS][2]
+                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
@@ -133,7 +138,7 @@ struct macroblock {
   unsigned char *active_ptr;
 
   // note that token_costs is the cost when eob node is skipped
-  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][2];
+  vp9_coeff_cost token_costs[TX_SIZE_MAX_SB];
 
   int optimize;
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4c04a1caa9..e3f1f1f475 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -222,10 +222,10 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
         band = get_coef_band(band_translate, i + 1);
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
         rate0 +=
-          mb->token_costs[tx_size][type][ref][0][band][pt]
+          mb->token_costs[tx_size][type][ref][band][0][pt]
                          [tokens[next][0].token];
         rate1 +=
-          mb->token_costs[tx_size][type][ref][0][band][pt]
+          mb->token_costs[tx_size][type][ref][band][0][pt]
                          [tokens[next][1].token];
       }
       UPDATE_RD_COST();
@@ -273,12 +273,12 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
         band = get_coef_band(band_translate, i + 1);
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
                                   [tokens[next][0].token];
         }
         if (t1 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
                                   [tokens[next][1].token];
         }
       }
@@ -311,12 +311,12 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
       /* Update the cost of each path if we're past the EOB token. */
       if (t0 != DCT_EOB_TOKEN) {
         tokens[next][0].rate +=
-            mb->token_costs[tx_size][type][ref][1][band][0][t0];
+            mb->token_costs[tx_size][type][ref][band][1][0][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
       if (t1 != DCT_EOB_TOKEN) {
         tokens[next][1].rate +=
-            mb->token_costs[tx_size][type][ref][1][band][0][t1];
+            mb->token_costs[tx_size][type][ref][band][1][0][t1];
         tokens[next][1].token = ZERO_TOKEN;
       }
       best_index[i][0] = best_index[i][1] = 0;
@@ -333,8 +333,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][ref][0][band][pt][t0];
-  rate1 += mb->token_costs[tx_size][type][ref][0][band][pt][t1];
+  rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0];
+  rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1db4c5f770..2cedcebf4e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -109,7 +109,7 @@ static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
 #define MAX_RD_THRESH_FREQ_FACT 32
 #define MAX_RD_THRESH_FREQ_INC 1
 
-static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
+static void fill_token_costs(vp9_coeff_cost *c,
                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
@@ -120,12 +120,12 @@ static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
             vp9_prob probs[ENTROPY_NODES];
             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
-            vp9_cost_tokens((int *)c[t][i][j][0][k][l], probs,
+            vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                             vp9_coef_tree);
-            vp9_cost_tokens_skip((int *)c[t][i][j][1][k][l], probs,
+            vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
                                  vp9_coef_tree);
-            assert(c[t][i][j][0][k][l][DCT_EOB_TOKEN] ==
-                   c[t][i][j][1][k][l][DCT_EOB_TOKEN]);
+            assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] ==
+                   c[t][i][j][k][1][l][DCT_EOB_TOKEN]);
           }
 }
 
@@ -513,11 +513,16 @@ int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
   return error;
 }
 
+/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+ * decide whether to include cost of a trailing EOB node or not (i.e. we
+ * can skip this if the last coefficient in this transform block, e.g. the
+ * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
+ * were non-zero). */
 static const int16_t band_counts[TX_SIZE_MAX_SB][8] = {
-  { 1, 2, 3, 4,  3,   16 - 13 },
-  { 1, 2, 3, 4, 11,   64 - 21 },
-  { 1, 2, 3, 4, 11,  256 - 21 },
-  { 1, 2, 3, 4, 11, 1024 - 21 },
+  { 1, 2, 3, 4,  3,   16 - 13, 0 },
+  { 1, 2, 3, 4, 11,   64 - 21, 0 },
+  { 1, 2, 3, 4, 11,  256 - 21, 0 },
+  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
 
 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
@@ -528,11 +533,11 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   MACROBLOCKD *const xd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt, c, cost;
-  const int16_t *band_count = band_counts[tx_size];
+  const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = xd->plane[plane].eobs[block];
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
-  unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+  unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS]
                     [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
   ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
   uint8_t token_cache[1024];
@@ -552,13 +557,14 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
     c = 0;
   } else {
-    int v, prev_t, band = 1, band_left = band_count[1];
+    int v, prev_t, band_left = *band_count++;
 
     // dc token
     v = qcoeff_ptr[0];
     prev_t = vp9_dct_value_tokens_ptr[v].token;
-    cost = token_costs[0][0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+    cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
     token_cache[0] = vp9_pt_energy_class[prev_t];
+    ++token_costs;
 
     // ac tokens
     for (c = 1; c < eob; c++) {
@@ -568,18 +574,19 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       v = qcoeff_ptr[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
       pt = get_coef_context(nb, token_cache, c);
-      cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v];
+      cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
       token_cache[rc] = vp9_pt_energy_class[t];
       prev_t = t;
       if (!--band_left) {
-        band_left = band_count[++band];
+        band_left = *band_count++;
+        ++token_costs;
       }
     }
 
     // eob token
-    if (band < 6) {
+    if (band_left) {
       pt = get_coef_context(nb, token_cache, c);
-      cost += token_costs[0][band][pt][DCT_EOB_TOKEN];
+      cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
     }
   }
 
-- 
GitLab