From 516ea8460b325500f44ce79f55ff2e2c3bab6787 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Tue, 28 Dec 2010 14:51:46 -0500
Subject: [PATCH] Use the fast quantizer for inter mode selection

Use the fast quantizer for inter mode selection and the
regular quantizer for the rest of the encode for good quality,
speed 1.  Both performance and quality were improved.  The
quality gains will make up for the quality loss mentioned in
I9dc089007ca08129fb6c11fe7692777ebb8647b0.

Change-Id: Ia90bc9cf326a7c65d60d31fa32f6465ab6984d21
---
 vp8/encoder/arm/quantize_arm.c         |  2 +-
 vp8/encoder/block.h                    |  1 +
 vp8/encoder/encodeframe.c              | 20 ++++++++++++++++++++
 vp8/encoder/onyx_if.c                  | 13 ++++++++++---
 vp8/encoder/onyx_int.h                 |  5 +++++
 vp8/encoder/quantize.c                 |  4 ++--
 vp8/encoder/x86/x86_csystemdependent.c |  6 +++---
 7 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c
index 65c6166140..225feaca63 100644
--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -29,7 +29,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor
 
 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
 {
-    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant);
+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
 }
 
 /*
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 90b42c35ca..bf94e508b5 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -33,6 +33,7 @@ typedef struct
 
     // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
     short *quant;
+    short *quant_fast;
     short *quant_shift;
     short *zbin;
     short *zrun_zbin_boost;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 1528389468..cb7cc65d7c 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -179,6 +179,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
     {
         // dc values
         quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+        cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
         vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
                            cpi->Y1quant_shift[Q] + 0, quant_val);
         cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
@@ -187,6 +188,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
         cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+        cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;
         vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
                            cpi->Y2quant_shift[Q] + 0, quant_val);
         cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
@@ -195,6 +197,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
         cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+        cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;
         vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
                            cpi->UVquant_shift[Q] + 0, quant_val);
         cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
@@ -208,6 +211,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
             int rc = vp8_default_zig_zag1d[i];
 
             quant_val = vp8_ac_yquant(Q);
+            cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val;
             vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc,
                                cpi->Y1quant_shift[Q] + rc, quant_val);
             cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
@@ -216,6 +220,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
             cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+            cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val;
             vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc,
                                cpi->Y2quant_shift[Q] + rc, quant_val);
             cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
@@ -224,6 +229,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
             cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+            cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val;
             vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc,
                                cpi->UVquant_shift[Q] + rc, quant_val);
             cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
@@ -325,6 +331,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
     for (i = 0; i < 16; i++)
     {
         x->block[i].quant = cpi->Y1quant[QIndex];
+        x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];
         x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
         x->block[i].zbin = cpi->Y1zbin[QIndex];
         x->block[i].round = cpi->Y1round[QIndex];
@@ -339,6 +346,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
     for (i = 16; i < 24; i++)
     {
         x->block[i].quant = cpi->UVquant[QIndex];
+        x->block[i].quant_fast = cpi->UVquant_fast[QIndex];
         x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
         x->block[i].zbin = cpi->UVzbin[QIndex];
         x->block[i].round = cpi->UVround[QIndex];
@@ -349,6 +357,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
 
     // Y2
     zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
+    x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
     x->block[24].quant = cpi->Y2quant[QIndex];
     x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
     x->block[24].zbin = cpi->Y2zbin[QIndex];
@@ -1270,7 +1279,18 @@ int vp8cx_encode_inter_macroblock
 
     if (cpi->sf.RD)
     {
+        /* Are we using the fast quantizer for the mode selection? */
+        if(cpi->sf.use_fastquant_for_pick)
+            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+
         inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+
+        /* switch back to the regular quantizer for the encode */
+        if (cpi->sf.improved_quant)
+        {
+            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+        }
+
     }
     else
 #endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 1f890790c3..05a1338dca 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -591,6 +591,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     sf->max_fs_radius = 32;
     sf->iterative_sub_pixel = 1;
     sf->optimize_coefficients = 1;
+    sf->use_fastquant_for_pick = 0;
 
     sf->first_step = 0;
     sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -758,7 +759,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
             cpi->mode_check_freq[THR_SPLITG] = 4;
             cpi->mode_check_freq[THR_SPLITA] = 4;
-            cpi->mode_check_freq[THR_SPLITMV] = 0;
+            cpi->mode_check_freq[THR_SPLITMV] = 2;
 
             sf->thresh_mult[THR_TM       ] = 1500;
             sf->thresh_mult[THR_V_PRED   ] = 1500;
@@ -789,8 +790,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 sf->thresh_mult[THR_SPLITA   ] = 20000;
             }
 
-            sf->improved_quant = 0;
-            sf->improved_dct = 0;
+            sf->use_fastquant_for_pick = 1;
 
             sf->first_step = 1;
             sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -798,6 +798,8 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
         if (Speed > 1)
         {
+            sf->use_fastquant_for_pick = 0;
+
             cpi->mode_check_freq[THR_SPLITG] = 15;
             cpi->mode_check_freq[THR_SPLITA] = 15;
             cpi->mode_check_freq[THR_SPLITMV] = 7;
@@ -831,6 +833,11 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 sf->thresh_mult[THR_SPLITA   ] = 50000;
             }
 
+            sf->first_step = 1;
+
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;
+
             // Only do recode loop on key frames, golden frames and
             // alt ref frames
             sf->recode_loop = 2;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 990ae1d9e4..ab270ca5f4 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -182,6 +182,8 @@ typedef struct
     int first_step;
     int optimize_coefficients;
 
+    int use_fastquant_for_pick;
+
 } SPEED_FEATURES;
 
 typedef struct
@@ -269,6 +271,9 @@ typedef struct
     DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
     DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);
 
 
     MACROBLOCK mb;
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index a1be6614b4..a672994876 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -27,7 +27,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
     short *coeff_ptr       = b->coeff;
     short *zbin_ptr        = b->zbin;
     short *round_ptr       = b->round;
-    short *quant_ptr       = b->quant;
+    short *quant_ptr       = b->quant_fast;
     short *quant_shift_ptr = b->quant_shift;
     short *qcoeff_ptr      = d->qcoeff;
     short *dqcoeff_ptr     = d->dqcoeff;
@@ -74,7 +74,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
     int x, y, z, sz;
     short *coeff_ptr   = b->coeff;
     short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
+    short *quant_ptr   = b->quant_fast;
     short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
     short *dequant_ptr = d->dequant;
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index d2199a4990..6e317e2a2f 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -32,7 +32,7 @@ void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
     short *coeff_ptr   = b->coeff;
     short *zbin_ptr    = b->zbin;
     short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
+    short *quant_ptr   = b->quant_fast;
     short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
     short *dequant_ptr = d->dequant;
@@ -90,7 +90,7 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
     short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
     short *coeff_ptr   = b->coeff;
     short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant;
+    short *quant_ptr   = b->quant_fast;
     short *qcoeff_ptr  = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
     short *dequant_ptr = d->dequant;
@@ -183,7 +183,7 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
                     d->qcoeff,
                     d->dequant,
                     b->round,
-                    b->quant,
+                    b->quant_fast,
                     d->dqcoeff
                );
 }
-- 
GitLab