diff --git a/vp8/encoder/arm/csystemdependent.c b/vp8/encoder/arm/csystemdependent.c
index 4521bfc3159b75d8ff26cca60145edd16c762675..bfceab16c0f17ce3389553ad10276f84c0d7d10b 100644
--- a/vp8/encoder/arm/csystemdependent.c
+++ b/vp8/encoder/arm/csystemdependent.c
@@ -63,7 +63,7 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
     cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;
 
     cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;
+    /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;*/
 #elif HAVE_ARMV6
     cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
     cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index b55bc51cb177dcb44e97277c75e7176e1fc362ca..19d307d26f090fb141b3feb8ca4ceaa205a298ca 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -33,6 +33,7 @@ typedef struct
 
     // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
     short(*quant)[4];
+    short(*quant_shift)[4];
     short(*zbin)[4];
     short(*zrun_zbin_boost);
     short(*round)[4];
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 32cef1db1e1212726747a629f13b6897a73ef78e..a05b33268dec5d64149ef9aeb9748832e3b54a8d 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -103,6 +103,18 @@ static const int qzbin_factors[129] =
     80,
 };
 
+static void vp8cx_invert_quant(short *quant, short *shift, short d)
+{
+    unsigned t;
+    int l;
+    t = d;
+    for(l = 0; t > 1; l++)
+        t>>=1;
+    t = 1 + (1<<(16+l))/d;
+    *quant = (short)(t - (1<<16));
+    *shift = l;
+}
+
 void vp8cx_init_quantizer(VP8_COMP *cpi)
 {
     int r, c;
@@ -116,21 +128,24 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
     {
         // dc values
         quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        cpi->Y1quant[Q][0][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->Y1quant[Q][0] + 0,
+                           cpi->Y1quant_shift[Q][0] + 0, quant_val);
         cpi->Y1zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
         cpi->Y1round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
         cpi->common.Y1dequant[Q][0][0] = quant_val;
         cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        cpi->Y2quant[Q][0][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->Y2quant[Q][0] + 0,
+                           cpi->Y2quant_shift[Q][0] + 0, quant_val);
         cpi->Y2zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
         cpi->Y2round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
         cpi->common.Y2dequant[Q][0][0] = quant_val;
         cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
         quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        cpi->UVquant[Q][0][0] = (1 << 16) / quant_val;
+        vp8cx_invert_quant(cpi->UVquant[Q][0] + 0,
+                           cpi->UVquant_shift[Q][0] + 0, quant_val);
         cpi->UVzbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
         cpi->UVround[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
         cpi->common.UVdequant[Q][0][0] = quant_val;
@@ -144,21 +159,24 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
             c = (rc & 3);
 
             quant_val = vp8_ac_yquant(Q);
-            cpi->Y1quant[Q][r][c] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->Y1quant[Q][r] + c,
+                               cpi->Y1quant_shift[Q][r] + c, quant_val);
             cpi->Y1zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
             cpi->Y1round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
             cpi->common.Y1dequant[Q][r][c] = quant_val;
             cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            cpi->Y2quant[Q][r][c] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->Y2quant[Q][r] + c,
+                               cpi->Y2quant_shift[Q][r] + c, quant_val);
             cpi->Y2zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
             cpi->Y2round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
             cpi->common.Y2dequant[Q][r][c] = quant_val;
             cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
 
             quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            cpi->UVquant[Q][r][c] = (1 << 16) / quant_val;
+            vp8cx_invert_quant(cpi->UVquant[Q][r] + c,
+                               cpi->UVquant_shift[Q][r] + c, quant_val);
             cpi->UVzbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
             cpi->UVround[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
             cpi->common.UVdequant[Q][r][c] = quant_val;
@@ -198,6 +216,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
     for (i = 0; i < 16; i++)
     {
         x->block[i].quant = cpi->Y1quant[QIndex];
+        x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
         x->block[i].zbin = cpi->Y1zbin[QIndex];
         x->block[i].round = cpi->Y1round[QIndex];
         x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
@@ -211,6 +230,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
     for (i = 16; i < 24; i++)
     {
         x->block[i].quant = cpi->UVquant[QIndex];
+        x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
         x->block[i].zbin = cpi->UVzbin[QIndex];
         x->block[i].round = cpi->UVround[QIndex];
         x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
@@ -221,6 +241,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
     // Y2
     zbin_extra = (cpi->common.Y2dequant[QIndex][0][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
     x->block[24].quant = cpi->Y2quant[QIndex];
+    x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
     x->block[24].zbin = cpi->Y2zbin[QIndex];
     x->block[24].round = cpi->Y2round[QIndex];
     x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index dd98a09d19a048448a66a0249264bad79aa577e8..54646f421bbb79ec7c4cc0dd76c891f69ae87160 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -286,6 +286,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
     for (i = 0; i < 25; i++)
     {
         z->block[i].quant           = x->block[i].quant;
+        z->block[i].quant_shift     = x->block[i].quant_shift;
         z->block[i].zbin            = x->block[i].zbin;
         z->block[i].zrun_zbin_boost   = x->block[i].zrun_zbin_boost;
         z->block[i].round           = x->block[i].round;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index fcde2205db6ee170a1a771b8c4e4b62908c08be8..f76d2efcdf463938165f590f032520441de61138 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -234,14 +234,17 @@ typedef struct
 {
 
     DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][4][4]);
     DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][4][4]);
     DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][4][4]);
 
     DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][4][4]);
     DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][4][4]);
     DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][4][4]);
 
     DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][4][4]);
     DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][4][4]);
     DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][4][4]);
 
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 181870c1130a8119e85133c48f0c89a662537ebb..877002b08cf5c60778fe1ee24621e79f0b035734 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -25,6 +25,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
     short *zbin_ptr   = &b->zbin[0][0];
     short *round_ptr  = &b->round[0][0];
     short *quant_ptr  = &b->quant[0][0];
+    short *quant_shift_ptr = &b->quant_shift[0][0];
     short *qcoeff_ptr = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
     short *dequant_ptr = &d->dequant[0][0];
@@ -45,7 +46,9 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 
         if (x >= zbin)
         {
-            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+            x += round_ptr[rc];
+            y  = (((x * quant_ptr[rc]) >> 16) + x)
+                 >> quant_shift_ptr[rc];                // quantize (x)
             x  = (y ^ sz) - sz;                         // get the sign back
             qcoeff_ptr[rc] = x;                          // write to destination
             dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
@@ -69,6 +72,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
     short *zbin_ptr   = &b->zbin[0][0];
     short *round_ptr  = &b->round[0][0];
     short *quant_ptr  = &b->quant[0][0];
+    short *quant_shift_ptr = &b->quant_shift[0][0];
     short *qcoeff_ptr = d->qcoeff;
     short *dqcoeff_ptr = d->dqcoeff;
     short *dequant_ptr = &d->dequant[0][0];
@@ -95,7 +99,9 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
 
         if (x >= zbin)
         {
-            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+            x += round_ptr[rc];
+            y  = (((x * quant_ptr[rc]) >> 16) + x)
+                 >> quant_shift_ptr[rc];                // quantize (x)
             x  = (y ^ sz) - sz;                         // get the sign back
             qcoeff_ptr[rc]  = x;                         // write to destination
             dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 11ef4197b4987174bfeed9383c56b1c2263e6af3..be226e040fccd11c1669fd84a9020775e7339738 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -238,7 +238,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_mmx;
         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_mmx;
 
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;
+        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/
     }
 
 #endif
@@ -285,8 +285,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
         /* cpi->rtcd.encodemb.sub* not implemented for wmt */
 
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse;
-        cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;
+        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse;
+        cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
     }
 
 #endif