diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 2979daf6d1197e3283db962e05c7275916a53c8d..d075443edc54d7eda69d6ebc51b999a07746c0b3 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -701,7 +701,7 @@ prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_b specialize vp9_quantize_b $ssse3_x86_64 prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" -specialize vp9_quantize_b_32x32 +specialize vp9_quantize_b_32x32 $ssse3_x86_64 # # Structured Similarity (SSIM) diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 02c06855231dabf9e934d5802680f1bb7cbb74f4..fb0e4707acfbf9fc8beb65eec1ee974b4fad61c2 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -84,7 +84,6 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, *eob_ptr = eob + 1; } -// This function works well for large transform size. void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, @@ -105,8 +104,8 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, eob = -1; // Base ZBIN - zbins[0] = zbin_ptr[0] + zbin_oq_value; - zbins[1] = zbin_ptr[1] + zbin_oq_value; + zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1); + zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1); nzbins[0] = zbins[0] * -1; nzbins[1] = zbins[1] * -1; @@ -114,7 +113,7 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, // Pre-scan pass for (i = 0; i < n_coeffs; i++) { rc = scan[i]; - z = coeff_ptr[rc] * 2; + z = coeff_ptr[rc]; // If the coefficient is out of the base ZBIN range, keep it for // quantization. @@ -130,14 +129,14 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, // Calculate ZBIN zbin = (zbins[rc != 0]); - z = coeff_ptr[rc] * 2; + z = coeff_ptr[rc]; sz = (z >> 31); // sign of z x = (z ^ sz) - sz; // x = abs(z) if (x >= zbin) { - x += (round_ptr[rc != 0]); + x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) * - quant_shift_ptr[rc != 0]) >> 16; // quantize (x) + quant_shift_ptr[rc != 0]) >> 15; // quantize (x) x = (y ^ sz) - sz; // get the sign back qcoeff_ptr[rc] = x; // write to destination diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm index 60f79919550c885e0868214376ac7a71b3a44245..7deb9815a3094aa3d98235e9f0eebeb797c2aa84 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm @@ -36,6 +36,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pshufd m4, m4, 0 mova m2, [quantq] ; m2 = quant paddw m0, m4 ; m0 = zbin + zbin_oq +%ifidn %1, b_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif mova m3, [r2q] ; m3 = dequant psubw m0, [pw_1] mov r2, shiftmp @@ -43,6 +51,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m4, [r2] ; m4 = shift mov r4, dqcoeffmp mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob lea coeffq, [ coeffq+ncoeffq*2] @@ -56,10 +67,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) -%ifidn %1, b_32x32 - paddw m6, m6 - paddw m11, m11 -%endif pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin punpckhqdq m0, m0 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin @@ -112,10 +119,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) -%ifidn %1, b_32x32 - paddw m6, m6 - paddw m11, m11 -%endif pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin %ifidn %1, b_32x32 @@ -164,6 +167,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmaxsw m8, m13 add ncoeffq, mmsize jl .ac_only_loop + %ifidn %1, b_32x32 jmp .accumulate_eob .skip_iter: