Merge "Update quantize SSSE3 SIMD to cover 32x32 transform case also."

9df24b41 · Ronald S. Bultje · Gerrit Code Review · b7cd01ed · c8defcfd · 9df24b41
Commit 9df24b41 authored 11 years ago by Ronald S. Bultje Committed by Gerrit Code Review 11 years ago
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -569,6 +569,9 @@ specialize vp9_subtract_block sse2
 prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64

+prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
+
 #
 # Structured Similarity (SSIM)
 #

--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -85,18 +85,19 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
 }

 // This function works well for large transform size.
-static void quantize_sparse(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
                            int skip_block,
                            int16_t *zbin_ptr, int16_t *round_ptr,
                            int16_t *quant_ptr, int16_t *quant_shift_ptr,
                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                            int16_t *dequant_ptr, int zbin_oq_value,
                            uint16_t *eob_ptr, const int16_t *scan,
-                            int *idx_arr) {
+                            const int16_t *iscan) {
  int i, rc, eob;
  int zbins[2], nzbins[2], zbin;
  int x, y, z, sz;
  int idx = 0;
+  int idx_arr[1024];

  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -179,20 +180,18 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
  // Call different quantization for different transform size.
  if (n_coeffs >= 1024) {
    // Save index of picked coefficient in pre-scan pass.
-    int idx_arr[1024];
-
-    quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-                    n_coeffs, mb->skip_block,
-                    mb->plane[plane].zbin,
-                    mb->plane[plane].round,
-                    mb->plane[plane].quant,
-                    mb->plane[plane].quant_shift,
-                    BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-                    BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                    xd->plane[plane].dequant,
-                    mb->plane[plane].zbin_extra,
-                    &xd->plane[plane].eobs[block],
-                    scan, idx_arr);
+    vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+                         n_coeffs, mb->skip_block,
+                         mb->plane[plane].zbin,
+                         mb->plane[plane].round,
+                         mb->plane[plane].quant,
+                         mb->plane[plane].quant_shift,
+                         BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+                         BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         xd->plane[plane].dequant,
+                         mb->plane[plane].zbin_extra,
+                         &xd->plane[plane].eobs[block],
+                         scan, iscan);
  }
  else {
    vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),

--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -15,10 +15,10 @@ pw_1: times 8 dw 1

 SECTION .text

-INIT_XMM ssse3
-cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                              shift, qcoeff, dqcoeff, dequant, zbin_oq, \
-                              eob, scan, iscan
+%macro QUANTIZE_FN 1
+cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                               shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                               eob, scan, iscan
  cmp                    dword skipm, 0
  jne .blank

@@ -57,6 +57,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
  pabsw                           m6, m9                   ; m6 = abs(m9)
  pabsw                          m11, m10                  ; m11 = abs(m10)
+%ifidn %1, b_32x32
+  paddw                           m6, m6
+  paddw                          m11, m11
+%endif
  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
  punpckhqdq                      m0, m0
  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
@@ -77,9 +81,19 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
  pand                           m13, m12
  mova        [qcoeffq+ncoeffq*2+ 0], m8
  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
  punpckhqdq                      m3, m3
  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
  mova       [dqcoeffq+ncoeffq*2+ 0], m8
  mova       [dqcoeffq+ncoeffq*2+16], m13
  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
@@ -99,6 +113,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
  pabsw                           m6, m9                   ; m6 = abs(m9)
  pabsw                          m11, m10                  ; m11 = abs(m10)
+%ifidn %1, b_32x32
+  paddw                           m6, m6
+  paddw                          m11, m11
+%endif
  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
  paddw                           m6, m1                   ; m6 += round
@@ -115,8 +133,18 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
  pand                           m13, m12
  mova        [qcoeffq+ncoeffq*2+ 0], m14
  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
  mova       [dqcoeffq+ncoeffq*2+ 0], m14
  mova       [dqcoeffq+ncoeffq*2+16], m13
  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
@@ -163,3 +191,8 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
  jl .blank_loop
  mov                    word [eobq], 0
  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b
+QUANTIZE_FN b_32x32