From a7c4de22e106f005a21efd0e73f3e5ff31d8152e Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Mon, 29 Jul 2013 13:55:22 -0700
Subject: [PATCH] 16x16 inverse 2D-DCT with DC only

This commit provides special handle on 16x16 inverse 2D-DCT, where
only DC coefficient is quantized to be non-zero value.

Change-Id: I7bf71be7fa13384fab453dc8742b5b50e77a277c
---
 vp9/common/vp9_idct.c                 | 12 ++++++++--
 vp9/common/vp9_rtcd_defs.sh           |  6 ++---
 vp9/common/x86/vp9_idct_intrin_sse2.c | 32 +++++++++++++++++++++++++++
 vp9/decoder/vp9_idct_blk.c            |  8 +------
 vp9/encoder/vp9_encodemb.c            |  4 +++-
 5 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index df9ff3bea4..38fec3e47c 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -864,10 +864,18 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
   }
 }
 
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  int i, j;
+  int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
-  output[0] = ROUND_POWER_OF_TWO(out, 6);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = clip_pixel(dest[i] + a1);
+    dest += dest_stride;
+  }
 }
 
 static void idct32_1d(int16_t *input, int16_t *output) {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 18da330ef5..f004d1c59f 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -306,15 +306,15 @@ specialize vp9_short_idct8x8_add sse2 neon
 prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_8x8_add sse2
 
+prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_1_add sse2
+
 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_add sse2
 
 prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_16x16_add sse2
 
-prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_16x16
-
 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct32x32_add sse2
 
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 726c83f43a..e175fd2bef 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -1470,6 +1470,38 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
+void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 2; ++i) {
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    dest += 8 - (stride * 16);
+  }
+}
+
 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   __m128i tbuf[8];
   array_transpose_8x8(res0, res0);
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 42b805f8ec..395e636b84 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -123,14 +123,8 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
   if (eob) {
     if (eob == 1) {
       /* DC only DCT coefficient. */
-      int16_t in = input[0];
-      int16_t out;
-      /* Note: the idct1 will need to be modified accordingly whenever
-       * vp9_short_idct16x16() is modified. */
-      vp9_short_idct1_16x16_c(&in, &out);
+      vp9_short_idct16x16_1_add(input, dest, stride);
       input[0] = 0;
-
-      vp9_add_constant_residual_16x16(out, dest, stride);
     } else if (eob <= 10) {
       vp9_short_idct10_16x16_add(input, dest, stride);
       vpx_memset(input, 0, 512);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3597e73e63..a92ecf2270 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -61,7 +61,9 @@ static void inverse_transform_b_8x8_add(MACROBLOCKD *xd, int eob,
 static void inverse_transform_b_16x16_add(MACROBLOCKD *xd, int eob,
                                           int16_t *dqcoeff, uint8_t *dest,
                                           int stride) {
-  if (eob <= 10)
+  if (eob <= 1)
+    vp9_short_idct16x16_1_add(dqcoeff, dest, stride);
+  else if (eob <= 10)
     vp9_short_idct10_16x16_add(dqcoeff, dest, stride);
   else
     vp9_short_idct16x16_add(dqcoeff, dest, stride);
-- 
GitLab