From 5780c4cbd592c92da567609f737dbf823b055cd6 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 4 Feb 2013 16:49:17 -0800
Subject: [PATCH] Added vp9_short_idct1_32x32_c

and called this function in vp9_dequant_idct_add_32x32_c when
eob == 1.  For the test clip used, the decoder performance improved
by 21+%.  Based on Yaowu's 16 point idct work.

Change-Id: Ib579a90fed531d45777980e04bf0c9b23c093c43
---
 vp9/common/vp9_idctllm.c     | 10 ++++++++++
 vp9/common/vp9_rtcd_defs.sh  |  3 +++
 vp9/decoder/vp9_dequantize.c | 23 ++++++++++++++++-------
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 7dd2776f65..85f8fd7db2 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -1644,6 +1644,16 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
+void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
+  int tmp;
+  int16_t out;
+  tmp = input[0] * cospi_16_64;
+  out = dct_const_round_shift(tmp);
+  tmp = out * cospi_16_64;
+  out = dct_const_round_shift(tmp);
+  *output = (out + 32) >> 6;
+}
+
 #else  // !CONFIG_DWTDCTHYBRID
 
 #if DWT_TYPE == 53
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 7822ee8572..8f66e06c83 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -408,6 +408,9 @@ specialize vp9_short_idct1_16x16
 prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct32x32
 
+prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
+specialize vp9_short_idct1_32x32
+
 prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
 specialize vp9_ihtllm
 
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index e46be3ac7e..18d4e59c71 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -349,13 +349,22 @@ void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
   int i;
 
   if (eob) {
-    input[0]= input[0] * dq[0] / 2;
-    for (i = 1; i < 1024; i++)
-      input[i] = input[i] * dq[1] / 2;
-    vp9_short_idct32x32_c(input, output, 64);
-    vpx_memset(input, 0, 2048);
-
-    add_residual(output, pred, pitch, dest, stride, 32, 32);
+    input[0] = input[0] * dq[0] / 2;
+#if !CONFIG_DWTDCTHYBRID
+    if (eob == 1) {
+      vp9_short_idct1_32x32_c(input, output);
+      add_constant_residual(output[0], pred, pitch, dest, stride, 32, 32);
+      input[0] = 0;
+    } else {
+#endif
+      for (i = 1; i < 1024; i++)
+        input[i] = input[i] * dq[1] / 2;
+      vp9_short_idct32x32_c(input, output, 64);
+      vpx_memset(input, 0, 2048);
+      add_residual(output, pred, pitch, dest, stride, 32, 32);
+#if !CONFIG_DWTDCTHYBRID
+    }
+#endif
   }
 }
 
-- 
GitLab