diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 21a597c108c6aa116ed44d69705cc0723756f2d4..673abd7b1ee0db1ce3287a23cf5f3cd03baf9530 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -1292,3 +1292,30 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
   out = dct_const_round_shift(out * cospi_16_64);
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
+
+void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+  int16_t out[32 * 32];
+  int16_t *outptr = out;
+  const int half_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[32], temp_out[32];
+
+  /* First transform rows. Since all non-zero dct coefficients are in
+   * upper-left 4x4 area, we only need to calculate first 4 rows here.
+   */
+  vpx_memset(out, 0, sizeof(out));
+  for (i = 0; i < 4; ++i) {
+    idct32_1d(input, outptr);
+    input += half_pitch;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_1d(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+  }
+}
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 2bd26c83e0a0a3a8ae7483ae10238708cc1053f5..9cbf44c464d2ec6f3906368ba65265764d878bdc 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -281,6 +281,9 @@ specialize vp9_short_idct32x32
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
+prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_idct10_32x32
+
 prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht8x8
 
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 46e5656bd76f86f23c7a5880b34592113a4291a9..5a98b11504b836569c5c6c7dc0af054ce981efef 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -314,14 +314,34 @@ void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
   if (eob) {
     input[0] = input[0] * dq[0] / 2;
     if (eob == 1) {
-      vp9_short_idct1_32x32_c(input, output);
+      vp9_short_idct1_32x32(input, output);
       add_constant_residual(output[0], pred, pitch, dest, stride, 32, 32);
       input[0] = 0;
+    } else if (eob <= 10) {
+      input[1] = input[1] * dq[1] / 2;
+      input[2] = input[2] * dq[1] / 2;
+      input[3] = input[3] * dq[1] / 2;
+      input[32] = input[32] * dq[1] / 2;
+      input[33] = input[33] * dq[1] / 2;
+      input[34] = input[34] * dq[1] / 2;
+      input[64] = input[64] * dq[1] / 2;
+      input[65] = input[65] * dq[1] / 2;
+      input[96] = input[96] * dq[1] / 2;
+
+      // the idct halves ( >> 1) the pitch
+      vp9_short_idct10_32x32(input, output, 64);
+
+      input[0] = input[1] = input[2] = input[3] = 0;
+      input[32] = input[33] = input[34] = 0;
+      input[64] = input[65] = 0;
+      input[96] = 0;
+
+      add_residual(output, pred, pitch, dest, stride, 32, 32);
     } else {
       int i;
       for (i = 1; i < 1024; i++)
         input[i] = input[i] * dq[1] / 2;
-      vp9_short_idct32x32_c(input, output, 64);
+      vp9_short_idct32x32(input, output, 64);
       vpx_memset(input, 0, 2048);
       add_residual(output, pred, pitch, dest, stride, 32, 32);
     }