diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index a95560a557162dc04e2aa7f3f0a1c5968db37443..df9ff3bea4fd93c8dd62d5a47dab22faffdff005 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -225,6 +225,19 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+  int i, j;
+  int a1;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = clip_pixel(dest[i] + a1);
+    dest += dest_stride;
+  }
+}
+
 static void iadst4_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -433,12 +446,6 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
   }
 }
 
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  output[0] = ROUND_POWER_OF_TWO(out, 5);
-}
-
 static void idct16_1d(int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index c357ef62a570098855b8c9d9e493c641505fde6c..976b200ef6505149e583211afbabd51b28802a50 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -297,15 +297,15 @@ specialize vp9_short_idct4x4_1_add sse2
 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct4x4_add sse2
 
+prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_1_add sse2
+
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2 neon
 
 prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_8x8_add sse2
 
-prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_8x8
-
 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_add sse2
 
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index a1e14b482ab9603238ffc878d62f9f9eb5ab752c..726c83f43a69cbfe7bf7ab0380355810d66a0378 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
   {                                                     \
      __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
       d0 = _mm_unpacklo_epi8(d0, zero); \
-      in_x = _mm_add_epi16(in_x, d0); \
-      in_x = _mm_packus_epi16(in_x, in_x); \
-      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      _mm_storel_epi64((__m128i *)(dest), d0); \
       dest += stride; \
   }
 
@@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
+void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 5);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+}
+
 // perform 8x8 transpose
 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 0217919da56b5977d4d114949022dfd6857d0ff4..70eb77f8a18c0afc4ad788fe9fda1fa67b70224c 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -93,15 +93,8 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
   if (eob) {
     if (eob == 1) {
       // DC only DCT coefficient
-      int16_t in = input[0];
-      int16_t out;
-
-      // Note: the idct1 will need to be modified accordingly whenever
-      // vp9_short_idct8x8_c() is modified.
-      vp9_short_idct1_8x8_c(&in, &out);
+      vp9_short_idct8x8_1_add(input, dest, stride);
       input[0] = 0;
-
-      vp9_add_constant_residual_8x8(out, dest, stride);
     } else {
       vp9_short_idct8x8_add(input, dest, stride);
       vpx_memset(input, 0, 128);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index d16f4f606fb6c41b9690bb6e5d1883e5c8556305..4c04a1caa92bdd99aaf7757283f760e6f597bf5a 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -47,6 +47,14 @@ static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
     xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
 
+static void inverse_transform_b_8x8_add(MACROBLOCKD *xd, int eob,
+                                        int16_t *dqcoeff, uint8_t *dest,
+                                        int stride) {
+  if (eob <= 1)
+    vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
+  else
+    vp9_short_idct8x8_add(dqcoeff, dest, stride);
+}
 
 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
@@ -533,7 +541,8 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
       vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_8X8:
-      vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+      inverse_transform_b_8x8_add(xd, pd->eobs[block], dqcoeff,
+                                  dst, pd->dst.stride);
       break;
     case TX_4X4:
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
@@ -711,7 +720,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
-          vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+          inverse_transform_b_8x8_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
         else
           vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
       }
@@ -746,8 +755,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          inverse_transform_b_4x4_add(xd, *eob, dqcoeff,
-                                      dst, pd->dst.stride);
+          inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
         else
           vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
       }