diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index df9ff3bea4fd93c8dd62d5a47dab22faffdff005..38fec3e47c6fb6508eb562f4849272cb7d2cc364 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -864,10 +864,18 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, } } -void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) { +void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { + int i, j; + int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); - output[0] = ROUND_POWER_OF_TWO(out, 6); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + dest[i] = clip_pixel(dest[i] + a1); + dest += dest_stride; + } } static void idct32_1d(int16_t *input, int16_t *output) { diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 18da330ef53b7061571f4d444d5d53dd9b91b4ab..f004d1c59f825c28deae320d6377e64c6ab673ab 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -306,15 +306,15 @@ specialize vp9_short_idct8x8_add sse2 neon prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_8x8_add sse2 +prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct16x16_1_add sse2 + prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct16x16_add sse2 prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_16x16_add sse2 -prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output" -specialize vp9_short_idct1_16x16 - prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct32x32_add sse2 diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 726c83f43a69cbfe7bf7ab0380355810d66a0378..e175fd2bef37f81e3dab22e95c829c562c7e31e8 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -1470,6 +1470,38 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { } } +void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 2; ++i) { + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + dest += 8 - (stride * 16); + } +} + static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { __m128i tbuf[8]; array_transpose_8x8(res0, res0); diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 42b805f8ec0791dd269b2ff2ec1f1b4fe79a6bb7..395e636b8477022faae3f0389257cd618cf0cb37 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -123,14 +123,8 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) { if (eob) { if (eob == 1) { /* DC only DCT coefficient. */ - int16_t in = input[0]; - int16_t out; - /* Note: the idct1 will need to be modified accordingly whenever - * vp9_short_idct16x16() is modified. */ - vp9_short_idct1_16x16_c(&in, &out); + vp9_short_idct16x16_1_add(input, dest, stride); input[0] = 0; - - vp9_add_constant_residual_16x16(out, dest, stride); } else if (eob <= 10) { vp9_short_idct10_16x16_add(input, dest, stride); vpx_memset(input, 0, 512); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 3597e73e630756b4a4f9db3562baab8d4d016ec7..a92ecf22703abf45e8f12ae6a408f0289202d3a3 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -61,7 +61,9 @@ static void inverse_transform_b_8x8_add(MACROBLOCKD *xd, int eob, static void inverse_transform_b_16x16_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff, uint8_t *dest, int stride) { - if (eob <= 10) + if (eob <= 1) + vp9_short_idct16x16_1_add(dqcoeff, dest, stride); + else if (eob <= 10) vp9_short_idct10_16x16_add(dqcoeff, dest, stride); else vp9_short_idct16x16_add(dqcoeff, dest, stride);