diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index a95560a557162dc04e2aa7f3f0a1c5968db37443..df9ff3bea4fd93c8dd62d5a47dab22faffdff005 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -225,6 +225,19 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) { } } +void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { + int i, j; + int a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) + dest[i] = clip_pixel(dest[i] + a1); + dest += dest_stride; + } +} + static void iadst4_1d(int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; @@ -433,12 +446,6 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, } } -void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) { - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - output[0] = ROUND_POWER_OF_TWO(out, 5); -} - static void idct16_1d(int16_t *input, int16_t *output) { int16_t step1[16], step2[16]; int temp1, temp2; diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index c357ef62a570098855b8c9d9e493c641505fde6c..976b200ef6505149e583211afbabd51b28802a50 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -297,15 +297,15 @@ specialize vp9_short_idct4x4_1_add sse2 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct4x4_add sse2 +prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct8x8_1_add sse2 + prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct8x8_add sse2 neon prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_8x8_add sse2 -prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output" -specialize vp9_short_idct1_8x8 - prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct16x16_add sse2 diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index a1e14b482ab9603238ffc878d62f9f9eb5ab752c..726c83f43a69cbfe7bf7ab0380355810d66a0378 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, { \ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ d0 = _mm_unpacklo_epi8(d0, zero); \ - in_x = _mm_add_epi16(in_x, d0); \ - in_x = _mm_packus_epi16(in_x, in_x); \ - _mm_storel_epi64((__m128i *)(dest), in_x); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ dest += stride; \ } @@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } +void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 5); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); +} + // perform 8x8 transpose static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 0217919da56b5977d4d114949022dfd6857d0ff4..70eb77f8a18c0afc4ad788fe9fda1fa67b70224c 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -93,15 +93,8 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) { if (eob) { if (eob == 1) { // DC only DCT coefficient - int16_t in = input[0]; - int16_t out; - - // Note: the idct1 will need to be modified accordingly whenever - // vp9_short_idct8x8_c() is modified. - vp9_short_idct1_8x8_c(&in, &out); + vp9_short_idct8x8_1_add(input, dest, stride); input[0] = 0; - - vp9_add_constant_residual_8x8(out, dest, stride); } else { vp9_short_idct8x8_add(input, dest, stride); vpx_memset(input, 0, 128); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index d16f4f606fb6c41b9690bb6e5d1883e5c8556305..4c04a1caa92bdd99aaf7757283f760e6f597bf5a 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -47,6 +47,14 @@ static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, xd->inv_txm4x4_add(dqcoeff, dest, stride); } +static void inverse_transform_b_8x8_add(MACROBLOCKD *xd, int eob, + int16_t *dqcoeff, uint8_t *dest, + int stride) { + if (eob <= 1) + vp9_short_idct8x8_1_add(dqcoeff, dest, stride); + else + vp9_short_idct8x8_add(dqcoeff, dest, stride); +} static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; @@ -533,7 +541,8 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride); break; case TX_8X8: - vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride); + inverse_transform_b_8x8_add(xd, pd->eobs[block], dqcoeff, + dst, pd->dst.stride); break; case TX_4X4: // this is like vp9_short_idct4x4 but has a special case around eob<=1 @@ -711,7 +720,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, pd->dequant, p->zbin_extra, eob, scan, iscan); if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) - vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride); + inverse_transform_b_8x8_add(xd, *eob, dqcoeff, dst, pd->dst.stride); else vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type); } @@ -746,8 +755,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - inverse_transform_b_4x4_add(xd, *eob, dqcoeff, - dst, pd->dst.stride); + inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride); else vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type); }