Commit 0084e61d authored by Jingning Han's avatar Jingning Han
Browse files

Tune the rounding operations in 8x8 ADST/DCT sse2

Improve the round-trip precision to meet the unit test setttings.

Change-Id: I303febae56b4b990ea3798b8ebed94c0510ecf79
parent cd6932db
...@@ -33,7 +33,13 @@ void idct8x8_add(int16_t *in, int16_t *out, uint8_t *dst, ...@@ -33,7 +33,13 @@ void idct8x8_add(int16_t *in, int16_t *out, uint8_t *dst,
vp9_short_idct8x8_add_c(out, dst, stride >> 1); vp9_short_idct8x8_add_c(out, dst, stride >> 1);
} }
void fht8x8(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) { void fht8x8(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
// TODO(jingning): need to refactor this to test both _c and _sse2 functions,
// when we have all inverse dct functions done sse2.
#if HAVE_SSE2
vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
#else
vp9_short_fht8x8_c(in, out, stride >> 1, tx_type); vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
#endif
} }
void iht8x8_add(int16_t *in, int16_t *out, uint8_t *dst, void iht8x8_add(int16_t *in, int16_t *out, uint8_t *dst,
int stride, int tx_type) { int stride, int tx_type) {
......
...@@ -397,6 +397,24 @@ static INLINE void load_buffer_8x8(int16_t *input, __m128i in[8], int stride) { ...@@ -397,6 +397,24 @@ static INLINE void load_buffer_8x8(int16_t *input, __m128i in[8], int stride) {
// write 8x8 array // write 8x8 array
static INLINE void write_buffer_8x8(int16_t *output, __m128i res[8]) { static INLINE void write_buffer_8x8(int16_t *output, __m128i res[8]) {
__m128i sign0 = _mm_srai_epi16(res[0], 15);
__m128i sign1 = _mm_srai_epi16(res[1], 15);
__m128i sign2 = _mm_srai_epi16(res[2], 15);
__m128i sign3 = _mm_srai_epi16(res[3], 15);
__m128i sign4 = _mm_srai_epi16(res[4], 15);
__m128i sign5 = _mm_srai_epi16(res[5], 15);
__m128i sign6 = _mm_srai_epi16(res[6], 15);
__m128i sign7 = _mm_srai_epi16(res[7], 15);
res[0] = _mm_sub_epi16(res[0], sign0);
res[1] = _mm_sub_epi16(res[1], sign1);
res[2] = _mm_sub_epi16(res[2], sign2);
res[3] = _mm_sub_epi16(res[3], sign3);
res[4] = _mm_sub_epi16(res[4], sign4);
res[5] = _mm_sub_epi16(res[5], sign5);
res[6] = _mm_sub_epi16(res[6], sign6);
res[7] = _mm_sub_epi16(res[7], sign7);
res[0] = _mm_srai_epi16(res[0], 1); res[0] = _mm_srai_epi16(res[0], 1);
res[1] = _mm_srai_epi16(res[1], 1); res[1] = _mm_srai_epi16(res[1], 1);
res[2] = _mm_srai_epi16(res[2], 1); res[2] = _mm_srai_epi16(res[2], 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment