diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index e1b2a07b8193c30ca7dce04e5897336a87faa0f7..90b4ecd64ca6f1668247171b12b8f7666d3488b0 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -16,6 +16,7 @@ extern "C" { #include "vp9_rtcd.h" +void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch); } #include "acm_random.h" @@ -100,11 +101,15 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) { for (int i = 0; i < count_test_block; ++i) { int16_t test_input_block[64]; int16_t test_temp_block[64]; - int16_t test_output_block[64]; + uint8_t dst[64], src[64]; + for (int j = 0; j < 64; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + } // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 64; ++j) - test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + test_input_block[j] = src[j] - dst[j]; const int pitch = 16; vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch); @@ -119,10 +124,10 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) { test_temp_block[j] *= 4; } } - vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch); + vp9_short_idct8x8_add_c(test_temp_block, dst, 8); for (int j = 0; j < 64; ++j) { - const int diff = test_input_block[j] - test_output_block[j]; + const int diff = dst[j] - src[j]; const int error = diff * diff; if (max_error < error) max_error = error; @@ -145,18 +150,22 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) { for (int i = 0; i < count_test_block; ++i) { int16_t test_input_block[64]; int16_t test_temp_block[64]; - int16_t test_output_block[64]; + uint8_t dst[64], src[64]; - // Initialize a test block with input range {-255, 255}. + for (int j = 0; j < 64; ++j) { + src[j] = rnd.Rand8() % 2 ? 255 : 0; + dst[j] = src[j] > 0 ? 0 : 255; + } + // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 64; ++j) - test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256; + test_input_block[j] = src[j] - dst[j]; const int pitch = 16; vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch); - vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch); + vp9_short_idct8x8_add_c(test_temp_block, dst, 8); for (int j = 0; j < 64; ++j) { - const int diff = test_input_block[j] - test_output_block[j]; + const int diff = dst[j] - src[j]; const int error = diff * diff; if (max_error < error) max_error = error; diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc index 30a1ac3998006df794b38cd235e076b2081c8621..67db78b4617a57a75f03a8eb48de8ca7008eca6f 100644 --- a/test/idct8x8_test.cc +++ b/test/idct8x8_test.cc @@ -112,20 +112,23 @@ TEST(VP9Idct8x8Test, AccuracyCheck) { const int count_test_block = 10000; for (int i = 0; i < count_test_block; ++i) { int16_t input[64], coeff[64]; - int16_t output_c[64]; double output_r[64]; + uint8_t dst[64], src[64]; + for (int j = 0; j < 64; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + } // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 64; ++j) - input[j] = rnd.Rand8() - rnd.Rand8(); + input[j] = src[j] - dst[j]; - const int pitch = 16; reference_dct_2d(input, output_r); for (int j = 0; j < 64; ++j) coeff[j] = round(output_r[j]); - vp9_short_idct8x8_c(coeff, output_c, pitch); + vp9_short_idct8x8_add_c(coeff, dst, 8); for (int j = 0; j < 64; ++j) { - const int diff = output_c[j] -input[j]; + const int diff = dst[j] - src[j]; const int error = diff * diff; EXPECT_GE(1, error) << "Error: 8x8 FDCT/IDCT has error " << error diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index b166fcbba3b6383e5b63eece97a87baedc7f506d..2ff7696f8140241e1f4a3426f67944e388ffbdb5 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -219,27 +219,27 @@ static void idct8_1d(int16_t *input, int16_t *output) { output[7] = step1[0] - step1[7]; } -void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[8 * 8]; int16_t *outptr = out; - const int half_pitch = pitch >> 1; int i, j; int16_t temp_in[8], temp_out[8]; - // Rows + // First transform rows for (i = 0; i < 8; ++i) { idct8_1d(input, outptr); input += 8; outptr += 8; } - // Columns + // Then transform columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; idct8_1d(temp_in, temp_out); for (j = 0; j < 8; ++j) - output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5); + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); } } @@ -400,8 +400,8 @@ static const transform_2d IHT_8[] = { { iadst8_1d, iadst8_1d } // ADST_ADST = 3 }; -void vp9_short_iht8x8_c(int16_t *input, int16_t *output, - int pitch, int tx_type) { +void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, + int tx_type) { int i, j; int16_t out[8 * 8]; int16_t *outptr = out; @@ -421,14 +421,14 @@ void vp9_short_iht8x8_c(int16_t *input, int16_t *output, temp_in[j] = out[j * 8 + i]; ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) - output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5); - } + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); } } -void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { int16_t out[8 * 8]; int16_t *outptr = out; - const int half_pitch = pitch >> 1; int i, j; int16_t temp_in[8], temp_out[8]; @@ -447,7 +447,8 @@ void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) { temp_in[j] = out[j * 8 + i]; idct8_1d(temp_in, temp_out); for (j = 0; j < 8; ++j) - output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5); + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); } } diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index ea60fbb10f3f968b8df51e42a2fbe808b1844560..5ecb0af44cc0c565c74dd65299c7c9e578db1127 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -88,9 +88,6 @@ if [ "$CONFIG_VP9_DECODER" = "yes" ]; then prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride" specialize vp9_add_residual_4x4 sse2 -prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride" -specialize vp9_add_residual_8x8 sse2 - prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" specialize vp9_add_constant_residual_8x8 sse2 @@ -188,11 +185,11 @@ specialize vp9_short_idct4x4_1 prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch" specialize vp9_short_idct4x4 sse2 -prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct8x8 sse2 +prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct8x8_add sse2 -prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct10_8x8 sse2 +prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct10_8x8_add sse2 prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output" specialize vp9_short_idct1_8x8 @@ -215,8 +212,8 @@ specialize vp9_short_idct1_32x32 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_32x32_add -prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type" -specialize vp9_short_iht8x8 +prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" +specialize vp9_short_iht8x8_add prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type" specialize vp9_short_iht4x4 diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 667da33696cc95c73613aaba2b537393514299b9..ab8604c75e05ce247a45e672b206349f8a90c44f 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -403,8 +403,18 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { in6 = _mm_subs_epi16(stp1_1, stp1_6); \ in7 = _mm_subs_epi16(stp1_0, stp2_7); -void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) { - const int half_pitch = pitch >> 1; +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + in_x = _mm_add_epi16(in_x, d0); \ + in_x = _mm_packus_epi16(in_x, in_x); \ + _mm_storel_epi64((__m128i *)(dest), in_x); \ + dest += stride; \ + } + +void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); @@ -461,19 +471,17 @@ void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) { in6 = _mm_srai_epi16(in6, 5); in7 = _mm_srai_epi16(in7, 5); - // Store results - _mm_store_si128((__m128i *)output, in0); - _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); - _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); - _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); - _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); - _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); - _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); - _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); } -void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { - const int half_pitch = pitch >> 1; +void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -612,15 +620,14 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { in6 = _mm_srai_epi16(in6, 5); in7 = _mm_srai_epi16(in7, 5); - // Store results - _mm_store_si128((__m128i *)output, in0); - _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); - _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); - _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); - _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); - _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); - _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); - _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); } #define IDCT16x16_1D \ @@ -752,16 +759,6 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { stp2_10, stp2_13, stp2_11, stp2_12) \ } -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - in_x = _mm_add_epi16(in_x, d0); \ - in_x = _mm_packus_epi16(in_x, in_x); \ - _mm_storel_epi64((__m128i *)(dest), in_x); \ - dest += stride; \ - } - void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index bc943fa85dd30823e622338ab9fc7c6f425de429..10b585b3ffdb6bc9dd5f3405653f52ffeee01592 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -101,10 +101,6 @@ void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) { add_residual(diff, dest, stride, 4, 4); } -void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) { - add_residual(diff, dest, stride, 8, 8); -} - static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride, int width, int height) { int r, c; @@ -151,11 +147,8 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, vp9_idct_add_8x8(input, dest, stride, eob); } else { if (eob > 0) { - DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64); - - vp9_short_iht8x8(input, output, 8, tx_type); + vp9_short_iht8x8_add(input, dest, stride, tx_type); vpx_memset(input, 0, 128); - vp9_add_residual_8x8(output, dest, stride); } } } @@ -210,8 +203,6 @@ void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest, } void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) { - DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64); - // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -233,20 +224,15 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) { vp9_add_constant_residual_8x8(out, dest, stride); #if !CONFIG_SCATTERSCAN } else if (eob <= 10) { - vp9_short_idct10_8x8(input, output, 16); - + vp9_short_idct10_8x8_add(input, dest, stride); input[0] = input[1] = input[2] = input[3] = 0; input[8] = input[9] = input[10] = 0; input[16] = input[17] = 0; input[24] = 0; - - vp9_add_residual_8x8(output, dest, stride); #endif } else { - // the idct halves ( >> 1) the pitch - vp9_short_idct8x8(input, output, 8 << 1); + vp9_short_idct8x8_add(input, dest, stride); vpx_memset(input, 0, 128); - vp9_add_residual_8x8(output, dest, stride); } } } diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c index 796fc123c7df01fb31b87eedf4ce391503ccdf07..72036c2d4064ab08c5c1c2e58d9bdb2ff3623230 100644 --- a/vp9/decoder/x86/vp9_dequantize_sse2.c +++ b/vp9/decoder/x86/vp9_dequantize_sse2.c @@ -58,70 +58,6 @@ void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) { *(int *)dest = _mm_cvtsi128_si32(p2); } -void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) { - const int width = 8; - const __m128i zero = _mm_setzero_si128(); - - // Diff data - const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); - const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width)); - const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width)); - const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width)); - const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width)); - const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width)); - const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width)); - const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width)); - - // Prediction data. - __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); - __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); - __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); - __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); - __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride)); - __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride)); - __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride)); - __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride)); - - p0 = _mm_unpacklo_epi8(p0, zero); - p1 = _mm_unpacklo_epi8(p1, zero); - p2 = _mm_unpacklo_epi8(p2, zero); - p3 = _mm_unpacklo_epi8(p3, zero); - p4 = _mm_unpacklo_epi8(p4, zero); - p5 = _mm_unpacklo_epi8(p5, zero); - p6 = _mm_unpacklo_epi8(p6, zero); - p7 = _mm_unpacklo_epi8(p7, zero); - - p0 = _mm_add_epi16(p0, d0); - p1 = _mm_add_epi16(p1, d1); - p2 = _mm_add_epi16(p2, d2); - p3 = _mm_add_epi16(p3, d3); - p4 = _mm_add_epi16(p4, d4); - p5 = _mm_add_epi16(p5, d5); - p6 = _mm_add_epi16(p6, d6); - p7 = _mm_add_epi16(p7, d7); - - p0 = _mm_packus_epi16(p0, p1); - p2 = _mm_packus_epi16(p2, p3); - p4 = _mm_packus_epi16(p4, p5); - p6 = _mm_packus_epi16(p6, p7); - - _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); - p0 = _mm_srli_si128(p0, 8); - _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); - - _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); - p2 = _mm_srli_si128(p2, 8); - _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); - - _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); - p4 = _mm_srli_si128(p4, 8); - _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); - - _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); - p6 = _mm_srli_si128(p6, 8); - _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); -} - void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 221de7426e5ca2e436c09dc7b81b870c5df6f946..bbc97da61eeb2056f72c9de3c4d01a00053b380e 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -534,11 +534,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, case TX_8X8: tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT; if (tx_type == DCT_DCT) { - vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - diff, bw * 2); + vp9_short_idct8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, + block, 16), dst, xd->plane[plane].dst.stride); } else { - vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - diff, bw, tx_type); + vp9_short_iht8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, + block, 16), dst, xd->plane[plane].dst.stride, + tx_type); } *wip_txfrm_size = 8; break; @@ -589,7 +590,7 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x, foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg); - if (wip_txfrm_size < 32) + if (wip_txfrm_size < 8) vp9_recon_sby(xd, bsize); } @@ -606,7 +607,7 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x, foreach_transformed_block_uv(xd, bsize, encode_block, &arg); - if (wip_txfrm_size < 16) + if (wip_txfrm_size < 8) vp9_recon_sbuv(xd, bsize); } @@ -628,13 +629,13 @@ void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x, // wip version... will use foreach_transformed_block when done foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg); - if (wip_txfrm_size < 16) + if (wip_txfrm_size < 8) vp9_recon_sby(xd, bsize); wip_txfrm_size = 0; foreach_transformed_block_uv(xd, bsize, encode_block, &arg); - if (wip_txfrm_size < 16) + if (wip_txfrm_size < 8) vp9_recon_sbuv(xd, bsize); #endif }