diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index a565270993d822146181561d64a7bce5f56831dc..e05d482b6503c6f3585fd9f2a4ef1cabeb7045b0 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -18,7 +18,7 @@ extern "C" { #include "vp9/common/vp9_entropy.h" #include "./vp9_rtcd.h" void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch); - void vp9_short_idct32x32_c(short *input, short *output, int pitch); + void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch); } #include "test/acm_random.h" @@ -91,28 +91,31 @@ static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) { } } - TEST(VP9Idct32x32Test, AccuracyCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 1000; for (int i = 0; i < count_test_block; ++i) { int16_t in[1024], coeff[1024]; - int16_t out_c[1024]; + uint8_t dst[1024], src[1024]; double out_r[1024]; + for (int j = 0; j < 1024; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + } // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 1024; ++j) - in[j] = rnd.Rand8() - rnd.Rand8(); + in[j] = src[j] - dst[j]; reference_32x32_dct_2d(in, out_r); for (int j = 0; j < 1024; j++) coeff[j] = round(out_r[j]); - vp9_short_idct32x32_c(coeff, out_c, 64); + vp9_short_idct32x32_add_c(coeff, dst, 32); for (int j = 0; j < 1024; ++j) { - const int diff = out_c[j] - in[j]; + const int diff = dst[j] - src[j]; const int error = diff * diff; EXPECT_GE(1, error) - << "Error: 3x32 IDCT has error " << error + << "Error: 32x32 IDCT has error " << error << " at index " << j; } } @@ -126,18 +129,22 @@ TEST(VP9Fdct32x32Test, AccuracyCheck) { for (int i = 0; i < count_test_block; ++i) { int16_t test_input_block[1024]; int16_t test_temp_block[1024]; - int16_t test_output_block[1024]; + uint8_t dst[1024], src[1024]; + for (int j = 0; j < 1024; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + } // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 1024; ++j) - test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + test_input_block[j] = src[j] - dst[j]; const int pitch = 64; vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch); - vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch); + vp9_short_idct32x32_add_c(test_temp_block, dst, 32); for (int j = 0; j < 1024; ++j) { - const unsigned diff = test_input_block[j] - test_output_block[j]; + const unsigned diff = dst[j] - src[j]; const unsigned error = diff * diff; if (max_error < error) max_error = error; diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 3ec093f735a7a2ecc8eff6f99737392c36678e95..5e6384c7a343891f4cde87a7332f41a532b307af 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -1249,10 +1249,9 @@ static void idct32_1d(int16_t *input, int16_t *output) { output[31] = step1[0] - step1[31]; } -void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[32 * 32]; int16_t *outptr = out; - const int half_pitch = pitch >> 1; int i, j; int16_t temp_in[32], temp_out[32]; @@ -1269,7 +1268,8 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { temp_in[j] = out[j * 32 + i]; idct32_1d(temp_in, temp_out); for (j = 0; j < 32; ++j) - output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * dest_stride + i]); } } @@ -1279,10 +1279,10 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { output[0] = ROUND_POWER_OF_TWO(out, 6); } -void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { int16_t out[32 * 32]; int16_t *outptr = out; - const int half_pitch = pitch >> 1; int i, j; int16_t temp_in[32], temp_out[32]; @@ -1302,6 +1302,7 @@ void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) { temp_in[j] = out[j * 32 + i]; idct32_1d(temp_in, temp_out); for (j = 0; j < 32; ++j) - output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * dest_stride + i]); } } diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 02d32530a07f84ec1ffa0a2d1d71c2beacbc2bec..c45d03084c8f1805ec3ac1b4cbeddf21c2ecf4f5 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -94,9 +94,6 @@ specialize vp9_add_residual_8x8 sse2 prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride" specialize vp9_add_residual_16x16 sse2 -prototype void vp9_add_residual_32x32 "const int16_t *diff, uint8_t *dest, int stride" -specialize vp9_add_residual_32x32 sse2 - prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" specialize vp9_add_constant_residual_8x8 sse2 @@ -212,15 +209,14 @@ specialize vp9_short_idct10_16x16 sse2 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output" specialize vp9_short_idct1_16x16 - -prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct32x32 sse2 +prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct32x32_add sse2 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 -prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct10_32x32 +prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct10_32x32_add prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type" specialize vp9_short_iht8x8 diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index dd7e68aa3be68227bfc3b6ef3e8d8797775c6822..e53a937f43258a241ea2b0d0bc655d4ddb4260ce 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -1319,8 +1319,7 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) { } } -void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) { - const int half_pitch = pitch >> 1; +void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); @@ -1832,6 +1831,8 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) { col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); } else { + const __m128i zero = _mm_setzero_si128(); + // 2_D: Calculate the results and store them to destination. in0 = _mm_add_epi16(stp1_0, stp1_31); in1 = _mm_add_epi16(stp1_1, stp1_30); @@ -1933,41 +1934,50 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) { in30 = _mm_srai_epi16(in30, 6); in31 = _mm_srai_epi16(in31, 6); - // Store results - _mm_store_si128((__m128i *)output, in0); - _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); - _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); - _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); - _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); - _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); - _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); - _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); - _mm_store_si128((__m128i *)(output + half_pitch * 8), in8); - _mm_store_si128((__m128i *)(output + half_pitch * 9), in9); - _mm_store_si128((__m128i *)(output + half_pitch * 10), in10); - _mm_store_si128((__m128i *)(output + half_pitch * 11), in11); - _mm_store_si128((__m128i *)(output + half_pitch * 12), in12); - _mm_store_si128((__m128i *)(output + half_pitch * 13), in13); - _mm_store_si128((__m128i *)(output + half_pitch * 14), in14); - _mm_store_si128((__m128i *)(output + half_pitch * 15), in15); - _mm_store_si128((__m128i *)(output + half_pitch * 16), in16); - _mm_store_si128((__m128i *)(output + half_pitch * 17), in17); - _mm_store_si128((__m128i *)(output + half_pitch * 18), in18); - _mm_store_si128((__m128i *)(output + half_pitch * 19), in19); - _mm_store_si128((__m128i *)(output + half_pitch * 20), in20); - _mm_store_si128((__m128i *)(output + half_pitch * 21), in21); - _mm_store_si128((__m128i *)(output + half_pitch * 22), in22); - _mm_store_si128((__m128i *)(output + half_pitch * 23), in23); - _mm_store_si128((__m128i *)(output + half_pitch * 24), in24); - _mm_store_si128((__m128i *)(output + half_pitch * 25), in25); - _mm_store_si128((__m128i *)(output + half_pitch * 26), in26); - _mm_store_si128((__m128i *)(output + half_pitch * 27), in27); - _mm_store_si128((__m128i *)(output + half_pitch * 28), in28); - _mm_store_si128((__m128i *)(output + half_pitch * 29), in29); - _mm_store_si128((__m128i *)(output + half_pitch * 30), in30); - _mm_store_si128((__m128i *)(output + half_pitch * 31), in31); +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + in_x = _mm_add_epi16(in_x, d0); \ + in_x = _mm_packus_epi16(in_x, in_x); \ + _mm_storel_epi64((__m128i *)(dest), in_x); \ + dest += stride; \ + } - output += 8; + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); + RECON_AND_STORE(dest, in8); + RECON_AND_STORE(dest, in9); + RECON_AND_STORE(dest, in10); + RECON_AND_STORE(dest, in11); + RECON_AND_STORE(dest, in12); + RECON_AND_STORE(dest, in13); + RECON_AND_STORE(dest, in14); + RECON_AND_STORE(dest, in15); + RECON_AND_STORE(dest, in16); + RECON_AND_STORE(dest, in17); + RECON_AND_STORE(dest, in18); + RECON_AND_STORE(dest, in19); + RECON_AND_STORE(dest, in20); + RECON_AND_STORE(dest, in21); + RECON_AND_STORE(dest, in22); + RECON_AND_STORE(dest, in23); + RECON_AND_STORE(dest, in24); + RECON_AND_STORE(dest, in25); + RECON_AND_STORE(dest, in26); + RECON_AND_STORE(dest, in27); + RECON_AND_STORE(dest, in28); + RECON_AND_STORE(dest, in29); + RECON_AND_STORE(dest, in30); + RECON_AND_STORE(dest, in31); + + dest += 8 - (stride * 32); } } } diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 3480df25efd60237c17e3b5ddabe03be18693cb6..faaee7378134fa208b2779b91d96cdc915692543 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -109,10 +109,6 @@ void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) { add_residual(diff, dest, stride, 16, 16); } -void vp9_add_residual_32x32_c(const int16_t *diff, uint8_t *dest, int stride) { - add_residual(diff, dest, stride, 32, 32); -} - static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride, int width, int height) { int r, c; @@ -321,20 +317,16 @@ void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) { input[0] = 0; #if !CONFIG_SCATTERSCAN } else if (eob <= 10) { - // the idct halves ( >> 1) the pitch - vp9_short_idct10_32x32(input, output, 64); - + vp9_short_idct10_32x32_add_c(input, dest, stride); input[0] = input[1] = input[2] = input[3] = 0; input[32] = input[33] = input[34] = 0; input[64] = input[65] = 0; input[96] = 0; - vp9_add_residual_32x32(output, dest, stride); #endif } else { - vp9_short_idct32x32(input, output, 64); + vp9_short_idct32x32_add(input, dest, stride); vpx_memset(input, 0, 2048); - vp9_add_residual_32x32(output, dest, stride); } } } diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c index 1296b704a102fe9fea98bab6daecd66070380a0c..38fd5aaa449a0699f1443a01075fe297e0e77d3e 100644 --- a/vp9/decoder/x86/vp9_dequantize_sse2.c +++ b/vp9/decoder/x86/vp9_dequantize_sse2.c @@ -181,65 +181,6 @@ void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest, } while (--i); } -void vp9_add_residual_32x32_sse2(const int16_t *diff, uint8_t *dest, - int stride) { - const int width = 32; - int i = 16; - const __m128i zero = _mm_setzero_si128(); - - // Diff data - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i p0, p1, p2, p3, p4, p5, p6, p7; - - do { - d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); - d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8)); - d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16)); - d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24)); - d4 = _mm_load_si128((const __m128i *)(diff + 1 * width)); - d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8)); - d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16)); - d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24)); - - // Prediction data. - p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride)); - p3 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16)); - p5 = _mm_load_si128((const __m128i *)(dest + 1 * stride)); - p7 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16)); - - p0 = _mm_unpacklo_epi8(p1, zero); - p1 = _mm_unpackhi_epi8(p1, zero); - p2 = _mm_unpacklo_epi8(p3, zero); - p3 = _mm_unpackhi_epi8(p3, zero); - p4 = _mm_unpacklo_epi8(p5, zero); - p5 = _mm_unpackhi_epi8(p5, zero); - p6 = _mm_unpacklo_epi8(p7, zero); - p7 = _mm_unpackhi_epi8(p7, zero); - - p0 = _mm_add_epi16(p0, d0); - p1 = _mm_add_epi16(p1, d1); - p2 = _mm_add_epi16(p2, d2); - p3 = _mm_add_epi16(p3, d3); - p4 = _mm_add_epi16(p4, d4); - p5 = _mm_add_epi16(p5, d5); - p6 = _mm_add_epi16(p6, d6); - p7 = _mm_add_epi16(p7, d7); - - p0 = _mm_packus_epi16(p0, p1); - p1 = _mm_packus_epi16(p2, p3); - p2 = _mm_packus_epi16(p4, p5); - p3 = _mm_packus_epi16(p6, p7); - - _mm_store_si128((__m128i *)(dest + 0 * stride), p0); - _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1); - _mm_store_si128((__m128i *)(dest + 1 * stride), p2); - _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3); - - diff += 2 * width; - dest += 2 * stride; - } while (--i); -} - void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index e4002d689f0909f2cba2b0b5ffb432d10e964b81..db18555f9d1b929c68ee323d257496553e9acb61 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -425,6 +425,7 @@ struct encode_b_args { VP9_COMMON *cm; MACROBLOCK *x; struct optimize_ctx *ctx; + int *wip_txfrm_size; // for "work in progress" only... will remove once done }; static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, @@ -493,6 +494,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK* const x = args->x; + int *wip_txfrm_size = args->wip_txfrm_size; MACROBLOCKD* const xd = &x->e_mbd; const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x); const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, @@ -500,6 +502,10 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, int16_t* const diff = raster_block_offset_int16(xd, bsize, plane, raster_block, xd->plane[plane].diff); + uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane, + raster_block, + xd->plane[plane].dst.buf, + xd->plane[plane].dst.stride); TX_TYPE tx_type = DCT_DCT; xform_quant(plane, block, bsize, ss_txfrm_size, arg); @@ -509,8 +515,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, switch (ss_txfrm_size / 2) { case TX_32X32: - vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - diff, bw * 2); + vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, + block, 16), dst, xd->plane[plane].dst.stride); + *wip_txfrm_size = 32; break; case TX_16X16: tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT; @@ -521,6 +528,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw, tx_type); } + *wip_txfrm_size = 16; break; case TX_8X8: tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT; @@ -531,6 +539,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw, tx_type); } + *wip_txfrm_size = 8; break; case TX_4X4: tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT; @@ -544,6 +553,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw, tx_type); } + *wip_txfrm_size = 4; break; } } @@ -551,7 +561,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; - struct encode_b_args arg = {cm, x, NULL}; + struct encode_b_args arg = {cm, x, NULL, NULL}; foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg); @@ -560,7 +570,7 @@ void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x, void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; - struct encode_b_args arg = {cm, x, NULL}; + struct encode_b_args arg = {cm, x, NULL, NULL}; foreach_transformed_block_uv(xd, bsize, xform_quant, &arg); } @@ -569,7 +579,8 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; + int wip_txfrm_size = 0; + struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size}; vp9_subtract_sby(x, bsize); if (x->optimize) @@ -577,15 +588,16 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x, foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg); - - vp9_recon_sby(xd, bsize); + if (wip_txfrm_size < 32) + vp9_recon_sby(xd, bsize); } void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; + int wip_txfrm_size = 0; + struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size}; vp9_subtract_sbuv(x, bsize); if (x->optimize) @@ -593,20 +605,35 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x, foreach_transformed_block_uv(xd, bsize, encode_block, &arg); - vp9_recon_sbuv(xd, bsize); + if (wip_txfrm_size < 32) + vp9_recon_sbuv(xd, bsize); } void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD* const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; + int wip_txfrm_size = 0; + struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size}; vp9_subtract_sb(x, bsize); if (x->optimize) vp9_optimize_init(xd, bsize, &ctx); - +#if 0 foreach_transformed_block(xd, bsize, encode_block, &arg); vp9_recon_sb(xd, bsize); +#else + // wip version... will use foreach_transformed_block when done + foreach_transformed_block_in_plane(xd, bsize, 0, + encode_block, &arg); + if (wip_txfrm_size < 32) + vp9_recon_sby(xd, bsize); + wip_txfrm_size = 0; + + foreach_transformed_block_uv(xd, bsize, encode_block, &arg); + + if (wip_txfrm_size < 32) + vp9_recon_sbuv(xd, bsize); +#endif }