diff --git a/configure b/configure index d1d25b3d5725a9a4d3775c8e2cb6fd74dddfbd32..c93ffd75fcc614ec2a64f77a42dafdf879b6a7bc 100755 --- a/configure +++ b/configure @@ -247,6 +247,8 @@ EXPERIMENT_LIST=" implicit_segmentation newbintramodes comp_interintra_pred + tx32x32 + dwt32x32hybrid " CONFIG_LIST=" external_build diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f089c6839b02a36997c1cb3d69f6c5ea3f4288f --- /dev/null +++ b/test/dct32x32_test.cc @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> +#include <stdlib.h> +#include <string.h> + +#include "third_party/googletest/src/include/gtest/gtest.h" + +extern "C" { +#include "vp9/common/vp9_entropy.h" +#include "./vp9_rtcd.h" + void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch); + void vp9_short_idct32x32_c(short *input, short *output, int pitch); +} + +#include "test/acm_random.h" +#include "vpx/vpx_integer.h" + +using libvpx_test::ACMRandom; + +namespace { + +#if !CONFIG_DWT32X32HYBRID +static const double kPi = 3.141592653589793238462643383279502884; +static void reference2_32x32_idct_2d(double *input, double *output) { + double x; + for (int l = 0; l < 32; ++l) { + for (int k = 0; k < 32; ++k) { + double s = 0; + for (int i = 0; i < 32; ++i) { + for (int j = 0; j < 32; ++j) { + x = cos(kPi * j * (l + 0.5) / 32.0) * + cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024; + if (i != 0) + x *= sqrt(2.0); + if (j != 0) + x *= sqrt(2.0); + s += x; + } + } + output[k * 32 + l] = s / 4; + } + } +} + +static void reference_32x32_dct_1d(double in[32], double out[32], int stride) { + const double kInvSqrt2 = 0.707106781186547524400844362104; + for (int k = 0; k < 32; k++) { + out[k] = 0.0; + for (int n = 0; n < 32; n++) + out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 64.0); + if (k == 0) + out[k] = out[k] * kInvSqrt2; + } +} + +static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) { + // First transform columns + for (int i = 0; i < 32; ++i) { + double temp_in[32], temp_out[32]; + for (int j = 0; j < 32; ++j) + temp_in[j] = input[j*32 + i]; + reference_32x32_dct_1d(temp_in, temp_out, 1); + for (int j = 0; j < 32; ++j) + output[j * 32 + i] = temp_out[j]; + } + // Then transform rows + for (int i = 0; i < 32; ++i) { + double temp_in[32], temp_out[32]; + for (int j = 0; j < 32; ++j) + temp_in[j] = output[j + i*32]; + reference_32x32_dct_1d(temp_in, temp_out, 1); + // Scale by some magic number + for (int j = 0; j < 32; ++j) + output[j + i * 32] = temp_out[j] / 4; + } +} + + +TEST(VP9Idct32x32Test, AccuracyCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + for (int i = 0; i < count_test_block; ++i) { + int16_t in[1024], coeff[1024]; + int16_t out_c[1024]; + double out_r[1024]; + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 1024; ++j) + in[j] = rnd.Rand8() - rnd.Rand8(); + + reference_32x32_dct_2d(in, out_r); + for (int j = 0; j < 1024; j++) + coeff[j] = round(out_r[j]); + vp9_short_idct32x32_c(coeff, out_c, 64); + for (int j = 0; j < 1024; ++j) { + const int diff = out_c[j] - in[j]; + const int error = diff * diff; + EXPECT_GE(1, error) + << "Error: 3x32 IDCT has error " << error + << " at index " << j; + } + + vp9_short_fdct32x32_c(in, out_c, 64); + for (int j = 0; j < 1024; ++j) { + const double diff = coeff[j] - out_c[j]; + const double error = diff * diff; + EXPECT_GE(1.0, error) + << "Error: 32x32 FDCT has error " << error + << " at index " << j; + } + } +} +#else // CONFIG_DWT32X32HYBRID + // TODO(rbultje/debargha): add DWT-specific tests +#endif // CONFIG_DWT32X32HYBRID +TEST(VP9Fdct32x32Test, AccuracyCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + unsigned int max_error = 0; + int64_t total_error = 0; + const int count_test_block = 1000; + for (int i = 0; i < count_test_block; ++i) { + int16_t test_input_block[1024]; + int16_t test_temp_block[1024]; + int16_t test_output_block[1024]; + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 1024; ++j) + test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + + const int pitch = 64; + vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch); + vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch); + + for (int j = 0; j < 1024; ++j) { + const unsigned diff = test_input_block[j] - test_output_block[j]; + const unsigned error = diff * diff; + if (max_error < error) + max_error = error; + total_error += error; + } + } + + EXPECT_GE(1u, max_error) + << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1"; + + EXPECT_GE(count_test_block/10, total_error) + << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1/10 per block"; +} + +TEST(VP9Fdct32x32Test, CoeffSizeCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + for (int i = 0; i < count_test_block; ++i) { + int16_t input_block[1024], input_extreme_block[1024]; + int16_t output_block[1024], output_extreme_block[1024]; + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 1024; ++j) { + input_block[j] = rnd.Rand8() - rnd.Rand8(); + input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255; + } + if (i == 0) + for (int j = 0; j < 1024; ++j) + input_extreme_block[j] = 255; + + const int pitch = 32; + vp9_short_fdct32x32_c(input_block, output_block, pitch); + vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch); + + // The minimum quant value is 4. + for (int j = 0; j < 1024; ++j) { + EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j])) + << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; + EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j])) + << "Error: 32x32 FDCT extreme has coefficient larger than " + "4*DCT_MAX_VALUE"; + } + } +} +} // namespace diff --git a/test/test.mk b/test/test.mk index 4fb464e643e93fabb2f3b7b7e9c8d04561907a44..919cf04387bbea5a08115f673bf3302c8b53b808 100644 --- a/test/test.mk +++ b/test/test.mk @@ -64,6 +64,9 @@ endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc #LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc +ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_TX32X32),yesyes) +LIBVPX_TEST_SRCS-yes += dct32x32_test.cc +endif LIBVPX_TEST_SRCS-yes += idct8x8_test.cc LIBVPX_TEST_SRCS-yes += variance_test.cc endif # VP9 diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 27ef22fffd3fe956d37919da701c210ea2c7d20c..11efd44759a25b4d497cdc54bdb3ae6a902471ed 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -129,7 +129,13 @@ typedef enum { TX_4X4, // 4x4 dct transform TX_8X8, // 8x8 dct transform TX_16X16, // 16x16 dct transform - TX_SIZE_MAX // Number of different transforms available + TX_SIZE_MAX_MB, // Number of transforms available to MBs +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + TX_32X32 = TX_SIZE_MAX_MB, // 32x32 dct transform + TX_SIZE_MAX_SB, // Number of transforms available to SBs +#else + TX_SIZE_MAX_SB = TX_SIZE_MAX_MB, +#endif } TX_SIZE; typedef enum { @@ -302,6 +308,15 @@ typedef struct blockd { union b_mode_info bmi; } BLOCKD; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +typedef struct superblockd { + /* 32x32 Y and 16x16 U/V. No 2nd order transform yet. */ + DECLARE_ALIGNED(16, short, diff[32*32+16*16*2]); + DECLARE_ALIGNED(16, short, qcoeff[32*32+16*16*2]); + DECLARE_ALIGNED(16, short, dqcoeff[32*32+16*16*2]); +} SUPERBLOCKD; +#endif + typedef struct macroblockd { DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */ DECLARE_ALIGNED(16, unsigned char, predictor[384]); @@ -309,6 +324,10 @@ typedef struct macroblockd { DECLARE_ALIGNED(16, short, dqcoeff[400]); DECLARE_ALIGNED(16, unsigned short, eobs[25]); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + SUPERBLOCKD sb_coeff_data; +#endif + /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ BLOCKD block[25]; int fullpixel_mask; diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h index 52fb02f36af34ef5fbcfb8c421fb992544264eb3..31103adb7fe7cd9906d7681eea55a65a6b674c3b 100644 --- a/vp9/common/vp9_default_coef_probs.h +++ b/vp9/common/vp9_default_coef_probs.h @@ -1375,3 +1375,5 @@ static const vp9_prob } } }; + +#define default_coef_probs_32x32 default_coef_probs_16x16 diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 4832b4e9c7b973a1b96d9df74c5bbf91e92090fe..321fa8c57504a6e335c6db9dca0ece3ee0880d61 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -132,6 +132,109 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = { 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255, }; +DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { + 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, + 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +}; +DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { + 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 128, 97, 66, 35, 4, 5, 36, 67, 98, 129, 160, 192, 161, 130, 99, 68, 37, 6, 7, 38, 69, 100, + 131, 162, 193, 224, 256, 225, 194, 163, 132, 101, 70, 39, 8, 9, 40, 71, 102, 133, 164, 195, 226, 257, 288, 320, 289, 258, 227, 196, 165, 134, 103, 72, + 41, 10, 11, 42, 73, 104, 135, 166, 197, 228, 259, 290, 321, 352, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, 13, 44, 75, 106, 137, + 168, 199, 230, 261, 292, 323, 354, 385, 416, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14, 15, 46, 77, 108, 139, 170, 201, 232, + 263, 294, 325, 356, 387, 418, 449, 480, 512, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 17, 48, 79, 110, 141, 172, 203, + 234, 265, 296, 327, 358, 389, 420, 451, 482, 513, 544, 576, 545, 514, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142, 111, 80, 49, 18, 19, 50, + 81, 112, 143, 174, 205, 236, 267, 298, 329, 360, 391, 422, 453, 484, 515, 546, 577, 608, 640, 609, 578, 547, 516, 485, 454, 423, 392, 361, 330, 299, 268, 237, + 206, 175, 144, 113, 82, 51, 20, 21, 52, 83, 114, 145, 176, 207, 238, 269, 300, 331, 362, 393, 424, 455, 486, 517, 548, 579, 610, 641, 672, 704, 673, 642, + 611, 580, 549, 518, 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115, 84, 53, 22, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302, 333, 364, + 395, 426, 457, 488, 519, 550, 581, 612, 643, 674, 705, 736, 768, 737, 706, 675, 644, 613, 582, 551, 520, 489, 458, 427, 396, 365, 334, 303, 272, 241, 210, 179, + 148, 117, 86, 55, 24, 25, 56, 87, 118, 149, 180, 211, 242, 273, 304, 335, 366, 397, 428, 459, 490, 521, 552, 583, 614, 645, 676, 707, 738, 769, 800, 832, + 801, 770, 739, 708, 677, 646, 615, 584, 553, 522, 491, 460, 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 27, 58, 89, 120, 151, 182, + 213, 244, 275, 306, 337, 368, 399, 430, 461, 492, 523, 554, 585, 616, 647, 678, 709, 740, 771, 802, 833, 864, 896, 865, 834, 803, 772, 741, 710, 679, 648, 617, + 586, 555, 524, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 29, 60, 91, 122, 153, 184, 215, 246, 277, 308, 339, 370, 401, + 432, 463, 494, 525, 556, 587, 618, 649, 680, 711, 742, 773, 804, 835, 866, 897, 928, 960, 929, 898, 867, 836, 805, 774, 743, 712, 681, 650, 619, 588, 557, 526, + 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61, 30, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341, 372, 403, 434, 465, 496, + 527, 558, 589, 620, 651, 682, 713, 744, 775, 806, 837, 868, 899, 930, 961, 992, 993, 962, 931, 900, 869, 838, 807, 776, 745, 714, 683, 652, 621, 590, 559, 528, + 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63, 95, 126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467, 498, 529, 560, 591, + 622, 653, 684, 715, 746, 777, 808, 839, 870, 901, 932, 963, 994, 995, 964, 933, 902, 871, 840, 809, 778, 747, 716, 685, 654, 623, 592, 561, 530, 499, 468, 437, + 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 159, 190, 221, 252, 283, 314, 345, 376, 407, 438, 469, 500, 531, 562, 593, 624, 655, 686, 717, 748, 779, 810, + 841, 872, 903, 934, 965, 996, 997, 966, 935, 904, 873, 842, 811, 780, 749, 718, 687, 656, 625, 594, 563, 532, 501, 470, 439, 408, 377, 346, 315, 284, 253, 222, + 191, 223, 254, 285, 316, 347, 378, 409, 440, 471, 502, 533, 564, 595, 626, 657, 688, 719, 750, 781, 812, 843, 874, 905, 936, 967, 998, 999, 968, 937, 906, 875, + 844, 813, 782, 751, 720, 689, 658, 627, 596, 565, 534, 503, 472, 441, 410, 379, 348, 317, 286, 255, 287, 318, 349, 380, 411, 442, 473, 504, 535, 566, 597, 628, + 659, 690, 721, 752, 783, 814, 845, 876, 907, 938, 969, 1000, 1001, 970, 939, 908, 877, 846, 815, 784, 753, 722, 691, 660, 629, 598, 567, 536, 505, 474, 443, 412, + 381, 350, 319, 351, 382, 413, 444, 475, 506, 537, 568, 599, 630, 661, 692, 723, 754, 785, 816, 847, 878, 909, 940, 971, 1002, 1003, 972, 941, 910, 879, 848, 817, + 786, 755, 724, 693, 662, 631, 600, 569, 538, 507, 476, 445, 414, 383, 415, 446, 477, 508, 539, 570, 601, 632, 663, 694, 725, 756, 787, 818, 849, 880, 911, 942, + 973, 1004, 1005, 974, 943, 912, 881, 850, 819, 788, 757, 726, 695, 664, 633, 602, 571, 540, 509, 478, 447, 479, 510, 541, 572, 603, 634, 665, 696, 727, 758, 789, + 820, 851, 882, 913, 944, 975, 1006, 1007, 976, 945, 914, 883, 852, 821, 790, 759, 728, 697, 666, 635, 604, 573, 542, 511, 543, 574, 605, 636, 667, 698, 729, 760, + 791, 822, 853, 884, 915, 946, 977, 1008, 1009, 978, 947, 916, 885, 854, 823, 792, 761, 730, 699, 668, 637, 606, 575, 607, 638, 669, 700, 731, 762, 793, 824, 855, + 886, 917, 948, 979, 1010, 1011, 980, 949, 918, 887, 856, 825, 794, 763, 732, 701, 670, 639, 671, 702, 733, 764, 795, 826, 857, 888, 919, 950, 981, 1012, 1013, 982, + 951, 920, 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890, 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767, 799, 830, 861, 892, + 923, 954, 985, 1016, 1017, 986, 955, 924, 893, 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895, 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023, +}; /* Array indices are identical to previously-existing CONTEXT_NODE indices */ @@ -160,10 +263,11 @@ static const Prob Pcat2[] = { 165, 145}; static const Prob Pcat3[] = { 173, 148, 140}; static const Prob Pcat4[] = { 176, 155, 140, 135}; static const Prob Pcat5[] = { 180, 157, 141, 134, 130}; -static const Prob Pcat6[] = -{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129}; +static const Prob Pcat6[] = { + 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 +}; -static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26]; +static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; static void init_bit_tree(vp9_tree_index *p, int n) { int i = 0; @@ -182,7 +286,7 @@ static void init_bit_trees() { init_bit_tree(cat3, 3); init_bit_tree(cat4, 4); init_bit_tree(cat5, 5); - init_bit_tree(cat6, 13); + init_bit_tree(cat6, 14); } vp9_extra_bit_struct vp9_extra_bits[12] = { @@ -196,7 +300,7 @@ vp9_extra_bit_struct vp9_extra_bits[12] = { { cat3, Pcat3, 3, 11}, { cat4, Pcat4, 4, 19}, { cat5, Pcat5, 5, 35}, - { cat6, Pcat6, 13, 67}, + { cat6, Pcat6, 14, 67}, { 0, 0, 0, 0} }; @@ -218,6 +322,11 @@ void vp9_default_coef_probs(VP9_COMMON *pc) { vpx_memcpy(pc->fc.hybrid_coef_probs_16x16, default_hybrid_coef_probs_16x16, sizeof(pc->fc.hybrid_coef_probs_16x16)); + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32, + sizeof(pc->fc.coef_probs_32x32)); +#endif } void vp9_coef_tree_initialize() { @@ -444,4 +553,28 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) { else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob; } } + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + for (i = 0; i < BLOCK_TYPES_32X32; ++i) + for (j = 0; j < COEF_BANDS; ++j) + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + coef_probs, branch_ct, cm->fc.coef_counts_32x32[i][j][k], 256, 1); + for (t = 0; t < ENTROPY_NODES; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + prob = ((int)cm->fc.pre_coef_probs_32x32[i][j][k][t] * + (256 - factor) + + (int)coef_probs[t] * factor + 128) >> 8; + if (prob <= 0) cm->fc.coef_probs_32x32[i][j][k][t] = 1; + else if (prob > 255) cm->fc.coef_probs_32x32[i][j][k][t] = 255; + else cm->fc.coef_probs_32x32[i][j][k][t] = prob; + } + } +#endif } diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index 3c74de7be9b73dabd5e9e8ddce10b6c45f1e9e1f..96d964448f143b42bf880566445863cb6c3e95a8 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -55,7 +55,7 @@ extern vp9_extra_bit_struct vp9_extra_bits[12]; /* indexed by token value */ #define PROB_UPDATE_BASELINE_COST 7 #define MAX_PROB 255 -#define DCT_MAX_VALUE 8192 +#define DCT_MAX_VALUE 16384 /* Coefficients are predicted via a 3-dimensional probability table. */ @@ -66,6 +66,10 @@ extern vp9_extra_bit_struct vp9_extra_bits[12]; /* indexed by token value */ #define BLOCK_TYPES_16X16 4 +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +#define BLOCK_TYPES_32X32 4 +#endif + /* Middle dimension is a coarsening of the coefficient's position within the 4x4 DCT. */ @@ -73,6 +77,9 @@ extern vp9_extra_bit_struct vp9_extra_bits[12]; /* indexed by token value */ extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]); extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]); extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]); +#endif /* Inside dimension is 3-valued measure of nearby complexity, that is, the extent to which nearby coefficients are nonzero. For the first @@ -106,9 +113,13 @@ extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]); extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]); extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]); +extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]); +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]); +#endif + void vp9_coef_tree_initialize(void); -extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]); void vp9_adapt_coef_probs(struct VP9Common *); #endif diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index 9622dfdee1335b596afa7b88f53b60b77b630903..cc685b99e68a3e52387b7c65c37a1827d43b31e5 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c @@ -1774,3 +1774,465 @@ void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) { #undef RIGHT_SHIFT #undef RIGHT_ROUNDING #endif + +#if CONFIG_TX32X32 +#if !CONFIG_DWT32X32HYBRID +#define DownshiftMultiplyBy2(x) x * 2 +#define DownshiftMultiply(x) x +static void idct16(double *input, double *output, int stride) { + static const double C1 = 0.995184726672197; + static const double C2 = 0.98078528040323; + static const double C3 = 0.956940335732209; + static const double C4 = 0.923879532511287; + static const double C5 = 0.881921264348355; + static const double C6 = 0.831469612302545; + static const double C7 = 0.773010453362737; + static const double C8 = 0.707106781186548; + static const double C9 = 0.634393284163646; + static const double C10 = 0.555570233019602; + static const double C11 = 0.471396736825998; + static const double C12 = 0.38268343236509; + static const double C13 = 0.290284677254462; + static const double C14 = 0.195090322016128; + static const double C15 = 0.098017140329561; + + double step[16]; + double intermediate[16]; + double temp1, temp2; + + // step 1 and 2 + step[ 0] = input[stride*0] + input[stride*8]; + step[ 1] = input[stride*0] - input[stride*8]; + + temp1 = input[stride*4]*C12; + temp2 = input[stride*12]*C4; + + temp1 -= temp2; + temp1 = DownshiftMultiply(temp1); + temp1 *= C8; + + step[ 2] = DownshiftMultiplyBy2(temp1); + + temp1 = input[stride*4]*C4; + temp2 = input[stride*12]*C12; + temp1 += temp2; + temp1 = DownshiftMultiply(temp1); + temp1 *= C8; + step[ 3] = DownshiftMultiplyBy2(temp1); + + temp1 = input[stride*2]*C8; + temp1 = DownshiftMultiplyBy2(temp1); + temp2 = input[stride*6] + input[stride*10]; + + step[ 4] = temp1 + temp2; + step[ 5] = temp1 - temp2; + + temp1 = input[stride*14]*C8; + temp1 = DownshiftMultiplyBy2(temp1); + temp2 = input[stride*6] - input[stride*10]; + + step[ 6] = temp2 - temp1; + step[ 7] = temp2 + temp1; + + // for odd input + temp1 = input[stride*3]*C12; + temp2 = input[stride*13]*C4; + temp1 += temp2; + temp1 = DownshiftMultiply(temp1); + temp1 *= C8; + intermediate[ 8] = DownshiftMultiplyBy2(temp1); + + temp1 = input[stride*3]*C4; + temp2 = input[stride*13]*C12; + temp2 -= temp1; + temp2 = DownshiftMultiply(temp2); + temp2 *= C8; + intermediate[ 9] = DownshiftMultiplyBy2(temp2); + + intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8); + intermediate[11] = input[stride*15] - input[stride*1]; + intermediate[12] = input[stride*15] + input[stride*1]; + intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8)); + + temp1 = input[stride*11]*C12; + temp2 = input[stride*5]*C4; + temp2 -= temp1; + temp2 = DownshiftMultiply(temp2); + temp2 *= C8; + intermediate[14] = DownshiftMultiplyBy2(temp2); + + temp1 = input[stride*11]*C4; + temp2 = input[stride*5]*C12; + temp1 += temp2; + temp1 = DownshiftMultiply(temp1); + temp1 *= C8; + intermediate[15] = DownshiftMultiplyBy2(temp1); + + step[ 8] = intermediate[ 8] + intermediate[14]; + step[ 9] = intermediate[ 9] + intermediate[15]; + step[10] = intermediate[10] + intermediate[11]; + step[11] = intermediate[10] - intermediate[11]; + step[12] = intermediate[12] + intermediate[13]; + step[13] = intermediate[12] - intermediate[13]; + step[14] = intermediate[ 8] - intermediate[14]; + step[15] = intermediate[ 9] - intermediate[15]; + + // step 3 + output[stride*0] = step[ 0] + step[ 3]; + output[stride*1] = step[ 1] + step[ 2]; + output[stride*2] = step[ 1] - step[ 2]; + output[stride*3] = step[ 0] - step[ 3]; + + temp1 = step[ 4]*C14; + temp2 = step[ 7]*C2; + temp1 -= temp2; + output[stride*4] = DownshiftMultiply(temp1); + + temp1 = step[ 4]*C2; + temp2 = step[ 7]*C14; + temp1 += temp2; + output[stride*7] = DownshiftMultiply(temp1); + + temp1 = step[ 5]*C10; + temp2 = step[ 6]*C6; + temp1 -= temp2; + output[stride*5] = DownshiftMultiply(temp1); + + temp1 = step[ 5]*C6; + temp2 = step[ 6]*C10; + temp1 += temp2; + output[stride*6] = DownshiftMultiply(temp1); + + output[stride*8] = step[ 8] + step[11]; + output[stride*9] = step[ 9] + step[10]; + output[stride*10] = step[ 9] - step[10]; + output[stride*11] = step[ 8] - step[11]; + output[stride*12] = step[12] + step[15]; + output[stride*13] = step[13] + step[14]; + output[stride*14] = step[13] - step[14]; + output[stride*15] = step[12] - step[15]; + + // output 4 + step[ 0] = output[stride*0] + output[stride*7]; + step[ 1] = output[stride*1] + output[stride*6]; + step[ 2] = output[stride*2] + output[stride*5]; + step[ 3] = output[stride*3] + output[stride*4]; + step[ 4] = output[stride*3] - output[stride*4]; + step[ 5] = output[stride*2] - output[stride*5]; + step[ 6] = output[stride*1] - output[stride*6]; + step[ 7] = output[stride*0] - output[stride*7]; + + temp1 = output[stride*8]*C7; + temp2 = output[stride*15]*C9; + temp1 -= temp2; + step[ 8] = DownshiftMultiply(temp1); + + temp1 = output[stride*9]*C11; + temp2 = output[stride*14]*C5; + temp1 += temp2; + step[ 9] = DownshiftMultiply(temp1); + + temp1 = output[stride*10]*C3; + temp2 = output[stride*13]*C13; + temp1 -= temp2; + step[10] = DownshiftMultiply(temp1); + + temp1 = output[stride*11]*C15; + temp2 = output[stride*12]*C1; + temp1 += temp2; + step[11] = DownshiftMultiply(temp1); + + temp1 = output[stride*11]*C1; + temp2 = output[stride*12]*C15; + temp2 -= temp1; + step[12] = DownshiftMultiply(temp2); + + temp1 = output[stride*10]*C13; + temp2 = output[stride*13]*C3; + temp1 += temp2; + step[13] = DownshiftMultiply(temp1); + + temp1 = output[stride*9]*C5; + temp2 = output[stride*14]*C11; + temp2 -= temp1; + step[14] = DownshiftMultiply(temp2); + + temp1 = output[stride*8]*C9; + temp2 = output[stride*15]*C7; + temp1 += temp2; + step[15] = DownshiftMultiply(temp1); + + // step 5 + output[stride*0] = step[0] + step[15]; + output[stride*1] = step[1] + step[14]; + output[stride*2] = step[2] + step[13]; + output[stride*3] = step[3] + step[12]; + output[stride*4] = step[4] + step[11]; + output[stride*5] = step[5] + step[10]; + output[stride*6] = step[6] + step[ 9]; + output[stride*7] = step[7] + step[ 8]; + + output[stride*15] = step[0] - step[15]; + output[stride*14] = step[1] - step[14]; + output[stride*13] = step[2] - step[13]; + output[stride*12] = step[3] - step[12]; + output[stride*11] = step[4] - step[11]; + output[stride*10] = step[5] - step[10]; + output[stride*9] = step[6] - step[ 9]; + output[stride*8] = step[7] - step[ 8]; +} +static void butterfly_32_idct_1d(double *input, double *output, int stride) { + static const double C1 = 0.998795456205; // cos(pi * 1 / 64) + static const double C3 = 0.989176509965; // cos(pi * 3 / 64) + static const double C5 = 0.970031253195; // cos(pi * 5 / 64) + static const double C7 = 0.941544065183; // cos(pi * 7 / 64) + static const double C9 = 0.903989293123; // cos(pi * 9 / 64) + static const double C11 = 0.857728610000; // cos(pi * 11 / 64) + static const double C13 = 0.803207531481; // cos(pi * 13 / 64) + static const double C15 = 0.740951125355; // cos(pi * 15 / 64) + static const double C16 = 0.707106781187; // cos(pi * 16 / 64) + static const double C17 = 0.671558954847; // cos(pi * 17 / 64) + static const double C19 = 0.595699304492; // cos(pi * 19 / 64) + static const double C21 = 0.514102744193; // cos(pi * 21 / 64) + static const double C23 = 0.427555093430; // cos(pi * 23 / 64) + static const double C25 = 0.336889853392; // cos(pi * 25 / 64) + static const double C27 = 0.242980179903; // cos(pi * 27 / 64) + static const double C29 = 0.146730474455; // cos(pi * 29 / 64) + static const double C31 = 0.049067674327; // cos(pi * 31 / 64) + + double step1[32]; + double step2[32]; + + step1[ 0] = input[stride*0]; + step1[ 1] = input[stride*2]; + step1[ 2] = input[stride*4]; + step1[ 3] = input[stride*6]; + step1[ 4] = input[stride*8]; + step1[ 5] = input[stride*10]; + step1[ 6] = input[stride*12]; + step1[ 7] = input[stride*14]; + step1[ 8] = input[stride*16]; + step1[ 9] = input[stride*18]; + step1[10] = input[stride*20]; + step1[11] = input[stride*22]; + step1[12] = input[stride*24]; + step1[13] = input[stride*26]; + step1[14] = input[stride*28]; + step1[15] = input[stride*30]; + + step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16); + step1[17] = (input[stride*3] + input[stride*1]); + step1[18] = (input[stride*5] + input[stride*3]); + step1[19] = (input[stride*7] + input[stride*5]); + step1[20] = (input[stride*9] + input[stride*7]); + step1[21] = (input[stride*11] + input[stride*9]); + step1[22] = (input[stride*13] + input[stride*11]); + step1[23] = (input[stride*15] + input[stride*13]); + step1[24] = (input[stride*17] + input[stride*15]); + step1[25] = (input[stride*19] + input[stride*17]); + step1[26] = (input[stride*21] + input[stride*19]); + step1[27] = (input[stride*23] + input[stride*21]); + step1[28] = (input[stride*25] + input[stride*23]); + step1[29] = (input[stride*27] + input[stride*25]); + step1[30] = (input[stride*29] + input[stride*27]); + step1[31] = (input[stride*31] + input[stride*29]); + + idct16(step1, step2, 1); + idct16(step1 + 16, step2 + 16, 1); + + step2[16] = DownshiftMultiply(step2[16] / (2*C1)); + step2[17] = DownshiftMultiply(step2[17] / (2*C3)); + step2[18] = DownshiftMultiply(step2[18] / (2*C5)); + step2[19] = DownshiftMultiply(step2[19] / (2*C7)); + step2[20] = DownshiftMultiply(step2[20] / (2*C9)); + step2[21] = DownshiftMultiply(step2[21] / (2*C11)); + step2[22] = DownshiftMultiply(step2[22] / (2*C13)); + step2[23] = DownshiftMultiply(step2[23] / (2*C15)); + step2[24] = DownshiftMultiply(step2[24] / (2*C17)); + step2[25] = DownshiftMultiply(step2[25] / (2*C19)); + step2[26] = DownshiftMultiply(step2[26] / (2*C21)); + step2[27] = DownshiftMultiply(step2[27] / (2*C23)); + step2[28] = DownshiftMultiply(step2[28] / (2*C25)); + step2[29] = DownshiftMultiply(step2[29] / (2*C27)); + step2[30] = DownshiftMultiply(step2[30] / (2*C29)); + step2[31] = DownshiftMultiply(step2[31] / (2*C31)); + + output[stride* 0] = step2[ 0] + step2[16]; + output[stride* 1] = step2[ 1] + step2[17]; + output[stride* 2] = step2[ 2] + step2[18]; + output[stride* 3] = step2[ 3] + step2[19]; + output[stride* 4] = step2[ 4] + step2[20]; + output[stride* 5] = step2[ 5] + step2[21]; + output[stride* 6] = step2[ 6] + step2[22]; + output[stride* 7] = step2[ 7] + step2[23]; + output[stride* 8] = step2[ 8] + step2[24]; + output[stride* 9] = step2[ 9] + step2[25]; + output[stride*10] = step2[10] + step2[26]; + output[stride*11] = step2[11] + step2[27]; + output[stride*12] = step2[12] + step2[28]; + output[stride*13] = step2[13] + step2[29]; + output[stride*14] = step2[14] + step2[30]; + output[stride*15] = step2[15] + step2[31]; + output[stride*16] = step2[15] - step2[(31 - 0)]; + output[stride*17] = step2[14] - step2[(31 - 1)]; + output[stride*18] = step2[13] - step2[(31 - 2)]; + output[stride*19] = step2[12] - step2[(31 - 3)]; + output[stride*20] = step2[11] - step2[(31 - 4)]; + output[stride*21] = step2[10] - step2[(31 - 5)]; + output[stride*22] = step2[ 9] - step2[(31 - 6)]; + output[stride*23] = step2[ 8] - step2[(31 - 7)]; + output[stride*24] = step2[ 7] - step2[(31 - 8)]; + output[stride*25] = step2[ 6] - step2[(31 - 9)]; + output[stride*26] = step2[ 5] - step2[(31 - 10)]; + output[stride*27] = step2[ 4] - step2[(31 - 11)]; + output[stride*28] = step2[ 3] - step2[(31 - 12)]; + output[stride*29] = step2[ 2] - step2[(31 - 13)]; + output[stride*30] = step2[ 1] - step2[(31 - 14)]; + output[stride*31] = step2[ 0] - step2[(31 - 15)]; +} + +void vp9_short_idct32x32_c(short *input, short *output, int pitch) { + vp9_clear_system_state(); // Make it simd safe : __asm emms; + { + double out[32*32], out2[32*32]; + const int short_pitch = pitch >> 1; + int i, j; + // First transform rows + for (i = 0; i < 32; ++i) { + double temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = input[j + i*short_pitch]; + butterfly_32_idct_1d(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) + out[j + i*32] = temp_out[j]; + } + // Then transform columns + for (i = 0; i < 32; ++i) { + double temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = out[j*32 + i]; + butterfly_32_idct_1d(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) + out2[j*32 + i] = temp_out[j]; + } + for (i = 0; i < 32*32; ++i) + output[i] = round(out2[i]/128); + } + vp9_clear_system_state(); // Make it simd safe : __asm emms; +} +#else // CONFIG_DWT32X32HYBRID + +#define MAX_BLOCK_LENGTH 64 +#define ENH_PRECISION_BITS 1 +#define ENH_PRECISION_RND ((1 << ENH_PRECISION_BITS) / 2) + +// Note: block length must be even for this implementation +static void synthesis_53_row(int length, short *lowpass, short *highpass, + short *x) { + short r, * a, * b; + int n; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ -= (r + (*b) + 1) >> 1; + r = *b++; + } + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *x++ = ((r = *a++) + 1) >> 1; + *x++ = *b++ + ((r + (*a) + 2) >> 2); + } + *x++ = ((r = *a) + 1)>>1; + *x++ = *b + ((r+1)>>1); +} + +static void synthesis_53_col(int length, short *lowpass, short *highpass, + short *x) { + short r, * a, * b; + int n; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ -= (r + (*b) + 1) >> 1; + r = *b++; + } + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *x++ = r = *a++; + *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1); + } + *x++ = r = *a; + *x++ = ((*b) << 1) + r; +} + +// NOTE: Using a 5/3 integer wavelet for now. Explore using a wavelet +// with a better response later +void dyadic_synthesize(int levels, int width, int height, short *c, int pitch_c, + short *x, int pitch_x) { + int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; + short buffer[2 * MAX_BLOCK_LENGTH]; + + th[0] = hh; + tw[0] = hw; + for (i = 1; i <= levels; i++) { + th[i] = (th[i - 1] + 1) >> 1; + tw[i] = (tw[i - 1] + 1) >> 1; + } + for (lv = levels - 1; lv >= 0; lv--) { + nh = th[lv]; + nw = tw[lv]; + hh = th[lv + 1]; + hw = tw[lv + 1]; + if ((nh < 2) || (nw < 2)) continue; + for (j = 0; j < nw; j++) { + for (i = 0; i < nh; i++) + buffer[i] = c[i * pitch_c + j]; + synthesis_53_col(nh, buffer, buffer + hh, buffer + nh); + for (i = 0; i < nh; i++) + c[i * pitch_c + j] = buffer[i + nh]; + } + for (i = 0; i < nh; i++) { + memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); + synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]); + } + } + for (i = 0; i < height; i++) + for (j = 0; j < width; j++) + x[i * pitch_x + j] = (c[i * pitch_c + j] + ENH_PRECISION_RND) >> + ENH_PRECISION_BITS; +} + +void vp9_short_idct32x32_c(short *input, short *output, int pitch) { + // assume out is a 32x32 buffer + short buffer[16 * 16]; + short buffer2[32 * 32]; + const int short_pitch = pitch >> 1; + int i; + // TODO(debargha): Implement more efficiently by adding output pitch + // argument to the idct16x16 function + vp9_short_idct16x16_c(input, buffer, pitch); + for (i = 0; i < 16; ++i) { + vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(short) * 16); + vpx_memcpy(buffer2 + i * 32 + 16, input + i * short_pitch + 16, + sizeof(short) * 16); + } + for (; i < 32; ++i) { + vpx_memcpy(buffer2 + i * 32, input + i * short_pitch, + sizeof(short) * 32); + } + dyadic_synthesize(1, 32, 32, buffer2, 32, output, 32); +} +#endif // CONFIG_DWT32X32HYBRID +#endif // CONFIG_TX32X32 diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index c78f1ad3c6ab26801d54627c7ad206e08787922e..3abf328940edcad0eede716a125a569f048b614b 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -143,3 +143,16 @@ void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) { vp9_inverse_transform_mby_16x16(xd); vp9_inverse_transform_mbuv_8x8(xd); } + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb) { + vp9_short_idct32x32(xd_sb->dqcoeff, xd_sb->diff, 64); +} + +void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb) { + vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1024, + xd_sb->diff + 1024, 32); + vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1280, + xd_sb->diff + 1280, 32); +} +#endif diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h index b012834f37337e4d93111d2738aa1e7afbe16bf6..94593f8cc60eb440d6889bd76de273934daab4da 100644 --- a/vp9/common/vp9_invtrans.h +++ b/vp9/common/vp9_invtrans.h @@ -38,4 +38,9 @@ extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd); extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +extern void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb); +extern void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb); +#endif + #endif // __INC_INVTRANS_H diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 8d4d014ba7cfc912697b3ae2739b5f3bc0b54bef..1139fb5d13161b0fb56fab80682758263eb8f0c8 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -192,6 +192,9 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) { /* Point at base of Mb MODE_INFO list */ const MODE_INFO *mode_info_context = cm->mi; +#if CONFIG_SUPERBLOCKS + const int mis = cm->mode_info_stride; +#endif /* Initialize the loop filter for this frame. */ vp9_loop_filter_frame_init(cm, xd, cm->filter_level); @@ -226,14 +229,18 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) { if (mb_col > 0 #if CONFIG_SUPERBLOCKS && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb && - mode_info_context[0].mbmi.mb_skip_coeff && - mode_info_context[-1].mbmi.mb_skip_coeff) + ((mode_info_context[0].mbmi.mb_skip_coeff && + mode_info_context[-1].mbmi.mb_skip_coeff) +#if CONFIG_TX32X32 + || mode_info_context[-1].mbmi.txfm_size == TX_32X32 +#endif + )) #endif ) vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); - if (!skip_lf && tx_type != TX_16X16) { + if (!skip_lf && tx_type < TX_16X16) { if (tx_type == TX_8X8) vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); @@ -247,14 +254,18 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) { if (mb_row > 0 #if CONFIG_SUPERBLOCKS && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb && - mode_info_context[0].mbmi.mb_skip_coeff && - mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff) + ((mode_info_context[0].mbmi.mb_skip_coeff && + mode_info_context[-mis].mbmi.mb_skip_coeff) +#if CONFIG_TX32X32 + || mode_info_context[-mis].mbmi.txfm_size == TX_32X32 +#endif + )) #endif ) vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); - if (!skip_lf && tx_type != TX_16X16) { + if (!skip_lf && tx_type < TX_16X16) { if (tx_type == TX_8X8) vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 0b6de7f82a2360e742b2c902b64c8f725c7bd206..d80498df1ad4e18dd0fd32a46e9547e607adea50 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -58,6 +58,9 @@ typedef struct frame_contexts { vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_prob coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; +#endif nmv_context nmvc; nmv_context pre_nmvc; @@ -95,6 +98,11 @@ typedef struct frame_contexts { vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_prob pre_coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; +#endif + unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] @@ -110,6 +118,11 @@ typedef struct frame_contexts { unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + unsigned int coef_counts_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; +#endif + nmv_context_counts NMVcount; vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] [VP9_SWITCHABLE_FILTERS - 1]; @@ -139,8 +152,11 @@ typedef enum { ONLY_4X4 = 0, ALLOW_8X8 = 1, ALLOW_16X16 = 2, - TX_MODE_SELECT = 3, - NB_TXFM_MODES = 4, +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + ALLOW_32X32 = 3, +#endif + TX_MODE_SELECT = 3 + (CONFIG_TX32X32 && CONFIG_SUPERBLOCKS), + NB_TXFM_MODES = 4 + (CONFIG_TX32X32 && CONFIG_SUPERBLOCKS), } TXFM_MODE; typedef struct VP9Common { @@ -268,7 +284,7 @@ typedef struct VP9Common { vp9_prob prob_comppred[COMP_PRED_CONTEXTS]; // FIXME contextualize - vp9_prob prob_tx[TX_SIZE_MAX - 1]; + vp9_prob prob_tx[TX_SIZE_MAX_SB - 1]; vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS]; diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c index 1f8dfce34b2f2e6be18293a2ba4d36e5b9d3fc69..e567bac8d54cec696bd7c52d09155ec8d5bf920b 100644 --- a/vp9/common/vp9_recon.c +++ b/vp9/common/vp9_recon.c @@ -168,6 +168,53 @@ void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) { } } } + +#if CONFIG_TX32X32 +void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) { + int x, y, stride = xd->block[0].dst_stride; + short *diff = xd->sb_coeff_data.diff; + + for (y = 0; y < 32; y++) { + for (x = 0; x < 32; x++) { + int a = dst[x] + diff[x]; + if (a < 0) + a = 0; + else if (a > 255) + a = 255; + dst[x] = a; + } + dst += stride; + diff += 32; + } +} + +void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) { + int x, y, stride = xd->block[16].dst_stride; + short *udiff = xd->sb_coeff_data.diff + 1024; + short *vdiff = xd->sb_coeff_data.diff + 1280; + + for (y = 0; y < 16; y++) { + for (x = 0; x < 16; x++) { + int u = udst[x] + udiff[x]; + int v = vdst[x] + vdiff[x]; + if (u < 0) + u = 0; + else if (u > 255) + u = 255; + if (v < 0) + v = 0; + else if (v > 255) + v = 255; + udst[x] = u; + vdst[x] = v; + } + udst += stride; + vdst += stride; + udiff += 16; + vdiff += 16; + } +} +#endif #endif void vp9_recon_mby_c(MACROBLOCKD *xd) { diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 5b7af100b636f9efb2665cec789b45b63b79b5c1..49a3a8595ed7b02871b4c1b9e56edae7ec6aa80a 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -361,6 +361,9 @@ specialize vp9_short_idct16x16 prototype void vp9_short_idct10_16x16 "short *input, short *output, int pitch" specialize vp9_short_idct10_16x16 +prototype void vp9_short_idct32x32 "short *input, short *output, int pitch" +specialize vp9_short_idct32x32 + prototype void vp9_ihtllm "const short *input, short *output, int pitch, int tx_type, int tx_dim" specialize vp9_ihtllm @@ -640,6 +643,9 @@ specialize vp9_short_fdct8x4 prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch" specialize vp9_short_walsh4x4 +prototype void vp9_short_fdct32x32 "short *InputData, short *OutputData, int pitch" +specialize vp9_short_fdct32x32 + prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch" specialize vp9_short_fdct16x16 diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c index 46a6ee45444f25203b7a6142f5e7ac6a72699fee..89c1e458dd851b3fc4aa80b247dc34f8b3470409 100644 --- a/vp9/common/vp9_seg_common.c +++ b/vp9/common/vp9_seg_common.c @@ -14,7 +14,7 @@ static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 }; static const int seg_feature_data_max[SEG_LVL_MAX] = - { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX - 1}; + { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX_SB - 1}; // These functions provide access to new segment level features. // Eventually these function may be "optimized out" but for the moment, diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 7e53884f78fd4282e27d7715c344e93a16d85a03..b9f411dd2959cbcb5ba65a8681168390980b9403 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -209,8 +209,17 @@ static void kfread_modes(VP9D_COMP *pbi, m->mbmi.mode <= I8X8_PRED) { // FIXME(rbultje) code ternary symbol once all experiments are merged m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]); - if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) + if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) { m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.encoded_as_sb) + m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[2]); +#endif + } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.encoded_as_sb) { + m->mbmi.txfm_size = TX_32X32; +#endif } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) { m->mbmi.txfm_size = TX_16X16; } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) { @@ -1219,8 +1228,17 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, // FIXME(rbultje) code ternary symbol once all experiments are merged mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]); if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED && - mbmi->mode != SPLITMV) + mbmi->mode != SPLITMV) { mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (mbmi->encoded_as_sb && mbmi->txfm_size != TX_8X8) + mbmi->txfm_size += vp9_read(bc, cm->prob_tx[2]); +#endif + } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + } else if (mbmi->encoded_as_sb && cm->txfm_mode >= ALLOW_32X32) { + mbmi->txfm_size = TX_32X32; +#endif } else if (cm->txfm_mode >= ALLOW_16X16 && ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) || (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) { diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 76349ad86737c01359808754a23656000acad37d..7f851a18a48cde74e24e7d73f458789a6c55f0d7 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -693,6 +693,7 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd, TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; VP9_COMMON *const pc = &pbi->common; MODE_INFO *orig_mi = xd->mode_info_context; + const int mis = pc->mode_info_stride; assert(xd->mode_info_context->mbmi.encoded_as_sb); @@ -733,6 +734,30 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd, } /* dequantization and idct */ +#if CONFIG_TX32X32 + if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) { + eobtotal = vp9_decode_sb_tokens(pbi, xd, bc); + if (eobtotal == 0) { // skip loopfilter + xd->mode_info_context->mbmi.mb_skip_coeff = 1; + if (mb_col + 1 < pc->mb_cols) + xd->mode_info_context[1].mbmi.mb_skip_coeff = 1; + if (mb_row + 1 < pc->mb_rows) { + xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1; + if (mb_col + 1 < pc->mb_cols) + xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1; + } + } else { + vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant, + xd->dst.y_buffer, xd->dst.y_buffer, + xd->dst.y_stride, xd->dst.y_stride, + xd->eobs[0]); + vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024, + xd->block[16].dequant, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs + 16); + } + } else { +#endif for (n = 0; n < 4; n++) { int x_idx = n & 1, y_idx = n >> 1; @@ -742,7 +767,7 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->above_context = pc->above_context + mb_col + x_idx; xd->left_context = pc->left_context + y_idx; - xd->mode_info_context = orig_mi + x_idx + y_idx * pc->mode_info_stride; + xd->mode_info_context = orig_mi + x_idx + y_idx * mis; for (i = 0; i < 25; i++) { xd->block[i].eob = 0; xd->eobs[i] = 0; @@ -766,6 +791,9 @@ static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->above_context = pc->above_context + mb_col; xd->left_context = pc->left_context; xd->mode_info_context = orig_mi; +#if CONFIG_TX32X32 + } +#endif } #endif @@ -1244,6 +1272,11 @@ static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) { read_coef_probs_common(bc, pc->fc.coef_probs_16x16); read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16); } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (pbi->common.txfm_mode > ALLOW_16X16) { + read_coef_probs_common(bc, pc->fc.coef_probs_32x32); + } +#endif } int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { @@ -1433,9 +1466,16 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { /* Read the loop filter level and type */ pc->txfm_mode = vp9_read_literal(&header_bc, 2); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (pc->txfm_mode == 3) + pc->txfm_mode += vp9_read_bit(&header_bc); +#endif if (pc->txfm_mode == TX_MODE_SELECT) { pc->prob_tx[0] = vp9_read_literal(&header_bc, 8); pc->prob_tx[1] = vp9_read_literal(&header_bc, 8); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + pc->prob_tx[2] = vp9_read_literal(&header_bc, 8); +#endif } pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc); @@ -1591,6 +1631,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { pbi->common.fc.coef_probs_16x16); vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16, pbi->common.fc.hybrid_coef_probs_16x16); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_copy(pbi->common.fc.pre_coef_probs_32x32, + pbi->common.fc.coef_probs_32x32); +#endif vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob); #if CONFIG_SUPERBLOCKS vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob); @@ -1610,6 +1654,9 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8); vp9_zero(pbi->common.fc.coef_counts_16x16); vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_zero(pbi->common.fc.coef_counts_32x32); +#endif vp9_zero(pbi->common.fc.ymode_counts); #if CONFIG_SUPERBLOCKS vp9_zero(pbi->common.fc.sb_ymode_counts); diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index 79114d58ca1a3cfb2e9ee426f5d43887f7410650..22a66716fd2de62325d43f80eba1276f88f3e1cb 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c @@ -352,3 +352,30 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16); } } + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +void vp9_dequant_idct_add_32x32(int16_t *input, const int16_t *dq, + uint8_t *pred, uint8_t *dest, int pitch, + int stride, uint16_t eobs) { + short output[1024]; + int i; + + input[0]= input[0] * dq[0] / 2; + for (i = 1; i < 1024; i++) + input[i] = input[i] * dq[1] / 2; + vp9_short_idct32x32_c(input, output, 64); + vpx_memset(input, 0, 2048); + + add_residual(output, pred, pitch, dest, stride, 32, 32); +} + +void vp9_dequant_idct_add_uv_block_16x16_c(short *q, const short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, + unsigned short *eobs) { + vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, eobs[0]); + vp9_dequant_idct_add_16x16_c(q + 256, dq, + dstv, dstv, stride, stride, eobs[4]); +} +#endif diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 897ad5204ac0dbbc162357c8a126a7695dea197c..35a26477ad6818eef10cf03e8aa41488f04a8484 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -55,8 +55,9 @@ #define CAT5_PROB3 157 #define CAT5_PROB4 180 -static const unsigned char cat6_prob[14] = -{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 }; +static const unsigned char cat6_prob[15] = { + 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 +}; void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) { /* Clear entropy contexts */ @@ -161,6 +162,12 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd, coef_counts = fc->hybrid_coef_counts_16x16[type]; } break; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + case TX_32X32: + coef_probs = fc->coef_probs_32x32[type]; + coef_counts = fc->coef_counts_32x32[type]; + break; +#endif } VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l); @@ -256,6 +263,54 @@ static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) { return eob; } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +int vp9_decode_sb_tokens(VP9D_COMP* const pbi, + MACROBLOCKD* const xd, + BOOL_DECODER* const bc) { + ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context; + ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context; + unsigned short* const eobs = xd->eobs; + const int segment_id = xd->mode_info_context->mbmi.segment_id; + int c, i, eobtotal = 0, seg_eob; + + // Luma block + eobs[0] = c = decode_coefs(pbi, xd, bc, A, L, PLANE_TYPE_Y_WITH_DC, + DCT_DCT, get_eob(xd, segment_id, 1024), + xd->sb_coeff_data.qcoeff, + vp9_default_zig_zag1d_32x32, + TX_32X32, vp9_coef_bands_32x32); + A[1] = A[2] = A[3] = A[0]; + L[1] = L[2] = L[3] = L[0]; + eobtotal += c; + + // 16x16 chroma blocks + seg_eob = get_eob(xd, segment_id, 256); + for (i = 16; i < 24; i += 4) { + ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i]; + ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i]; + + eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV, + DCT_DCT, seg_eob, + xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64, + vp9_default_zig_zag1d_16x16, + TX_16X16, vp9_coef_bands_16x16); + a[1] = a[0]; + l[1] = l[0]; + eobtotal += c; + } + + // no Y2 block + vpx_memset(&A[8], 0, sizeof(A[8])); + vpx_memset(&L[8], 0, sizeof(L[8])); + + vpx_memcpy(xd->above_context + 1, xd->above_context, + sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(xd->left_context + 1, xd->left_context, + sizeof(ENTROPY_CONTEXT_PLANES)); + + return eobtotal; +} +#endif static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi, MACROBLOCKD* const xd, diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h index 9b319d4a951c856940a10ddffcc79b763d32c8c2..09d354ea6c5ddefdd151e6e86e60590948a63b33 100644 --- a/vp9/decoder/vp9_detokenize.h +++ b/vp9/decoder/vp9_detokenize.h @@ -23,6 +23,12 @@ int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd, int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const, BOOL_DECODER* const); +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +int vp9_decode_sb_tokens(VP9D_COMP* const pbi, + MACROBLOCKD* const xd, + BOOL_DECODER* const bc); +#endif + int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd, BOOL_DECODER* const bc); diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 0adaeee0a14d6ee008b1bef942a320dbd3cb60b2..847815f50849dbff0c37d9446d72f1c8c4410748 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -1200,8 +1200,13 @@ static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) { TX_SIZE sz = mi->txfm_size; // FIXME(rbultje) code ternary symbol once all experiments are merged vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]); - if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) + if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) { vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (mi->encoded_as_sb && sz != TX_8X8) + vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]); +#endif + } } #ifdef ENTROPY_STATS @@ -1337,8 +1342,13 @@ static void write_mb_modes_kf(const VP9_COMMON *c, TX_SIZE sz = m->mbmi.txfm_size; // FIXME(rbultje) code ternary symbol once all experiments are merged vp9_write(bc, sz != TX_4X4, c->prob_tx[0]); - if (sz != TX_4X4 && ym <= TM_PRED) + if (sz != TX_4X4 && ym <= TM_PRED) { vp9_write(bc, sz != TX_8X8, c->prob_tx[1]); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (m->mbmi.encoded_as_sb && sz != TX_8X8) + vp9_write(bc, sz != TX_16X16, c->prob_tx[2]); +#endif + } } } @@ -1547,29 +1557,54 @@ static void build_coeff_contexts(VP9_COMP *cpi) { if (!cpi->dummy_packing) for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t]; +#endif + } + } + } + for (i = 0; i < BLOCK_TYPES_16X16; ++i) { + for (j = 0; j < COEF_BANDS; ++j) { + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + cpi->frame_hybrid_coef_probs_16x16[i][j][k], + cpi->frame_hybrid_branch_ct_16x16[i][j][k], + cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1); +#ifdef ENTROPY_STATS + if (!cpi->dummy_packing) + for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) + hybrid_context_counters_16x16[i][j][k][t] += + cpi->hybrid_coef_counts_16x16[i][j][k][t]; #endif } } } } - for (i = 0; i < BLOCK_TYPES_16X16; ++i) { - for (j = 0; j < COEF_BANDS; ++j) { - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution( - MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, - cpi->frame_hybrid_coef_probs_16x16[i][j][k], - cpi->frame_hybrid_branch_ct_16x16[i][j][k], - cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1); + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (cpi->common.txfm_mode > ALLOW_16X16) { + for (i = 0; i < BLOCK_TYPES_32X32; ++i) { + for (j = 0; j < COEF_BANDS; ++j) { + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + cpi->frame_coef_probs_32x32[i][j][k], + cpi->frame_branch_ct_32x32[i][j][k], + cpi->coef_counts_32x32[i][j][k], 256, 1); #ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t]; + if (!cpi->dummy_packing) + for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) + context_counters_32x32[i][j][k][t] += + cpi->coef_counts_32x32[i][j][k][t]; #endif + } } } } +#endif } static void update_coef_probs_common( @@ -1714,6 +1749,15 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { cpi->common.fc.hybrid_coef_probs_16x16, cpi->frame_hybrid_branch_ct_16x16); } + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (cpi->common.txfm_mode > ALLOW_16X16) { + update_coef_probs_common(bc, + cpi->frame_coef_probs_32x32, + cpi->common.fc.coef_probs_32x32, + cpi->frame_branch_ct_32x32); + } +#endif } #ifdef PACKET_TESTING @@ -1955,18 +1999,53 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, { if (pc->txfm_mode == TX_MODE_SELECT) { - pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0], - cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] + - cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]); - pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]); + pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] + + cpi->txfm_count_16x16p[TX_4X4] + + cpi->txfm_count_8x8p[TX_4X4], + cpi->txfm_count_32x32p[TX_4X4] + + cpi->txfm_count_32x32p[TX_8X8] + + cpi->txfm_count_32x32p[TX_16X16] + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + cpi->txfm_count_32x32p[TX_32X32] + +#endif + cpi->txfm_count_16x16p[TX_4X4] + + cpi->txfm_count_16x16p[TX_8X8] + + cpi->txfm_count_16x16p[TX_16X16] + + cpi->txfm_count_8x8p[TX_4X4] + + cpi->txfm_count_8x8p[TX_8X8]); + pc->prob_tx[1] = get_prob(cpi->txfm_count_32x32p[TX_8X8] + + cpi->txfm_count_16x16p[TX_8X8], + cpi->txfm_count_32x32p[TX_8X8] + + cpi->txfm_count_32x32p[TX_16X16] + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + cpi->txfm_count_32x32p[TX_32X32] + +#endif + cpi->txfm_count_16x16p[TX_8X8] + + cpi->txfm_count_16x16p[TX_16X16]); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + pc->prob_tx[2] = get_prob(cpi->txfm_count_32x32p[TX_16X16], + cpi->txfm_count_32x32p[TX_16X16] + + cpi->txfm_count_32x32p[TX_32X32]); +#endif } else { pc->prob_tx[0] = 128; pc->prob_tx[1] = 128; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + pc->prob_tx[2] = 128; +#endif + } + vp9_write_literal(&header_bc, pc->txfm_mode <= 3 ? pc->txfm_mode : 3, 2); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (pc->txfm_mode > ALLOW_16X16) { + vp9_write_bit(&header_bc, pc->txfm_mode == TX_MODE_SELECT); } - vp9_write_literal(&header_bc, pc->txfm_mode, 2); +#endif if (pc->txfm_mode == TX_MODE_SELECT) { vp9_write_literal(&header_bc, pc->prob_tx[0], 8); vp9_write_literal(&header_bc, pc->prob_tx[1], 8); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_write_literal(&header_bc, pc->prob_tx[2], 8); +#endif } } @@ -2150,6 +2229,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8); vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16); vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_copy(cpi->common.fc.pre_coef_probs_32x32, + cpi->common.fc.coef_probs_32x32); +#endif #if CONFIG_SUPERBLOCKS vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob); #endif diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 4669d2ed6945dbccff7d75213c748f30c699cf37..82dc5edc1670b227db75def2cca47b73b0a10d87 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -36,9 +36,15 @@ typedef struct block { short *zbin; short *zbin_8x8; short *zbin_16x16; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + short *zbin_32x32; +#endif short *zrun_zbin_boost; short *zrun_zbin_boost_8x8; short *zrun_zbin_boost_16x16; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + short *zrun_zbin_boost_32x32; +#endif short *round; // Zbin Over Quant value @@ -52,6 +58,9 @@ typedef struct block { int eob_max_offset; int eob_max_offset_8x8; int eob_max_offset_16x16; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + int eob_max_offset_32x32; +#endif } BLOCK; typedef struct { @@ -83,6 +92,13 @@ typedef struct { int64_t txfm_rd_diff[NB_TXFM_MODES]; } PICK_MODE_CONTEXT; +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +typedef struct superblock { + DECLARE_ALIGNED(16, short, src_diff[32*32+16*16*2]); + DECLARE_ALIGNED(16, short, coeff[32*32+16*16*2]); +} SUPERBLOCK; +#endif + typedef struct macroblock { DECLARE_ALIGNED(16, short, src_diff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y DECLARE_ALIGNED(16, short, coeff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y @@ -95,6 +111,10 @@ typedef struct macroblock { // 1 DC 2nd order block each with 16 entries BLOCK block[25]; +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 + SUPERBLOCK sb_coeff_data; +#endif + YV12_BUFFER_CONFIG src; MACROBLOCKD e_mbd; @@ -153,9 +173,9 @@ typedef struct macroblock { unsigned char *active_ptr; - unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS] + unsigned int token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][COEF_BANDS] [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS]; - unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS] + unsigned int hybrid_token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][COEF_BANDS] [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS]; int optimize; diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 6753f2462883d9c9b272efb8387f79b999b5c5c4..0fc8fa35edb416c177395f372ea2151dc6e7ece5 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -1330,3 +1330,461 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) { #undef RIGHT_SHIFT #undef ROUNDING #endif + +#if CONFIG_TX32X32 +#if !CONFIG_DWT32X32HYBRID +static void dct32_1d(double *input, double *output, int stride) { + static const double C1 = 0.998795456205; // cos(pi * 1 / 64) + static const double C2 = 0.995184726672; // cos(pi * 2 / 64) + static const double C3 = 0.989176509965; // cos(pi * 3 / 64) + static const double C4 = 0.980785280403; // cos(pi * 4 / 64) + static const double C5 = 0.970031253195; // cos(pi * 5 / 64) + static const double C6 = 0.956940335732; // cos(pi * 6 / 64) + static const double C7 = 0.941544065183; // cos(pi * 7 / 64) + static const double C8 = 0.923879532511; // cos(pi * 8 / 64) + static const double C9 = 0.903989293123; // cos(pi * 9 / 64) + static const double C10 = 0.881921264348; // cos(pi * 10 / 64) + static const double C11 = 0.857728610000; // cos(pi * 11 / 64) + static const double C12 = 0.831469612303; // cos(pi * 12 / 64) + static const double C13 = 0.803207531481; // cos(pi * 13 / 64) + static const double C14 = 0.773010453363; // cos(pi * 14 / 64) + static const double C15 = 0.740951125355; // cos(pi * 15 / 64) + static const double C16 = 0.707106781187; // cos(pi * 16 / 64) + static const double C17 = 0.671558954847; // cos(pi * 17 / 64) + static const double C18 = 0.634393284164; // cos(pi * 18 / 64) + static const double C19 = 0.595699304492; // cos(pi * 19 / 64) + static const double C20 = 0.555570233020; // cos(pi * 20 / 64) + static const double C21 = 0.514102744193; // cos(pi * 21 / 64) + static const double C22 = 0.471396736826; // cos(pi * 22 / 64) + static const double C23 = 0.427555093430; // cos(pi * 23 / 64) + static const double C24 = 0.382683432365; // cos(pi * 24 / 64) + static const double C25 = 0.336889853392; // cos(pi * 25 / 64) + static const double C26 = 0.290284677254; // cos(pi * 26 / 64) + static const double C27 = 0.242980179903; // cos(pi * 27 / 64) + static const double C28 = 0.195090322016; // cos(pi * 28 / 64) + static const double C29 = 0.146730474455; // cos(pi * 29 / 64) + static const double C30 = 0.098017140330; // cos(pi * 30 / 64) + static const double C31 = 0.049067674327; // cos(pi * 31 / 64) + + double step[32]; + + // Stage 1 + step[0] = input[stride*0] + input[stride*(32 - 1)]; + step[1] = input[stride*1] + input[stride*(32 - 2)]; + step[2] = input[stride*2] + input[stride*(32 - 3)]; + step[3] = input[stride*3] + input[stride*(32 - 4)]; + step[4] = input[stride*4] + input[stride*(32 - 5)]; + step[5] = input[stride*5] + input[stride*(32 - 6)]; + step[6] = input[stride*6] + input[stride*(32 - 7)]; + step[7] = input[stride*7] + input[stride*(32 - 8)]; + step[8] = input[stride*8] + input[stride*(32 - 9)]; + step[9] = input[stride*9] + input[stride*(32 - 10)]; + step[10] = input[stride*10] + input[stride*(32 - 11)]; + step[11] = input[stride*11] + input[stride*(32 - 12)]; + step[12] = input[stride*12] + input[stride*(32 - 13)]; + step[13] = input[stride*13] + input[stride*(32 - 14)]; + step[14] = input[stride*14] + input[stride*(32 - 15)]; + step[15] = input[stride*15] + input[stride*(32 - 16)]; + step[16] = -input[stride*16] + input[stride*(32 - 17)]; + step[17] = -input[stride*17] + input[stride*(32 - 18)]; + step[18] = -input[stride*18] + input[stride*(32 - 19)]; + step[19] = -input[stride*19] + input[stride*(32 - 20)]; + step[20] = -input[stride*20] + input[stride*(32 - 21)]; + step[21] = -input[stride*21] + input[stride*(32 - 22)]; + step[22] = -input[stride*22] + input[stride*(32 - 23)]; + step[23] = -input[stride*23] + input[stride*(32 - 24)]; + step[24] = -input[stride*24] + input[stride*(32 - 25)]; + step[25] = -input[stride*25] + input[stride*(32 - 26)]; + step[26] = -input[stride*26] + input[stride*(32 - 27)]; + step[27] = -input[stride*27] + input[stride*(32 - 28)]; + step[28] = -input[stride*28] + input[stride*(32 - 29)]; + step[29] = -input[stride*29] + input[stride*(32 - 30)]; + step[30] = -input[stride*30] + input[stride*(32 - 31)]; + step[31] = -input[stride*31] + input[stride*(32 - 32)]; + + // Stage 2 + output[stride*0] = step[0] + step[16 - 1]; + output[stride*1] = step[1] + step[16 - 2]; + output[stride*2] = step[2] + step[16 - 3]; + output[stride*3] = step[3] + step[16 - 4]; + output[stride*4] = step[4] + step[16 - 5]; + output[stride*5] = step[5] + step[16 - 6]; + output[stride*6] = step[6] + step[16 - 7]; + output[stride*7] = step[7] + step[16 - 8]; + output[stride*8] = -step[8] + step[16 - 9]; + output[stride*9] = -step[9] + step[16 - 10]; + output[stride*10] = -step[10] + step[16 - 11]; + output[stride*11] = -step[11] + step[16 - 12]; + output[stride*12] = -step[12] + step[16 - 13]; + output[stride*13] = -step[13] + step[16 - 14]; + output[stride*14] = -step[14] + step[16 - 15]; + output[stride*15] = -step[15] + step[16 - 16]; + + output[stride*16] = step[16]; + output[stride*17] = step[17]; + output[stride*18] = step[18]; + output[stride*19] = step[19]; + + output[stride*20] = (-step[20] + step[27])*C16; + output[stride*21] = (-step[21] + step[26])*C16; + output[stride*22] = (-step[22] + step[25])*C16; + output[stride*23] = (-step[23] + step[24])*C16; + + output[stride*24] = (step[24] + step[23])*C16; + output[stride*25] = (step[25] + step[22])*C16; + output[stride*26] = (step[26] + step[21])*C16; + output[stride*27] = (step[27] + step[20])*C16; + + output[stride*28] = step[28]; + output[stride*29] = step[29]; + output[stride*30] = step[30]; + output[stride*31] = step[31]; + + // Stage 3 + step[0] = output[stride*0] + output[stride*(8 - 1)]; + step[1] = output[stride*1] + output[stride*(8 - 2)]; + step[2] = output[stride*2] + output[stride*(8 - 3)]; + step[3] = output[stride*3] + output[stride*(8 - 4)]; + step[4] = -output[stride*4] + output[stride*(8 - 5)]; + step[5] = -output[stride*5] + output[stride*(8 - 6)]; + step[6] = -output[stride*6] + output[stride*(8 - 7)]; + step[7] = -output[stride*7] + output[stride*(8 - 8)]; + step[8] = output[stride*8]; + step[9] = output[stride*9]; + step[10] = (-output[stride*10] + output[stride*13])*C16; + step[11] = (-output[stride*11] + output[stride*12])*C16; + step[12] = (output[stride*12] + output[stride*11])*C16; + step[13] = (output[stride*13] + output[stride*10])*C16; + step[14] = output[stride*14]; + step[15] = output[stride*15]; + + step[16] = output[stride*16] + output[stride*23]; + step[17] = output[stride*17] + output[stride*22]; + step[18] = output[stride*18] + output[stride*21]; + step[19] = output[stride*19] + output[stride*20]; + step[20] = -output[stride*20] + output[stride*19]; + step[21] = -output[stride*21] + output[stride*18]; + step[22] = -output[stride*22] + output[stride*17]; + step[23] = -output[stride*23] + output[stride*16]; + step[24] = -output[stride*24] + output[stride*31]; + step[25] = -output[stride*25] + output[stride*30]; + step[26] = -output[stride*26] + output[stride*29]; + step[27] = -output[stride*27] + output[stride*28]; + step[28] = output[stride*28] + output[stride*27]; + step[29] = output[stride*29] + output[stride*26]; + step[30] = output[stride*30] + output[stride*25]; + step[31] = output[stride*31] + output[stride*24]; + + // Stage 4 + output[stride*0] = step[0] + step[3]; + output[stride*1] = step[1] + step[2]; + output[stride*2] = -step[2] + step[1]; + output[stride*3] = -step[3] + step[0]; + output[stride*4] = step[4]; + output[stride*5] = (-step[5] + step[6])*C16; + output[stride*6] = (step[6] + step[5])*C16; + output[stride*7] = step[7]; + output[stride*8] = step[8] + step[11]; + output[stride*9] = step[9] + step[10]; + output[stride*10] = -step[10] + step[9]; + output[stride*11] = -step[11] + step[8]; + output[stride*12] = -step[12] + step[15]; + output[stride*13] = -step[13] + step[14]; + output[stride*14] = step[14] + step[13]; + output[stride*15] = step[15] + step[12]; + + output[stride*16] = step[16]; + output[stride*17] = step[17]; + output[stride*18] = step[18]*-C8 + step[29]*C24; + output[stride*19] = step[19]*-C8 + step[28]*C24; + output[stride*20] = step[20]*-C24 + step[27]*-C8; + output[stride*21] = step[21]*-C24 + step[26]*-C8; + output[stride*22] = step[22]; + output[stride*23] = step[23]; + output[stride*24] = step[24]; + output[stride*25] = step[25]; + output[stride*26] = step[26]*C24 + step[21]*-C8; + output[stride*27] = step[27]*C24 + step[20]*-C8; + output[stride*28] = step[28]*C8 + step[19]*C24; + output[stride*29] = step[29]*C8 + step[18]*C24; + output[stride*30] = step[30]; + output[stride*31] = step[31]; + + // Stage 5 + step[0] = (output[stride*0] + output[stride*1]) * C16; + step[1] = (-output[stride*1] + output[stride*0]) * C16; + step[2] = output[stride*2]*C24 + output[stride*3] * C8; + step[3] = output[stride*3]*C24 - output[stride*2] * C8; + step[4] = output[stride*4] + output[stride*5]; + step[5] = -output[stride*5] + output[stride*4]; + step[6] = -output[stride*6] + output[stride*7]; + step[7] = output[stride*7] + output[stride*6]; + step[8] = output[stride*8]; + step[9] = output[stride*9]*-C8 + output[stride*14]*C24; + step[10] = output[stride*10]*-C24 + output[stride*13]*-C8; + step[11] = output[stride*11]; + step[12] = output[stride*12]; + step[13] = output[stride*13]*C24 + output[stride*10]*-C8; + step[14] = output[stride*14]*C8 + output[stride*9]*C24; + step[15] = output[stride*15]; + + step[16] = output[stride*16] + output[stride*19]; + step[17] = output[stride*17] + output[stride*18]; + step[18] = -output[stride*18] + output[stride*17]; + step[19] = -output[stride*19] + output[stride*16]; + step[20] = -output[stride*20] + output[stride*23]; + step[21] = -output[stride*21] + output[stride*22]; + step[22] = output[stride*22] + output[stride*21]; + step[23] = output[stride*23] + output[stride*20]; + step[24] = output[stride*24] + output[stride*27]; + step[25] = output[stride*25] + output[stride*26]; + step[26] = -output[stride*26] + output[stride*25]; + step[27] = -output[stride*27] + output[stride*24]; + step[28] = -output[stride*28] + output[stride*31]; + step[29] = -output[stride*29] + output[stride*30]; + step[30] = output[stride*30] + output[stride*29]; + step[31] = output[stride*31] + output[stride*28]; + + // Stage 6 + output[stride*0] = step[0]; + output[stride*1] = step[1]; + output[stride*2] = step[2]; + output[stride*3] = step[3]; + output[stride*4] = step[4]*C28 + step[7]*C4; + output[stride*5] = step[5]*C12 + step[6]*C20; + output[stride*6] = step[6]*C12 + step[5]*-C20; + output[stride*7] = step[7]*C28 + step[4]*-C4; + output[stride*8] = step[8] + step[9]; + output[stride*9] = -step[9] + step[8]; + output[stride*10] = -step[10] + step[11]; + output[stride*11] = step[11] + step[10]; + output[stride*12] = step[12] + step[13]; + output[stride*13] = -step[13] + step[12]; + output[stride*14] = -step[14] + step[15]; + output[stride*15] = step[15] + step[14]; + + output[stride*16] = step[16]; + output[stride*17] = step[17]*-C4 + step[30]*C28; + output[stride*18] = step[18]*-C28 + step[29]*-C4; + output[stride*19] = step[19]; + output[stride*20] = step[20]; + output[stride*21] = step[21]*-C20 + step[26]*C12; + output[stride*22] = step[22]*-C12 + step[25]*-C20; + output[stride*23] = step[23]; + output[stride*24] = step[24]; + output[stride*25] = step[25]*C12 + step[22]*-C20; + output[stride*26] = step[26]*C20 + step[21]*C12; + output[stride*27] = step[27]; + output[stride*28] = step[28]; + output[stride*29] = step[29]*C28 + step[18]*-C4; + output[stride*30] = step[30]*C4 + step[17]*C28; + output[stride*31] = step[31]; + + // Stage 7 + step[0] = output[stride*0]; + step[1] = output[stride*1]; + step[2] = output[stride*2]; + step[3] = output[stride*3]; + step[4] = output[stride*4]; + step[5] = output[stride*5]; + step[6] = output[stride*6]; + step[7] = output[stride*7]; + step[8] = output[stride*8]*C30 + output[stride*15]*C2; + step[9] = output[stride*9]*C14 + output[stride*14]*C18; + step[10] = output[stride*10]*C22 + output[stride*13]*C10; + step[11] = output[stride*11]*C6 + output[stride*12]*C26; + step[12] = output[stride*12]*C6 + output[stride*11]*-C26; + step[13] = output[stride*13]*C22 + output[stride*10]*-C10; + step[14] = output[stride*14]*C14 + output[stride*9]*-C18; + step[15] = output[stride*15]*C30 + output[stride*8]*-C2; + + step[16] = output[stride*16] + output[stride*17]; + step[17] = -output[stride*17] + output[stride*16]; + step[18] = -output[stride*18] + output[stride*19]; + step[19] = output[stride*19] + output[stride*18]; + step[20] = output[stride*20] + output[stride*21]; + step[21] = -output[stride*21] + output[stride*20]; + step[22] = -output[stride*22] + output[stride*23]; + step[23] = output[stride*23] + output[stride*22]; + step[24] = output[stride*24] + output[stride*25]; + step[25] = -output[stride*25] + output[stride*24]; + step[26] = -output[stride*26] + output[stride*27]; + step[27] = output[stride*27] + output[stride*26]; + step[28] = output[stride*28] + output[stride*29]; + step[29] = -output[stride*29] + output[stride*28]; + step[30] = -output[stride*30] + output[stride*31]; + step[31] = output[stride*31] + output[stride*30]; + + // Final stage --- outputs indices are bit-reversed. + output[stride*0] = step[0]; + output[stride*16] = step[1]; + output[stride*8] = step[2]; + output[stride*24] = step[3]; + output[stride*4] = step[4]; + output[stride*20] = step[5]; + output[stride*12] = step[6]; + output[stride*28] = step[7]; + output[stride*2] = step[8]; + output[stride*18] = step[9]; + output[stride*10] = step[10]; + output[stride*26] = step[11]; + output[stride*6] = step[12]; + output[stride*22] = step[13]; + output[stride*14] = step[14]; + output[stride*30] = step[15]; + + output[stride*1] = step[16]*C31 + step[31]*C1; + output[stride*17] = step[17]*C15 + step[30]*C17; + output[stride*9] = step[18]*C23 + step[29]*C9; + output[stride*25] = step[19]*C7 + step[28]*C25; + output[stride*5] = step[20]*C27 + step[27]*C5; + output[stride*21] = step[21]*C11 + step[26]*C21; + output[stride*13] = step[22]*C19 + step[25]*C13; + output[stride*29] = step[23]*C3 + step[24]*C29; + output[stride*3] = step[24]*C3 + step[23]*-C29; + output[stride*19] = step[25]*C19 + step[22]*-C13; + output[stride*11] = step[26]*C11 + step[21]*-C21; + output[stride*27] = step[27]*C27 + step[20]*-C5; + output[stride*7] = step[28]*C7 + step[19]*-C25; + output[stride*23] = step[29]*C23 + step[18]*-C9; + output[stride*15] = step[30]*C15 + step[17]*-C17; + output[stride*31] = step[31]*C31 + step[16]*-C1; +} + +void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { + vp9_clear_system_state(); // Make it simd safe : __asm emms; + { + int shortpitch = pitch >> 1; + int i, j; + double output[1024]; + // First transform columns + for (i = 0; i < 32; i++) { + double temp_in[32], temp_out[32]; + for (j = 0; j < 32; j++) + temp_in[j] = input[j*shortpitch + i]; + dct32_1d(temp_in, temp_out, 1); + for (j = 0; j < 32; j++) + output[j*32 + i] = temp_out[j]; + } + // Then transform rows + for (i = 0; i < 32; ++i) { + double temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = output[j + i*32]; + dct32_1d(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) + output[j + i*32] = temp_out[j]; + } + // Scale by some magic number + for (i = 0; i < 1024; i++) { + out[i] = (short)round(output[i]/4); + } + } + + vp9_clear_system_state(); // Make it simd safe : __asm emms; +} + +#else // CONFIG_DWT32X32HYBRID + +#define MAX_BLOCK_LENGTH 64 +#define ENH_PRECISION_BITS 1 +#define ENH_PRECISION_RND ((1 << ENH_PRECISION_BITS) / 2) + +// Note: block length must be even for this implementation +static void analysis_53_row(int length, short *x, + short *lowpass, short *highpass) { + int n; + short r, * a, * b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++) << 1; + *b++ = *x - ((r + x[1] + 1) >> 1); + x++; + } + *a = (r = *x++) << 1; + *b = *x - r; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void analysis_53_col(int length, short *x, + short *lowpass, short *highpass) { + int n; + short r, * a, * b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++); + *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2; + x++; + } + *a = (r = *x++); + *b = (*x - r + 1) >> 1; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +// NOTE: Using a 5/3 integer wavelet for now. Explore using a wavelet +// with a better response later +static void dyadic_analyze(int levels, int width, int height, + short *x, int pitch_x, short *c, int pitch_c) { + int lv, i, j, nh, nw, hh = height, hw = width; + short buffer[2 * MAX_BLOCK_LENGTH]; + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x[i * pitch_x + j] << ENH_PRECISION_BITS; + } + } + for (lv = 0; lv < levels; lv++) { + nh = hh; + hh = (hh + 1) >> 1; + nw = hw; + hw = (hw + 1) >> 1; + if ((nh < 2) || (nw < 2)) return; + for (i = 0; i < nh; i++) { + memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); + analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); + } + for (j = 0; j < nw; j++) { + for (i = 0; i < nh; i++) + buffer[i + nh] = c[i * pitch_c + j]; + analysis_53_col(nh, buffer + nh, buffer, buffer + hh); + for (i = 0; i < nh; i++) + c[i * pitch_c + j] = buffer[i]; + } + } +} + +void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { + // assume out is a 32x32 buffer + short buffer[16 * 16]; + int i; + const int short_pitch = pitch >> 1; + dyadic_analyze(1, 32, 32, input, short_pitch, out, 32); + // TODO(debargha): Implement more efficiently by adding output pitch + // argument to the dct16x16 function + vp9_short_fdct16x16_c(out, buffer, 64); + for (i = 0; i < 16; ++i) + vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); +} +#endif // CONFIG_DWT32X32HYBRID +#endif // CONFIG_TX32X32 diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 6ab23cae0b2f43480dea341bd2e1ea362e5b1703..f504fc53c73bd47be8e252bb7ff6365291160d10 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -456,6 +456,10 @@ static void update_state(VP9_COMP *cpi, MACROBLOCK *x, if (xd->mb_to_right_edge >= 0) vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO)); } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + } else { + ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16]; +#endif } #endif @@ -1487,6 +1491,9 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cpi->hybrid_coef_counts_8x8); vp9_zero(cpi->coef_counts_16x16); vp9_zero(cpi->hybrid_coef_counts_16x16); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_zero(cpi->coef_counts_32x32); +#endif vp9_frame_init_quantizer(cpi); @@ -1507,7 +1514,8 @@ static void encode_frame_internal(VP9_COMP *cpi) { vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff)); vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count)); vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count)); - vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count)); + vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p)); + vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p)); vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p)); vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff)); { @@ -1700,7 +1708,11 @@ void vp9_encode_frame(VP9_COMP *cpi) { * keyframe's probabilities as an estimate of what the current keyframe's * coefficient cost distributions may look like. */ if (frame_type == 0) { +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + txfm_type = ALLOW_32X32; +#else txfm_type = ALLOW_16X16; +#endif } else #if 0 /* FIXME (rbultje) @@ -1731,9 +1743,15 @@ void vp9_encode_frame(VP9_COMP *cpi) { } else txfm_type = ALLOW_8X8; #else - txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >= +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >= cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? + ALLOW_32X32 : TX_MODE_SELECT; +#else + txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >= + cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? ALLOW_16X16 : TX_MODE_SELECT; +#endif #endif cpi->common.txfm_mode = txfm_type; if (txfm_type != TX_MODE_SELECT) { @@ -1753,7 +1771,8 @@ void vp9_encode_frame(VP9_COMP *cpi) { int64_t pd = cpi->rd_tx_select_diff[i]; int diff; if (i == TX_MODE_SELECT) - pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0); + pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, + 2048 * (TX_SIZE_MAX_SB - 1), 0); diff = (int)(pd / cpi->common.MBs); cpi->rd_tx_select_threshes[frame_type][i] += diff; cpi->rd_tx_select_threshes[frame_type][i] /= 2; @@ -1776,19 +1795,37 @@ void vp9_encode_frame(VP9_COMP *cpi) { } if (cpi->common.txfm_mode == TX_MODE_SELECT) { - const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4]; - const int count8x8 = cpi->txfm_count[TX_8X8]; + const int count4x4 = cpi->txfm_count_16x16p[TX_4X4] + + cpi->txfm_count_32x32p[TX_4X4] + + cpi->txfm_count_8x8p[TX_4X4]; + const int count8x8_lp = cpi->txfm_count_32x32p[TX_8X8] + + cpi->txfm_count_16x16p[TX_8X8]; const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8]; - const int count16x16 = cpi->txfm_count[TX_16X16]; + const int count16x16_16x16p = cpi->txfm_count_16x16p[TX_16X16]; + const int count16x16_lp = cpi->txfm_count_32x32p[TX_16X16]; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + const int count32x32 = cpi->txfm_count_32x32p[TX_32X32]; +#else + const int count32x32 = 0; +#endif - if (count4x4 == 0 && count16x16 == 0) { + if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && + count32x32 == 0) { cpi->common.txfm_mode = ALLOW_8X8; reset_skip_txfm_size(cpi, TX_8X8); - } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) { + } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && + count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { cpi->common.txfm_mode = ONLY_4X4; reset_skip_txfm_size(cpi, TX_4X4); - } else if (count8x8 == 0 && count4x4 == 0) { +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { + cpi->common.txfm_mode = ALLOW_32X32; +#endif + } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) { cpi->common.txfm_mode = ALLOW_16X16; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + reset_skip_txfm_size(cpi, TX_16X16); +#endif } } } else { @@ -2087,6 +2124,7 @@ static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x, vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag); } + assert(mbmi->txfm_size <= TX_16X16); if (mbmi->ref_frame == INTRA_FRAME) { #ifdef ENC_DEBUG if (enc_debug) { @@ -2266,7 +2304,7 @@ static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x, vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) { if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED && mbmi->mode != SPLITMV) { - cpi->txfm_count[mbmi->txfm_size]++; + cpi->txfm_count_16x16p[mbmi->txfm_size]++; } else if (mbmi->mode == I8X8_PRED || (mbmi->mode == SPLITMV && mbmi->partitioning != PARTITIONING_4X4)) { @@ -2308,6 +2346,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x, MODE_INFO *mi = x->e_mbd.mode_info_context; unsigned int segment_id = mi->mbmi.segment_id; ENTROPY_CONTEXT_PLANES ta[4], tl[4]; + const int mis = cm->mode_info_stride; x->skip = 0; @@ -2397,6 +2436,53 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x, xd->dst.y_stride, xd->dst.uv_stride); } +#if CONFIG_TX32X32 + if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) { + vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride, + dst, dst_y_stride); + vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff, + usrc, vsrc, src_uv_stride, + udst, vdst, dst_uv_stride); + vp9_transform_sby_32x32(x); + vp9_transform_sbuv_16x16(x); + vp9_quantize_sby_32x32(x); + vp9_quantize_sbuv_16x16(x); + // TODO(rbultje): trellis optimize + vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data); + vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data); + vp9_recon_sby_s_c(&x->e_mbd, dst); + vp9_recon_sbuv_s_c(&x->e_mbd, udst, vdst); + + if (!x->skip) { + vp9_tokenize_sb(cpi, &x->e_mbd, t, 0); + } else { + int mb_skip_context = + cpi->common.mb_no_coeff_skip ? + (mi - 1)->mbmi.mb_skip_coeff + + (mi - mis)->mbmi.mb_skip_coeff : + 0; + mi->mbmi.mb_skip_coeff = 1; + if (cm->mb_no_coeff_skip) { + cpi->skip_true_count[mb_skip_context]++; + vp9_fix_contexts_sb(xd); + } else { + vp9_stuff_sb(cpi, xd, t, 0); + cpi->skip_false_count[mb_skip_context]++; + } + } + + // copy skip flag on all mb_mode_info contexts in this SB + // if this was a skip at this txfm size + if (mb_col < cm->mb_cols - 1) + mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; + if (mb_row < cm->mb_rows - 1) { + mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; + if (mb_col < cm->mb_cols - 1) + mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; + } + skip[0] = skip[2] = skip[1] = skip[3] = mi->mbmi.mb_skip_coeff; + } else { +#endif for (n = 0; n < 4; n++) { int x_idx = n & 1, y_idx = n >> 1; @@ -2405,7 +2491,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x, memcpy(&ta[n], xd->above_context, sizeof(ta[n])); memcpy(&tl[n], xd->left_context, sizeof(tl[n])); tp[n] = *t; - xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride; + xd->mode_info_context = mi + x_idx + y_idx * mis; vp9_subtract_mby_s_c(x->src_diff, src + x_idx * 16 + y_idx * 16 * src_y_stride, @@ -2433,7 +2519,7 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x, int mb_skip_context = cpi->common.mb_no_coeff_skip ? (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff + - (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff : + (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff : 0; xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1; if (cpi->common.mb_no_coeff_skip) { @@ -2450,20 +2536,29 @@ static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x, xd->mode_info_context = mi; update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip); +#if CONFIG_TX32X32 + } +#endif if (cm->txfm_mode == TX_MODE_SELECT && !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) || (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { - cpi->txfm_count[mi->mbmi.txfm_size]++; + cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++; } else { - TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_16X16 : cm->txfm_mode; + TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? +#if CONFIG_TX32X32 + TX_32X32 : +#else + TX_16X16 : +#endif + cm->txfm_mode; mi->mbmi.txfm_size = sz; if (mb_col < cm->mb_cols - 1) mi[1].mbmi.txfm_size = sz; if (mb_row < cm->mb_rows - 1) { - mi[cm->mode_info_stride].mbmi.txfm_size = sz; + mi[mis].mbmi.txfm_size = sz; if (mb_col < cm->mb_cols - 1) - mi[cm->mode_info_stride + 1].mbmi.txfm_size = sz; + mi[mis + 1].mbmi.txfm_size = sz; } } } diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 091f2f0fe9b09053cfa53e2a3f603938ababc6d8..46087c28e53e41ece34856bad43c3e64a416c1d3 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -108,6 +108,52 @@ void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride, } } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +void vp9_subtract_sby_s_c(short *diff, const unsigned char *src, int src_stride, + const unsigned char *pred, int dst_stride) { + int r, c; + + for (r = 0; r < 32; r++) { + for (c = 0; c < 32; c++) { + diff[c] = src[c] - pred[c]; + } + + diff += 32; + pred += dst_stride; + src += src_stride; + } +} + +void vp9_subtract_sbuv_s_c(short *diff, const unsigned char *usrc, + const unsigned char *vsrc, int src_stride, + const unsigned char *upred, + const unsigned char *vpred, int dst_stride) { + short *udiff = diff + 1024; + short *vdiff = diff + 1024 + 256; + int r, c; + + for (r = 0; r < 16; r++) { + for (c = 0; c < 16; c++) { + udiff[c] = usrc[c] - upred[c]; + } + + udiff += 16; + upred += dst_stride; + usrc += src_stride; + } + + for (r = 0; r < 16; r++) { + for (c = 0; c < 16; c++) { + vdiff[c] = vsrc[c] - vpred[c]; + } + + vdiff += 16; + vpred += dst_stride; + vsrc += src_stride; + } +} +#endif + void vp9_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) { vp9_subtract_mby_s_c(diff, src, stride, pred, 16); @@ -265,6 +311,22 @@ void vp9_transform_mb_16x16(MACROBLOCK *x) { vp9_transform_mbuv_8x8(x); } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +void vp9_transform_sby_32x32(MACROBLOCK *x) { + SUPERBLOCK * const x_sb = &x->sb_coeff_data; + vp9_short_fdct32x32(x_sb->src_diff, x_sb->coeff, 64); +} + +void vp9_transform_sbuv_16x16(MACROBLOCK *x) { + SUPERBLOCK * const x_sb = &x->sb_coeff_data; + vp9_clear_system_state(); + x->vp9_short_fdct16x16(x_sb->src_diff + 1024, + x_sb->coeff + 1024, 32); + x->vp9_short_fdct16x16(x_sb->src_diff + 1280, + x_sb->coeff + 1280, 32); +} +#endif + #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) typedef struct vp9_token_state vp9_token_state; diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 4f49647a2c76127c52f14ddaaca513eef423ac4a..3c0a0a5a2376d711ae4f02fcbcb5e35a247b178f 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -47,6 +47,11 @@ void vp9_transform_mb_16x16(MACROBLOCK *mb); void vp9_transform_mby_16x16(MACROBLOCK *x); void vp9_optimize_mby_16x16(MACROBLOCK *x); +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +void vp9_transform_sby_32x32(MACROBLOCK *x); +void vp9_transform_sbuv_16x16(MACROBLOCK *x); +#endif + void vp9_fidct_mb(MACROBLOCK *x); void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch); @@ -59,6 +64,14 @@ void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc, void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride, const unsigned char *pred, int dst_stride); +#if CONFIG_TX32X32 +void vp9_subtract_sby_s_c(short *diff, const unsigned char *src, int src_stride, + const unsigned char *pred, int dst_stride); +void vp9_subtract_sbuv_s_c(short *diff, const unsigned char *usrc, + const unsigned char *vsrc, int src_stride, + const unsigned char *upred, + const unsigned char *vpred, int dst_stride); +#endif #endif #endif diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 0fe7a14b6e1852a3efeabdb14167b35aa3496e0e..779534bac1b1d9ac5399fbda05729ac16beab90a 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -1810,7 +1810,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { #endif for (i = 0; i < COMP_PRED_CONTEXTS; i++) cm->prob_comppred[i] = 128; - for (i = 0; i < TX_SIZE_MAX - 1; i++) + for (i = 0; i < TX_SIZE_MAX_SB - 1; i++) cm->prob_tx[i] = 128; // Prime the recent reference frame useage counters. @@ -3698,6 +3698,9 @@ static void encode_frame_to_data_rate vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16); vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16, cpi->hybrid_coef_counts_16x16); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32); +#endif vp9_adapt_coef_probs(&cpi->common); if (cpi->common.frame_type != KEY_FRAME) { #if CONFIG_SUPERBLOCKS diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 511e62f1c85776012cef5da467438502977f9465..28acc96d47a2a7a09446ef978db6707a66be2a2b 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -109,6 +109,11 @@ typedef struct { vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_prob coef_probs_32x32[BLOCK_TYPES_32X32] + [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]; +#endif + #if CONFIG_SUPERBLOCKS vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1]; #endif @@ -435,6 +440,15 @@ typedef struct VP9_COMP { DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]); DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + DECLARE_ALIGNED(16, short, Y1zbin_32x32[QINDEX_RANGE][1024]); + DECLARE_ALIGNED(16, short, Y2zbin_32x32[QINDEX_RANGE][1024]); + DECLARE_ALIGNED(16, short, UVzbin_32x32[QINDEX_RANGE][1024]); + DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_32x32[QINDEX_RANGE][1024]); + DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_32x32[QINDEX_RANGE][1024]); + DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_32x32[QINDEX_RANGE][1024]); +#endif + MACROBLOCK mb; VP9_COMMON common; VP9_CONFIG oxcf; @@ -483,8 +497,9 @@ typedef struct VP9_COMP { int comp_pred_count[COMP_PRED_CONTEXTS]; int single_pred_count[COMP_PRED_CONTEXTS]; // FIXME contextualize - int txfm_count[TX_SIZE_MAX]; - int txfm_count_8x8p[TX_SIZE_MAX - 1]; + int txfm_count_32x32p[TX_SIZE_MAX_SB]; + int txfm_count_16x16p[TX_SIZE_MAX_MB]; + int txfm_count_8x8p[TX_SIZE_MAX_MB - 1]; int64_t rd_tx_select_diff[NB_TXFM_MODES]; int rd_tx_select_threshes[4][NB_TXFM_MODES]; @@ -604,6 +619,12 @@ typedef struct VP9_COMP { vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2]; +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 + unsigned int coef_counts_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; /* for this frame */ + vp9_prob frame_coef_probs_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + unsigned int frame_branch_ct_32x32 [BLOCK_TYPES_32X32] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2]; +#endif + int gfu_boost; int last_boost; int kf_boost; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index f160edb565e112907345c9321e99b938e9de9587..fcc7d2948b00d011f6fe702e87d8e3d505159f0a 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -323,28 +323,25 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x) { vp9_quantize_mbuv_8x8(x); } -void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) { +static void quantize(short *zbin_boost_orig_ptr, + short *coeff_ptr, int n_coeffs, int max_coeffs, + short *zbin_ptr, short *round_ptr, short *quant_ptr, + unsigned char *quant_shift_ptr, + short *qcoeff_ptr, short *dqcoeff_ptr, + short *dequant_ptr, short zbin_oq_value, + int *eob_ptr, const int *scan, int mul) { int i, rc, eob; int zbin; int x, y, z, sz; - short *zbin_boost_ptr = b->zrun_zbin_boost_16x16; - short *coeff_ptr = b->coeff; - short *zbin_ptr = b->zbin_16x16; - short *round_ptr = b->round; - short *quant_ptr = b->quant; - unsigned char *quant_shift_ptr = b->quant_shift; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - short zbin_oq_value = b->zbin_extra; + short *zbin_boost_ptr = zbin_boost_orig_ptr; - vpx_memset(qcoeff_ptr, 0, 256*sizeof(short)); - vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short)); + vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(short)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(short)); eob = -1; - for (i = 0; i < b->eob_max_offset_16x16; i++) { - rc = vp9_default_zig_zag1d_16x16[i]; - z = coeff_ptr[rc]; + for (i = 0; i < max_coeffs; i++) { + rc = scan[i]; + z = coeff_ptr[rc] * mul; zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value); zbin_boost_ptr ++; @@ -354,22 +351,70 @@ void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) { if (x >= zbin) { x += (round_ptr[rc!=0]); - y = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x)) + y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) >> quant_shift_ptr[rc!=0]; // quantize (x) x = (y ^ sz) - sz; // get the sign back qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0]; // dequantized value + dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value if (y) { eob = i; // last nonzero coeffs - zbin_boost_ptr = b->zrun_zbin_boost_16x16; + zbin_boost_ptr = zbin_boost_orig_ptr; } } } - d->eob = eob + 1; + *eob_ptr = eob + 1; +} + +void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) { + quantize(b->zrun_zbin_boost_16x16, + b->coeff, + 256, b->eob_max_offset_16x16, + b->zbin_16x16, b->round, b->quant, b->quant_shift, + d->qcoeff, + d->dqcoeff, + d->dequant, + b->zbin_extra, + &d->eob, vp9_default_zig_zag1d_16x16, 1); +} + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +void vp9_quantize_sby_32x32(MACROBLOCK *x) { + x->e_mbd.block[0].eob = 0; + quantize(x->block[0].zrun_zbin_boost_32x32, + x->sb_coeff_data.coeff, + 1024, x->block[0].eob_max_offset_32x32, + x->block[0].zbin_32x32, + x->block[0].round, x->block[0].quant, x->block[0].quant_shift, + x->e_mbd.sb_coeff_data.qcoeff, + x->e_mbd.sb_coeff_data.dqcoeff, + x->e_mbd.block[0].dequant, + x->block[0].zbin_extra, + &x->e_mbd.block[0].eob, + vp9_default_zig_zag1d_32x32, 2); } +void vp9_quantize_sbuv_16x16(MACROBLOCK *x) { + int i; + + x->e_mbd.block[16].eob = 0; + x->e_mbd.block[20].eob = 0; + for (i = 16; i < 24; i += 4) + quantize(x->block[i].zrun_zbin_boost_16x16, + x->sb_coeff_data.coeff + 1024 + (i - 16) * 64, + 256, x->block[i].eob_max_offset_16x16, + x->block[i].zbin_16x16, + x->block[i].round, x->block[0].quant, x->block[i].quant_shift, + x->e_mbd.sb_coeff_data.qcoeff + 1024 + (i - 16) * 64, + x->e_mbd.sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64, + x->e_mbd.block[i].dequant, + x->block[i].zbin_extra, + &x->e_mbd.block[i].eob, + vp9_default_zig_zag1d_16x16, 1); +} +#endif + /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of * these two C functions if corresponding optimized routine is not available. * NEON optimized version implements currently the fast quantization for pair @@ -427,6 +472,74 @@ void vp9_init_quantizer(VP9_COMP *cpi) { 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, }; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + static const int zbin_boost_32x32[1024] = { + 0, 0, 0, 8, 8, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + }; +#endif int qrounding_factor = 48; @@ -454,7 +567,13 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7; cpi->zrun_zbin_boost_y1_8x8[Q][0] = ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; + cpi->zrun_zbin_boost_y1_16x16[Q][0] = + ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + cpi->Y1zbin_32x32[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; + cpi->zrun_zbin_boost_y1_32x32[Q][0] = + ((quant_val * zbin_boost_32x32[0]) + 64) >> 7; +#endif quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q); @@ -468,7 +587,8 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7; cpi->zrun_zbin_boost_y2_8x8[Q][0] = ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; + cpi->zrun_zbin_boost_y2_16x16[Q][0] = + ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q); invert_quant(cpi->UVquant[Q] + 0, @@ -481,7 +601,8 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7; cpi->zrun_zbin_boost_uv_8x8[Q][0] = ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; + cpi->zrun_zbin_boost_uv_16x16[Q][0] = + ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; // all the 4x4 ac values =; for (i = 1; i < 16; i++) { @@ -543,16 +664,30 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quant_val = vp9_ac_yquant(Q); cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; + cpi->zrun_zbin_boost_y1_16x16[Q][i] = + ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q); cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; + cpi->zrun_zbin_boost_y2_16x16[Q][i] = + ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q); cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; + cpi->zrun_zbin_boost_uv_16x16[Q][i] = + ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + // 32x32 structures. Same comment above applies. + for (i = 1; i < 1024; i++) { + int rc = vp9_default_zig_zag1d_32x32[i]; + + quant_val = vp9_ac_yquant(Q); + cpi->Y1zbin_32x32[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; + cpi->zrun_zbin_boost_y1_32x32[Q][i] = + ((quant_val * zbin_boost_32x32[i]) + 64) >> 7; + } +#endif } } @@ -592,11 +727,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->block[i].zbin = cpi->Y1zbin[QIndex]; x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex]; x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex]; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + x->block[i].zbin_32x32 = cpi->Y1zbin_32x32[QIndex]; +#endif x->block[i].round = cpi->Y1round[QIndex]; x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex]; x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex]; x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex]; x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex]; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + x->block[i].zrun_zbin_boost_32x32 = cpi->zrun_zbin_boost_y1_32x32[QIndex]; +#endif x->block[i].zbin_extra = (short)zbin_extra; // Segment max eob offset feature. @@ -607,10 +748,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); x->block[i].eob_max_offset_16x16 = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + x->block[i].eob_max_offset_32x32 = + vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); +#endif } else { x->block[i].eob_max_offset = 16; x->block[i].eob_max_offset_8x8 = 64; x->block[i].eob_max_offset_16x16 = 256; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + x->block[i].eob_max_offset_32x32 = 1024; +#endif } } @@ -640,9 +788,12 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); x->block[i].eob_max_offset_8x8 = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); + x->block[i].eob_max_offset_16x16 = + vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); } else { x->block[i].eob_max_offset = 16; x->block[i].eob_max_offset_8x8 = 64; + x->block[i].eob_max_offset_16x16 = 256; } } diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index dd11e75bab5f0bd2309100a5733d28b6b6462329..832a486f505b4125a21ad026d12dec6fb975dcde 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -78,6 +78,11 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x); extern prototype_quantize_block(vp9_quantize_quantb_16x16); extern prototype_quantize_mb(vp9_quantize_mby_16x16); +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +void vp9_quantize_sby_32x32(MACROBLOCK *x); +void vp9_quantize_sbuv_16x16(MACROBLOCK *x); +#endif + struct VP9_COMP; extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index b589243e8207398f2bd03d60a999bd12bc638b28..c896e41b1eb9cfd14423ae0d27598bdf9d0278cf 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -175,6 +175,9 @@ void vp9_save_coding_context(VP9_COMP *cpi) { vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8); vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16); vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32); +#endif vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob); #if CONFIG_COMP_INTERINTRA_PRED cc->interintra_prob = cm->fc.interintra_prob; @@ -234,6 +237,9 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8); vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16); vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32); +#endif vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob); #if CONFIG_COMP_INTERINTRA_PRED cm->fc.interintra_prob = cc->interintra_prob; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 9cea189698b47bab3ccbd653fdd947435962f968..60f14f8feaf60e7475cd09bbf5e82d5417a05a67 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -400,12 +400,18 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) { cpi->common.fc.hybrid_coef_probs_16x16, BLOCK_TYPES_16X16); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + fill_token_costs( + cpi->mb.token_costs[TX_32X32], + (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_32x32, + BLOCK_TYPES_32X32); +#endif + /*rough estimate for costing*/ cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4; vp9_init_mode_costs(cpi); - if (cpi->common.frame_type != KEY_FRAME) - { + if (cpi->common.frame_type != KEY_FRAME) { vp9_build_nmv_cost_table( cpi->mb.nmvjointcost, cpi->mb.e_mbd.allow_high_precision_mv ? @@ -556,7 +562,7 @@ static int cost_coeffs_2x2(MACROBLOCK *mb, static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, - int tx_size) { + TX_SIZE tx_size) { const int eob = b->eob; int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */ int cost = 0, default_eob, seg_eob; @@ -613,9 +619,24 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type, default_eob = 256; if (type == PLANE_TYPE_Y_WITH_DC) { tx_type = get_tx_type_16x16(xd, b); +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + } else if (type == PLANE_TYPE_UV) { + int ib = (int)(b - xd->block) - 16; + + qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * ib; +#endif } break; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + case TX_32X32: + scan = vp9_default_zig_zag1d_32x32; + band = vp9_coef_bands_32x32; + default_eob = 1024; + qcoeff_ptr = xd->sb_coeff_data.qcoeff; + break; +#endif default: + abort(); break; } if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB)) @@ -813,23 +834,28 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion, } static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, - int r[2][TX_SIZE_MAX], int *rate, - int d[TX_SIZE_MAX], int *distortion, - int s[TX_SIZE_MAX], int *skip, - int64_t txfm_cache[NB_TXFM_MODES]) { + int (*r)[2], int *rate, + int *d, int *distortion, + int *s, int *skip, + int64_t txfm_cache[NB_TXFM_MODES], + TX_SIZE max_txfm_size) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; vp9_prob skip_prob = cm->mb_no_coeff_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128; - int64_t rd[2][TX_SIZE_MAX]; - int n; - - r[1][TX_16X16] = r[0][TX_16X16] + vp9_cost_one(cm->prob_tx[0]) + - vp9_cost_one(cm->prob_tx[1]); - r[1][TX_8X8] = r[0][TX_8X8] + vp9_cost_one(cm->prob_tx[0]) + - vp9_cost_zero(cm->prob_tx[1]); - r[1][TX_4X4] = r[0][TX_4X4] + vp9_cost_zero(cm->prob_tx[0]); + int64_t rd[TX_SIZE_MAX_SB][2]; + int n, m; + + for (n = TX_4X4; n <= max_txfm_size; n++) { + r[n][1] = r[n][0]; + for (m = 0; m <= n - (n == max_txfm_size); m++) { + if (m == n) + r[n][1] += vp9_cost_zero(cm->prob_tx[m]); + else + r[n][1] += vp9_cost_one(cm->prob_tx[m]); + } + } if (cm->mb_no_coeff_skip) { int s0, s1; @@ -838,64 +864,82 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); - for (n = TX_4X4; n <= TX_16X16; n++) { + for (n = TX_4X4; n <= max_txfm_size; n++) { if (s[n]) { - rd[0][n] = rd[1][n] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); + rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); } else { - rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n] + s0, d[n]); - rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n] + s0, d[n]); + rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]); + rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); } } } else { - for (n = TX_4X4; n <= TX_16X16; n++) { - rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n], d[n]); - rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n], d[n]); + for (n = TX_4X4; n <= max_txfm_size; n++) { + rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0], d[n]); + rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1], d[n]); } } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + if (max_txfm_size == TX_32X32 && + (cm->txfm_mode == ALLOW_32X32 || + (cm->txfm_mode == TX_MODE_SELECT && + rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] && + rd[TX_32X32][1] < rd[TX_4X4][1]))) { + mbmi->txfm_size = TX_32X32; + } else +#endif if ( cm->txfm_mode == ALLOW_16X16 || +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + (max_txfm_size == TX_16X16 && cm->txfm_mode == ALLOW_32X32) || +#endif (cm->txfm_mode == TX_MODE_SELECT && - rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])) { + rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])) { mbmi->txfm_size = TX_16X16; } else if (cm->txfm_mode == ALLOW_8X8 || - (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_8X8] < rd[1][TX_4X4])) { + (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) { mbmi->txfm_size = TX_8X8; } else { - assert(cm->txfm_mode == ONLY_4X4 || - (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_4X4] <= rd[1][TX_8X8])); + assert(cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT); mbmi->txfm_size = TX_4X4; } *distortion = d[mbmi->txfm_size]; - *rate = r[cm->txfm_mode == TX_MODE_SELECT][mbmi->txfm_size]; + *rate = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT]; *skip = s[mbmi->txfm_size]; - txfm_cache[ONLY_4X4] = rd[0][TX_4X4]; - txfm_cache[ALLOW_8X8] = rd[0][TX_8X8]; - txfm_cache[ALLOW_16X16] = rd[0][TX_16X16]; - if (rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4]) - txfm_cache[TX_MODE_SELECT] = rd[1][TX_16X16]; + txfm_cache[ONLY_4X4] = rd[TX_4X4][0]; + txfm_cache[ALLOW_8X8] = rd[TX_8X8][0]; + txfm_cache[ALLOW_16X16] = rd[TX_16X16][0]; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + txfm_cache[ALLOW_32X32] = rd[max_txfm_size][0]; + if (max_txfm_size == TX_32X32 && + rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] && + rd[TX_32X32][1] < rd[TX_4X4][1]) + txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1]; + else +#endif + if (rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1]) + txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1]; else - txfm_cache[TX_MODE_SELECT] = rd[1][TX_4X4] < rd[1][TX_8X8] ? - rd[1][TX_4X4] : rd[1][TX_8X8]; + txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ? + rd[TX_4X4][1] : rd[TX_8X8][1]; } static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) { MACROBLOCKD *const xd = &x->e_mbd; - int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX]; + int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB]; vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor, x->block[0].src_stride); - macro_block_yrd_16x16(x, &r[0][TX_16X16], &d[TX_16X16], - &s[TX_16X16], 1); - macro_block_yrd_8x8(x, &r[0][TX_8X8], &d[TX_8X8], &s[TX_8X8], 1); - macro_block_yrd_4x4(x, &r[0][TX_4X4], &d[TX_4X4], &s[TX_4X4], 1); + macro_block_yrd_16x16(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1); + macro_block_yrd_8x8(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1); + macro_block_yrd_4x4(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable, - txfm_cache); + txfm_cache, TX_16X16); } static void copy_predictor(unsigned char *dst, const unsigned char *predictor) { @@ -908,25 +952,91 @@ static void copy_predictor(unsigned char *dst, const unsigned char *predictor) { } #if CONFIG_SUPERBLOCKS +#if CONFIG_TX32X32 +static int rdcost_sby_32x32(MACROBLOCK *x) { + MACROBLOCKD * const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above, + *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + return cost_coeffs(x, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32); +} + +static int vp9_sb_block_error_c(short *coeff, short *dqcoeff, int block_size) { + int i; + int64_t error = 0; + + for (i = 0; i < block_size; i++) { + unsigned int this_diff = coeff[i] - dqcoeff[i]; + error += this_diff * this_diff; + } + + return error > INT_MAX ? INT_MAX : error; +} + +#define DEBUG_ERROR 0 +static void super_block_yrd_32x32(MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + SUPERBLOCK * const x_sb = &x->sb_coeff_data; + MACROBLOCKD * const xd = &x->e_mbd; + SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data; +#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID + short out[1024]; +#endif + + vp9_transform_sby_32x32(x); + vp9_quantize_sby_32x32(x); +#if DEBUG_ERROR || CONFIG_DWT32X32HYBRID + vp9_short_idct32x32(xd_sb->dqcoeff, out, 64); +#endif + +#if !CONFIG_DWT32X32HYBRID + *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024); +#else + *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4; +#endif +#if DEBUG_ERROR + printf("IDCT/FDCT error 32x32: %d (d: %d)\n", + vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion); +#endif + *rate = rdcost_sby_32x32(x); + *skippable = vp9_sby_is_skippable_32x32(&x->e_mbd); +} +#endif + static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int *skip, int64_t txfm_cache[NB_TXFM_MODES]) { MACROBLOCKD *const xd = &x->e_mbd; - int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX], n; + int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n; const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer; int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride; - ENTROPY_CONTEXT_PLANES t_above[3][2], *orig_above = xd->above_context; - ENTROPY_CONTEXT_PLANES t_left[3][2], *orig_left = xd->left_context; + ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_MB][2], + *orig_above = xd->above_context; + ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_MB][2], + *orig_left = xd->left_context; - for (n = TX_4X4; n <= TX_16X16; n++) { + for (n = TX_4X4; n < TX_SIZE_MAX_MB; n++) { vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n])); vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n])); - r[0][n] = 0; + r[n][0] = 0; d[n] = 0; s[n] = 1; } +#if CONFIG_TX32X32 + vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride, + dst, dst_y_stride); + super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); +#endif + +#if DEBUG_ERROR + int err[3] = { 0, 0, 0 }; +#endif for (n = 0; n < 4; n++) { int x_idx = n & 1, y_idx = n >> 1; int r_tmp, d_tmp, s_tmp; @@ -941,25 +1051,42 @@ static void super_block_yrd(VP9_COMP *cpi, xd->left_context = &t_left[TX_16X16][y_idx]; macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0); d[TX_16X16] += d_tmp; - r[0][TX_16X16] += r_tmp; + r[TX_16X16][0] += r_tmp; s[TX_16X16] = s[TX_16X16] && s_tmp; +#if DEBUG_ERROR + vp9_inverse_transform_mby_16x16(xd); + err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256); +#endif xd->above_context = &t_above[TX_4X4][x_idx]; xd->left_context = &t_left[TX_4X4][y_idx]; macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0); d[TX_4X4] += d_tmp; - r[0][TX_4X4] += r_tmp; + r[TX_4X4][0] += r_tmp; s[TX_4X4] = s[TX_4X4] && s_tmp; +#if DEBUG_ERROR + vp9_inverse_transform_mby_4x4(xd); + err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256); +#endif xd->above_context = &t_above[TX_8X8][x_idx]; xd->left_context = &t_left[TX_8X8][y_idx]; macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0); d[TX_8X8] += d_tmp; - r[0][TX_8X8] += r_tmp; + r[TX_8X8][0] += r_tmp; s[TX_8X8] = s[TX_8X8] && s_tmp; +#if DEBUG_ERROR + vp9_inverse_transform_mby_8x8(xd); + err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256); +#endif } - - choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache); +#if DEBUG_ERROR + printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]); + printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]); + printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]); +#endif + choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, + TX_SIZE_MAX_SB - 1); xd->above_context = orig_above; xd->left_context = orig_left; @@ -1632,14 +1759,59 @@ static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate, } #if CONFIG_SUPERBLOCKS +#if CONFIG_TX32X32 +static int rd_cost_sbuv_16x16(MACROBLOCK *x) { + int b; + int cost = 0; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT *ta, *tl; + + vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta = (ENTROPY_CONTEXT *) &t_above; + tl = (ENTROPY_CONTEXT *) &t_left; + + for (b = 16; b < 24; b += 4) + cost += cost_coeffs(x, xd->block + b, PLANE_TYPE_UV, + ta + vp9_block2above_8x8[b], + tl + vp9_block2left_8x8[b], TX_16X16); + + return cost; +} + +static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate, + int *distortion, int *skip) { + MACROBLOCKD *const xd = &x->e_mbd; + + vp9_transform_sbuv_16x16(x); + vp9_quantize_sbuv_16x16(x); + + *rate = rd_cost_sbuv_16x16(x); + *distortion = vp9_block_error_c(x->sb_coeff_data.coeff + 1024, + xd->sb_coeff_data.dqcoeff + 1024, 512) >> 2; + *skip = vp9_sbuv_is_skippable_16x16(xd); +} +#endif + static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel, int *skip) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - int n, r = 0, d = 0; const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer; const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer; int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride; + +#if CONFIG_TX32X32 + if (mbmi->txfm_size == TX_32X32) { + vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff, + usrc, vsrc, src_uv_stride, + udst, vdst, dst_uv_stride); + rd_inter32x32_uv_16x16(x, rate, distortion, skip); + } else { +#endif + int n, r = 0, d = 0; int skippable = 1; ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; ENTROPY_CONTEXT_PLANES *ta = xd->above_context; @@ -1680,8 +1852,11 @@ static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate, xd->above_context = ta; memcpy(xd->above_context, t_above, sizeof(t_above)); memcpy(xd->left_context, t_left, sizeof(t_left)); +#if CONFIG_TX32X32 + } +#endif - return RDCOST(x->rdmult, x->rddiv, r, d); + return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); } #endif @@ -1818,15 +1993,26 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi, } #if CONFIG_SUPERBLOCKS -static void super_block_uvrd_8x8(MACROBLOCK *x, - int *rate, - int *distortion, - int *skippable) { +// TODO(rbultje) very similar to rd_inter32x32_uv(), merge? +static void super_block_uvrd(MACROBLOCK *x, + int *rate, + int *distortion, + int *skippable) { MACROBLOCKD *const xd = &x->e_mbd; - int d = 0, r = 0, n, s = 1; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer; const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer; int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride; + +#if CONFIG_TX32X32 + if (mbmi->txfm_size == TX_32X32) { + vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff, + usrc, vsrc, src_uv_stride, + udst, vdst, dst_uv_stride); + rd_inter32x32_uv_16x16(x, rate, distortion, skippable); + } else { +#endif + int d = 0, r = 0, n, s = 1; ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; ENTROPY_CONTEXT_PLANES *ta = xd->above_context; ENTROPY_CONTEXT_PLANES *tl = xd->left_context; @@ -1844,9 +2030,15 @@ static void super_block_uvrd_8x8(MACROBLOCK *x, udst + x_idx * 8 + y_idx * 8 * dst_uv_stride, vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride, dst_uv_stride); - vp9_transform_mbuv_8x8(x); - vp9_quantize_mbuv_8x8(x); - s &= vp9_mbuv_is_skippable_8x8(xd); + if (mbmi->txfm_size == TX_4X4) { + vp9_transform_mbuv_4x4(x); + vp9_quantize_mbuv_4x4(x); + s &= vp9_mbuv_is_skippable_4x4(xd); + } else { + vp9_transform_mbuv_8x8(x); + vp9_quantize_mbuv_8x8(x); + s &= vp9_mbuv_is_skippable_8x8(xd); + } d += vp9_mbuverror(x) >> 2; xd->above_context = ta + x_idx; @@ -1864,6 +2056,9 @@ static void super_block_uvrd_8x8(MACROBLOCK *x, xd->above_context = ta; memcpy(xd->above_context, t_above, sizeof(t_above)); memcpy(xd->left_context, t_left, sizeof(t_left)); +#if CONFIG_TX32X32 + } +#endif } static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, @@ -1882,8 +2077,8 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, x->e_mbd.mode_info_context->mbmi.uv_mode = mode; vp9_build_intra_predictors_sbuv_s(&x->e_mbd); - super_block_uvrd_8x8(x, &this_rate_tokenonly, - &this_distortion, &s); + super_block_uvrd(x, &this_rate_tokenonly, + &this_distortion, &s); this_rate = this_rate_tokenonly + x->intra_uv_mode_cost[x->e_mbd.frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); @@ -4141,8 +4336,6 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int y_skip, uv_skip; int64_t txfm_cache[NB_TXFM_MODES]; - xd->mode_info_context->mbmi.txfm_size = TX_8X8; - error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, &y_skip, txfm_cache); error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, @@ -4362,6 +4555,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0; MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV; int switchable_filter_index = 0; +#if CONFIG_TX32X32 + int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0; + int dist_uv_16x16 = 0, uv_skip_16x16 = 0; + MB_PREDICTION_MODE mode_uv_16x16; +#endif x->skip = 0; xd->mode_info_context->mbmi.segment_id = segment_id; @@ -4397,6 +4595,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, &dist_uv_8x8, &uv_skip_8x8); mode_uv_8x8 = mbmi->uv_mode; } +#if CONFIG_TX32X32 + if (cm->txfm_mode >= ALLOW_32X32) { + mbmi->txfm_size = TX_32X32; + rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16, &rate_uv_tokenonly_16x16, + &dist_uv_16x16, &uv_skip_16x16); + mode_uv_16x16 = mbmi->uv_mode; + } +#endif for (mode_index = 0; mode_index < MAX_MODES; mode_index += (!switchable_filter_index)) { @@ -4524,6 +4730,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, distortion_uv = dist_uv_4x4; skippable = skippable && uv_skip_4x4; mbmi->uv_mode = mode_uv_4x4; +#if CONFIG_TX32X32 + } else if (mbmi->txfm_size == TX_32X32) { + rate_uv = rate_uv_16x16; + distortion_uv = dist_uv_16x16; + skippable = skippable && uv_skip_16x16; + mbmi->uv_mode = mode_uv_16x16; +#endif } else { rate_uv = rate_uv_8x8; distortion_uv = dist_uv_8x8; diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 44963b2237dfa050ec8351ce3f4c7a713fe195b2..a662e048edae44ebbae2a729000e848f7b6c7630 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -117,7 +117,7 @@ static void tokenize_b(VP9_COMP *cpi, int dry_run) { int pt; /* near block/prev token context index */ int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0; - const int eob = b->eob; /* one beyond last nonzero coeff */ + int eob = b->eob; /* one beyond last nonzero coeff */ TOKENEXTRA *t = *tp; /* store tokens starting here */ const short *qcoeff_ptr = b->qcoeff; int seg_eob; @@ -177,7 +177,23 @@ static void tokenize_b(VP9_COMP *cpi, counts = cpi->coef_counts_16x16; probs = cpi->common.fc.coef_probs_16x16; } +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 + if (type == PLANE_TYPE_UV) { + int uv_idx = (((int) (b - xd->block)) - 16) >> 2; + qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx; + } +#endif + break; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + case TX_32X32: + seg_eob = 1024; + bands = vp9_coef_bands_32x32; + scan = vp9_default_zig_zag1d_32x32; + counts = cpi->coef_counts_32x32; + probs = cpi->common.fc.coef_probs_32x32; + qcoeff_ptr = xd->sb_coeff_data.qcoeff; break; +#endif } if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) @@ -283,6 +299,79 @@ static int mb_is_skippable_16x16(MACROBLOCKD *xd) { return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd)); } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) { + int skip = 1; + skip &= !xd->block[0].eob; + return skip; +} + +int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) { + return (!xd->block[16].eob) & (!xd->block[20].eob); +} + +static int sb_is_skippable_32x32(MACROBLOCKD *xd) { + return vp9_sby_is_skippable_32x32(xd) && + vp9_sbuv_is_skippable_16x16(xd); +} + +void vp9_tokenize_sb(VP9_COMP *cpi, + MACROBLOCKD *xd, + TOKENEXTRA **t, + int dry_run) { + VP9_COMMON * const cm = &cpi->common; + MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi; + TOKENEXTRA *t_backup = *t; + ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0), + (ENTROPY_CONTEXT *) (xd->above_context + 1), }; + ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0), + (ENTROPY_CONTEXT *) (xd->left_context + 1), }; + const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP); + const int segment_id = mbmi->segment_id; + const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || + (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0); + int b; + + mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd); + + if (mbmi->mb_skip_coeff) { + if (!dry_run) + cpi->skip_true_count[mb_skip_context] += skip_inc; + if (!cm->mb_no_coeff_skip) { + vp9_stuff_sb(cpi, xd, t, dry_run); + } else { + vp9_fix_contexts_sb(xd); + } + if (dry_run) + *t = t_backup; + return; + } + + if (!dry_run) + cpi->skip_false_count[mb_skip_context] += skip_inc; + + tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, + A[0], L[0], TX_32X32, dry_run); + A[0][1] = A[0][2] = A[0][3] = A[0][0]; + L[0][1] = L[0][2] = L[0][3] = L[0][0]; + + for (b = 16; b < 24; b += 4) { + tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, + A[0] + vp9_block2above_8x8[b], L[0] + vp9_block2left_8x8[b], + TX_16X16, dry_run); + A[0][vp9_block2above_8x8[b] + 1] = A[0][vp9_block2above_8x8[b]]; + L[0][vp9_block2left_8x8[b] + 1] = L[0][vp9_block2left_8x8[b]]; + } + vpx_memset(&A[0][8], 0, sizeof(A[0][8])); + vpx_memset(&L[0][8], 0, sizeof(L[0][8])); + vpx_memcpy(A[1], A[0], sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(L[1], L[0], sizeof(ENTROPY_CONTEXT_PLANES)); + + if (dry_run) + *t = t_backup; +} +#endif + void vp9_tokenize_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, @@ -717,6 +806,13 @@ static __inline void stuff_b(VP9_COMP *cpi, probs = cpi->common.fc.coef_probs_16x16; } break; +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS + case TX_32X32: + bands = vp9_coef_bands_32x32; + counts = cpi->coef_counts_32x32; + probs = cpi->common.fc.coef_probs_32x32; + break; +#endif } band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0]; t->Token = DCT_EOB_TOKEN; @@ -775,7 +871,8 @@ static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd, A[1] = A[2] = A[3] = A[0]; L[1] = L[2] = L[3] = L[0]; for (b = 16; b < 24; b += 4) { - stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b], + stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, + A + vp9_block2above_8x8[b], L + vp9_block2above_8x8[b], TX_8X8, dry_run); A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]]; L[vp9_block2left_8x8[b] + 1] = L[vp9_block2left_8x8[b]]; @@ -869,6 +966,43 @@ void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { } } +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +static void stuff_sb_32x32(VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run) { + ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0), + (ENTROPY_CONTEXT *) (xd->above_context + 1), }; + ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0), + (ENTROPY_CONTEXT *) (xd->left_context + 1), }; + int b; + + stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, + A[0], L[0], TX_32X32, dry_run); + A[0][1] = A[0][2] = A[0][3] = A[0][0]; + L[0][1] = L[0][2] = L[0][3] = L[0][0]; + for (b = 16; b < 24; b += 4) { + stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, + A[0] + vp9_block2above_8x8[b], + L[0] + vp9_block2above_8x8[b], TX_16X16, dry_run); + A[0][vp9_block2above_8x8[b] + 1] = A[0][vp9_block2above_8x8[b]]; + L[0][vp9_block2left_8x8[b] + 1] = L[0][vp9_block2left_8x8[b]]; + } + vpx_memset(&A[0][8], 0, sizeof(A[0][8])); + vpx_memset(&L[0][8], 0, sizeof(L[0][8])); + vpx_memcpy(A[1], A[0], sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(L[1], L[0], sizeof(ENTROPY_CONTEXT_PLANES)); +} + +void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { + TOKENEXTRA * const t_backup = *t; + + stuff_sb_32x32(cpi, xd, t, dry_run); + + if (dry_run) { + *t = t_backup; + } +} +#endif + void vp9_fix_contexts(MACROBLOCKD *xd) { /* Clear entropy contexts for blocks */ if ((xd->mode_info_context->mbmi.mode != B_PRED @@ -885,3 +1019,10 @@ void vp9_fix_contexts(MACROBLOCKD *xd) { xd->left_context->y2 = 1; } } + +#if CONFIG_TX32X32 && CONFIG_SUPERBLOCKS +void vp9_fix_contexts_sb(MACROBLOCKD *xd) { + vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2); + vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2); +} +#endif diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index 868909be39fe8ff32a1f319df830c446c7b7d496..cfd5db694b420e2aa718561dde600ae9c7a65bd8 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -34,16 +34,29 @@ extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd); extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block); extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd); extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd); +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +extern int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd); +extern int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd); +#endif struct VP9_COMP; extern void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run); +extern void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); extern void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run); +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +extern void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); +#endif extern void vp9_fix_contexts(MACROBLOCKD *xd); +#if CONFIG_SUPERBLOCKS && CONFIG_TX32X32 +extern void vp9_fix_contexts_sb(MACROBLOCKD *xd); +#endif #ifdef ENTROPY_STATS void init_context_counters();