diff --git a/.gitignore b/.gitignore index 4074b0bbf565da62b03fdb604b8cb34aa3628c05..5b47a86e525f4cae19de7bbccbc81d0926768725 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ *.d *.o *~ +/*.ivf +/*.ivf.md5 /*-*.mk /*.asm /*.doxy diff --git a/build/make/configure.sh b/build/make/configure.sh index 050ae57a7db7522a7b6fb0794116e867e22fe88c..4d0cad23e31e5c8d68468d73eca509a464482389 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -460,6 +460,7 @@ write_common_target_config_h() { #ifndef VPX_CONFIG_H #define VPX_CONFIG_H #define RESTRICT ${RESTRICT} +#define INLINE ${INLINE} EOF print_config_h ARCH "${TMP_H}" ${ARCH_LIST} print_config_h HAVE "${TMP_H}" ${HAVE_LIST} @@ -1005,12 +1006,6 @@ process_common_toolchain() { #error "not x32" #endif EOF - soft_enable runtime_cpu_detect - soft_enable mmx - soft_enable sse - soft_enable sse2 - soft_enable sse3 - soft_enable ssse3 case ${tgt_os} in win*) @@ -1064,9 +1059,15 @@ EOF ;; esac + soft_enable runtime_cpu_detect + soft_enable mmx + soft_enable sse + soft_enable sse2 + soft_enable sse3 + soft_enable ssse3 # We can't use 'check_cflags' until the compiler is configured and CC is # populated. - if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4.1; then + if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 " else soft_enable sse4_1 @@ -1174,6 +1175,14 @@ EOF [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' | grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian + # Try to find which inline keywords are supported + check_cc <<EOF && INLINE="inline" + static inline function() {} +EOF + check_cc <<EOF && INLINE="__attribute__((always_inline))" + static __attribute__((always_inline)) function() {} +EOF + # Almost every platform uses pthreads. if enabled multithread; then case ${toolchain} in diff --git a/configure b/configure index ad33c9c1c378ef01a27b8a236ac32f74a79fb5c4..5f2c39183c27f6979c33b44fd36fcd724aea0435 100755 --- a/configure +++ b/configure @@ -239,17 +239,18 @@ HAVE_LIST=" " EXPERIMENT_LIST=" csm - lossless new_mvref implicit_segmentation newbintramodes comp_interintra_pred - tx64x64 - dwtdcthybrid - cnvcontext - newcoefcontext enable_6tap abovesprefmv + code_nonzerocount + useselectrefmv + modelcoefprob + loop_dering + implicit_compoundinter_weight + scatterscan " CONFIG_LIST=" external_build @@ -647,6 +648,7 @@ process_toolchain() { enable solution vs_version=${tgt_cc##vs} all_targets="${all_targets} solution" + INLINE="__forceinline" ;; esac diff --git a/test/altref_test.cc b/test/altref_test.cc index ca055773df70ba7b416171df435dd238199934e1..14af26574293ba43da3fe79a4491179ff50a3282 100644 --- a/test/altref_test.cc +++ b/test/altref_test.cc @@ -8,19 +8,20 @@ * be found in the AUTHORS file in the root of the source tree. */ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" - +#include "test/util.h" namespace { // lookahead range: [kLookAheadMin, kLookAheadMax). const int kLookAheadMin = 5; const int kLookAheadMax = 26; -class AltRefTest : public libvpx_test::EncoderTest, - public ::testing::TestWithParam<int> { +class AltRefTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<int> { protected: - AltRefTest() : altref_count_(0) {} + AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {} virtual ~AltRefTest() {} virtual void SetUp() { @@ -58,7 +59,7 @@ TEST_P(AltRefTest, MonotonicTimestamps) { const vpx_rational timebase = { 33333333, 1000000000 }; cfg_.g_timebase = timebase; cfg_.rc_target_bitrate = 1000; - cfg_.g_lag_in_frames = GetParam(); + cfg_.g_lag_in_frames = GET_PARAM(1); libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, timebase.den, timebase.num, 0, 30); @@ -66,6 +67,7 @@ TEST_P(AltRefTest, MonotonicTimestamps) { EXPECT_GE(altref_count(), 1); } -INSTANTIATE_TEST_CASE_P(NonZeroLag, AltRefTest, - ::testing::Range(kLookAheadMin, kLookAheadMax)); + +VP8_INSTANTIATE_TEST_CASE(AltRefTest, + ::testing::Range(kLookAheadMin, kLookAheadMax)); } // namespace diff --git a/test/codec_factory.h b/test/codec_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..fdae5720f0175adea5e7a5dd6f7a21954692acf2 --- /dev/null +++ b/test/codec_factory.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef TEST_CODEC_FACTORY_H_ +#define TEST_CODEC_FACTORY_H_ + +extern "C" { +#include "./vpx_config.h" +#include "vpx/vpx_decoder.h" +#include "vpx/vpx_encoder.h" +#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER +#include "vpx/vp8cx.h" +#endif +#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER +#include "vpx/vp8dx.h" +#endif +} + +#include "test/decode_test_driver.h" +#include "test/encode_test_driver.h" +namespace libvpx_test { + +class CodecFactory { + public: + CodecFactory() {} + + virtual ~CodecFactory() {} + + virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg, + unsigned long deadline) const = 0; + + virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg, + unsigned long deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const = 0; + + virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const = 0; +}; + +/* Provide CodecTestWith<n>Params classes for a variable number of parameters + * to avoid having to include a pointer to the CodecFactory in every test + * definition. + */ +template<class T1> +class CodecTestWithParam : public ::testing::TestWithParam< + std::tr1::tuple< const libvpx_test::CodecFactory*, T1 > > { +}; + +template<class T1, class T2> +class CodecTestWith2Params : public ::testing::TestWithParam< + std::tr1::tuple< const libvpx_test::CodecFactory*, T1, T2 > > { +}; + +template<class T1, class T2, class T3> +class CodecTestWith3Params : public ::testing::TestWithParam< + std::tr1::tuple< const libvpx_test::CodecFactory*, T1, T2, T3 > > { +}; + +/* + * VP8 Codec Definitions + */ +#if CONFIG_VP8 +class VP8Decoder : public Decoder { + public: + VP8Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline) + : Decoder(cfg, deadline) {} + + protected: + virtual const vpx_codec_iface_t* CodecInterface() const { +#if CONFIG_VP8_DECODER + return &vpx_codec_vp8_dx_algo; +#else + return NULL; +#endif + } +}; + +class VP8Encoder : public Encoder { + public: + VP8Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + const unsigned long init_flags, TwopassStatsStore *stats) + : Encoder(cfg, deadline, init_flags, stats) {} + + protected: + virtual const vpx_codec_iface_t* CodecInterface() const { +#if CONFIG_VP8_ENCODER + return &vpx_codec_vp8_cx_algo; +#else + return NULL; +#endif + } +}; + +class VP8CodecFactory : public CodecFactory { + public: + VP8CodecFactory() : CodecFactory() {} + + virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg, + unsigned long deadline) const { +#if CONFIG_VP8_DECODER + return new VP8Decoder(cfg, deadline); +#else + return NULL; +#endif + } + + virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg, + unsigned long deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const { +#if CONFIG_VP8_ENCODER + return new VP8Encoder(cfg, deadline, init_flags, stats); +#else + return NULL; +#endif + } + + virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const { +#if CONFIG_VP8_ENCODER + return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage); +#else + return VPX_CODEC_INCAPABLE; +#endif + } +}; + +const libvpx_test::VP8CodecFactory kVP8; + +#define VP8_INSTANTIATE_TEST_CASE(test, params)\ + INSTANTIATE_TEST_CASE_P(VP8, test, \ + ::testing::Combine( \ + ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \ + &libvpx_test::kVP8)), \ + params)) +#else +#define VP8_INSTANTIATE_TEST_CASE(test, params) +#endif // CONFIG_VP8 + + +/* + * VP9 Codec Definitions + */ +#if CONFIG_VP9 +class VP9Decoder : public Decoder { + public: + VP9Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline) + : Decoder(cfg, deadline) {} + + protected: + virtual const vpx_codec_iface_t* CodecInterface() const { +#if CONFIG_VP9_DECODER + return &vpx_codec_vp9_dx_algo; +#else + return NULL; +#endif + } +}; + +class VP9Encoder : public Encoder { + public: + VP9Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + const unsigned long init_flags, TwopassStatsStore *stats) + : Encoder(cfg, deadline, init_flags, stats) {} + + protected: + virtual const vpx_codec_iface_t* CodecInterface() const { +#if CONFIG_VP9_ENCODER + return &vpx_codec_vp9_cx_algo; +#else + return NULL; +#endif + } +}; + +class VP9CodecFactory : public CodecFactory { + public: + VP9CodecFactory() : CodecFactory() {} + + virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg, + unsigned long deadline) const { +#if CONFIG_VP9_DECODER + return new VP9Decoder(cfg, deadline); +#else + return NULL; +#endif + } + + virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg, + unsigned long deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const { +#if CONFIG_VP9_ENCODER + return new VP9Encoder(cfg, deadline, init_flags, stats); +#else + return NULL; +#endif + } + + virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const { +#if CONFIG_VP9_ENCODER + return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage); +#else + return VPX_CODEC_INCAPABLE; +#endif + } +}; + +const libvpx_test::VP9CodecFactory kVP9; + +#define VP9_INSTANTIATE_TEST_CASE(test, params)\ + INSTANTIATE_TEST_CASE_P(VP9, test, \ + ::testing::Combine( \ + ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \ + &libvpx_test::kVP9)), \ + params)) +#else +#define VP9_INSTANTIATE_TEST_CASE(test, params) +#endif // CONFIG_VP9 + + +} // namespace libvpx_test + +#endif // TEST_CODEC_FACTORY_H_ diff --git a/test/config_test.cc b/test/config_test.cc index c4da46e2e96218146bb6bed9dbf7075b9de3e0f8..90087280b4c552ad67549173e60aa360754f39c1 100644 --- a/test/config_test.cc +++ b/test/config_test.cc @@ -8,20 +8,22 @@ * be found in the AUTHORS file in the root of the source tree. */ #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" #include "test/encode_test_driver.h" +#include "test/util.h" #include "test/video_source.h" namespace { class ConfigTest : public ::libvpx_test::EncoderTest, - public ::testing::TestWithParam<enum libvpx_test::TestMode> { - public: - ConfigTest() : frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {} - + public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: + ConfigTest() : EncoderTest(GET_PARAM(0)), + frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {} + virtual void SetUp() { InitializeConfig(); - SetMode(GetParam()); + SetMode(GET_PARAM(1)); } virtual void BeginPassHook(unsigned int /*pass*/) { @@ -57,5 +59,5 @@ TEST_P(ConfigTest, LagIsDisabled) { EXPECT_EQ(frame_count_in_, frame_count_out_); } -INSTANTIATE_TEST_CASE_P(OnePassModes, ConfigTest, ONE_PASS_TEST_MODES); +VP8_INSTANTIATE_TEST_CASE(ConfigTest, ONE_PASS_TEST_MODES); } // namespace diff --git a/test/convolve_test.cc b/test/convolve_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..35065a41f755a30589e91bb1094db2b2e85dbb5d --- /dev/null +++ b/test/convolve_test.cc @@ -0,0 +1,509 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +extern "C" { +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_filter.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +} +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { +typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h); + +struct ConvolveFunctions { + ConvolveFunctions(convolve_fn_t h8, convolve_fn_t h8_avg, + convolve_fn_t v8, convolve_fn_t v8_avg, + convolve_fn_t hv8, convolve_fn_t hv8_avg) + : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg), + hv8_avg_(hv8_avg) {} + + convolve_fn_t h8_; + convolve_fn_t v8_; + convolve_fn_t hv8_; + convolve_fn_t h8_avg_; + convolve_fn_t v8_avg_; + convolve_fn_t hv8_avg_; +}; + +// Reference 8-tap subpixel filter, slightly modified to fit into this test. +#define VP9_FILTER_WEIGHT 128 +#define VP9_FILTER_SHIFT 7 +static uint8_t clip_pixel(int x) { + return x < 0 ? 0 : + x > 255 ? 255 : + x; +} + +static void filter_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint8_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height) { + // Between passes, we use an intermediate buffer whose height is extended to + // have enough horizontally filtered values as input for the vertical pass. + // This buffer is allocated to be big enough for the largest block type we + // support. + const int kInterp_Extend = 4; + const unsigned int intermediate_height = + (kInterp_Extend - 1) + output_height + kInterp_Extend; + + /* Size of intermediate_buffer is max_intermediate_height * filter_max_width, + * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height + * + kInterp_Extend + * = 3 + 16 + 4 + * = 23 + * and filter_max_width = 16 + */ + uint8_t intermediate_buffer[23 * 16]; + const int intermediate_next_stride = 1 - intermediate_height * output_width; + + // Horizontal pass (src -> transposed intermediate). + { + uint8_t *output_ptr = intermediate_buffer; + const int src_next_row_stride = src_stride - output_width; + unsigned int i, j; + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + int temp = ((int)src_ptr[0] * HFilter[0]) + + ((int)src_ptr[1] * HFilter[1]) + + ((int)src_ptr[2] * HFilter[2]) + + ((int)src_ptr[3] * HFilter[3]) + + ((int)src_ptr[4] * HFilter[4]) + + ((int)src_ptr[5] * HFilter[5]) + + ((int)src_ptr[6] * HFilter[6]) + + ((int)src_ptr[7] * HFilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT); + ++src_ptr; + output_ptr += intermediate_height; + } + src_ptr += src_next_row_stride; + output_ptr += intermediate_next_stride; + } + } + + // Vertical pass (transposed intermediate -> dst). + { + uint8_t *src_ptr = intermediate_buffer; + const int dst_next_row_stride = dst_stride - output_width; + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + int temp = ((int)src_ptr[0] * VFilter[0]) + + ((int)src_ptr[1] * VFilter[1]) + + ((int)src_ptr[2] * VFilter[2]) + + ((int)src_ptr[3] * VFilter[3]) + + ((int)src_ptr[4] * VFilter[4]) + + ((int)src_ptr[5] * VFilter[5]) + + ((int)src_ptr[6] * VFilter[6]) + + ((int)src_ptr[7] * VFilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT); + src_ptr += intermediate_height; + } + src_ptr += intermediate_next_stride; + dst_ptr += dst_next_row_stride; + } + } +} + +static void block2d_average_c(uint8_t *src, + unsigned int src_stride, + uint8_t *output_ptr, + unsigned int output_stride, + unsigned int output_width, + unsigned int output_height) { + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; + } + output_ptr += output_stride; + } +} + +static void filter_average_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint8_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height) { + uint8_t tmp[16*16]; + + assert(output_width <= 16); + assert(output_height <= 16); + filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 16, + output_width, output_height); + block2d_average_c(tmp, 16, dst_ptr, dst_stride, + output_width, output_height); +} + +class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) { + public: + static void SetUpTestCase() { + // Force input_ to be unaligned, output to be 16 byte aligned. + input_ = reinterpret_cast<uint8_t*>( + vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1)) + + 1; + output_ = reinterpret_cast<uint8_t*>( + vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize)); + } + + static void TearDownTestCase() { + vpx_free(input_ - 1); + input_ = NULL; + vpx_free(output_); + output_ = NULL; + } + + protected: + static const int kDataAlignment = 16; + static const int kOuterBlockSize = 32; + static const int kInputStride = kOuterBlockSize; + static const int kOutputStride = kOuterBlockSize; + static const int kMaxDimension = 16; + + int Width() const { return GET_PARAM(0); } + int Height() const { return GET_PARAM(1); } + int BorderLeft() const { + const int center = (kOuterBlockSize - Width()) / 2; + return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1); + } + int BorderTop() const { return (kOuterBlockSize - Height()) / 2; } + + bool IsIndexInBorder(int i) { + return (i < BorderTop() * kOuterBlockSize || + i >= (BorderTop() + Height()) * kOuterBlockSize || + i % kOuterBlockSize < BorderLeft() || + i % kOuterBlockSize >= (BorderLeft() + Width())); + } + + virtual void SetUp() { + UUT_ = GET_PARAM(2); + memset(input_, 0, sizeof(input_)); + /* Set up guard blocks for an inner block cetered in the outer block */ + for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) { + if (IsIndexInBorder(i)) + output_[i] = 255; + else + output_[i] = 0; + } + + ::libvpx_test::ACMRandom prng; + for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) + input_[i] = prng.Rand8(); + } + + void CheckGuardBlocks() { + for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) { + if (IsIndexInBorder(i)) + EXPECT_EQ(255, output_[i]); + } + } + + uint8_t* input() { + return input_ + BorderTop() * kOuterBlockSize + BorderLeft(); + } + + uint8_t* output() { + return output_ + BorderTop() * kOuterBlockSize + BorderLeft(); + } + + const ConvolveFunctions* UUT_; + static uint8_t* input_; + static uint8_t* output_; +}; +uint8_t* ConvolveTest::input_ = NULL; +uint8_t* ConvolveTest::output_ = NULL; + +TEST_P(ConvolveTest, GuardBlocks) { + CheckGuardBlocks(); +} + +TEST_P(ConvolveTest, CopyHoriz) { + uint8_t* const in = input(); + uint8_t* const out = output(); + const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0}; + + REGISTER_STATE_CHECK( + UUT_->h8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x]) + << "(" << x << "," << y << ")"; +} + +TEST_P(ConvolveTest, CopyVert) { + uint8_t* const in = input(); + uint8_t* const out = output(); + const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0}; + + REGISTER_STATE_CHECK( + UUT_->v8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x]) + << "(" << x << "," << y << ")"; +} + +TEST_P(ConvolveTest, Copy2D) { + uint8_t* const in = input(); + uint8_t* const out = output(); + const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0}; + + REGISTER_STATE_CHECK( + UUT_->hv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x]) + << "(" << x << "," << y << ")"; +} + +const int16_t (*kTestFilterList[])[8] = { + vp9_bilinear_filters, + vp9_sub_pel_filters_6, + vp9_sub_pel_filters_8, + vp9_sub_pel_filters_8s, + vp9_sub_pel_filters_8lp +}; + +const int16_t kInvalidFilter[8] = { 0 }; + +TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { + uint8_t* const in = input(); + uint8_t* const out = output(); + uint8_t ref[kOutputStride * kMaxDimension]; + + const int kNumFilterBanks = sizeof(kTestFilterList) / + sizeof(kTestFilterList[0]); + + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const int16_t (*filters)[8] = kTestFilterList[filter_bank]; + const int kNumFilters = 16; + + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + filter_block2d_8_c(in, kInputStride, + filters[filter_x], filters[filter_y], + ref, kOutputStride, + Width(), Height()); + + if (filters == vp9_sub_pel_filters_8lp || (filter_x && filter_y)) + REGISTER_STATE_CHECK( + UUT_->hv8_(in, kInputStride, out, kOutputStride, + filters[filter_x], 16, filters[filter_y], 16, + Width(), Height())); + else if (filter_y) + REGISTER_STATE_CHECK( + UUT_->v8_(in, kInputStride, out, kOutputStride, + kInvalidFilter, 16, filters[filter_y], 16, + Width(), Height())); + else + REGISTER_STATE_CHECK( + UUT_->h8_(in, kInputStride, out, kOutputStride, + filters[filter_x], 16, kInvalidFilter, 16, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x]) + << "mismatch at (" << x << "," << y << "), " + << "filters (" << filter_bank << "," + << filter_x << "," << filter_y << ")"; + } + } + } +} + +TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) { + uint8_t* const in = input(); + uint8_t* const out = output(); + uint8_t ref[kOutputStride * kMaxDimension]; + + // Populate ref and out with some random data + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const uint8_t r = prng.Rand8(); + + out[y * kOutputStride + x] = r; + ref[y * kOutputStride + x] = r; + } + } + + const int kNumFilterBanks = sizeof(kTestFilterList) / + sizeof(kTestFilterList[0]); + + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const int16_t (*filters)[8] = kTestFilterList[filter_bank]; + const int kNumFilters = 16; + + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + filter_average_block2d_8_c(in, kInputStride, + filters[filter_x], filters[filter_y], + ref, kOutputStride, + Width(), Height()); + + if (filters == vp9_sub_pel_filters_8lp || (filter_x && filter_y)) + REGISTER_STATE_CHECK( + UUT_->hv8_avg_(in, kInputStride, out, kOutputStride, + filters[filter_x], 16, filters[filter_y], 16, + Width(), Height())); + else if (filter_y) + REGISTER_STATE_CHECK( + UUT_->v8_avg_(in, kInputStride, out, kOutputStride, + filters[filter_x], 16, filters[filter_y], 16, + Width(), Height())); + else + REGISTER_STATE_CHECK( + UUT_->h8_avg_(in, kInputStride, out, kOutputStride, + filters[filter_x], 16, filters[filter_y], 16, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x]) + << "mismatch at (" << x << "," << y << "), " + << "filters (" << filter_bank << "," + << filter_x << "," << filter_y << ")"; + } + } + } +} + +DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = { + { 0, 0, 0, 0, 0, 0, 0, 128}, + { 0, 0, 0, 0, 0, 0, 128}, + { 0, 0, 0, 0, 0, 128}, + { 0, 0, 0, 0, 128}, + { 0, 0, 0, 128}, + { 0, 0, 128}, + { 0, 128}, + { 128}, + { 0, 0, 0, 0, 0, 0, 0, 128}, + { 0, 0, 0, 0, 0, 0, 128}, + { 0, 0, 0, 0, 0, 128}, + { 0, 0, 0, 0, 128}, + { 0, 0, 0, 128}, + { 0, 0, 128}, + { 0, 128}, + { 128} +}; + +TEST_P(ConvolveTest, ChangeFilterWorks) { + uint8_t* const in = input(); + uint8_t* const out = output(); + + REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride, + kChangeFilters[8], 17, kChangeFilters[4], 16, + Width(), Height())); + + for (int x = 0; x < Width(); ++x) { + if (x < 8) + ASSERT_EQ(in[4], out[x]) << "x == " << x; + else + ASSERT_EQ(in[12], out[x]) << "x == " << x; + } + + REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride, + kChangeFilters[4], 16, kChangeFilters[8], 17, + Width(), Height())); + + for (int y = 0; y < Height(); ++y) { + if (y < 8) + ASSERT_EQ(in[4 * kInputStride], out[y * kOutputStride]) << "y == " << y; + else + ASSERT_EQ(in[12 * kInputStride], out[y * kOutputStride]) << "y == " << y; + } + + REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride, + kChangeFilters[8], 17, kChangeFilters[8], 17, + Width(), Height())); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const int ref_x = x < 8 ? 4 : 12; + const int ref_y = y < 8 ? 4 : 12; + + ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x]) + << "x == " << x << ", y == " << y; + } + } +} + + +using std::tr1::make_tuple; + +const ConvolveFunctions convolve8_c( + vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c, + vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c, + vp9_convolve8_c, vp9_convolve8_avg_c); + +INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve8_c), + make_tuple(8, 4, &convolve8_c), + make_tuple(8, 8, &convolve8_c), + make_tuple(16, 8, &convolve8_c), + make_tuple(16, 16, &convolve8_c))); +} + +#if HAVE_SSSE3 +const ConvolveFunctions convolve8_ssse3( + vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c, + vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c, + vp9_convolve8_ssse3, vp9_convolve8_avg_c); + +INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve8_ssse3), + make_tuple(8, 4, &convolve8_ssse3), + make_tuple(8, 8, &convolve8_ssse3), + make_tuple(16, 8, &convolve8_ssse3), + make_tuple(16, 16, &convolve8_ssse3))); +#endif diff --git a/test/cq_test.cc b/test/cq_test.cc index 42ee2a2f83a14672a6a2194158aaaf49a9ad240d..a6a4b8ebd49efdf65cad758a0b0468b9f761d4fb 100644 --- a/test/cq_test.cc +++ b/test/cq_test.cc @@ -9,8 +9,12 @@ */ #include <cmath> #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" +#include "test/util.h" + +namespace { // CQ level range: [kCQLevelMin, kCQLevelMax). const int kCQLevelMin = 4; @@ -18,12 +22,13 @@ const int kCQLevelMax = 63; const int kCQLevelStep = 8; const int kCQTargetBitrate = 2000; -namespace { - -class CQTest : public libvpx_test::EncoderTest, - public ::testing::TestWithParam<int> { +class CQTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<int> { protected: - CQTest() : cq_level_(GetParam()) { init_flags_ = VPX_CODEC_USE_PSNR; } + CQTest() : EncoderTest(GET_PARAM(0)), cq_level_(GET_PARAM(1)) { + init_flags_ = VPX_CODEC_USE_PSNR; + } + virtual ~CQTest() {} virtual void SetUp() { @@ -100,7 +105,7 @@ TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) { EXPECT_GE(cq_psnr_lin, vbr_psnr_lin); } -INSTANTIATE_TEST_CASE_P(CQLevelRange, CQTest, - ::testing::Range(kCQLevelMin, kCQLevelMax, - kCQLevelStep)); +VP8_INSTANTIATE_TEST_CASE(CQTest, + ::testing::Range(kCQLevelMin, kCQLevelMax, + kCQLevelStep)); } // namespace diff --git a/test/datarate_test.cc b/test/datarate_test.cc index 6fbcb643d025f1409c21cb423e04bc6a8149d18a..85eeafbcc7cae5f1e219ffab2a1204ddfd2c560b 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -7,17 +7,23 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/util.h" + namespace { class DatarateTest : public ::libvpx_test::EncoderTest, - public ::testing::TestWithParam<enum libvpx_test::TestMode> { + public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { + public: + DatarateTest() : EncoderTest(GET_PARAM(0)) {} + protected: virtual void SetUp() { InitializeConfig(); - SetMode(GetParam()); + SetMode(GET_PARAM(1)); ResetModel(); } @@ -174,5 +180,6 @@ TEST_P(DatarateTest, ChangingDropFrameThresh) { } } -INSTANTIATE_TEST_CASE_P(AllModes, DatarateTest, ALL_TEST_MODES); +VP8_INSTANTIATE_TEST_CASE(DatarateTest, ALL_TEST_MODES); + } // namespace diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 64bf0bbf6ab9852b1ef47f45ceff21a31ccfd84b..f6d2d5994bd8f77ddebfd6f01fbe5e66c4fa344b 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -15,7 +15,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" extern "C" { -#include "vp9/common/entropy.h" +#include "vp9/common/vp9_entropy.h" #include "vp9_rtcd.h" } @@ -26,6 +26,15 @@ using libvpx_test::ACMRandom; namespace { +#ifdef _MSC_VER +static int round(double x) { + if (x < 0) + return (int)ceil(x - 0.5); + else + return (int)floor(x + 0.5); +} +#endif + const double PI = 3.1415926535898; void reference2_16x16_idct_2d(double *input, double *output) { double x; @@ -278,18 +287,10 @@ TEST(VP9Idct16x16Test, AccuracyCheck) { << "Error: 16x16 IDCT has error " << error << " at index " << j; } - - vp9_short_fdct16x16_c(in, out_c, 32); - for (int j = 0; j < 256; ++j) { - const double diff = coeff[j] - out_c[j]; - const double error = diff * diff; - EXPECT_GE(1.0, error) - << "Error: 16x16 FDCT has error " << error - << " at index " << j; - } } } - +#if 1 +// we need enable fdct test once we re-do the 16 point fdct. TEST(VP9Fdct16x16Test, AccuracyCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int max_error = 0; @@ -318,10 +319,10 @@ TEST(VP9Fdct16x16Test, AccuracyCheck) { } EXPECT_GE(1, max_error) - << "Error: 16x16 FDCT/IDCT has an individual roundtrip error > 1"; + << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1"; - EXPECT_GE(count_test_block/10, total_error) - << "Error: 16x16 FDCT/IDCT has average roundtrip error > 1/10 per block"; + EXPECT_GE(count_test_block , total_error) + << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block"; } TEST(VP9Fdct16x16Test, CoeffSizeCheck) { @@ -353,4 +354,6 @@ TEST(VP9Fdct16x16Test, CoeffSizeCheck) { } } } +#endif + } // namespace diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 827b13316cfbcc711d894354d790a03753b6a9d7..a565270993d822146181561d64a7bce5f56831dc 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -36,7 +36,6 @@ static int round(double x) { } #endif -#if !CONFIG_DWTDCTHYBRID static const double kPi = 3.141592653589793238462643383279502884; static void reference2_32x32_idct_2d(double *input, double *output) { double x; @@ -116,20 +115,9 @@ TEST(VP9Idct32x32Test, AccuracyCheck) { << "Error: 3x32 IDCT has error " << error << " at index " << j; } - - vp9_short_fdct32x32_c(in, out_c, 64); - for (int j = 0; j < 1024; ++j) { - const double diff = coeff[j] - out_c[j]; - const double error = diff * diff; - EXPECT_GE(1.0, error) - << "Error: 32x32 FDCT has error " << error - << " at index " << j; - } } } -#else // CONFIG_DWTDCTHYBRID - // TODO(rbultje/debargha): add DWT-specific tests -#endif // CONFIG_DWTDCTHYBRID + TEST(VP9Fdct32x32Test, AccuracyCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); unsigned int max_error = 0; @@ -160,8 +148,8 @@ TEST(VP9Fdct32x32Test, AccuracyCheck) { EXPECT_GE(1u, max_error) << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1"; - EXPECT_GE(count_test_block/10, total_error) - << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1/10 per block"; + EXPECT_GE(count_test_block, total_error) + << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block"; } TEST(VP9Fdct32x32Test, CoeffSizeCheck) { diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc index 0db48e4a79a2b4d17a9bc55e573f544bfa4bce31..1f6d5406482e8292579a348a0842451c32191954 100644 --- a/test/decode_test_driver.cc +++ b/test/decode_test_driver.cc @@ -7,16 +7,17 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include "test/codec_factory.h" #include "test/decode_test_driver.h" #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/register_state_check.h" #include "test/video_source.h" namespace libvpx_test { -#if CONFIG_VP8_DECODER vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, int size) { vpx_codec_err_t res_dec; + InitOnce(); REGISTER_STATE_CHECK(res_dec = vpx_codec_decode(&decoder_, cxdata, size, NULL, 0)); return res_dec; @@ -24,21 +25,23 @@ vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, int size) { void DecoderTest::RunLoop(CompressedVideoSource *video) { vpx_codec_dec_cfg_t dec_cfg = {0}; - Decoder decoder(dec_cfg, 0); + Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0); + ASSERT_TRUE(decoder != NULL); // Decode frames. for (video->Begin(); video->cxdata(); video->Next()) { - vpx_codec_err_t res_dec = decoder.DecodeFrame(video->cxdata(), - video->frame_size()); - ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder.DecodeError(); + vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(), + video->frame_size()); + ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); - DxDataIterator dec_iter = decoder.GetDxData(); + DxDataIterator dec_iter = decoder->GetDxData(); const vpx_image_t *img = NULL; // Get decompressed data while ((img = dec_iter.Next())) DecompressedFrameHook(*img, video->frame_number()); } + + delete decoder; } -#endif } // namespace libvpx_test diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h index 7d31a08954ad6c8070a2575bea90bea4fe4f5b69..49e7384f463c4affdbfbc645c572c89f40a3265c 100644 --- a/test/decode_test_driver.h +++ b/test/decode_test_driver.h @@ -14,10 +14,10 @@ #include "third_party/googletest/src/include/gtest/gtest.h" #include "vpx_config.h" #include "vpx/vpx_decoder.h" -#include "vpx/vp8dx.h" namespace libvpx_test { +class CodecFactory; class CompressedVideoSource; // Provides an object to handle decoding output @@ -42,12 +42,11 @@ class DxDataIterator { class Decoder { public: Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline) - : cfg_(cfg), deadline_(deadline) { + : cfg_(cfg), deadline_(deadline), init_done_(false) { memset(&decoder_, 0, sizeof(decoder_)); - Init(); } - ~Decoder() { + virtual ~Decoder() { vpx_codec_destroy(&decoder_); } @@ -62,37 +61,45 @@ class Decoder { } void Control(int ctrl_id, int arg) { + InitOnce(); const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); } void Control(int ctrl_id, const void *arg) { + InitOnce(); const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); } - const char *DecodeError() { + const char* DecodeError() { const char *detail = vpx_codec_error_detail(&decoder_); return detail ? detail : vpx_codec_error(&decoder_); } protected: - void Init() { - const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_, - &vpx_codec_vp8_dx_algo, - &cfg_, 0); - ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); + virtual const vpx_codec_iface_t* CodecInterface() const = 0; + + void InitOnce() { + if (!init_done_) { + const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_, + CodecInterface(), + &cfg_, 0); + ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError(); + init_done_ = true; + } } vpx_codec_ctx_t decoder_; vpx_codec_dec_cfg_t cfg_; unsigned int deadline_; + bool init_done_; }; // Common test functionality for all Decoder tests. class DecoderTest { public: - // Main loop. + // Main decoding loop virtual void RunLoop(CompressedVideoSource *video); // Hook to be called on every decompressed frame. @@ -100,9 +107,11 @@ class DecoderTest { const unsigned int frame_number) {} protected: - DecoderTest() {} + explicit DecoderTest(const CodecFactory *codec) : codec_(codec) {} virtual ~DecoderTest() {} + + const CodecFactory *codec_; }; } // namespace libvpx_test diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 404aaa2907fae72c86121e13930800966c52e94b..eed3e33af4d78278999e5c6e992b054bf3979381 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -7,11 +7,11 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + #include "vpx_config.h" +#include "test/codec_factory.h" #include "test/encode_test_driver.h" -#if CONFIG_VP8_DECODER #include "test/decode_test_driver.h" -#endif #include "test/register_state_check.h" #include "test/video_source.h" #include "third_party/googletest/src/include/gtest/gtest.h" @@ -45,7 +45,7 @@ void Encoder::EncodeFrameInternal(const VideoSource &video, cfg_.g_h = img->d_h; cfg_.g_timebase = video.timebase(); cfg_.rc_twopass_stats_in = stats_->buf(); - res = vpx_codec_enc_init(&encoder_, &vpx_codec_vp8_cx_algo, &cfg_, + res = vpx_codec_enc_init(&encoder_, CodecInterface(), &cfg_, init_flags_); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } @@ -72,6 +72,11 @@ void Encoder::Flush() { ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } +void EncoderTest::InitializeConfig() { + const vpx_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0); + ASSERT_EQ(VPX_CODEC_OK, res); +} + void EncoderTest::SetMode(TestMode mode) { switch (mode) { case kRealTime: @@ -125,13 +130,17 @@ static bool compare_img(const vpx_image_t *img1, return match; } +void EncoderTest::MismatchHook(const vpx_image_t *img1, + const vpx_image_t *img2) { + ASSERT_TRUE(0) << "Encode/Decode mismatch found"; +} + void EncoderTest::RunLoop(VideoSource *video) { -#if CONFIG_VP8_DECODER vpx_codec_dec_cfg_t dec_cfg = {0}; -#endif stats_.Reset(); + ASSERT_TRUE(passes_ == 1 || passes_ == 2); for (unsigned int pass = 0; pass < passes_; pass++) { last_pts_ = 0; @@ -143,34 +152,34 @@ void EncoderTest::RunLoop(VideoSource *video) { cfg_.g_pass = VPX_RC_LAST_PASS; BeginPassHook(pass); - Encoder encoder(cfg_, deadline_, init_flags_, &stats_); -#if CONFIG_VP8_DECODER - Decoder decoder(dec_cfg, 0); - bool has_cxdata = false; -#endif + Encoder* const encoder = codec_->CreateEncoder(cfg_, deadline_, init_flags_, + &stats_); + ASSERT_TRUE(encoder != NULL); + Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0); bool again; for (again = true, video->Begin(); again; video->Next()) { again = video->img() != NULL; PreEncodeFrameHook(video); - PreEncodeFrameHook(video, &encoder); - encoder.EncodeFrame(video, frame_flags_); + PreEncodeFrameHook(video, encoder); + encoder->EncodeFrame(video, frame_flags_); - CxDataIterator iter = encoder.GetCxData(); + CxDataIterator iter = encoder->GetCxData(); + bool has_cxdata = false; + bool has_dxdata = false; while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + pkt = MutateEncoderOutputHook(pkt); again = true; -#if CONFIG_VP8_DECODER - vpx_codec_err_t res_dec; -#endif switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: -#if CONFIG_VP8_DECODER has_cxdata = true; - res_dec = decoder.DecodeFrame((const uint8_t*)pkt->data.frame.buf, - pkt->data.frame.sz); - ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder.DecodeError(); -#endif + if (decoder && DoDecode()) { + vpx_codec_err_t res_dec = decoder->DecodeFrame( + (const uint8_t*)pkt->data.frame.buf, pkt->data.frame.sz); + ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + has_dxdata = true; + } ASSERT_GE(pkt->data.frame.pts, last_pts_); last_pts_ = pkt->data.frame.pts; FramePktHook(pkt); @@ -185,25 +194,32 @@ void EncoderTest::RunLoop(VideoSource *video) { } } -#if CONFIG_VP8_DECODER - if (has_cxdata) { - const vpx_image_t *img_enc = encoder.GetPreviewFrame(); - DxDataIterator dec_iter = decoder.GetDxData(); + if (has_dxdata && has_cxdata) { + const vpx_image_t *img_enc = encoder->GetPreviewFrame(); + DxDataIterator dec_iter = decoder->GetDxData(); const vpx_image_t *img_dec = dec_iter.Next(); - if(img_enc && img_dec) { + if (img_enc && img_dec) { const bool res = compare_img(img_enc, img_dec); - ASSERT_TRUE(res)<< "Encoder/Decoder mismatch found."; + if (!res) { // Mismatch + MismatchHook(img_enc, img_dec); + } } + if (img_dec) + DecompressedFrameHook(*img_dec, video->pts()); } -#endif if (!Continue()) break; } EndPassHook(); + if (decoder) + delete decoder; + delete encoder; + if (!Continue()) break; } } + } // namespace libvpx_test diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 0141fa9107c2c2a6604643cbe0ccb88819b0c702..5a37816ab941b5b8288d11da1e99f57fffda686d 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -9,14 +9,16 @@ */ #ifndef TEST_ENCODE_TEST_DRIVER_H_ #define TEST_ENCODE_TEST_DRIVER_H_ + +#include "./vpx_config.h" #include <string> #include <vector> #include "third_party/googletest/src/include/gtest/gtest.h" #include "vpx/vpx_encoder.h" -#include "vpx/vp8cx.h" namespace libvpx_test { +class CodecFactory; class VideoSource; enum TestMode { @@ -36,6 +38,9 @@ enum TestMode { ::libvpx_test::kOnePassGood, \ ::libvpx_test::kOnePassBest) +#define TWO_PASS_TEST_MODES ::testing::Values(::libvpx_test::kTwoPassGood, \ + ::libvpx_test::kTwoPassBest) + // Provides an object to handle the libvpx get_cx_data() iteration pattern class CxDataIterator { @@ -83,7 +88,7 @@ class Encoder { public: Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, const unsigned long init_flags, TwopassStatsStore *stats) - : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) { + : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) { memset(&encoder_, 0, sizeof(encoder_)); } @@ -112,11 +117,18 @@ class Encoder { ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } + void Control(int ctrl_id, struct vpx_scaling_mode *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + void set_deadline(unsigned long deadline) { deadline_ = deadline; } protected: + virtual const vpx_codec_iface_t* CodecInterface() const = 0; + const char *EncoderError() { const char *detail = vpx_codec_error_detail(&encoder_); return detail ? detail : vpx_codec_error(&encoder_); @@ -145,22 +157,19 @@ class Encoder { // classes directly, so that tests can be parameterized differently. class EncoderTest { protected: - EncoderTest() : abort_(false), init_flags_(0), frame_flags_(0), - last_pts_(0) {} + explicit EncoderTest(const CodecFactory *codec) + : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0), + last_pts_(0) {} virtual ~EncoderTest() {} // Initialize the cfg_ member with the default configuration. - void InitializeConfig() { - const vpx_codec_err_t res = vpx_codec_enc_config_default( - &vpx_codec_vp8_cx_algo, &cfg_, 0); - ASSERT_EQ(VPX_CODEC_OK, res); - } + void InitializeConfig(); // Map the TestMode enum to the deadline_ and passes_ variables. void SetMode(TestMode mode); - // Main loop. + // Main loop virtual void RunLoop(VideoSource *video); // Hook to be called at the beginning of a pass. @@ -182,6 +191,24 @@ class EncoderTest { // Hook to determine whether the encode loop should continue. virtual bool Continue() const { return !abort_; } + const CodecFactory *codec_; + // Hook to determine whether to decode frame after encoding + virtual bool DoDecode() const { return 1; } + + // Hook to handle encode/decode mismatch + virtual void MismatchHook(const vpx_image_t *img1, + const vpx_image_t *img2); + + // Hook to be called on every decompressed frame. + virtual void DecompressedFrameHook(const vpx_image_t& img, + vpx_codec_pts_t pts) {} + + // Hook that can modify the encoder's output data + virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) { + return pkt; + } + bool abort_; vpx_codec_enc_cfg_t cfg_; unsigned int passes_; diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 25c67310aa0c665aa599c3074345fde318b42928..1eee0f55abd63ba31fd630a05aadf07c9412b065 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -7,22 +7,37 @@ in the file PATENTS. All contributing project authors may be found in the AUTHORS file in the root of the source tree. */ + #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" +#include "test/util.h" namespace { -class ErrorResilienceTest : public libvpx_test::EncoderTest, - public ::testing::TestWithParam<int> { +const int kMaxErrorFrames = 8; +const int kMaxDroppableFrames = 8; + +class ErrorResilienceTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: - ErrorResilienceTest() { - psnr_ = 0.0; - nframes_ = 0; - encoding_mode_ = static_cast<libvpx_test::TestMode>(GetParam()); + ErrorResilienceTest() : EncoderTest(GET_PARAM(0)), + psnr_(0.0), + nframes_(0), + mismatch_psnr_(0.0), + mismatch_nframes_(0), + encoding_mode_(GET_PARAM(1)) { + Reset(); } + virtual ~ErrorResilienceTest() {} + void Reset() { + error_nframes_ = 0; + droppable_nframes_ = 0; + } + virtual void SetUp() { InitializeConfig(); SetMode(encoding_mode_); @@ -31,6 +46,8 @@ class ErrorResilienceTest : public libvpx_test::EncoderTest, virtual void BeginPassHook(unsigned int /*pass*/) { psnr_ = 0.0; nframes_ = 0; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; } virtual bool Continue() const { @@ -42,15 +59,92 @@ class ErrorResilienceTest : public libvpx_test::EncoderTest, nframes_++; } + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) { + frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF); + if (droppable_nframes_ > 0 && + (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) { + for (unsigned int i = 0; i < droppable_nframes_; ++i) { + if (droppable_frames_[i] == nframes_) { + std::cout << " Encoding droppable frame: " + << droppable_frames_[i] << "\n"; + frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF); + return; + } + } + } + } + double GetAveragePsnr() const { if (nframes_) return psnr_ / nframes_; return 0.0; } + double GetAverageMismatchPsnr() const { + if (mismatch_nframes_) + return mismatch_psnr_ / mismatch_nframes_; + return 0.0; + } + + virtual bool DoDecode() const { + if (error_nframes_ > 0 && + (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) { + for (unsigned int i = 0; i < error_nframes_; ++i) { + if (error_frames_[i] == nframes_ - 1) { + std::cout << " Skipping decoding frame: " + << error_frames_[i] << "\n"; + return 0; + } + } + } + return 1; + } + + virtual void MismatchHook(const vpx_image_t *img1, + const vpx_image_t *img2) { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n"; + } + + void SetErrorFrames(int num, unsigned int *list) { + if (num > kMaxErrorFrames) + num = kMaxErrorFrames; + else if (num < 0) + num = 0; + error_nframes_ = num; + for (unsigned int i = 0; i < error_nframes_; ++i) + error_frames_[i] = list[i]; + } + + void SetDroppableFrames(int num, unsigned int *list) { + if (num > kMaxDroppableFrames) + num = kMaxDroppableFrames; + else if (num < 0) + num = 0; + droppable_nframes_ = num; + for (unsigned int i = 0; i < droppable_nframes_; ++i) + droppable_frames_[i] = list[i]; + } + + unsigned int GetMismatchFrames() { + return mismatch_nframes_; + } + private: double psnr_; unsigned int nframes_; + unsigned int error_nframes_; + unsigned int droppable_nframes_; + double mismatch_psnr_; + unsigned int mismatch_nframes_; + unsigned int error_frames_[kMaxErrorFrames]; + unsigned int droppable_frames_[kMaxDroppableFrames]; libvpx_test::TestMode encoding_mode_; }; @@ -85,6 +179,49 @@ TEST_P(ErrorResilienceTest, OnVersusOff) { } } -INSTANTIATE_TEST_CASE_P(OnOffTest, ErrorResilienceTest, - ONE_PASS_TEST_MODES); +TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 30); + + // Error resilient mode ON. + cfg_.g_error_resilient = 1; + + // Set an arbitrary set of error frames same as droppable frames + unsigned int num_droppable_frames = 2; + unsigned int droppable_frame_list[] = {5, 16}; + SetDroppableFrames(num_droppable_frames, droppable_frame_list); + SetErrorFrames(num_droppable_frames, droppable_frame_list); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Test that no mismatches have been found + std::cout << " Mismatch frames: " + << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0); + + // reset previously set error/droppable frames + Reset(); + + // Now set an arbitrary set of error frames that are non-droppable + unsigned int num_error_frames = 3; + unsigned int error_frame_list[] = {3, 10, 20}; + SetErrorFrames(num_error_frames, error_frame_list); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Test that dropping an arbitrary set of inter frames does not hurt too much + // Note the Average Mismatch PSNR is the average of the PSNR between + // decoded frame and encoder's version of the same frame for all frames + // with mismatch. + const double psnr_resilience_mismatch = GetAverageMismatchPsnr(); + std::cout << " Mismatch PSNR: " + << psnr_resilience_mismatch << "\n"; + EXPECT_GT(psnr_resilience_mismatch, 20.0); +} + +VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTest, ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTest, ONE_PASS_TEST_MODES); + } // namespace diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index ebec890d6e3df99ec478e43de5620ee4172d5ae2..dfb64c3a2e84becce6b375ddbefe60f474648449 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -25,7 +25,7 @@ using libvpx_test::ACMRandom; namespace { -TEST(Vp9FdctTest, SignBiasCheck) { +TEST(Vp9Fdct4x4Test, SignBiasCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int16_t test_input_block[16]; int16_t test_output_block[16]; @@ -88,7 +88,7 @@ TEST(Vp9FdctTest, SignBiasCheck) { } }; -TEST(Vp9FdctTest, RoundTripErrorCheck) { +TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int max_error = 0; double total_error = 0; @@ -120,7 +120,7 @@ TEST(Vp9FdctTest, RoundTripErrorCheck) { } // Because the bitstream is not frozen yet, use the idct in the codebase. - vp9_short_idct4x4llm_c(test_temp_block, test_output_block, pitch); + vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch); for (int j = 0; j < 16; ++j) { const int diff = test_input_block[j] - test_output_block[j]; diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 5967d36c4a597c1bda5856785d3eec21cd64370e..e1b2a07b8193c30ca7dce04e5897336a87faa0f7 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -149,7 +149,7 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) { // Initialize a test block with input range {-255, 255}. for (int j = 0; j < 64; ++j) - test_input_block[j] = rnd.Rand8() % 2 ? 255 : -255; + test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256; const int pitch = 16; vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch); diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc index ab71e85669fce8322b110983d3ffe601fe9c3c70..30a1ac3998006df794b38cd235e076b2081c8621 100644 --- a/test/idct8x8_test.cc +++ b/test/idct8x8_test.cc @@ -120,31 +120,6 @@ TEST(VP9Idct8x8Test, AccuracyCheck) { input[j] = rnd.Rand8() - rnd.Rand8(); const int pitch = 16; - vp9_short_fdct8x8_c(input, output_c, pitch); - reference_dct_2d(input, output_r); - - for (int j = 0; j < 64; ++j) { - const double diff = output_c[j] - output_r[j]; - const double error = diff * diff; - // An error in a DCT coefficient isn't that bad. - // We care more about the reconstructed pixels. - EXPECT_GE(2.0, error) - << "Error: 8x8 FDCT/IDCT has error " << error - << " at index " << j; - } - -#if 0 - // Tests that the reference iDCT and fDCT match. - reference_dct_2d(input, output_r); - reference_idct_2d(output_r, output_c); - for (int j = 0; j < 64; ++j) { - const int diff = output_c[j] -input[j]; - const int error = diff * diff; - EXPECT_EQ(0, error) - << "Error: 8x8 FDCT/IDCT has error " << error - << " at index " << j; - } -#endif reference_dct_2d(input, output_r); for (int j = 0; j < 64; ++j) coeff[j] = round(output_r[j]); diff --git a/test/idctllm_test.cc b/test/idct_test.cc similarity index 72% rename from test/idctllm_test.cc rename to test/idct_test.cc index d6fdffea5fbf85fbeb556cd30e5eac42ab32b007..51fb65a4315df44c3f2afa9ed6a8122fe160ce90 100644 --- a/test/idctllm_test.cc +++ b/test/idct_test.cc @@ -10,8 +10,8 @@ extern "C" { -#include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vpx_config.h" +#include "./vp8_rtcd.h" } #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" @@ -20,18 +20,16 @@ typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); namespace { -class IDCTTest : public ::testing::TestWithParam<idct_fn_t> -{ +class IDCTTest : public ::testing::TestWithParam<idct_fn_t> { protected: - virtual void SetUp() - { + virtual void SetUp() { int i; UUT = GetParam(); memset(input, 0, sizeof(input)); /* Set up guard blocks */ - for(i=0; i<256; i++) - output[i] = ((i&0xF)<4&&(i<64))?0:-1; + for (i = 0; i < 256; i++) + output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1; } idct_fn_t UUT; @@ -40,78 +38,72 @@ class IDCTTest : public ::testing::TestWithParam<idct_fn_t> unsigned char predict[256]; }; -TEST_P(IDCTTest, TestGuardBlocks) -{ +TEST_P(IDCTTest, TestGuardBlocks) { int i; - for(i=0; i<256; i++) - if((i&0xF) < 4 && i<64) + for (i = 0; i < 256; i++) + if ((i & 0xF) < 4 && i < 64) EXPECT_EQ(0, output[i]) << i; else EXPECT_EQ(255, output[i]); } -TEST_P(IDCTTest, TestAllZeros) -{ +TEST_P(IDCTTest, TestAllZeros) { int i; REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); - for(i=0; i<256; i++) - if((i&0xF) < 4 && i<64) + for (i = 0; i < 256; i++) + if ((i & 0xF) < 4 && i < 64) EXPECT_EQ(0, output[i]) << "i==" << i; else EXPECT_EQ(255, output[i]) << "i==" << i; } -TEST_P(IDCTTest, TestAllOnes) -{ +TEST_P(IDCTTest, TestAllOnes) { int i; input[0] = 4; REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); - for(i=0; i<256; i++) - if((i&0xF) < 4 && i<64) + for (i = 0; i < 256; i++) + if ((i & 0xF) < 4 && i < 64) EXPECT_EQ(1, output[i]) << "i==" << i; else EXPECT_EQ(255, output[i]) << "i==" << i; } -TEST_P(IDCTTest, TestAddOne) -{ +TEST_P(IDCTTest, TestAddOne) { int i; - for(i=0; i<256; i++) + for (i = 0; i < 256; i++) predict[i] = i; - input[0] = 4; REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16)); - for(i=0; i<256; i++) - if((i&0xF) < 4 && i<64) + for (i = 0; i < 256; i++) + if ((i & 0xF) < 4 && i < 64) EXPECT_EQ(i+1, output[i]) << "i==" << i; else EXPECT_EQ(255, output[i]) << "i==" << i; } -TEST_P(IDCTTest, TestWithData) -{ +TEST_P(IDCTTest, TestWithData) { int i; - for(i=0; i<16; i++) + for (i = 0; i < 16; i++) input[i] = i; REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); - for(i=0; i<256; i++) - if((i&0xF) > 3 || i>63) + for (i = 0; i < 256; i++) + if ((i & 0xF) > 3 || i > 63) EXPECT_EQ(255, output[i]) << "i==" << i; - else if(i == 0) + else if (i == 0) EXPECT_EQ(11, output[i]) << "i==" << i; - else if(i == 34) + else if (i == 34) EXPECT_EQ(1, output[i]) << "i==" << i; - else if(i == 2 || i == 17 || i == 32) + else if (i == 2 || i == 17 || i == 32) EXPECT_EQ(3, output[i]) << "i==" << i; else EXPECT_EQ(0, output[i]) << "i==" << i; diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc index d0c81df99825a8e1a5f50505ec477b39ed89a438..85ca0b97556e0f614760565e656b68bcf6a9e0f2 100644 --- a/test/keyframe_test.cc +++ b/test/keyframe_test.cc @@ -9,18 +9,22 @@ */ #include <climits> #include <vector> +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/util.h" namespace { class KeyframeTest : public ::libvpx_test::EncoderTest, - public ::testing::TestWithParam<enum libvpx_test::TestMode> { + public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: + KeyframeTest() : EncoderTest(GET_PARAM(0)) {} + virtual void SetUp() { InitializeConfig(); - SetMode(GetParam()); + SetMode(GET_PARAM(1)); kf_count_ = 0; kf_count_max_ = INT_MAX; kf_do_force_kf_ = false; @@ -64,7 +68,7 @@ TEST_P(KeyframeTest, TestRandomVideoSource) { // In realtime mode - auto placed keyframes are exceedingly rare, don't // bother with this check if(GetParam() > 0) - if(GetParam() > 0) + if (GET_PARAM(1) > 0) EXPECT_GT(kf_count_, 1); } @@ -126,7 +130,7 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { // In realtime mode - auto placed keyframes are exceedingly rare, don't // bother with this check - if(GetParam() > 0) + if (GET_PARAM(1) > 0) EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes "; // Verify that keyframes match the file keyframes in the file. @@ -141,5 +145,5 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { } } -INSTANTIATE_TEST_CASE_P(AllModes, KeyframeTest, ALL_TEST_MODES); +VP8_INSTANTIATE_TEST_CASE(KeyframeTest, ALL_TEST_MODES); } // namespace diff --git a/test/md5_helper.h b/test/md5_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..fc1a97479a1a0a8686b44cbc353268633f12f225 --- /dev/null +++ b/test/md5_helper.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBVPX_TEST_MD5_HELPER_H_ +#define LIBVPX_TEST_MD5_HELPER_H_ + +extern "C" { +#include "./md5_utils.h" +#include "vpx/vpx_decoder.h" +} + +namespace libvpx_test { +class MD5 { + public: + MD5() { + MD5Init(&md5_); + } + + void Add(const vpx_image_t *img) { + for (int plane = 0; plane < 3; ++plane) { + uint8_t *buf = img->planes[plane]; + const int h = plane ? (img->d_h + 1) >> 1 : img->d_h; + const int w = plane ? (img->d_w + 1) >> 1 : img->d_w; + + for (int y = 0; y < h; ++y) { + MD5Update(&md5_, buf, w); + buf += img->stride[plane]; + } + } + } + + const char *Get(void) { + static const char hex[16] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', + }; + uint8_t tmp[16]; + MD5Context ctx_tmp = md5_; + + MD5Final(tmp, &ctx_tmp); + for (int i = 0; i < 16; i++) { + res_[i * 2 + 0] = hex[tmp[i] >> 4]; + res_[i * 2 + 1] = hex[tmp[i] & 0xf]; + } + res_[32] = 0; + + return res_; + } + + protected: + char res_[33]; + MD5Context md5_; +}; + +} // namespace libvpx_test + +#endif // LIBVPX_TEST_MD5_HELPER_H_ diff --git a/test/resize_test.cc b/test/resize_test.cc index c846157eadcd9bf09f7bb2e6b27f3f80b5a48723..0d591ad87dbf1ecfb69671769ffead1e60970a7e 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -9,9 +9,12 @@ */ #include <climits> #include <vector> +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" #include "test/encode_test_driver.h" +#include "test/i420_video_source.h" #include "test/video_source.h" -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/util.h" namespace { @@ -49,8 +52,10 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { }; class ResizeTest : public ::libvpx_test::EncoderTest, - public ::testing::TestWithParam<enum libvpx_test::TestMode> { + public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: + ResizeTest() : EncoderTest(GET_PARAM(0)) {} + struct FrameInfo { FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h) : pts(_pts), w(_w), h(_h) {} @@ -62,22 +67,16 @@ class ResizeTest : public ::libvpx_test::EncoderTest, virtual void SetUp() { InitializeConfig(); - SetMode(GetParam()); + SetMode(GET_PARAM(1)); } virtual bool Continue() const { return !HasFatalFailure() && !abort_; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { - const unsigned char *buf = - reinterpret_cast<const unsigned char *>(pkt->data.frame.buf); - const unsigned int w = (buf[6] | (buf[7] << 8)) & 0x3fff; - const unsigned int h = (buf[8] | (buf[9] << 8)) & 0x3fff; - - frame_info_list_.push_back(FrameInfo(pkt->data.frame.pts, w, h)); - } + virtual void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) { + frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); } std::vector< FrameInfo > frame_info_list_; @@ -100,5 +99,53 @@ TEST_P(ResizeTest, TestExternalResizeWorks) { } } -INSTANTIATE_TEST_CASE_P(OnePass, ResizeTest, ONE_PASS_TEST_MODES); +class ResizeInternalTest : public ResizeTest { + protected: + ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {} + + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) { + if (video->frame() == 3) { + struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE}; + encoder->Control(VP8E_SET_SCALEMODE, &mode); + } + if (video->frame() == 6) { + struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL}; + encoder->Control(VP8E_SET_SCALEMODE, &mode); + } + } + + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + if (!frame0_psnr_) + frame0_psnr_ = pkt->data.psnr.psnr[0]; + EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0); + } + + double frame0_psnr_; +}; + +TEST_P(ResizeInternalTest, TestInternalResizeWorks) { + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 10); + init_flags_ = VPX_CODEC_USE_PSNR; + // q picked such that initial keyframe on this clip is ~30dB PSNR + cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (std::vector<FrameInfo>::iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + const vpx_codec_pts_t pts = info->pts; + if (pts >= 3 && pts < 6) { + ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width"; + ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height"; + } else { + EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width"; + EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height"; + } + } +} + +VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest, + ::testing::Values(::libvpx_test::kOnePassBest)); } // namespace diff --git a/test/sad_test.cc b/test/sad_test.cc index 72741a901e02406ad804809383e3224d249983b1..165e2c8f0c58182513ba2a63105f70848c5ad63f 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -15,8 +15,13 @@ extern "C" { #include "./vpx_config.h" +#if CONFIG_VP8_ENCODER #include "./vp8_rtcd.h" -#include "vp8/common/blockd.h" +//#include "vp8/common/blockd.h" +#endif +#if CONFIG_VP9_ENCODER +#include "./vp9_rtcd.h" +#endif #include "vpx_mem/vpx_mem.h" } @@ -32,14 +37,22 @@ typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr, int reference_stride, unsigned int max_sad); +typedef void (*sad_n_by_n_by_4_fn_t)(const uint8_t *src_ptr, + int src_stride, + const unsigned char * const ref_ptr[], + int ref_stride, + unsigned int *sad_array); + using libvpx_test::ACMRandom; namespace { -class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) { +class SADTestBase : public ::testing::Test { public: + SADTestBase(int width, int height) : width_(width), height_(height) {} + static void SetUpTestCase() { source_data_ = reinterpret_cast<uint8_t*>( - vpx_memalign(kDataAlignment, kDataBufferSize)); + vpx_memalign(kDataAlignment, kDataBlockSize)); reference_data_ = reinterpret_cast<uint8_t*>( vpx_memalign(kDataAlignment, kDataBufferSize)); } @@ -52,36 +65,31 @@ class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) { } protected: + // Handle blocks up to 4 blocks 64x64 with stride up to 128 static const int kDataAlignment = 16; - static const int kDataBufferSize = 16 * 32; + static const int kDataBlockSize = 64 * 128; + static const int kDataBufferSize = 4 * kDataBlockSize; virtual void SetUp() { - sad_fn_ = GET_PARAM(2); - height_ = GET_PARAM(1); - width_ = GET_PARAM(0); - source_stride_ = width_ * 2; + source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); } - sad_m_by_n_fn_t sad_fn_; - virtual unsigned int SAD(unsigned int max_sad) { - unsigned int ret; - REGISTER_STATE_CHECK(ret = sad_fn_(source_data_, source_stride_, - reference_data_, reference_stride_, - max_sad)); - return ret; + virtual uint8_t* GetReference(int block_idx) { + return reference_data_ + block_idx * kDataBlockSize; } // Sum of Absolute Differences. Given two blocks, calculate the absolute // difference between two pixels in the same relative location; accumulate. - unsigned int ReferenceSAD(unsigned int max_sad) { + unsigned int ReferenceSAD(unsigned int max_sad, int block_idx = 0) { unsigned int sad = 0; + const uint8_t* const reference = GetReference(block_idx); for (int h = 0; h < height_; ++h) { for (int w = 0; w < width_; ++w) { sad += abs(source_data_[h * source_stride_ + w] - - reference_data_[h * reference_stride_ + w]); + - reference[h * reference_stride_ + w]); } if (sad > max_sad) { break; @@ -106,6 +114,32 @@ class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) { } } + int width_, height_; + static uint8_t* source_data_; + int source_stride_; + static uint8_t* reference_data_; + int reference_stride_; + + ACMRandom rnd_; +}; + +class SADTest : public SADTestBase, + public ::testing::WithParamInterface< + std::tr1::tuple<int, int, sad_m_by_n_fn_t> > { + public: + SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {} + + protected: + unsigned int SAD(unsigned int max_sad, int block_idx = 0) { + unsigned int ret; + const uint8_t* const reference = GetReference(block_idx); + + REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_, + reference, reference_stride_, + max_sad)); + return ret; + } + void CheckSad(unsigned int max_sad) { unsigned int reference_sad, exp_sad; @@ -119,19 +153,38 @@ class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) { ASSERT_GE(exp_sad, reference_sad); } } +}; - // Handle blocks up to 16x16 with stride up to 32 - int height_, width_; - static uint8_t* source_data_; - int source_stride_; - static uint8_t* reference_data_; - int reference_stride_; +class SADx4Test : public SADTestBase, + public ::testing::WithParamInterface< + std::tr1::tuple<int, int, sad_n_by_n_by_4_fn_t> > { + public: + SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {} - ACMRandom rnd_; + protected: + void SADs(unsigned int *results) { + const uint8_t* refs[] = {GetReference(0), GetReference(1), + GetReference(2), GetReference(3)}; + + REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_, + refs, reference_stride_, + results)); + } + + void CheckSADs() { + unsigned int reference_sad, exp_sad[4]; + + SADs(exp_sad); + for (int block = 0; block < 4; block++) { + reference_sad = ReferenceSAD(UINT_MAX, block); + + EXPECT_EQ(exp_sad[block], reference_sad) << "block " << block; + } + } }; -uint8_t* SADTest::source_data_ = NULL; -uint8_t* SADTest::reference_data_ = NULL; +uint8_t* SADTestBase::source_data_ = NULL; +uint8_t* SADTestBase::reference_data_ = NULL; TEST_P(SADTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); @@ -139,12 +192,30 @@ TEST_P(SADTest, MaxRef) { CheckSad(UINT_MAX); } +TEST_P(SADx4Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, 255); + FillConstant(GetReference(1), reference_stride_, 255); + FillConstant(GetReference(2), reference_stride_, 255); + FillConstant(GetReference(3), reference_stride_, 255); + CheckSADs(); +} + TEST_P(SADTest, MaxSrc) { FillConstant(source_data_, source_stride_, 255); FillConstant(reference_data_, reference_stride_, 0); CheckSad(UINT_MAX); } +TEST_P(SADx4Test, MaxSrc) { + FillConstant(source_data_, source_stride_, 255); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + CheckSADs(); +} + TEST_P(SADTest, ShortRef) { int tmp_stride = reference_stride_; reference_stride_ >>= 1; @@ -154,6 +225,18 @@ TEST_P(SADTest, ShortRef) { reference_stride_ = tmp_stride; } +TEST_P(SADx4Test, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + TEST_P(SADTest, UnalignedRef) { // The reference frame, but not the source frame, may be unaligned for // certain types of searches. @@ -165,6 +248,20 @@ TEST_P(SADTest, UnalignedRef) { reference_stride_ = tmp_stride; } +TEST_P(SADx4Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + TEST_P(SADTest, ShortSrc) { int tmp_stride = source_stride_; source_stride_ >>= 1; @@ -174,6 +271,18 @@ TEST_P(SADTest, ShortSrc) { source_stride_ = tmp_stride; } +TEST_P(SADx4Test, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_stride_ = tmp_stride; +} + TEST_P(SADTest, MaxSAD) { // Verify that, when max_sad is set, the implementation does not return a // value lower than the reference. @@ -184,17 +293,61 @@ TEST_P(SADTest, MaxSAD) { using std::tr1::make_tuple; +#if CONFIG_VP8_ENCODER && CONFIG_VP9_ENCODER +#define VP8_VP9_SEPARATOR , +#else +#define VP8_VP9_SEPARATOR +#endif + +#if CONFIG_VP8_ENCODER const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c; const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c; const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c; const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c; const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c; +#endif +#if CONFIG_VP9_ENCODER +const sad_m_by_n_fn_t sad_64x64_c_vp9 = vp9_sad64x64_c; +const sad_m_by_n_fn_t sad_32x32_c_vp9 = vp9_sad32x32_c; +const sad_m_by_n_fn_t sad_16x16_c_vp9 = vp9_sad16x16_c; +const sad_m_by_n_fn_t sad_8x16_c_vp9 = vp9_sad8x16_c; +const sad_m_by_n_fn_t sad_16x8_c_vp9 = vp9_sad16x8_c; +const sad_m_by_n_fn_t sad_8x8_c_vp9 = vp9_sad8x8_c; +const sad_m_by_n_fn_t sad_4x4_c_vp9 = vp9_sad4x4_c; +#endif INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::Values( +#if CONFIG_VP8_ENCODER make_tuple(16, 16, sad_16x16_c), make_tuple(8, 16, sad_8x16_c), make_tuple(16, 8, sad_16x8_c), make_tuple(8, 8, sad_8x8_c), - make_tuple(4, 4, sad_4x4_c))); + make_tuple(4, 4, sad_4x4_c) +#endif + VP8_VP9_SEPARATOR +#if CONFIG_VP9_ENCODER + make_tuple(64, 64, sad_64x64_c_vp9), + make_tuple(32, 32, sad_32x32_c_vp9), + make_tuple(16, 16, sad_16x16_c_vp9), + make_tuple(8, 16, sad_8x16_c_vp9), + make_tuple(16, 8, sad_16x8_c_vp9), + make_tuple(8, 8, sad_8x8_c_vp9), + make_tuple(4, 4, sad_4x4_c_vp9) +#endif + )); + +#if CONFIG_VP9_ENCODER +const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c; +const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c; +const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c; +const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c; +const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c; +INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values( + make_tuple(64, 64, sad_64x64x4d_c), + make_tuple(32, 32, sad_32x32x4d_c), + make_tuple(16, 16, sad_16x16x4d_c), + make_tuple(8, 8, sad_8x8x4d_c), + make_tuple(4, 4, sad_4x4x4d_c))); +#endif // ARM tests #if HAVE_MEDIA @@ -219,31 +372,120 @@ INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values( // X86 tests #if HAVE_MMX +#if CONFIG_VP8_ENCODER const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx; const sad_m_by_n_fn_t sad_8x16_mmx = vp8_sad8x16_mmx; const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx; const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx; const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx; +#endif +#if CONFIG_VP9_ENCODER +const sad_m_by_n_fn_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx; +const sad_m_by_n_fn_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx; +const sad_m_by_n_fn_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx; +const sad_m_by_n_fn_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx; +const sad_m_by_n_fn_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx; +#endif + INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::Values( +#if CONFIG_VP8_ENCODER make_tuple(16, 16, sad_16x16_mmx), make_tuple(8, 16, sad_8x16_mmx), make_tuple(16, 8, sad_16x8_mmx), make_tuple(8, 8, sad_8x8_mmx), - make_tuple(4, 4, sad_4x4_mmx))); + make_tuple(4, 4, sad_4x4_mmx) +#endif + VP8_VP9_SEPARATOR +#if CONFIG_VP9_ENCODER + make_tuple(16, 16, sad_16x16_mmx_vp9), + make_tuple(8, 16, sad_8x16_mmx_vp9), + make_tuple(16, 8, sad_16x8_mmx_vp9), + make_tuple(8, 8, sad_8x8_mmx_vp9), + make_tuple(4, 4, sad_4x4_mmx_vp9) +#endif + )); +#endif + +#if HAVE_SSE +#if CONFIG_VP9_ENCODER +const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse; +INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values( + make_tuple(4, 4, sad_4x4_sse_vp9))); + +const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse; +INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values( + make_tuple(4, 4, sad_4x4x4d_sse))); #endif +#endif + #if HAVE_SSE2 +#if CONFIG_VP8_ENCODER const sad_m_by_n_fn_t sad_16x16_wmt = vp8_sad16x16_wmt; const sad_m_by_n_fn_t sad_8x16_wmt = vp8_sad8x16_wmt; const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt; const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt; const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt; +#endif +#if CONFIG_VP9_ENCODER +const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2; +const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2; +const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2; +const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; +const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2; +const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2; +#endif INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::Values( +#if CONFIG_VP8_ENCODER make_tuple(16, 16, sad_16x16_wmt), make_tuple(8, 16, sad_8x16_wmt), make_tuple(16, 8, sad_16x8_wmt), make_tuple(8, 8, sad_8x8_wmt), - make_tuple(4, 4, sad_4x4_wmt))); + make_tuple(4, 4, sad_4x4_wmt) +#endif + VP8_VP9_SEPARATOR +#if CONFIG_VP9_ENCODER + make_tuple(64, 64, sad_64x64_sse2_vp9), + make_tuple(32, 32, sad_32x32_sse2_vp9), + make_tuple(16, 16, sad_16x16_sse2_vp9), + make_tuple(8, 16, sad_8x16_sse2_vp9), + make_tuple(16, 8, sad_16x8_sse2_vp9), + make_tuple(8, 8, sad_8x8_sse2_vp9) +#endif + )); + +#if CONFIG_VP9_ENCODER +const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2; +INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( + make_tuple(64, 64, sad_64x64x4d_sse2), + make_tuple(32, 32, sad_32x32x4d_sse2), + make_tuple(16, 16, sad_16x16x4d_sse2), + make_tuple(16, 8, sad_16x8x4d_sse2), + make_tuple(8, 16, sad_8x16x4d_sse2), + make_tuple(8, 8, sad_8x8x4d_sse2))); #endif +#endif + +#if HAVE_SSE3 +#if CONFIG_VP8_ENCODER +const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3; +const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3; +const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3; +const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3; +const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3; +INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values( + make_tuple(16, 16, sad_16x16x4d_sse3), + make_tuple(16, 8, sad_16x8x4d_sse3), + make_tuple(8, 16, sad_8x16x4d_sse3), + make_tuple(8, 8, sad_8x8x4d_sse3), + make_tuple(4, 4, sad_4x4x4d_sse3))); +#endif +#endif + #if HAVE_SSSE3 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3; INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( diff --git a/test/superframe_test.cc b/test/superframe_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..52faddb43e4890bbf6816dfaedbe94065d555803 --- /dev/null +++ b/test/superframe_test.cc @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <climits> +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class SuperframeTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { + protected: + SuperframeTest() : EncoderTest(GET_PARAM(0)), modified_buf_(NULL), + last_sf_pts_(0) {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + sf_count_ = 0; + sf_count_max_ = INT_MAX; + } + + virtual void TearDown() { + delete modified_buf_; + } + + virtual bool Continue() const { + return !HasFatalFailure() && !abort_; + } + + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + } + } + + virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) { + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) + return pkt; + + const uint8_t *buffer = reinterpret_cast<uint8_t*>(pkt->data.frame.buf); + const uint8_t marker = buffer[pkt->data.frame.sz - 1]; + const int frames = (marker & 0x7) + 1; + const int mag = ((marker >> 3) & 3) + 1; + const unsigned int index_sz = 2 + mag * frames; + if ((marker & 0xe0) == 0xc0 && + pkt->data.frame.sz >= index_sz && + buffer[pkt->data.frame.sz - index_sz] == marker) { + // frame is a superframe. strip off the index. + if (modified_buf_) + delete modified_buf_; + modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz]; + memcpy(modified_buf_, pkt->data.frame.buf, + pkt->data.frame.sz - index_sz); + modified_pkt_ = *pkt; + modified_pkt_.data.frame.buf = modified_buf_; + modified_pkt_.data.frame.sz -= index_sz; + + sf_count_++; + last_sf_pts_ = pkt->data.frame.pts; + return &modified_pkt_; + } + + // Make sure we do a few frames after the last SF + abort_ |= sf_count_ > sf_count_max_ && + pkt->data.frame.pts - last_sf_pts_ >= 5; + return pkt; + } + + int sf_count_; + int sf_count_max_; + vpx_codec_cx_pkt_t modified_pkt_; + uint8_t *modified_buf_; + vpx_codec_pts_t last_sf_pts_; +}; + +TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) { + sf_count_max_ = 0; // early exit on successful test. + cfg_.g_lag_in_frames = 25; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 40); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_EQ(sf_count_, 1); +} + +VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Values( + ::libvpx_test::kTwoPassGood)); +} // namespace diff --git a/test/test.mk b/test/test.mk index a1345b876e5b71b64f57557e41d4f1bc6086f80a..0d069d026fe92295d15312369786f750733eae09 100644 --- a/test/test.mk +++ b/test/test.mk @@ -1,7 +1,8 @@ LIBVPX_TEST_SRCS-yes += register_state_check.h LIBVPX_TEST_SRCS-yes += test.mk LIBVPX_TEST_SRCS-yes += acm_random.h - +LIBVPX_TEST_SRCS-yes += md5_helper.h +LIBVPX_TEST_SRCS-yes += codec_factory.h LIBVPX_TEST_SRCS-yes += test_libvpx.cc LIBVPX_TEST_SRCS-yes += util.h LIBVPX_TEST_SRCS-yes += video_source.h @@ -15,17 +16,20 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += datarate_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.h -LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += error_resilience_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += i420_video_source.h + +LIBVPX_TEST_SRCS-yes += encode_test_driver.cc +LIBVPX_TEST_SRCS-yes += encode_test_driver.h +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += error_resilience_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ../md5_utils.h ../md5_utils.c -LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.h -LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ivf_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../md5_utils.h ../md5_utils.c +LIBVPX_TEST_SRCS-yes += decode_test_driver.cc +LIBVPX_TEST_SRCS-yes += decode_test_driver.h +LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ivf_video_source.h + + LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += test_vector_test.cc ## @@ -44,10 +48,10 @@ ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes) LIBVPX_TEST_SRCS-yes += vp8_boolcoder_test.cc endif -LIBVPX_TEST_SRCS-yes += idctllm_test.cc +LIBVPX_TEST_SRCS-yes += idct_test.cc LIBVPX_TEST_SRCS-yes += intrapred_test.cc LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc -LIBVPX_TEST_SRCS-yes += sad_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc @@ -66,13 +70,18 @@ LIBVPX_TEST_SRCS-yes += vp9_boolcoder_test.cc # IDCT test currently depends on FDCT function LIBVPX_TEST_SRCS-yes += idct8x8_test.cc +LIBVPX_TEST_SRCS-yes += superframe_test.cc +LIBVPX_TEST_SRCS-yes += tile_independence_test.cc endif +LIBVPX_TEST_SRCS-$(CONFIG_VP9) += convolve_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc + LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc -#LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc + endif # VP9 @@ -82,7 +91,8 @@ endif ## ## TEST DATA ## -LIBVPX_TEST_DATA-$(CONFIG_VP8_ENCODER) += hantro_collage_w352h288.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv + LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc index 938457b2bc5293bca5b0a7513317609c347c4715..e0d99b5dd866588a6a0fe363301333477d9e7853 100644 --- a/test/test_vector_test.cc +++ b/test/test_vector_test.cc @@ -12,17 +12,15 @@ #include <cstdlib> #include <string> #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" #include "test/decode_test_driver.h" #include "test/ivf_video_source.h" +#include "test/util.h" +#include "test/md5_helper.h" extern "C" { -#include "./md5_utils.h" #include "vpx_mem/vpx_mem.h" } -#if defined(_MSC_VER) -#define snprintf sprintf_s -#endif - namespace { // There are 61 test vectors in total. const char *kTestVectors[] = { @@ -59,10 +57,10 @@ const char *kTestVectors[] = { "vp80-05-sharpness-1440.ivf", "vp80-05-sharpness-1443.ivf" }; -class TestVectorTest : public libvpx_test::DecoderTest, - public ::testing::TestWithParam<const char*> { +class TestVectorTest : public ::libvpx_test::DecoderTest, + public ::libvpx_test::CodecTestWithParam<const char*> { protected: - TestVectorTest() : md5_file_(NULL) {} + TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {} virtual ~TestVectorTest() { if (md5_file_) @@ -85,30 +83,9 @@ class TestVectorTest : public libvpx_test::DecoderTest, ASSERT_NE(res, EOF) << "Read md5 data failed"; expected_md5[32] = '\0'; - MD5Context md5; - MD5Init(&md5); - - // Compute and update md5 for each raw in decompressed data. - for (int plane = 0; plane < 3; ++plane) { - uint8_t *buf = img.planes[plane]; - - for (unsigned int y = 0; y < (plane ? (img.d_h + 1) >> 1 : img.d_h); - ++y) { - MD5Update(&md5, buf, (plane ? (img.d_w + 1) >> 1 : img.d_w)); - buf += img.stride[plane]; - } - } - - uint8_t md5_sum[16]; - MD5Final(md5_sum, &md5); - - char actual_md5[33]; - // Convert to get the actual md5. - for (int i = 0; i < 16; i++) { - snprintf(&actual_md5[i * 2], sizeof(actual_md5) - i * 2, "%02x", - md5_sum[i]); - } - actual_md5[32] = '\0'; + ::libvpx_test::MD5 md5_res; + md5_res.Add(&img); + const char *actual_md5 = md5_res.Get(); // Check md5 match. ASSERT_STREQ(expected_md5, actual_md5) @@ -124,7 +101,7 @@ class TestVectorTest : public libvpx_test::DecoderTest, // checksums match the correct md5 data, then the test is passed. Otherwise, // the test failed. TEST_P(TestVectorTest, MD5Match) { - const std::string filename = GetParam(); + const std::string filename = GET_PARAM(1); // Open compressed video file. libvpx_test::IVFVideoSource video(filename); @@ -138,7 +115,7 @@ TEST_P(TestVectorTest, MD5Match) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -INSTANTIATE_TEST_CASE_P(TestVectorSequence, TestVectorTest, - ::testing::ValuesIn(kTestVectors)); +VP8_INSTANTIATE_TEST_CASE(TestVectorTest, + ::testing::ValuesIn(kTestVectors)); } // namespace diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..711d0bd45e82fe00a543736552147aad93f372f6 --- /dev/null +++ b/test/tile_independence_test.cc @@ -0,0 +1,102 @@ +/* + Copyright (c) 2012 The WebM project authors. All Rights Reserved. + + Use of this source code is governed by a BSD-style license + that can be found in the LICENSE file in the root of the source + tree. An additional intellectual property rights grant can be found + in the file PATENTS. All contributing project authors may + be found in the AUTHORS file in the root of the source tree. + */ + +#include <cstdio> +#include <cstdlib> +#include <string> +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/md5_helper.h" +extern "C" { +#include "vpx_mem/vpx_mem.h" +} + +namespace { +class TileIndependenceTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<int> { + protected: + TileIndependenceTest() : EncoderTest(GET_PARAM(0)), n_tiles_(GET_PARAM(1)), + md5_fw_order_(), md5_inv_order_() { + init_flags_ = VPX_CODEC_USE_PSNR; + vpx_codec_dec_cfg_t cfg; + cfg.w = 704; + cfg.h = 144; + cfg.threads = 1; + fw_dec_ = codec_->CreateDecoder(cfg, 0); + inv_dec_ = codec_->CreateDecoder(cfg, 0); + inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1); + } + + virtual ~TileIndependenceTest() { + delete fw_dec_; + delete inv_dec_; + } + + virtual void SetUp() { + InitializeConfig(); + SetMode(libvpx_test::kTwoPassGood); + } + + virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_); + } + } + + void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt, + ::libvpx_test::MD5 *md5) { + dec->DecodeFrame((uint8_t *) pkt->data.frame.buf, pkt->data.frame.sz); + const vpx_image_t *img = dec->GetDxData().Next(); + md5->Add(img); + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + UpdateMD5(fw_dec_, pkt, &md5_fw_order_); + UpdateMD5(inv_dec_, pkt, &md5_inv_order_); + } + + private: + int n_tiles_; + protected: + ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_; + ::libvpx_test::Decoder *fw_dec_, *inv_dec_; +}; + +// run an encode with 2 or 4 tiles, and do the decode both in normal and +// inverted tile ordering. Ensure that the MD5 of the output in both cases +// is identical. If so, tiles are considered independent and the test passes. +TEST_P(TileIndependenceTest, MD5Match) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = 25; + cfg_.rc_end_usage = VPX_VBR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 144, + timebase.den, timebase.num, 0, 30); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + const char *md5_fw_str = md5_fw_order_.Get(); + const char *md5_inv_str = md5_inv_order_.Get(); + + // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer + // output if it fails. Not sure if it's helpful since it's really just + // a MD5... + ASSERT_STREQ(md5_fw_str, md5_inv_str); +} + +VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, + ::testing::Range(0, 2, 1)); + +} // namespace diff --git a/test/util.h b/test/util.h index 06a70cc8e412d32dd9e98cce39606a6a912d1c82..533a1db5cebb107087b6c3901e9f3a2d8ff6b918 100644 --- a/test/util.h +++ b/test/util.h @@ -11,8 +11,38 @@ #ifndef TEST_UTIL_H_ #define TEST_UTIL_H_ +#include <stdio.h> +#include <math.h> +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx/vpx_image.h" + // Macros #define PARAMS(...) ::testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > > #define GET_PARAM(k) std::tr1::get< k >(GetParam()) +static double compute_psnr(const vpx_image_t *img1, + const vpx_image_t *img2) { + assert((img1->fmt == img2->fmt) && + (img1->d_w == img2->d_w) && + (img1->d_h == img2->d_h)); + + const unsigned int width_y = img1->d_w; + const unsigned int height_y = img1->d_h; + unsigned int i, j; + + int64_t sqrerr = 0; + for (i = 0; i < height_y; ++i) + for (j = 0; j < width_y; ++j) { + int64_t d = img1->planes[VPX_PLANE_Y][i * img1->stride[VPX_PLANE_Y] + j] - + img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j]; + sqrerr += d * d; + } + double mse = sqrerr / (width_y * height_y); + double psnr = 100.0; + if (mse > 0.0) { + psnr = 10 * log10(255.0 * 255.0 / mse); + } + return psnr; +} + #endif // TEST_UTIL_H_ diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 019c255f698dbcd62a2db5a009194c79695179b7..2db309658c5b94f3badd6f02fb887032c5769066 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -302,7 +302,7 @@ int check_fragments_for_errors(VP8D_COMP *pbi) return 1; } - + int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, const uint8_t *source, int64_t time_stamp) diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 8f94171e692e9a5a3374d6f352b1f0b23424dd47..ca680f9a5f15dea36e41ced1542ce0e738784a8b 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -50,7 +50,7 @@ const int vp8cx_base_skip_false_prob[128] = unsigned __int64 Sectionbits[500]; #endif -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS int intra_mode_stats[10][10][10]; static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] [2]; extern unsigned int active_section; @@ -531,7 +531,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) vp8_convert_rfct_to_prob(cpi); -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 1; #endif @@ -580,7 +580,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) xd->mb_to_top_edge = -((mb_row * 16)) << 3; xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 9; #endif @@ -593,7 +593,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) if (rf == INTRA_FRAME) { vp8_write(w, 0, cpi->prob_intra_coded); -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 6; #endif write_ymode(w, mode, pc->fc.ymode_prob); @@ -633,13 +633,13 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) vp8_mv_ref_probs(mv_ref_p, ct); -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS accum_mv_refs(mode, ct); #endif } -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 3; #endif @@ -649,7 +649,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { case NEWMV: -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 5; #endif @@ -692,7 +692,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) if (blockmode == NEW4X4) { -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 11; #endif write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *) mvc); @@ -769,7 +769,7 @@ static void write_kfmodes(VP8_COMP *cpi) const B_PREDICTION_MODE L = left_block_mode(m, i); const int bm = m->bmi[i].as_mode; -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS ++intra_mode_stats [A] [L] [bm]; #endif @@ -1160,7 +1160,7 @@ void vp8_update_coef_probs(VP8_COMP *cpi) #endif -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS ++ tree_update_hist [i][j][k][t] [u]; #endif @@ -1181,7 +1181,7 @@ void vp8_update_coef_probs(VP8_COMP *cpi) while (++t < ENTROPY_NODES); /* Accum token counts for generation of default statistics */ -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS t = 0; do @@ -1527,7 +1527,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest if (pc->frame_type != KEY_FRAME) vp8_write_bit(bc, pc->refresh_last_frame); -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS if (pc->frame_type == INTER_FRAME) active_section = 0; @@ -1550,7 +1550,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest vp8_update_coef_probs(cpi); #endif -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 2; #endif @@ -1561,7 +1561,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest { write_kfmodes(cpi); -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 8; #endif } @@ -1569,7 +1569,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest { pack_inter_mode_mvs(cpi); -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 1; #endif } @@ -1687,7 +1687,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest #endif } -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS void print_tree_update_probs() { int i, j, k, l; diff --git a/vp8/encoder/boolhuff.c b/vp8/encoder/boolhuff.c index 74770a276a02e8dfbc2b4fcf8419d8331e33cc7c..3b0c03a142de2d6c1a35b42e4bd6f6e3605ae573 100644 --- a/vp8/encoder/boolhuff.c +++ b/vp8/encoder/boolhuff.c @@ -16,7 +16,7 @@ unsigned __int64 Sectionbits[500]; #endif -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS unsigned int active_section = 0; #endif diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h index 83090630672a31f755989af592383904810f447c..39ab586b52b369aca0973de504880961c47e0612 100644 --- a/vp8/encoder/boolhuff.h +++ b/vp8/encoder/boolhuff.h @@ -67,7 +67,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) unsigned int lowvalue = br->lowvalue; register unsigned int shift; -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS #if defined(SECTIONBITS_OUTPUT) if (bit) diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c index 0c43d0692174b13ca7aa126efa003718dc2f4aab..2a74ff4ae3a1313a47b23733850627e4054cf08e 100644 --- a/vp8/encoder/encodemv.c +++ b/vp8/encoder/encodemv.c @@ -16,7 +16,7 @@ #include <math.h> -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS extern unsigned int active_section; #endif @@ -359,7 +359,7 @@ void vp8_write_mvprobs(VP8_COMP *cpi) vp8_writer *const w = cpi->bc; MV_CONTEXT *mvc = cpi->common.fc.mvc; int flags[2] = {0, 0}; -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 4; #endif write_component_probs( @@ -374,7 +374,7 @@ void vp8_write_mvprobs(VP8_COMP *cpi) if (flags[0] || flags[1]) vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags); -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS active_section = 5; #endif } diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index a34af64280702b800485f701860420b370e55d43..2c59872f76caa9648af6b00cab78d4e865f4264f 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -18,7 +18,7 @@ #include <math.h> #include "vp8/common/findnearmv.h" -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS static int mv_ref_ct [31] [4] [2]; static int mv_mode_cts [4] [2]; #endif @@ -1912,7 +1912,7 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS void print_mode_context(void) { FILE *f = fopen("modecont.c", "w"); @@ -1965,8 +1965,8 @@ void print_mode_context(void) fclose(f); } -/* MV ref count ENTROPY_STATS stats code */ -#ifdef ENTROPY_STATS +/* MV ref count VP8_ENTROPY_STATS stats code */ +#ifdef VP8_ENTROPY_STATS void init_mv_ref_counts() { vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct)); @@ -2020,6 +2020,6 @@ void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) } } -#endif/* END MV ref count ENTROPY_STATS stats code */ +#endif/* END MV ref count VP8_ENTROPY_STATS stats code */ #endif diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h index 890113f9a433b6a6e7decad114baf43b88129854..e36c51543cb0d3d5ebc16a6c36b5330c002efb09 100644 --- a/vp8/encoder/mcomp.h +++ b/vp8/encoder/mcomp.h @@ -15,7 +15,7 @@ #include "block.h" #include "vp8/common/variance.h" -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS extern void init_mv_ref_counts(); extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); #endif diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 10d3cc8d252746356a0ba7000a84894d18dfb501..916137b49a008a7749661e3311cd6193de521d9b 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -111,7 +111,7 @@ extern int skip_false_count; #endif -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS extern int intra_mode_stats[10][10][10]; #endif @@ -1805,7 +1805,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) else cpi->cyclic_refresh_map = (signed char *) NULL; -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS init_context_counters(); #endif @@ -1923,7 +1923,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->mb.rd_thresh_mult[i] = 128; } -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS init_mv_ref_counts(); #endif @@ -2060,7 +2060,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) #endif -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS print_context_counters(); print_tree_update_probs(); print_mode_context(); @@ -2242,7 +2242,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) } #endif -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS { int i, j, k; FILE *fmode = fopen("modecontext.c", "w"); diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c index 3b5268b6135934a125a14c72f8fd489eef9a2510..11559a72083c3ebf9b9715bb9e0b3c2ee0c401e6 100644 --- a/vp8/encoder/tokenize.c +++ b/vp8/encoder/tokenize.c @@ -20,7 +20,7 @@ /* Global event counters used for accumulating statistics across several compressions, then generating context.c = initial stats. */ -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; #endif void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ; @@ -413,7 +413,7 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) } -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS void init_context_counters(void) { diff --git a/vp8/encoder/tokenize.h b/vp8/encoder/tokenize.h index c2d1438f9c2e88369f542d654164ab2f056ba821..1e6cea1146573bbd5b702c7e6d587dd03900c1cb 100644 --- a/vp8/encoder/tokenize.h +++ b/vp8/encoder/tokenize.h @@ -33,7 +33,7 @@ typedef struct int rd_cost_mby(MACROBLOCKD *); -#ifdef ENTROPY_STATS +#ifdef VP8_ENTROPY_STATS void init_context_counters(); void print_context_counters(); diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2_intrinsics.c similarity index 100% rename from vp8/encoder/x86/quantize_sse2.c rename to vp8/encoder/x86/quantize_sse2_intrinsics.c diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index b985cb1b7fa88def67d7fdef3e1e658e809b6701..4531d5ad089ec5dc1d966628c63b9d140e5bb9a1 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -684,6 +684,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; yv12->y_width = img->d_w; yv12->y_height = img->d_h; yv12->uv_width = (1 + yv12->y_width) / 2; diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index f3834b063d7410eae3aaa27d6d73d9985a83997a..90a175436dbf517a36d42243de85303deb02f54b 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -790,6 +790,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; yv12->y_width = img->d_w; yv12->y_height = img->d_h; yv12->uv_width = yv12->y_width / 2; diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 7d1904aaf7d15fb8a89b3a6e94302c24d2e70ec4..ca9f6a62e8727acf9765bc1a410b3cefe81767a0 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -89,12 +89,12 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm -VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2_intrinsics.c # TODO(johann) make this generic ifeq ($(HAVE_SSE2),yes) -vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2 -vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2 +vp8/encoder/x86/quantize_sse2_intrinsics.c.o: CFLAGS += -msse2 +vp8/encoder/x86/quantize_sse2_intrinsics.c.d: CFLAGS += -msse2 endif ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c index b02f3f083430cb17f27f8139ee0294f00cc16e32..79092cd0eb6020010328ae39c02c45ce4dbfcc78 100644 --- a/vp9/common/generic/vp9_systemdependent.c +++ b/vp9/common/generic/vp9_systemdependent.c @@ -11,8 +11,6 @@ #include "./vpx_config.h" #include "vp9_rtcd.h" -#include "vp9/common/vp9_subpixel.h" -#include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" void vp9_machine_specific_config(VP9_COMMON *ctx) { diff --git a/vp9/common/ppc/vp9_idctllm_altivec.asm b/vp9/common/ppc/vp9_idct_altivec.asm similarity index 98% rename from vp9/common/ppc/vp9_idctllm_altivec.asm rename to vp9/common/ppc/vp9_idct_altivec.asm index 117d9cfc8e868ca1305172db4f5ed539f9e18acf..b87aa42001893f92b5c4dc47626b089b04fe5eaf 100644 --- a/vp9/common/ppc/vp9_idctllm_altivec.asm +++ b/vp9/common/ppc/vp9_idct_altivec.asm @@ -9,7 +9,7 @@ ; - .globl short_idct4x4llm_ppc + .globl short_idct4x4_ppc .macro load_c V, LABEL, OFF, R0, R1 lis \R0, \LABEL@ha @@ -21,7 +21,7 @@ ;# r4 short *output ;# r5 int pitch .align 2 -short_idct4x4llm_ppc: +short_idct4x4_ppc: mfspr r11, 256 ;# get old VRSAVE oris r12, r11, 0xfff8 mtspr 256, r12 ;# set VRSAVE diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c index 106a2b763e31a35b46f588cd1bdd54f03b6d0518..a6be550a1336b228c9f99c28a57308101efdcc54 100644 --- a/vp9/common/ppc/vp9_systemdependent.c +++ b/vp9/common/ppc/vp9_systemdependent.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/common/vp9_subpixel.h" #include "vp9/common/vp9_loopfilter.h" #include "recon.h" #include "vp9/common/vp9_onyxc_int.h" @@ -17,32 +16,28 @@ void (*vp8_short_idct4x4)(short *input, short *output, int pitch); void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch); void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch); -extern void (*vp9_post_proc_down_and_across)( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit -); - -extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit); -extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit); -extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit); -extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit); - -extern void vp9_post_proc_down_and_across_c -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit -); -void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a); +extern void (*vp9_post_proc_down_and_across)(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, int cols, int flimit); + +extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, + int rows, int cols, int flimit); +extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, + int rows, int cols, int flimit); +extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, + int rows, int cols, int flimit); +extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, + int rows, int cols, int flimit); +extern void vp9_post_proc_down_and_across_c(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, int cols, int flimit); +void vp9_plane_add_noise_c(unsigned char *start, + unsigned int width, unsigned int height, + int pitch, int q, int a); extern copy_mem_block_function *vp9_copy_mem16x16; extern copy_mem_block_function *vp9_copy_mem8x8; @@ -60,11 +55,14 @@ extern subpixel_predict_function bilinear_predict16x16_ppc; extern copy_mem_block_function copy_mem16x16_ppc; -void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, + unsigned char *dst_ptr, int stride); +void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, + unsigned char *dst_ptr, int stride); +void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, + unsigned char *dst_ptr, int stride); -extern void short_idct4x4llm_ppc(short *input, short *output, int pitch); +extern void short_idct4x4_ppc(short *input, short *output, int pitch); // Generic C extern subpixel_predict_function vp9_sixtap_predict_c; @@ -80,12 +78,15 @@ extern copy_mem_block_function vp9_copy_mem16x16_c; extern copy_mem_block_function vp9_copy_mem8x8_c; extern copy_mem_block_function vp9_copy_mem8x4_c; -void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, + unsigned char *dst_ptr, int stride); +void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, + unsigned char *dst_ptr, int stride); +void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, + unsigned char *dst_ptr, int stride); -extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch); -extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch); +extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch); +extern void vp9_short_idct4x4_c(short *input, short *output, int pitch); extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); // PPC @@ -140,8 +141,8 @@ void vp9_machine_specific_config(void) { vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc; vp9_sixtap_predict = sixtap_predict_ppc; - vp8_short_idct4x4_1 = vp9_short_idct4x4llm_1_c; - vp8_short_idct4x4 = short_idct4x4llm_ppc; + vp8_short_idct4x4_1 = vp9_short_idct4x4_1_c; + vp8_short_idct4x4 = short_idct4x4_ppc; vp8_dc_only_idct = vp8_dc_only_idct_c; vp8_lf_mbvfull = loop_filter_mbv_ppc; diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index a2306f0d10c86b28e7825106ef9dda52b5752dad..15c8c0d64222bf0ece61575b501375ff0dd8ceb1 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -67,20 +67,16 @@ void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) { int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { int i; + int aligned_width, aligned_height; vp9_de_alloc_frame_buffers(oci); /* our internal buffers are always multiples of 16 */ - if ((width & 0xf) != 0) - width += 16 - (width & 0xf); - - if ((height & 0xf) != 0) - height += 16 - (height & 0xf); - + aligned_width = (width + 15) & ~15; + aligned_height = (height + 15) & ~15; for (i = 0; i < NUM_YV12_BUFFERS; i++) { oci->fb_idx_ref_cnt[i] = 0; - oci->yv12_fb[i].flags = 0; if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP9BORDERINPIXELS) < 0) { vp9_de_alloc_frame_buffers(oci); @@ -88,15 +84,16 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { } } - oci->new_fb_idx = 0; - oci->lst_fb_idx = 1; - oci->gld_fb_idx = 2; - oci->alt_fb_idx = 3; + oci->new_fb_idx = NUM_YV12_BUFFERS - 1; + oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1; + + for (i = 0; i < 3; i++) + oci->active_ref_idx[i] = i; - oci->fb_idx_ref_cnt[0] = 1; - oci->fb_idx_ref_cnt[1] = 1; - oci->fb_idx_ref_cnt[2] = 1; - oci->fb_idx_ref_cnt[3] = 1; + for (i = 0; i < NUM_REF_FRAMES; i++) { + oci->ref_frame_map[i] = i; + oci->fb_idx_ref_cnt[i] = 1; + } if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP9BORDERINPIXELS) < 0) { @@ -110,8 +107,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { return 1; } - oci->mb_rows = height >> 4; - oci->mb_cols = width >> 4; + oci->mb_rows = aligned_height >> 4; + oci->mb_cols = aligned_width >> 4; oci->MBs = oci->mb_rows * oci->mb_cols; oci->mode_info_stride = oci->mb_cols + 1; oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); @@ -134,7 +131,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1; - oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1); + oci->above_context = + vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * (3 + oci->mb_cols), 1); if (!oci->above_context) { vp9_de_alloc_frame_buffers(oci); @@ -146,6 +144,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { return 0; } + void vp9_setup_version(VP9_COMMON *cm) { if (cm->version & 0x4) { if (!CONFIG_EXPERIMENTAL) @@ -204,9 +203,6 @@ void vp9_create_common(VP9_COMMON *oci) { /* Initialise reference frame sign bias structure to defaults */ vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); - /* Default disable buffer to buffer copying */ - oci->copy_buffer_to_gf = 0; - oci->copy_buffer_to_arf = 0; oci->kf_ymode_probs_update = 0; } @@ -220,8 +216,4 @@ void vp9_initialize_common() { vp9_entropy_mode_init(); vp9_entropy_mv_init(); - -#if CONFIG_NEWCOEFCONTEXT - vp9_init_neighbors(); -#endif } diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c index 4ae8132bbb308bca8cae92397f7a8637ab7bdd3a..9151622d3884408e541cd1c49d02b5c326d723b8 100644 --- a/vp9/common/vp9_blockd.c +++ b/vp9/common/vp9_blockd.c @@ -12,15 +12,431 @@ #include "vp9/common/vp9_blockd.h" #include "vpx_mem/vpx_mem.h" -const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25] = { - {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8}, - {0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8} +const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24] = { + { 0, 0, 0, 0, + 1, 1, 1, 1, + 2, 2, 2, 2, + 3, 3, 3, 3, + 4, 4, + 5, 5, + 6, 6, + 7, 7 }, + { 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 2, 2, 2, + 2, 2, 2, 2, + 4, 4, + 4, 4, + 6, 6, + 6, 6 }, + { 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }, }; -const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25] = { - {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8}, - {0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}, - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8} +const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24] = { + { 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + 4, 5, + 4, 5, + 6, 7, + 6, 7 }, + { 0, 0, 0, 0, + 2, 2, 2, 2, + 0, 0, 0, 0, + 2, 2, 2, 2, + 4, 4, + 4, 4, + 6, 6, + 6, 6 }, + { 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }, }; + +#define S(x) x + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT) +const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96] = { + { 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), + S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), + S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), + 4, 4, 4, 4, + 5, 5, 5, 5, + S(4), S(4), S(4), S(4), + S(5), S(5), S(5), S(5), + 6, 6, 6, 6, + 7, 7, 7, 7, + S(6), S(6), S(6), S(6), + S(7), S(7), S(7), S(7) }, + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), + S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), + 4, 4, 4, 4, + 4, 4, 4, 4, + S(4), S(4), S(4), S(4), + S(4), S(4), S(4), S(4), + 6, 6, 6, 6, + 6, 6, 6, 6, + S(6), S(6), S(6), S(6), + S(6), S(6), S(6), S(6) }, + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + 4, 4, 4, 4, + 4, 4, 4, 4, + 4, 4, 4, 4, + 4, 4, 4, 4, + 6, 6, 6, 6, + 6, 6, 6, 6, + 6, 6, 6, 6, + 6, 6, 6, 6 }, + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, +}; +const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96] = { + { 0, 1, 2, 3, S(0), S(1), S(2), S(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), + 4, 5, S(4), S(5), + 4, 5, S(4), S(5), + 4, 5, S(4), S(5), + 4, 5, S(4), S(5), + 6, 7, S(6), S(7), + 6, 7, S(6), S(7), + 6, 7, S(6), S(7), + 6, 7, S(6), S(7) }, + { 0, 0, 0, 0, 2, 2, 2, 2, + S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + 0, 0, 0, 0, 2, 2, 2, 2, + S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + 0, 0, 0, 0, 2, 2, 2, 2, + S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + 0, 0, 0, 0, 2, 2, 2, 2, + S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + 4, 4, 4, 4, + S(4), S(4), S(4), S(4), + 4, 4, 4, 4, + S(4), S(4), S(4), S(4), + 6, 6, 6, 6, + S(6), S(6), S(6), S(6), + 6, 6, 6, 6, + S(6), S(6), S(6), S(6) }, + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + 4, 4, 4, 4, + 4, 4, 4, 4, + 4, 4, 4, 4, + 4, 4, 4, 4, + 6, 6, 6, 6, + 6, 6, 6, 6, + 6, 6, 6, 6, + 6, 6, 6, 6 }, + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, +}; + +#define T(x) x + 2 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT)) +#define U(x) x + 3 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT)) +const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), + S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), + S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), + T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), + T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), + U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), + U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), + 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, + S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4), + S(5), S(5), S(5), S(5), S(5), S(5), S(5), S(5), + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + T(5), T(5), T(5), T(5), T(5), T(5), T(5), T(5), + U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4), + U(5), U(5), U(5), U(5), U(5), U(5), U(5), U(5), + 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, + S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6), + S(7), S(7), S(7), S(7), S(7), S(7), S(7), S(7), + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + T(7), T(7), T(7), T(7), T(7), T(7), T(7), T(7), + U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6), + U(7), U(7), U(7), U(7), U(7), U(7), U(7), U(7) }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), + S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), + T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), + U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4), + S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4), + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4), + U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4), + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6), + S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6), + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6), + U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6) }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6 }, +}; +const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384] = { + { 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3), + 4, 5, S(4), S(5), T(4), T(5), U(4), U(5), + 4, 5, S(4), S(5), T(4), T(5), U(4), U(5), + 4, 5, S(4), S(5), T(4), T(5), U(4), U(5), + 4, 5, S(4), S(5), T(4), T(5), U(4), U(5), + 4, 5, S(4), S(5), T(4), T(5), U(4), U(5), + 4, 5, S(4), S(5), T(4), T(5), U(4), U(5), + 4, 5, S(4), S(5), T(4), T(5), U(4), U(5), + 4, 5, S(4), S(5), T(4), T(5), U(4), U(5), + 6, 7, S(6), S(7), T(6), T(7), U(6), U(7), + 6, 7, S(6), S(7), T(6), T(7), U(6), U(7), + 6, 7, S(6), S(7), T(6), T(7), U(6), U(7), + 6, 7, S(6), S(7), T(6), T(7), U(6), U(7), + 6, 7, S(6), S(7), T(6), T(7), U(6), U(7), + 6, 7, S(6), S(7), T(6), T(7), U(6), U(7), + 6, 7, S(6), S(7), T(6), T(7), U(6), U(7), + 6, 7, S(6), S(7), T(6), T(7), U(6), U(7) }, + { 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2), + 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2), + 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2), + 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2), + 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2), + 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2), + 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2), + 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2), + T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2), + 4, 4, 4, 4, S(4), S(4), S(4), S(4), + T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4), + 4, 4, 4, 4, S(4), S(4), S(4), S(4), + T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4), + 4, 4, 4, 4, S(4), S(4), S(4), S(4), + T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4), + 4, 4, 4, 4, S(4), S(4), S(4), S(4), + T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4), + 6, 6, 6, 6, S(6), S(6), S(6), S(6), + T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6), + 6, 6, 6, 6, S(6), S(6), S(6), S(6), + T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6), + 6, 6, 6, 6, S(6), S(6), S(6), S(6), + T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6), + 6, 6, 6, 6, S(6), S(6), S(6), S(6), + T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6) }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), + T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), + U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4), + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6), + T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6 }, +}; +#undef U +#undef T +#undef S diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index e838da2215e76f8fecc5252d11b519fc34ef0e87..23d0bfd593374c8a9970157d52aaaff244dfb1c5 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -16,9 +16,9 @@ void vpx_log(const char *format, ...); #include "./vpx_config.h" #include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_convolve.h" #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_treecoder.h" -#include "vp9/common/vp9_subpixel.h" #include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" @@ -47,27 +47,13 @@ void vpx_log(const char *format, ...); #define MAX_MV_REFS 9 #define MAX_MV_REF_CANDIDATES 4 -#if CONFIG_DWTDCTHYBRID -#define DWT_MAX_LENGTH 64 -#define DWT_TYPE 26 // 26/53/97 -#define DWT_PRECISION_BITS 2 -#define DWT_PRECISION_RND ((1 << DWT_PRECISION_BITS) / 2) - -#define DWTDCT16X16 0 -#define DWTDCT16X16_LEAN 1 -#define DWTDCT8X8 2 -#define DWTDCT_TYPE DWTDCT16X16_LEAN -#endif - typedef struct { int r, c; } POS; -typedef enum PlaneType { - PLANE_TYPE_Y_NO_DC = 0, - PLANE_TYPE_Y2, - PLANE_TYPE_UV, +typedef enum { PLANE_TYPE_Y_WITH_DC, + PLANE_TYPE_UV, } PLANE_TYPE; typedef char ENTROPY_CONTEXT; @@ -75,10 +61,9 @@ typedef struct { ENTROPY_CONTEXT y1[4]; ENTROPY_CONTEXT u[2]; ENTROPY_CONTEXT v[2]; - ENTROPY_CONTEXT y2; } ENTROPY_CONTEXT_PLANES; -#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \ +#define VP9_COMBINEENTROPYCONTEXTS(Dest, A, B) \ Dest = ((A)!=0) + ((B)!=0); typedef enum { @@ -86,8 +71,7 @@ typedef enum { INTER_FRAME = 1 } FRAME_TYPE; -typedef enum -{ +typedef enum { #if CONFIG_ENABLE_6TAP SIXTAP, #endif @@ -98,8 +82,7 @@ typedef enum SWITCHABLE /* should be the last one */ } INTERPOLATIONFILTERTYPE; -typedef enum -{ +typedef enum { DC_PRED, /* average of above and left pixels */ V_PRED, /* vertical prediction */ H_PRED, /* horizontal prediction */ @@ -125,10 +108,8 @@ typedef enum { SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame - SEG_LVL_MODE = 3, // Optional Segment mode - SEG_LVL_EOB = 4, // EOB end stop marker. - SEG_LVL_TRANSFORM = 5, // Block transform size. - SEG_LVL_MAX = 6 // Number of MB level features supported + SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode + SEG_LVL_MAX = 4 // Number of MB level features supported } SEG_LVL_FEATURES; // Segment level features. @@ -155,10 +136,7 @@ typedef enum { #define VP9_MVREFS (1 + SPLITMV - NEARESTMV) -#if CONFIG_LOSSLESS -#define WHT_UPSCALE_FACTOR 3 -#define Y2_WHT_UPSCALE_FACTOR 2 -#endif +#define WHT_UPSCALE_FACTOR 2 typedef enum { B_DC_PRED, /* average of above and left pixels */ @@ -219,10 +197,7 @@ union b_mode_info { B_PREDICTION_MODE context; #endif } as_mode; - struct { - int_mv first; - int_mv second; - } as_mv; + int_mv as_mv[2]; // first, second inter predictor motion vectors }; typedef enum { @@ -274,6 +249,9 @@ typedef struct { INTERPOLATIONFILTERTYPE interp_filter; BLOCK_SIZE_TYPE sb_type; +#if CONFIG_CODE_NONZEROCOUNT + uint16_t nzcs[256+64*2]; +#endif } MB_MODE_INFO; typedef struct { @@ -298,36 +276,44 @@ typedef struct blockd { int dst; int dst_stride; - int eob; - union b_mode_info bmi; } BLOCKD; -typedef struct superblockd { - /* 32x32 Y and 16x16 U/V. No 2nd order transform yet. */ - DECLARE_ALIGNED(16, int16_t, diff[32*32+16*16*2]); - DECLARE_ALIGNED(16, int16_t, qcoeff[32*32+16*16*2]); - DECLARE_ALIGNED(16, int16_t, dqcoeff[32*32+16*16*2]); -} SUPERBLOCKD; +struct scale_factors { + int x_num; + int x_den; + int x_offset_q4; + int x_step_q4; + int y_num; + int y_den; + int y_offset_q4; + int y_step_q4; +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT + convolve_fn_t predict[2][2][8]; // horiz, vert, weight (0 - 7) +#else + convolve_fn_t predict[2][2][2]; // horiz, vert, avg +#endif +}; typedef struct macroblockd { - DECLARE_ALIGNED(16, int16_t, diff[400]); /* from idct diff */ - DECLARE_ALIGNED(16, uint8_t, predictor[384]); - DECLARE_ALIGNED(16, int16_t, qcoeff[400]); - DECLARE_ALIGNED(16, int16_t, dqcoeff[400]); - DECLARE_ALIGNED(16, uint16_t, eobs[25]); - - SUPERBLOCKD sb_coeff_data; + DECLARE_ALIGNED(16, int16_t, diff[64*64+32*32*2]); /* from idct diff */ + DECLARE_ALIGNED(16, uint8_t, predictor[384]); // unused for superblocks + DECLARE_ALIGNED(16, int16_t, qcoeff[64*64+32*32*2]); + DECLARE_ALIGNED(16, int16_t, dqcoeff[64*64+32*32*2]); + DECLARE_ALIGNED(16, uint16_t, eobs[256+64*2]); +#if CONFIG_CODE_NONZEROCOUNT + DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]); +#endif - /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ - BLOCKD block[25]; + /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */ + BLOCKD block[24]; int fullpixel_mask; YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */ - struct { - uint8_t *y_buffer, *u_buffer, *v_buffer; - } second_pre; + YV12_BUFFER_CONFIG second_pre; YV12_BUFFER_CONFIG dst; + struct scale_factors scale_factor[2]; + struct scale_factors scale_factor_uv[2]; MODE_INFO *prev_mode_info_context; MODE_INFO *mode_info_context; @@ -337,8 +323,9 @@ typedef struct macroblockd { int up_available; int left_available; + int right_available; - /* Y,U,V,Y2 */ + /* Y,U,V */ ENTROPY_CONTEXT_PLANES *above_context; ENTROPY_CONTEXT_PLANES *left_context; @@ -359,6 +346,7 @@ typedef struct macroblockd { // Probability Tree used to code Segment number vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; + vp9_prob mb_segment_mispred_tree_probs[MAX_MB_SEGMENTS]; #if CONFIG_NEW_MVREF vp9_prob mb_mv_ref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1]; @@ -387,21 +375,20 @@ typedef struct macroblockd { unsigned int frames_since_golden; unsigned int frames_till_alt_ref_frame; + int lossless; /* Inverse transform function pointers. */ - void (*inv_xform4x4_1_x8)(int16_t *input, int16_t *output, int pitch); - void (*inv_xform4x4_x8)(int16_t *input, int16_t *output, int pitch); - void (*inv_walsh4x4_1)(int16_t *in, int16_t *out); - void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out); - - - vp9_subpix_fn_t subpixel_predict4x4; - vp9_subpix_fn_t subpixel_predict8x4; - vp9_subpix_fn_t subpixel_predict8x8; - vp9_subpix_fn_t subpixel_predict16x16; - vp9_subpix_fn_t subpixel_predict_avg4x4; - vp9_subpix_fn_t subpixel_predict_avg8x4; - vp9_subpix_fn_t subpixel_predict_avg8x8; - vp9_subpix_fn_t subpixel_predict_avg16x16; + void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch); + void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch); + void (*itxm_add)(int16_t *input, const int16_t *dq, + uint8_t *pred, uint8_t *output, int pitch, int stride, int eob); + void (*itxm_add_y_block)(int16_t *q, const int16_t *dq, + uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd); + void (*itxm_add_uv_block)(int16_t *q, const int16_t *dq, + uint8_t *pre, uint8_t *dst_u, uint8_t *dst_v, int stride, + struct macroblockd *xd); + + struct subpix_fn_table subpix; + int allow_high_precision_mv; int corrupted; @@ -412,74 +399,46 @@ typedef struct macroblockd { } MACROBLOCKD; -#define ACTIVE_HT 110 // quantization stepsize threshold +#define ACTIVE_HT 110 // quantization stepsize threshold -#define ACTIVE_HT8 300 +#define ACTIVE_HT8 300 #define ACTIVE_HT16 300 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) { - B_PREDICTION_MODE b_mode; switch (mode) { - case DC_PRED: - b_mode = B_DC_PRED; - break; - case V_PRED: - b_mode = B_VE_PRED; - break; - case H_PRED: - b_mode = B_HE_PRED; - break; - case TM_PRED: - b_mode = B_TM_PRED; - break; - case D45_PRED: - b_mode = B_LD_PRED; - break; - case D135_PRED: - b_mode = B_RD_PRED; - break; - case D117_PRED: - b_mode = B_VR_PRED; - break; - case D153_PRED: - b_mode = B_HD_PRED; - break; - case D27_PRED: - b_mode = B_HU_PRED; - break; - case D63_PRED: - b_mode = B_VL_PRED; - break; - default : - // for debug purpose, to be removed after full testing - assert(0); - break; + case DC_PRED: return B_DC_PRED; + case V_PRED: return B_VE_PRED; + case H_PRED: return B_HE_PRED; + case TM_PRED: return B_TM_PRED; + case D45_PRED: return B_LD_PRED; + case D135_PRED: return B_RD_PRED; + case D117_PRED: return B_VR_PRED; + case D153_PRED: return B_HD_PRED; + case D27_PRED: return B_HU_PRED; + case D63_PRED: return B_VL_PRED; + default: + assert(0); + return B_MODE_COUNT; // Dummy value } - return b_mode; } // transform mapping static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) { - // map transform type - TX_TYPE tx_type; switch (bmode) { case B_TM_PRED : case B_RD_PRED : - tx_type = ADST_ADST; - break; + return ADST_ADST; case B_VE_PRED : case B_VR_PRED : - tx_type = ADST_DCT; - break; + return ADST_DCT; case B_HE_PRED : case B_HD_PRED : case B_HU_PRED : - tx_type = DCT_ADST; - break; + return DCT_ADST; #if CONFIG_NEWBINTRAMODES case B_CONTEXT_PRED: @@ -487,33 +446,41 @@ static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) { break; #endif - default : - tx_type = DCT_DCT; - break; + default: + return DCT_DCT; } - return tx_type; } -extern const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25]; -extern const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25]; +extern const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24]; +extern const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24]; +extern const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96]; +extern const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96]; +extern const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384]; +extern const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384]; -#define USE_ADST_FOR_I16X16_8X8 0 -#define USE_ADST_FOR_I16X16_4X4 0 +#define USE_ADST_FOR_I16X16_8X8 1 +#define USE_ADST_FOR_I16X16_4X4 1 #define USE_ADST_FOR_I8X8_4X4 1 #define USE_ADST_PERIPHERY_ONLY 1 +#define USE_ADST_FOR_SB 1 +#define USE_ADST_FOR_REMOTE_EDGE 0 -static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { +static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) { // TODO(debargha): explore different patterns for ADST usage when blocksize // is smaller than the prediction size TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; +#if !USE_ADST_FOR_SB + if (sb_type) return tx_type; - // TODO(rbultje, debargha): Explore ADST usage for superblocks - if (xd->mode_info_context->mbmi.sb_type) +#endif + if (ib >= (16 << (2 * sb_type))) // no chroma adst return tx_type; + if (xd->lossless) + return DCT_DCT; if (xd->mode_info_context->mbmi.mode == B_PRED && xd->q_index < ACTIVE_HT) { + const BLOCKD *b = &xd->block[ib]; tx_type = txfm_map( #if CONFIG_NEWBINTRAMODES b->bmi.as_mode.first == B_CONTEXT_PRED ? b->bmi.as_mode.context : @@ -521,16 +488,32 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { b->bmi.as_mode.first); } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED && xd->q_index < ACTIVE_HT) { + const BLOCKD *b = &xd->block[ib]; + const int ic = (ib & 10); #if USE_ADST_FOR_I8X8_4X4 #if USE_ADST_PERIPHERY_ONLY // Use ADST for periphery blocks only - int ic = (ib & 10); + const int inner = ib & 5; b += ic - ib; - tx_type = (ic != 10) ? - txfm_map(pred_mode_conv((MB_PREDICTION_MODE)b->bmi.as_mode.first)) : - DCT_DCT; + tx_type = txfm_map(pred_mode_conv( + (MB_PREDICTION_MODE)b->bmi.as_mode.first)); +#if USE_ADST_FOR_REMOTE_EDGE + if (inner == 5) + tx_type = DCT_DCT; +#else + if (inner == 1) { + if (tx_type == ADST_ADST) tx_type = ADST_DCT; + else if (tx_type == DCT_ADST) tx_type = DCT_DCT; + } else if (inner == 4) { + if (tx_type == ADST_ADST) tx_type = DCT_ADST; + else if (tx_type == ADST_DCT) tx_type = DCT_DCT; + } else if (inner == 5) { + tx_type = DCT_DCT; + } +#endif #else // Use ADST + b += ic - ib; tx_type = txfm_map(pred_mode_conv( (MB_PREDICTION_MODE)b->bmi.as_mode.first)); #endif @@ -542,9 +525,22 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { xd->q_index < ACTIVE_HT) { #if USE_ADST_FOR_I16X16_4X4 #if USE_ADST_PERIPHERY_ONLY - // Use ADST for periphery blocks only - tx_type = (ib < 4 || ((ib & 3) == 0)) ? - txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)) : DCT_DCT; + const int hmax = 4 << sb_type; + tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); +#if USE_ADST_FOR_REMOTE_EDGE + if ((ib & (hmax - 1)) != 0 && ib >= hmax) + tx_type = DCT_DCT; +#else + if (ib >= 1 && ib < hmax) { + if (tx_type == ADST_ADST) tx_type = ADST_DCT; + else if (tx_type == DCT_ADST) tx_type = DCT_DCT; + } else if (ib >= 1 && (ib & (hmax - 1)) == 0) { + if (tx_type == ADST_ADST) tx_type = DCT_ADST; + else if (tx_type == ADST_DCT) tx_type = DCT_DCT; + } else if (ib != 0) { + tx_type = DCT_DCT; + } +#endif #else // Use ADST tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); @@ -557,29 +553,44 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { return tx_type; } -static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) { +static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) { // TODO(debargha): explore different patterns for ADST usage when blocksize // is smaller than the prediction size TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; +#if !USE_ADST_FOR_SB + if (sb_type) return tx_type; - // TODO(rbultje, debargha): Explore ADST usage for superblocks - if (xd->mode_info_context->mbmi.sb_type) +#endif + if (ib >= (16 << (2 * sb_type))) // no chroma adst return tx_type; if (xd->mode_info_context->mbmi.mode == I8X8_PRED && xd->q_index < ACTIVE_HT8) { + const BLOCKD *b = &xd->block[ib]; // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged // or the relationship otherwise modified to address this type conversion. tx_type = txfm_map(pred_mode_conv( (MB_PREDICTION_MODE)b->bmi.as_mode.first)); } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED && xd->q_index < ACTIVE_HT8) { -#if USE_ADST_FOR_I8X8_4X4 +#if USE_ADST_FOR_I16X16_8X8 #if USE_ADST_PERIPHERY_ONLY - // Use ADST for periphery blocks only - tx_type = (ib != 10) ? - txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)) : DCT_DCT; + const int hmax = 4 << sb_type; + tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); +#if USE_ADST_FOR_REMOTE_EDGE + if ((ib & (hmax - 1)) != 0 && ib >= hmax) + tx_type = DCT_DCT; +#else + if (ib >= 1 && ib < hmax) { + if (tx_type == ADST_ADST) tx_type = ADST_DCT; + else if (tx_type == DCT_ADST) tx_type = DCT_DCT; + } else if (ib >= 1 && (ib & (hmax - 1)) == 0) { + if (tx_type == ADST_ADST) tx_type = DCT_ADST; + else if (tx_type == ADST_DCT) tx_type = DCT_DCT; + } else if (ib != 0) { + tx_type = DCT_DCT; + } +#endif #else // Use ADST tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); @@ -592,63 +603,73 @@ static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) { return tx_type; } -static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) { +static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) { TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; +#if !USE_ADST_FOR_SB + if (sb_type) return tx_type; - // TODO(rbultje, debargha): Explore ADST usage for superblocks - if (xd->mode_info_context->mbmi.sb_type) +#endif + if (ib >= (16 << (2 * sb_type))) return tx_type; if (xd->mode_info_context->mbmi.mode < I8X8_PRED && xd->q_index < ACTIVE_HT16) { tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); +#if USE_ADST_PERIPHERY_ONLY + if (sb_type) { + const int hmax = 4 << sb_type; +#if USE_ADST_FOR_REMOTE_EDGE + if ((ib & (hmax - 1)) != 0 && ib >= hmax) + tx_type = DCT_DCT; +#else + if (ib >= 1 && ib < hmax) { + if (tx_type == ADST_ADST) tx_type = ADST_DCT; + else if (tx_type == DCT_ADST) tx_type = DCT_DCT; + } else if (ib >= 1 && (ib & (hmax - 1)) == 0) { + if (tx_type == ADST_ADST) tx_type = DCT_ADST; + else if (tx_type == ADST_DCT) tx_type = DCT_DCT; + } else if (ib != 0) { + tx_type = DCT_DCT; + } +#endif + } +#endif } return tx_type; } -static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) { - TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) - return tx_type; - if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) { - tx_type = get_tx_type_16x16(xd, b); - } - if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { - ib = (ib & 8) + ((ib & 4) >> 1); - tx_type = get_tx_type_8x8(xd, &xd->block[ib]); - } - if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) { - tx_type = get_tx_type_4x4(xd, b); - } - return tx_type; -} - -static int get_2nd_order_usage(const MACROBLOCKD *xd) { - int has_2nd_order = (xd->mode_info_context->mbmi.mode != SPLITMV && - xd->mode_info_context->mbmi.mode != I8X8_PRED && - xd->mode_info_context->mbmi.mode != B_PRED && - xd->mode_info_context->mbmi.txfm_size != TX_16X16); - if (has_2nd_order) - has_2nd_order = (get_tx_type(xd, xd->block) == DCT_DCT); - return has_2nd_order; -} - -extern void vp9_build_block_doffsets(MACROBLOCKD *xd); -extern void vp9_setup_block_dptrs(MACROBLOCKD *xd); +void vp9_build_block_doffsets(MACROBLOCKD *xd); +void vp9_setup_block_dptrs(MACROBLOCKD *xd); static void update_blockd_bmi(MACROBLOCKD *xd) { - int i; - int is_4x4; - is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) || - (xd->mode_info_context->mbmi.mode == I8X8_PRED) || - (xd->mode_info_context->mbmi.mode == B_PRED); - - if (is_4x4) { - for (i = 0; i < 16; i++) { + const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode; + + if (mode == SPLITMV || mode == I8X8_PRED || mode == B_PRED) { + int i; + for (i = 0; i < 16; i++) xd->block[i].bmi = xd->mode_info_context->bmi[i]; - } } } + +static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) { + TX_SIZE tx_size_uv; + if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) { + tx_size_uv = xd->mode_info_context->mbmi.txfm_size; + } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) { + if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) + tx_size_uv = TX_16X16; + else + tx_size_uv = xd->mode_info_context->mbmi.txfm_size; + } else { + if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) + tx_size_uv = TX_8X8; + else if (xd->mode_info_context->mbmi.txfm_size == TX_8X8 && + (xd->mode_info_context->mbmi.mode == I8X8_PRED || + xd->mode_info_context->mbmi.mode == SPLITMV)) + tx_size_uv = TX_4X4; + else + tx_size_uv = xd->mode_info_context->mbmi.txfm_size; + } + return tx_size_uv; +} #endif // VP9_COMMON_VP9_BLOCKD_H_ diff --git a/vp9/common/vp9_coefupdateprobs.h b/vp9/common/vp9_coefupdateprobs.h index ee250e0489fbea699f57fbb1c2b627f1667b4988..b4d892df9afd5b2fb63a5dbde657ac1a782d49b2 100644 --- a/vp9/common/vp9_coefupdateprobs.h +++ b/vp9/common/vp9_coefupdateprobs.h @@ -9,12 +9,25 @@ */ #ifndef VP9_COMMON_VP9_COEFUPDATEPROBS_H_ -#define VP9_COMMON_VP9_COEFUPDATEPROBS_H__ +#define VP9_COMMON_VP9_COEFUPDATEPROBS_H_ /* Update probabilities for the nodes in the token entropy tree. Generated file included by vp9_entropy.c */ -#define COEF_UPDATE_PROB 252 -#define COEF_UPDATE_PROB_8X8 252 -#define COEF_UPDATE_PROB_16X16 252 + +static const vp9_prob vp9_coef_update_prob[ENTROPY_NODES] = { + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252 +}; + +#if CONFIG_CODE_NONZEROCOUNT +#define NZC_UPDATE_PROB_4X4 252 +#define NZC_UPDATE_PROB_8X8 252 +#define NZC_UPDATE_PROB_16X16 252 +#define NZC_UPDATE_PROB_32X32 252 +#define NZC_UPDATE_PROB_PCAT 252 +#endif + +#if CONFIG_MODELCOEFPROB +#define COEF_MODEL_UPDATE_PROB 16 +#endif #endif // VP9_COMMON_VP9_COEFUPDATEPROBS_H__ diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 2e1ee4b1a92b06b3fb0ec24b80c60f2d89e45892..f72d25e7f4c86d6ec56312e75caa20b63eb2725f 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -11,10 +11,11 @@ #ifndef VP9_COMMON_VP9_COMMON_H_ #define VP9_COMMON_VP9_COMMON_H_ -#include <assert.h> -#include "vpx_config.h" /* Interface header for common constant data structures and lookup tables */ +#include <assert.h> + +#include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vpx/vpx_integer.h" @@ -24,26 +25,34 @@ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) -/* Only need this for fixed-size arrays, for structs just assign. */ - -#define vp9_copy(Dest, Src) { \ - assert(sizeof(Dest) == sizeof(Src)); \ - vpx_memcpy(Dest, Src, sizeof(Src)); \ - } +#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) -/* Use this for variably-sized arrays. */ +/* If we don't want to use ROUND_POWER_OF_TWO macro +static INLINE int16_t round_power_of_two(int16_t value, int n) { + return (value + (1 << (n - 1))) >> n; +}*/ -#define vp9_copy_array(Dest, Src, N) { \ - assert(sizeof(*Dest) == sizeof(*Src)); \ - vpx_memcpy(Dest, Src, N * sizeof(*Src)); \ +// Only need this for fixed-size arrays, for structs just assign. +#define vp9_copy(dest, src) { \ + assert(sizeof(dest) == sizeof(src)); \ + vpx_memcpy(dest, src, sizeof(src)); \ } -#define vp9_zero(Dest) vpx_memset(&Dest, 0, sizeof(Dest)); +// Use this for variably-sized arrays. +#define vp9_copy_array(dest, src, n) { \ + assert(sizeof(*dest) == sizeof(*src)); \ + vpx_memcpy(dest, src, n * sizeof(*src)); \ + } -#define vp9_zero_array(Dest, N) vpx_memset(Dest, 0, N * sizeof(*Dest)); +#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest)); +#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)); -static __inline uint8_t clip_pixel(int val) { +static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255u : (val < 0) ? 0u : val; } +static INLINE int clamp(int value, int low, int high) { + return value < low ? low : (value > high ? high : value); +} + #endif // VP9_COMMON_VP9_COMMON_H_ diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c new file mode 100644 index 0000000000000000000000000000000000000000..3ab8bec7a162d0b16c107dbf1b5dac9a67ed85bb --- /dev/null +++ b/vp9/common/vp9_convolve.c @@ -0,0 +1,850 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vp9/common/vp9_convolve.h" + +#include <assert.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define VP9_FILTER_WEIGHT 128 +#define VP9_FILTER_SHIFT 7 + +/* Assume a bank of 16 filters to choose from. There are two implementations + * for filter wrapping behavior, since we want to be able to pick which filter + * to start with. We could either: + * + * 1) make filter_ a pointer to the base of the filter array, and then add an + * additional offset parameter, to choose the starting filter. + * 2) use a pointer to 2 periods worth of filters, so that even if the original + * phase offset is at 15/16, we'll have valid data to read. The filter + * tables become [32][8], and the second half is duplicated. + * 3) fix the alignment of the filter tables, so that we know the 0/16 is + * always 256 byte aligned. + * + * Implementations 2 and 3 are likely preferable, as they avoid an extra 2 + * parameters, and switching between them is trivial, with the + * ALIGN_FILTERS_256 macro, below. + */ + #define ALIGN_FILTERS_256 1 + +static void convolve_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x0, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps) { + int x, y, k, sum; + const int16_t *filter_x_base = filter_x0; + +#if ALIGN_FILTERS_256 + filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source line */ + src -= taps / 2 - 1; + + for (y = 0; y < h; ++y) { + /* Pointer to filter to use */ + const int16_t *filter_x = filter_x0; + + /* Initial phase offset */ + int x0_q4 = (filter_x - filter_x_base) / taps; + int x_q4 = x0_q4; + + for (x = 0; x < w; ++x) { + /* Per-pixel src offset */ + int src_x = (x_q4 - x0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[src_x + k] * filter_x[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT); + + /* Adjust source and filter to use for the next pixel */ + x_q4 += x_step_q4; + filter_x = filter_x_base + (x_q4 & 0xf) * taps; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x0, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps) { + int x, y, k, sum; + const int16_t *filter_x_base = filter_x0; + +#if ALIGN_FILTERS_256 + filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source line */ + src -= taps / 2 - 1; + + for (y = 0; y < h; ++y) { + /* Pointer to filter to use */ + const int16_t *filter_x = filter_x0; + + /* Initial phase offset */ + int x0_q4 = (filter_x - filter_x_base) / taps; + int x_q4 = x0_q4; + + for (x = 0; x < w; ++x) { + /* Per-pixel src offset */ + int src_x = (x_q4 - x0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[src_x + k] * filter_x[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1; + + /* Adjust source and filter to use for the next pixel */ + x_q4 += x_step_q4; + filter_x = filter_x_base + (x_q4 & 0xf) * taps; + } + src += src_stride; + dst += dst_stride; + } +} + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT + +static inline uint8_t combine_qtr(uint8_t a, uint8_t b) { + return (((a) + (b) * 3 + 2) >> 2); +} + +static inline uint8_t combine_3qtr(uint8_t a, uint8_t b) { + return (((a) * 3 + (b) + 2) >> 2); +} + +static inline uint8_t combine_1by8(uint8_t a, uint8_t b) { + return (((a) * 1 + (b) * 7 + 4) >> 3); +} + +static inline uint8_t combine_3by8(uint8_t a, uint8_t b) { + return (((a) * 3 + (b) * 5 + 4) >> 3); +} + +static inline uint8_t combine_5by8(uint8_t a, uint8_t b) { + return (((a) * 5 + (b) * 3 + 4) >> 3); +} + +static inline uint8_t combine_7by8(uint8_t a, uint8_t b) { + return (((a) * 7 + (b) * 1 + 4) >> 3); +} + +// TODO(debargha): Implment with a separate weight parameter +static void convolve_wtd_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x0, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps, + uint8_t (*combine)(uint8_t a, uint8_t b)) { + int x, y, k, sum; + const int16_t *filter_x_base = filter_x0; + +#if ALIGN_FILTERS_256 + filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source line */ + src -= taps / 2 - 1; + + for (y = 0; y < h; ++y) { + /* Pointer to filter to use */ + const int16_t *filter_x = filter_x0; + + /* Initial phase offset */ + int x0_q4 = (filter_x - filter_x_base) / taps; + int x_q4 = x0_q4; + + for (x = 0; x < w; ++x) { + /* Per-pixel src offset */ + int src_x = (x_q4 - x0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[src_x + k] * filter_x[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[x] = combine(dst[x], clip_pixel(sum >> VP9_FILTER_SHIFT)); + + /* Adjust source and filter to use for the next pixel */ + x_q4 += x_step_q4; + filter_x = filter_x_base + (x_q4 & 0xf) * taps; + } + src += src_stride; + dst += dst_stride; + } +} + +#endif + +static void convolve_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y0, int y_step_q4, + int w, int h, int taps) { + int x, y, k, sum; + + const int16_t *filter_y_base = filter_y0; + +#if ALIGN_FILTERS_256 + filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source column */ + src -= src_stride * (taps / 2 - 1); + for (x = 0; x < w; ++x) { + /* Pointer to filter to use */ + const int16_t *filter_y = filter_y0; + + /* Initial phase offset */ + int y0_q4 = (filter_y - filter_y_base) / taps; + int y_q4 = y0_q4; + + for (y = 0; y < h; ++y) { + /* Per-pixel src offset */ + int src_y = (y_q4 - y0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[(src_y + k) * src_stride] * filter_y[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT); + + /* Adjust source and filter to use for the next pixel */ + y_q4 += y_step_q4; + filter_y = filter_y_base + (y_q4 & 0xf) * taps; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y0, int y_step_q4, + int w, int h, int taps) { + int x, y, k, sum; + + const int16_t *filter_y_base = filter_y0; + +#if ALIGN_FILTERS_256 + filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source column */ + src -= src_stride * (taps / 2 - 1); + for (x = 0; x < w; ++x) { + /* Pointer to filter to use */ + const int16_t *filter_y = filter_y0; + + /* Initial phase offset */ + int y0_q4 = (filter_y - filter_y_base) / taps; + int y_q4 = y0_q4; + + for (y = 0; y < h; ++y) { + /* Per-pixel src offset */ + int src_y = (y_q4 - y0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[(src_y + k) * src_stride] * filter_y[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[y * dst_stride] = + (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1; + + /* Adjust source and filter to use for the next pixel */ + y_q4 += y_step_q4; + filter_y = filter_y_base + (y_q4 & 0xf) * taps; + } + ++src; + ++dst; + } +} + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +static void convolve_wtd_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y0, int y_step_q4, + int w, int h, int taps, + uint8_t (*combine)(uint8_t a, uint8_t b)) { + int x, y, k, sum; + + const int16_t *filter_y_base = filter_y0; + +#if ALIGN_FILTERS_256 + filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source column */ + src -= src_stride * (taps / 2 - 1); + for (x = 0; x < w; ++x) { + /* Pointer to filter to use */ + const int16_t *filter_y = filter_y0; + + /* Initial phase offset */ + int y0_q4 = (filter_y - filter_y_base) / taps; + int y_q4 = y0_q4; + + for (y = 0; y < h; ++y) { + /* Per-pixel src offset */ + int src_y = (y_q4 - y0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[(src_y + k) * src_stride] * filter_y[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[y * dst_stride] = combine(dst[y * dst_stride], + clip_pixel(sum >> VP9_FILTER_SHIFT)); + + /* Adjust source and filter to use for the next pixel */ + y_q4 += y_step_q4; + filter_y = filter_y_base + (y_q4 & 0xf) * taps; + } + ++src; + ++dst; + } +} +#endif + +static void convolve_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps) { + /* Fixed size intermediate buffer places limits on parameters. + * Maximum intermediate_height is 39, for y_step_q4 == 32, + * h == 16, taps == 8. + */ + uint8_t temp[16 * 39]; + int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1; + + assert(w <= 16); + assert(h <= 16); + assert(taps <= 8); + assert(y_step_q4 <= 32); + + if (intermediate_height < h) + intermediate_height = h; + + convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, + temp, 16, + filter_x, x_step_q4, filter_y, y_step_q4, + w, intermediate_height, taps); + convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, taps); +} + +static void convolve_avg_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps) { + /* Fixed size intermediate buffer places limits on parameters. + * Maximum intermediate_height is 39, for y_step_q4 == 32, + * h == 16, taps == 8. + */ + uint8_t temp[16 * 39]; + int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1; + + assert(w <= 16); + assert(h <= 16); + assert(taps <= 8); + assert(y_step_q4 <= 32); + + if (intermediate_height < h) + intermediate_height = h; + + convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, + temp, 16, + filter_x, x_step_q4, filter_y, y_step_q4, + w, intermediate_height, taps); + convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, taps); +} + +void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +void vp9_convolve8_1by8_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_1by8); +} + +void vp9_convolve8_qtr_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_qtr); +} + +void vp9_convolve8_3by8_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_3by8); +} + +void vp9_convolve8_5by8_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_5by8); +} + +void vp9_convolve8_3qtr_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_3qtr); +} + +void vp9_convolve8_7by8_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_7by8); +} +#endif + +void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +void vp9_convolve8_1by8_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_1by8); +} + +void vp9_convolve8_qtr_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_qtr); +} + +void vp9_convolve8_3by8_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_3by8); +} + +void vp9_convolve8_5by8_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_5by8); +} + +void vp9_convolve8_3qtr_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_3qtr); +} + +void vp9_convolve8_7by8_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_wtd_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8, combine_7by8); +} +#endif + +void vp9_convolve8_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16); + assert(w <= 16); + assert(h <= 16); + + vp9_convolve8(src, src_stride, + temp, 16, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_avg(temp, 16, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); +} + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +void vp9_convolve8_1by8_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16); + assert(w <= 16); + assert(h <= 16); + + vp9_convolve8(src, src_stride, + temp, 16, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_1by8(temp, 16, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); +} + +void vp9_convolve8_qtr_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16); + assert(w <= 16); + assert(h <= 16); + + vp9_convolve8(src, src_stride, + temp, 16, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_qtr(temp, 16, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); +} + +void vp9_convolve8_3by8_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16); + assert(w <= 16); + assert(h <= 16); + + vp9_convolve8(src, src_stride, + temp, 16, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_3by8(temp, 16, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); +} + +void vp9_convolve8_5by8_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16); + assert(w <= 16); + assert(h <= 16); + + vp9_convolve8(src, src_stride, + temp, 16, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_5by8(temp, 16, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); +} + +void vp9_convolve8_3qtr_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16); + assert(w <= 16); + assert(h <= 16); + + vp9_convolve8(src, src_stride, + temp, 16, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_3qtr(temp, 16, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); +} + +void vp9_convolve8_7by8_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16); + assert(w <= 16); + assert(h <= 16); + + vp9_convolve8(src, src_stride, + temp, 16, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_7by8(temp, 16, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); +} +#endif + +void vp9_convolve_copy(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + if (w == 16 && h == 16) { + vp9_copy_mem16x16(src, src_stride, dst, dst_stride); + } else if (w == 8 && h == 8) { + vp9_copy_mem8x8(src, src_stride, dst, dst_stride); + } else if (w == 8 && h == 4) { + vp9_copy_mem8x4(src, src_stride, dst, dst_stride); + } else { + int r; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + } +} + +void vp9_convolve_avg(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + src += src_stride; + dst += dst_stride; + } +} + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +void vp9_convolve_1by8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = combine_1by8(dst[x], src[x]); + } + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve_qtr(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = combine_qtr(dst[x], src[x]); + } + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve_3by8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = combine_3by8(dst[x], src[x]); + } + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve_5by8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = combine_5by8(dst[x], src[x]); + } + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve_3qtr(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = combine_3qtr(dst[x], src[x]); + } + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve_7by8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = combine_7by8(dst[x], src[x]); + } + src += src_stride; + dst += dst_stride; + } +} +#endif diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h new file mode 100644 index 0000000000000000000000000000000000000000..bef2d85641783ef1a14c4cccd493bb23d8033a19 --- /dev/null +++ b/vp9/common/vp9_convolve.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_COMMON_CONVOLVE_H_ +#define VP9_COMMON_CONVOLVE_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block copy conforming to the convolution prototype +void vp9_convolve_copy(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block average conforming to the convolution prototype +void vp9_convolve_avg(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +// Not a convolution, a block wtd (1/8, 7/8) average for (dst, src) +void vp9_convolve_1by8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block wtd (1/4, 3/4) average for (dst, src) +void vp9_convolve_qtr(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block wtd (3/8, 5/8) average for (dst, src) +void vp9_convolve_3by8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block wtd (5/8, 3/8) average for (dst, src) +void vp9_convolve_5by8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block wtd (3/4, 1/4) average for (dst, src) +void vp9_convolve_3qtr(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block wtd (7/8, 1/8) average for (dst, src) +void vp9_convolve_7by8(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); +#endif + +struct subpix_fn_table { + const int16_t (*filter_x)[8]; + const int16_t (*filter_y)[8]; +}; + +#endif // VP9_COMMON_CONVOLVE_H_ diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c index 5ea7736b74047dd3127d48ba14cc903bcf73ef67..c3fffc63251216d0076b49e2ae1cc0f69ca72776 100644 --- a/vp9/common/vp9_debugmodes.c +++ b/vp9/common/vp9_debugmodes.c @@ -9,6 +9,7 @@ */ #include <stdio.h> + #include "vp9/common/vp9_blockd.h" void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, @@ -18,8 +19,7 @@ void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int mb_index = 0; FILE *mvs = fopen("mvs.stt", "a"); - /* print out the macroblock Y modes */ - mb_index = 0; + // Print out the macroblock Y modes fprintf(mvs, "Mb Modes for Frame %d\n", frame); for (mb_row = 0; mb_row < rows; mb_row++) { @@ -129,8 +129,8 @@ void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); bindex = (b_row & 3) * 4 + (b_col & 3); fprintf(mvs, "%3d:%-3d ", - mi[mb_index].bmi[bindex].as_mv.first.as_mv.row, - mi[mb_index].bmi[bindex].as_mv.first.as_mv.col); + mi[mb_index].bmi[bindex].as_mv[0].as_mv.row, + mi[mb_index].bmi[bindex].as_mv[0].as_mv.col); } diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h index 10d3c389f16c2cc03d32574d8653d156a37c4590..c9be8b2299adcd61d75cd777f030869a5dd81f1e 100644 --- a/vp9/common/vp9_default_coef_probs.h +++ b/vp9/common/vp9_default_coef_probs.h @@ -11,1201 +11,987 @@ /*Generated file, included by vp9_entropy.c*/ - -static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES_4X4] = { +// NOTE: When the CONFIG_MODELCOEFPROB experiment is on, only the first +// 2 or 3 from each row is actually used depending on whether +// UNCONSTRAINDED_NODES is 2 or 3. If this experiment is merged +// the tables below should be shortened accordingly. +static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES] = { { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 224, 180, 254, 255, 234, 224, 255, 227, 128, 128, 128 }, - { 187, 178, 250, 255, 226, 218, 255, 229, 255, 255, 128 }, - { 145, 171, 243, 253, 219, 211, 254, 226, 255, 224, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 1, 187, 252, 255, 231, 220, 255, 229, 255, 255, 128 }, - { 129, 174, 244, 254, 225, 216, 253, 219, 255, 255, 128 }, - { 16, 131, 193, 251, 205, 205, 254, 222, 255, 255, 128 }, - { 2, 93, 136, 236, 159, 179, 255, 197, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 1, 188, 254, 255, 241, 236, 254, 220, 255, 255, 128 }, - { 133, 165, 249, 255, 236, 220, 252, 220, 255, 255, 128 }, - { 20, 112, 203, 254, 217, 214, 255, 224, 255, 255, 128 }, - { 4, 61, 106, 240, 155, 189, 252, 202, 255, 255, 128 } - }, { /* Coeff Band 4 */ - { 1, 168, 252, 255, 239, 228, 253, 217, 255, 255, 128 }, - { 158, 163, 247, 255, 231, 221, 255, 242, 128, 128, 128 }, - { 23, 127, 205, 253, 212, 224, 255, 234, 255, 255, 128 }, - { 2, 83, 141, 237, 176, 210, 245, 207, 255, 255, 128 } - }, { /* Coeff Band 5 */ - { 1, 233, 254, 255, 243, 241, 255, 213, 128, 128, 128 }, - { 155, 213, 253, 255, 240, 221, 216, 112, 255, 255, 128 }, - { 41, 159, 237, 254, 229, 216, 255, 161, 128, 128, 128 }, - { 11, 95, 176, 244, 194, 191, 255, 167, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 1, 160, 253, 255, 238, 231, 255, 230, 255, 255, 128 }, - { 174, 152, 248, 255, 230, 223, 255, 223, 255, 255, 128 }, - { 86, 125, 213, 253, 207, 207, 254, 224, 255, 171, 128 }, - { 39, 89, 156, 240, 168, 190, 251, 181, 255, 255, 128 } - }, { /* Coeff Band 7 */ - { 1, 101, 255, 255, 243, 244, 255, 255, 128, 128, 128 }, - { 230, 66, 255, 255, 238, 238, 128, 128, 128, 128, 128 }, - { 151, 92, 229, 255, 224, 197, 128, 128, 128, 128, 128 }, - { 109, 57, 171, 255, 73, 255, 128, 128, 128, 128, 128 } + { /* Intra */ + { /* Coeff Band 0 */ + { 208, 32, 178, 198, 161, 167, 196, 147, 244, 194, 210 }, + { 102, 43, 132, 185, 148, 162, 185, 141, 237, 181, 215 }, + { 15, 36, 68, 143, 119, 151, 169, 133, 230, 173, 214 } + }, { /* Coeff Band 1 */ + { 71, 91, 178, 226, 169, 176, 232, 170, 252, 219, 231 }, + { 72, 88, 174, 226, 168, 176, 232, 170, 252, 219, 234 }, + { 40, 79, 154, 222, 161, 174, 231, 169, 251, 219, 238 }, + { 21, 68, 126, 211, 144, 167, 230, 167, 252, 219, 236 }, + { 7, 49, 84, 175, 121, 152, 223, 151, 251, 218, 237 }, + { 1, 20, 32, 100, 97, 140, 163, 116, 237, 186, 222 } + }, { /* Coeff Band 2 */ + { 108, 110, 206, 237, 182, 183, 239, 181, 252, 221, 245 }, + { 72, 98, 191, 236, 180, 182, 240, 183, 252, 223, 239 }, + { 26, 77, 152, 230, 166, 179, 239, 181, 252, 222, 241 }, + { 7, 57, 106, 212, 141, 167, 236, 173, 252, 223, 243 }, + { 1, 35, 60, 171, 110, 149, 225, 155, 251, 218, 240 }, + { 1, 14, 22, 90, 86, 134, 163, 116, 238, 181, 233 } + }, { /* Coeff Band 3 */ + { 105, 139, 222, 245, 196, 192, 245, 195, 253, 229, 255 }, + { 76, 118, 205, 245, 192, 192, 247, 198, 254, 230, 255 }, + { 21, 88, 164, 240, 175, 186, 246, 197, 255, 232, 255 }, + { 5, 63, 118, 222, 149, 172, 242, 185, 255, 230, 254 }, + { 1, 42, 74, 186, 120, 157, 227, 161, 253, 220, 250 }, + { 1, 18, 30, 97, 92, 136, 163, 118, 244, 184, 244 } + }, { /* Coeff Band 4 */ + { 143, 117, 233, 251, 207, 201, 250, 210, 255, 239, 128 }, + { 99, 104, 214, 249, 200, 199, 251, 211, 255, 238, 255 }, + { 26, 81, 170, 245, 183, 192, 250, 206, 255, 242, 255 }, + { 6, 60, 116, 226, 151, 176, 242, 187, 255, 235, 255 }, + { 1, 38, 65, 178, 114, 153, 224, 157, 254, 224, 255 }, + { 1, 15, 26, 86, 88, 133, 163, 110, 251, 197, 252 } + }, { /* Coeff Band 5 */ + { 155, 74, 238, 252, 215, 206, 252, 223, 255, 255, 128 }, + { 152, 64, 223, 250, 205, 201, 254, 219, 255, 255, 128 }, + { 67, 55, 182, 246, 187, 192, 251, 210, 255, 240, 128 }, + { 27, 44, 127, 227, 155, 176, 244, 186, 255, 240, 255 }, + { 9, 27, 69, 176, 115, 152, 227, 154, 255, 229, 255 }, + { 2, 11, 28, 91, 84, 133, 177, 115, 254, 210, 255 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 207, 112, 234, 244, 192, 193, 246, 194, 255, 237, 255 }, + { 145, 120, 212, 233, 178, 183, 232, 177, 252, 216, 228 }, + { 77, 114, 177, 214, 164, 174, 210, 159, 245, 199, 230 } + }, { /* Coeff Band 1 */ + { 93, 174, 243, 248, 205, 200, 245, 195, 255, 232, 255 }, + { 100, 144, 231, 248, 204, 200, 244, 193, 255, 232, 255 }, + { 28, 101, 186, 247, 194, 199, 244, 194, 255, 232, 255 }, + { 9, 73, 132, 238, 155, 186, 245, 197, 255, 232, 250 }, + { 2, 44, 76, 187, 112, 151, 240, 172, 255, 235, 249 }, + { 1, 19, 33, 98, 92, 138, 176, 113, 252, 208, 249 } + }, { /* Coeff Band 2 */ + { 116, 175, 246, 250, 212, 202, 248, 198, 255, 238, 255 }, + { 78, 142, 231, 250, 208, 203, 249, 200, 255, 241, 255 }, + { 14, 93, 177, 245, 186, 196, 248, 198, 255, 241, 255 }, + { 4, 65, 122, 227, 148, 177, 244, 186, 255, 241, 243 }, + { 1, 38, 69, 180, 111, 152, 235, 162, 255, 237, 247 }, + { 1, 18, 30, 101, 89, 133, 190, 116, 255, 219, 246 } + }, { /* Coeff Band 3 */ + { 138, 183, 249, 253, 220, 209, 252, 210, 255, 251, 128 }, + { 93, 147, 237, 252, 213, 209, 253, 213, 255, 251, 128 }, + { 21, 104, 187, 247, 185, 196, 252, 210, 255, 249, 128 }, + { 6, 73, 131, 225, 147, 174, 248, 190, 255, 248, 128 }, + { 1, 47, 83, 189, 119, 155, 239, 167, 255, 246, 128 }, + { 1, 26, 44, 130, 96, 139, 209, 129, 255, 235, 255 } + }, { /* Coeff Band 4 */ + { 188, 143, 252, 255, 228, 218, 253, 218, 255, 209, 128 }, + { 137, 124, 241, 253, 215, 211, 254, 221, 255, 255, 128 }, + { 32, 89, 188, 248, 186, 198, 254, 216, 255, 253, 128 }, + { 7, 61, 122, 231, 146, 176, 252, 201, 255, 250, 128 }, + { 1, 34, 66, 186, 103, 149, 246, 176, 255, 249, 128 }, + { 1, 18, 34, 115, 91, 134, 217, 124, 255, 233, 255 } + }, { /* Coeff Band 5 */ + { 198, 92, 253, 255, 231, 222, 255, 230, 128, 128, 128 }, + { 189, 79, 244, 254, 220, 217, 255, 237, 255, 255, 128 }, + { 78, 61, 200, 252, 196, 207, 255, 231, 255, 255, 128 }, + { 34, 50, 146, 242, 161, 187, 255, 222, 255, 255, 128 }, + { 11, 38, 93, 215, 122, 159, 253, 202, 255, 255, 128 }, + { 1, 31, 55, 143, 102, 143, 227, 148, 255, 238, 128 } + } } }, { /* block Type 1 */ - { /* Coeff Band 0 */ - { 148, 109, 219, 239, 203, 184, 222, 172, 238, 203, 192 }, - { 101, 110, 206, 229, 181, 178, 224, 171, 250, 206, 180 }, - { 67, 108, 186, 222, 172, 174, 216, 167, 246, 195, 221 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 184, 249, 254, 226, 220, 253, 241, 255, 255, 128 }, - { 84, 182, 244, 254, 222, 218, 254, 217, 255, 255, 128 }, - { 56, 147, 210, 252, 208, 210, 253, 218, 255, 255, 128 }, - { 32, 124, 170, 233, 165, 178, 249, 196, 255, 253, 128 } - }, { /* Coeff Band 2 */ - { 1, 182, 242, 245, 208, 194, 239, 179, 255, 238, 128 }, - { 28, 170, 230, 241, 202, 192, 243, 171, 255, 243, 128 }, - { 16, 109, 165, 231, 182, 184, 237, 168, 255, 249, 255 }, - { 2, 76, 113, 202, 141, 172, 221, 160, 252, 227, 255 } - }, { /* Coeff Band 3 */ - { 1, 195, 249, 254, 230, 239, 251, 211, 255, 255, 128 }, - { 39, 164, 242, 254, 224, 222, 255, 235, 255, 255, 128 }, - { 16, 111, 179, 251, 204, 197, 251, 234, 255, 209, 128 }, - { 3, 84, 130, 225, 155, 176, 226, 196, 255, 238, 128 } - }, { /* Coeff Band 4 */ - { 1, 180, 248, 254, 227, 219, 254, 211, 255, 255, 128 }, - { 38, 170, 242, 253, 222, 214, 254, 242, 255, 255, 128 }, - { 5, 111, 176, 250, 204, 197, 255, 208, 128, 128, 128 }, - { 1, 75, 120, 233, 146, 186, 250, 203, 255, 255, 128 } - }, { /* Coeff Band 5 */ - { 1, 183, 251, 255, 232, 223, 252, 229, 255, 255, 128 }, - { 51, 158, 245, 255, 230, 224, 255, 239, 128, 128, 128 }, - { 13, 80, 158, 253, 206, 216, 255, 233, 128, 128, 128 }, - { 4, 39, 76, 212, 107, 153, 252, 206, 255, 255, 128 } - }, { /* Coeff Band 6 */ - { 1, 181, 252, 254, 231, 214, 242, 225, 255, 236, 128 }, - { 81, 167, 247, 254, 229, 217, 252, 226, 255, 255, 128 }, - { 20, 122, 195, 253, 213, 212, 249, 211, 255, 238, 128 }, - { 18, 100, 153, 231, 158, 182, 244, 203, 255, 219, 128 } - }, { /* Coeff Band 7 */ - { 1, 100, 254, 255, 242, 246, 255, 230, 128, 128, 128 }, - { 177, 62, 250, 255, 246, 210, 255, 255, 128, 128, 128 }, - { 65, 58, 186, 255, 227, 241, 255, 219, 128, 128, 128 }, - { 45, 23, 118, 244, 162, 208, 255, 228, 128, 128, 128 } - } - }, { /* block Type 2 */ - { /* Coeff Band 0 */ - { 242, 73, 238, 244, 198, 192, 241, 189, 253, 226, 247 }, - { 171, 70, 204, 231, 180, 183, 228, 172, 247, 215, 221 }, - { 73, 62, 144, 202, 153, 169, 207, 153, 245, 199, 230 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 163, 241, 245, 201, 192, 243, 191, 255, 229, 255 }, - { 165, 147, 230, 245, 201, 193, 244, 193, 255, 231, 255 }, - { 76, 109, 191, 243, 190, 193, 243, 192, 255, 231, 255 }, - { 22, 63, 111, 202, 138, 164, 225, 164, 252, 218, 248 } - }, { /* Coeff Band 2 */ - { 1, 113, 225, 245, 201, 195, 238, 185, 254, 225, 255 }, - { 122, 105, 195, 236, 183, 186, 235, 180, 254, 227, 252 }, - { 38, 79, 135, 217, 154, 172, 229, 171, 253, 220, 250 }, - { 9, 53, 78, 161, 121, 151, 202, 141, 251, 207, 244 } - }, { /* Coeff Band 3 */ - { 1, 150, 238, 250, 213, 202, 244, 194, 255, 236, 255 }, - { 140, 132, 223, 247, 204, 199, 243, 193, 255, 234, 255 }, - { 51, 101, 182, 240, 188, 189, 240, 186, 255, 232, 255 }, - { 6, 59, 100, 201, 137, 165, 225, 161, 252, 221, 249 } - }, { /* Coeff Band 4 */ - { 1, 151, 233, 248, 205, 199, 248, 196, 255, 243, 255 }, - { 133, 140, 214, 244, 193, 193, 245, 194, 255, 236, 255 }, - { 27, 104, 168, 235, 172, 183, 243, 187, 254, 235, 255 }, - { 2, 61, 101, 202, 135, 164, 229, 167, 254, 223, 255 } - }, { /* Coeff Band 5 */ - { 1, 227, 246, 254, 225, 215, 254, 217, 255, 255, 128 }, - { 132, 195, 239, 253, 219, 210, 252, 212, 255, 255, 128 }, - { 49, 143, 214, 251, 207, 204, 253, 212, 255, 238, 128 }, - { 11, 93, 151, 235, 169, 185, 247, 190, 255, 238, 128 } - }, { /* Coeff Band 6 */ - { 1, 143, 237, 251, 213, 203, 249, 203, 255, 243, 128 }, - { 137, 120, 216, 246, 198, 196, 248, 199, 255, 240, 255 }, - { 50, 94, 166, 233, 169, 181, 245, 189, 255, 240, 255 }, - { 9, 56, 97, 190, 129, 158, 228, 159, 255, 226, 255 } - }, { /* Coeff Band 7 */ - { 1, 96, 245, 254, 229, 216, 255, 212, 255, 255, 128 }, - { 179, 81, 234, 253, 217, 209, 255, 230, 255, 255, 128 }, - { 105, 56, 192, 248, 192, 197, 252, 212, 255, 205, 128 }, - { 53, 32, 133, 228, 151, 177, 250, 192, 255, 255, 128 } - } - }, { /* block Type 3 */ - { /* Coeff Band 0 */ - { 209, 89, 216, 242, 191, 190, 245, 191, 240, 235, 168 }, - { 142, 96, 196, 229, 173, 180, 233, 175, 247, 220, 174 }, - { 66, 89, 157, 205, 155, 171, 209, 156, 243, 200, 197 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 159, 235, 246, 202, 197, 237, 186, 248, 223, 223 }, - { 96, 137, 223, 247, 203, 198, 242, 188, 241, 202, 209 }, - { 22, 95, 167, 243, 184, 196, 237, 187, 247, 221, 221 }, - { 3, 51, 81, 192, 125, 158, 220, 164, 242, 211, 197 } - }, { /* Coeff Band 2 */ - { 1, 145, 226, 244, 196, 194, 240, 191, 247, 225, 233 }, - { 66, 127, 203, 240, 188, 189, 239, 188, 248, 225, 220 }, - { 9, 83, 136, 224, 159, 176, 235, 177, 247, 223, 207 }, - { 2, 46, 71, 169, 121, 152, 210, 149, 241, 212, 199 } - }, { /* Coeff Band 3 */ - { 1, 174, 238, 249, 209, 201, 245, 198, 241, 196, 241 }, - { 76, 151, 223, 247, 203, 197, 245, 194, 243, 202, 198 }, - { 12, 102, 170, 240, 183, 187, 242, 191, 247, 225, 209 }, - { 1, 52, 85, 202, 135, 162, 225, 168, 240, 209, 221 } - }, { /* Coeff Band 4 */ - { 1, 140, 230, 247, 204, 198, 242, 190, 249, 209, 248 }, - { 94, 126, 213, 244, 195, 194, 240, 190, 247, 210, 237 }, - { 13, 95, 159, 232, 171, 181, 237, 179, 245, 205, 237 }, - { 1, 51, 83, 186, 128, 158, 216, 154, 240, 193, 229 } - }, { /* Coeff Band 5 */ - { 1, 218, 244, 251, 214, 202, 243, 199, 253, 214, 255 }, - { 91, 194, 238, 249, 210, 200, 247, 203, 251, 223, 255 }, - { 18, 140, 207, 247, 198, 194, 246, 203, 252, 213, 255 }, - { 3, 76, 126, 223, 156, 172, 233, 185, 251, 206, 255 } - }, { /* Coeff Band 6 */ - { 1, 135, 235, 250, 210, 203, 246, 206, 251, 219, 241 }, - { 105, 120, 214, 246, 196, 196, 245, 195, 250, 216, 243 }, - { 24, 91, 154, 231, 166, 180, 241, 183, 250, 214, 242 }, - { 3, 53, 84, 183, 127, 157, 218, 153, 244, 195, 237 } - }, { /* Coeff Band 7 */ - { 1, 83, 246, 252, 215, 208, 246, 206, 255, 237, 128 }, - { 184, 61, 233, 250, 208, 204, 245, 198, 254, 227, 255 }, - { 83, 58, 190, 246, 189, 195, 244, 198, 255, 229, 128 }, - { 41, 38, 125, 214, 144, 169, 229, 171, 251, 216, 255 } + { /* Intra */ + { /* Coeff Band 0 */ + { 207, 35, 219, 243, 195, 192, 243, 188, 251, 232, 238 }, + { 126, 46, 182, 230, 177, 182, 228, 171, 248, 214, 232 }, + { 51, 47, 125, 196, 147, 166, 206, 151, 245, 199, 229 } + }, { /* Coeff Band 1 */ + { 114, 124, 220, 244, 197, 192, 242, 189, 253, 226, 255 }, + { 142, 116, 213, 243, 194, 191, 241, 188, 252, 226, 255 }, + { 81, 101, 190, 242, 188, 190, 242, 190, 253, 229, 255 }, + { 42, 83, 155, 235, 166, 183, 241, 190, 253, 227, 246 }, + { 16, 62, 104, 205, 133, 161, 238, 176, 254, 227, 250 }, + { 6, 40, 60, 132, 109, 145, 190, 128, 248, 202, 239 } + }, { /* Coeff Band 2 */ + { 139, 149, 228, 248, 205, 198, 244, 196, 255, 223, 255 }, + { 115, 127, 221, 248, 202, 198, 245, 198, 255, 228, 255 }, + { 43, 100, 189, 246, 195, 195, 244, 196, 254, 234, 228 }, + { 13, 77, 141, 238, 168, 187, 243, 191, 255, 232, 255 }, + { 3, 49, 88, 203, 125, 160, 237, 178, 253, 227, 251 }, + { 1, 23, 41, 118, 97, 136, 191, 127, 250, 207, 247 } + }, { /* Coeff Band 3 */ + { 119, 185, 236, 251, 216, 205, 249, 202, 253, 237, 255 }, + { 89, 140, 224, 251, 211, 205, 250, 208, 255, 241, 255 }, + { 34, 105, 189, 248, 195, 197, 250, 208, 255, 245, 255 }, + { 14, 78, 142, 235, 166, 182, 246, 194, 255, 242, 255 }, + { 5, 49, 90, 196, 128, 160, 235, 165, 255, 237, 255 }, + { 1, 22, 41, 114, 97, 139, 180, 124, 252, 201, 249 } + }, { /* Coeff Band 4 */ + { 162, 142, 244, 254, 228, 215, 255, 230, 128, 128, 128 }, + { 129, 120, 231, 253, 216, 210, 255, 228, 255, 255, 128 }, + { 44, 90, 189, 249, 195, 199, 253, 217, 255, 240, 128 }, + { 14, 65, 132, 234, 158, 181, 249, 203, 255, 248, 128 }, + { 3, 38, 72, 188, 112, 154, 239, 171, 255, 243, 128 }, + { 1, 17, 39, 110, 86, 141, 201, 123, 255, 240, 128 } + }, { /* Coeff Band 5 */ + { 167, 96, 247, 255, 230, 218, 249, 231, 255, 255, 128 }, + { 163, 84, 234, 253, 214, 209, 255, 231, 255, 255, 128 }, + { 70, 63, 185, 249, 189, 197, 255, 230, 255, 255, 128 }, + { 30, 44, 132, 238, 157, 180, 251, 210, 255, 220, 128 }, + { 13, 30, 80, 195, 121, 153, 243, 179, 255, 224, 128 }, + { 5, 13, 38, 103, 109, 128, 196, 147, 255, 255, 128 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 242, 90, 246, 244, 200, 192, 242, 189, 255, 234, 255 }, + { 186, 102, 228, 233, 187, 182, 231, 172, 254, 225, 252 }, + { 102, 108, 203, 228, 181, 180, 218, 167, 243, 201, 223 } + }, { /* Coeff Band 1 */ + { 152, 169, 250, 253, 223, 209, 251, 208, 255, 250, 128 }, + { 164, 149, 242, 253, 222, 209, 249, 207, 253, 238, 255 }, + { 63, 108, 204, 252, 215, 211, 251, 211, 255, 242, 128 }, + { 39, 83, 153, 248, 175, 199, 250, 214, 255, 245, 128 }, + { 31, 66, 108, 214, 130, 161, 251, 196, 255, 237, 128 }, + { 27, 65, 71, 150, 112, 149, 213, 133, 255, 230, 255 } + }, { /* Coeff Band 2 */ + { 161, 174, 250, 254, 226, 215, 254, 226, 255, 230, 128 }, + { 133, 150, 239, 254, 222, 213, 254, 225, 255, 255, 128 }, + { 32, 105, 197, 252, 206, 207, 253, 220, 255, 255, 128 }, + { 10, 78, 147, 245, 173, 193, 253, 212, 255, 255, 128 }, + { 2, 49, 99, 221, 133, 164, 250, 198, 255, 252, 128 }, + { 1, 26, 53, 154, 96, 135, 234, 142, 255, 240, 128 } + }, { /* Coeff Band 3 */ + { 160, 187, 251, 255, 234, 223, 255, 233, 128, 128, 128 }, + { 131, 155, 241, 255, 228, 222, 255, 232, 255, 255, 128 }, + { 42, 108, 198, 253, 207, 212, 255, 234, 255, 255, 128 }, + { 18, 81, 151, 246, 176, 194, 254, 222, 255, 255, 128 }, + { 9, 60, 112, 225, 144, 167, 252, 199, 255, 255, 128 }, + { 5, 35, 49, 163, 113, 150, 237, 118, 255, 255, 128 } + }, { /* Coeff Band 4 */ + { 195, 141, 253, 255, 242, 232, 255, 255, 128, 128, 128 }, + { 169, 128, 245, 255, 235, 227, 255, 248, 128, 128, 128 }, + { 62, 91, 204, 255, 216, 220, 255, 233, 128, 128, 128 }, + { 23, 70, 150, 248, 178, 202, 255, 223, 128, 128, 128 }, + { 2, 44, 78, 220, 110, 164, 255, 209, 128, 128, 128 }, + { 1, 1, 128, 255, 255, 128, 128, 128, 128, 128, 128 } + }, { /* Coeff Band 5 */ + { 195, 104, 253, 255, 246, 246, 255, 171, 128, 128, 128 }, + { 197, 92, 248, 255, 239, 228, 255, 239, 128, 128, 128 }, + { 88, 71, 214, 255, 219, 220, 255, 244, 128, 128, 128 }, + { 39, 56, 160, 250, 187, 204, 255, 255, 128, 128, 128 }, + { 18, 28, 90, 217, 81, 137, 255, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } } } }; -static const vp9_coeff_probs default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4] = { +static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES] = { { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + { /* Intra */ + { /* Coeff Band 0 */ + { 196, 40, 199, 180, 158, 161, 172, 135, 226, 183, 140 }, + { 83, 38, 128, 153, 142, 157, 155, 128, 222, 164, 202 }, + { 10, 29, 55, 116, 113, 146, 150, 122, 223, 169, 200 } + }, { /* Coeff Band 1 */ + { 33, 114, 160, 211, 155, 169, 223, 162, 248, 212, 215 }, + { 69, 107, 155, 210, 154, 169, 224, 163, 248, 212, 216 }, + { 30, 91, 138, 207, 150, 168, 223, 162, 248, 212, 216 }, + { 12, 74, 115, 200, 140, 164, 222, 160, 249, 212, 219 }, + { 4, 52, 80, 172, 121, 153, 216, 149, 249, 212, 226 }, + { 1, 27, 40, 105, 101, 141, 157, 120, 231, 177, 210 } + }, { /* Coeff Band 2 */ + { 38, 159, 190, 227, 171, 177, 229, 172, 250, 214, 237 }, + { 34, 130, 182, 229, 173, 180, 231, 174, 249, 215, 234 }, + { 10, 97, 153, 226, 164, 178, 232, 175, 250, 215, 241 }, + { 3, 71, 115, 213, 145, 170, 230, 171, 251, 217, 235 }, + { 1, 41, 68, 172, 114, 152, 219, 154, 250, 212, 235 }, + { 1, 16, 27, 88, 90, 135, 155, 113, 235, 180, 216 } + }, { /* Coeff Band 3 */ + { 41, 184, 214, 238, 187, 186, 235, 180, 252, 217, 236 }, + { 24, 142, 199, 241, 188, 189, 237, 184, 252, 220, 235 }, + { 6, 97, 159, 235, 172, 184, 239, 185, 252, 221, 243 }, + { 1, 63, 110, 214, 144, 170, 234, 174, 253, 223, 243 }, + { 1, 32, 58, 166, 109, 149, 218, 152, 251, 215, 238 }, + { 1, 12, 21, 78, 85, 131, 152, 109, 236, 180, 224 } + }, { /* Coeff Band 4 */ + { 54, 207, 231, 245, 201, 193, 238, 186, 252, 221, 220 }, + { 32, 156, 213, 246, 198, 195, 242, 192, 252, 224, 245 }, + { 7, 98, 164, 240, 177, 187, 243, 193, 252, 227, 244 }, + { 2, 62, 108, 216, 143, 170, 237, 177, 254, 227, 248 }, + { 1, 32, 57, 165, 108, 148, 219, 152, 252, 217, 243 }, + { 1, 13, 22, 79, 87, 132, 153, 109, 240, 182, 232 } + }, { /* Coeff Band 5 */ + { 89, 208, 239, 250, 216, 200, 240, 190, 255, 222, 219 }, + { 53, 155, 223, 250, 209, 202, 245, 199, 253, 225, 246 }, + { 12, 102, 170, 243, 183, 192, 246, 198, 254, 230, 255 }, + { 3, 67, 111, 218, 144, 171, 239, 180, 254, 231, 248 }, + { 1, 38, 60, 164, 108, 148, 221, 152, 253, 220, 246 }, + { 1, 18, 26, 81, 88, 132, 157, 108, 245, 188, 241 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 205, 121, 244, 237, 187, 188, 229, 174, 248, 215, 228 }, + { 140, 120, 211, 219, 174, 177, 207, 158, 241, 195, 214 }, + { 51, 100, 152, 198, 155, 168, 199, 148, 240, 193, 207 } + }, { /* Coeff Band 1 */ + { 66, 196, 236, 247, 202, 197, 243, 193, 254, 228, 246 }, + { 99, 164, 223, 246, 199, 196, 243, 193, 254, 226, 255 }, + { 29, 122, 187, 244, 187, 194, 244, 193, 255, 227, 239 }, + { 14, 95, 145, 234, 156, 181, 244, 194, 254, 229, 246 }, + { 6, 68, 97, 190, 123, 155, 240, 168, 254, 232, 245 }, + { 3, 43, 50, 112, 105, 143, 170, 118, 245, 195, 230 } + }, { /* Coeff Band 2 */ + { 66, 202, 238, 248, 206, 199, 245, 196, 254, 233, 244 }, + { 45, 155, 218, 248, 200, 199, 245, 197, 254, 229, 208 }, + { 6, 96, 163, 242, 178, 191, 245, 196, 254, 233, 228 }, + { 2, 64, 110, 224, 142, 175, 242, 185, 254, 232, 247 }, + { 1, 34, 61, 172, 103, 147, 232, 164, 254, 226, 244 }, + { 1, 13, 24, 82, 85, 133, 165, 105, 248, 199, 242 } + }, { /* Coeff Band 3 */ + { 66, 204, 242, 251, 213, 204, 248, 204, 255, 236, 255 }, + { 38, 158, 222, 251, 206, 205, 249, 206, 255, 238, 255 }, + { 6, 95, 166, 244, 178, 194, 249, 205, 255, 236, 255 }, + { 2, 61, 111, 223, 141, 173, 244, 187, 255, 237, 255 }, + { 1, 31, 59, 171, 104, 149, 230, 158, 255, 230, 252 }, + { 1, 12, 22, 82, 79, 128, 171, 111, 251, 203, 249 } + }, { /* Coeff Band 4 */ + { 63, 214, 245, 252, 219, 208, 249, 206, 255, 241, 128 }, + { 38, 164, 228, 252, 210, 208, 251, 212, 255, 245, 255 }, + { 5, 101, 174, 246, 182, 196, 251, 207, 255, 244, 255 }, + { 1, 64, 116, 224, 142, 174, 246, 190, 255, 241, 228 }, + { 1, 34, 63, 172, 105, 148, 233, 160, 255, 235, 237 }, + { 1, 14, 26, 88, 85, 130, 177, 110, 252, 210, 250 } + }, { /* Coeff Band 5 */ + { 91, 214, 246, 254, 226, 213, 251, 210, 255, 239, 255 }, + { 55, 162, 233, 253, 215, 210, 253, 216, 255, 244, 128 }, + { 10, 104, 179, 247, 184, 196, 252, 212, 255, 247, 255 }, + { 2, 67, 119, 226, 143, 173, 249, 195, 255, 245, 255 }, + { 1, 37, 66, 175, 106, 149, 237, 164, 255, 240, 255 }, + { 1, 16, 30, 96, 87, 132, 188, 113, 255, 222, 255 } + } } }, { /* block Type 1 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - } - }, { /* block Type 2 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - } - }, { /* block Type 3 */ - { /* Coeff Band 0 */ - { 191, 34, 178, 193, 160, 173, 196, 142, 247, 191, 244 }, - { 84, 45, 129, 187, 145, 170, 189, 145, 240, 186, 212 }, - { 14, 36, 69, 149, 120, 154, 177, 136, 231, 177, 196 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 76, 169, 226, 167, 180, 227, 171, 247, 218, 226 }, - { 72, 75, 162, 226, 166, 181, 231, 172, 242, 200, 219 }, - { 30, 63, 130, 218, 153, 175, 226, 170, 247, 216, 219 }, - { 5, 39, 67, 156, 119, 151, 194, 140, 239, 202, 216 } - }, { /* Coeff Band 2 */ - { 1, 79, 182, 228, 175, 183, 224, 170, 247, 215, 220 }, - { 69, 77, 168, 224, 170, 180, 223, 168, 246, 215, 223 }, - { 24, 63, 126, 209, 153, 171, 219, 160, 247, 215, 225 }, - { 3, 35, 58, 151, 115, 151, 191, 138, 240, 199, 220 } - }, { /* Coeff Band 3 */ - { 1, 139, 213, 238, 194, 192, 234, 180, 244, 193, 236 }, - { 82, 127, 204, 238, 190, 186, 234, 175, 244, 191, 235 }, - { 26, 93, 161, 230, 173, 179, 233, 178, 249, 217, 241 }, - { 3, 48, 78, 186, 132, 158, 212, 157, 244, 205, 233 } - }, { /* Coeff Band 4 */ - { 1, 100, 208, 233, 180, 182, 238, 175, 250, 206, 225 }, - { 84, 87, 184, 230, 175, 180, 236, 179, 250, 209, 243 }, - { 14, 61, 111, 217, 146, 171, 236, 174, 249, 207, 245 }, - { 1, 32, 49, 150, 106, 142, 212, 145, 242, 191, 237 } - }, { /* Coeff Band 5 */ - { 1, 130, 223, 241, 192, 189, 231, 176, 250, 209, 246 }, - { 101, 120, 207, 239, 188, 187, 240, 196, 250, 202, 255 }, - { 19, 90, 155, 232, 169, 181, 238, 190, 250, 207, 249 }, - { 1, 54, 86, 197, 130, 161, 220, 170, 248, 196, 248 } - }, { /* Coeff Band 6 */ - { 1, 103, 208, 236, 183, 185, 235, 190, 243, 202, 219 }, - { 95, 92, 185, 230, 175, 181, 233, 174, 242, 203, 225 }, - { 24, 72, 131, 213, 152, 171, 226, 164, 241, 202, 220 }, - { 3, 45, 74, 169, 123, 154, 204, 145, 238, 188, 222 } - }, { /* Coeff Band 7 */ - { 1, 63, 236, 247, 205, 194, 241, 189, 252, 222, 255 }, - { 151, 48, 224, 245, 200, 193, 240, 187, 255, 234, 255 }, - { 76, 45, 178, 240, 180, 189, 239, 182, 253, 231, 255 }, - { 38, 31, 111, 187, 125, 154, 217, 155, 253, 214, 255 } + { /* Intra */ + { /* Coeff Band 0 */ + { 211, 32, 212, 235, 185, 184, 223, 167, 239, 210, 182 }, + { 121, 47, 171, 224, 171, 180, 211, 162, 238, 195, 221 }, + { 40, 51, 118, 203, 145, 168, 211, 160, 246, 200, 236 } + }, { /* Coeff Band 1 */ + { 71, 129, 209, 244, 192, 194, 242, 188, 255, 230, 255 }, + { 118, 122, 206, 244, 192, 192, 241, 187, 254, 227, 255 }, + { 53, 104, 184, 241, 186, 190, 241, 184, 254, 232, 255 }, + { 20, 81, 148, 234, 168, 183, 240, 183, 254, 231, 240 }, + { 3, 47, 82, 197, 127, 160, 234, 166, 254, 228, 251 }, + { 1, 18, 28, 96, 88, 134, 174, 116, 247, 194, 247 } + }, { /* Coeff Band 2 */ + { 86, 162, 220, 247, 203, 198, 245, 193, 255, 237, 255 }, + { 84, 134, 216, 247, 201, 197, 244, 192, 255, 233, 255 }, + { 26, 102, 186, 243, 190, 192, 244, 192, 255, 232, 255 }, + { 7, 75, 135, 231, 163, 181, 240, 183, 255, 234, 255 }, + { 1, 46, 79, 193, 121, 157, 233, 168, 255, 225, 242 }, + { 1, 20, 35, 113, 94, 136, 191, 123, 252, 209, 250 } + }, { /* Coeff Band 3 */ + { 89, 191, 232, 250, 211, 203, 248, 202, 255, 230, 128 }, + { 67, 148, 223, 250, 207, 201, 250, 207, 255, 247, 255 }, + { 19, 105, 183, 245, 189, 193, 249, 202, 255, 244, 255 }, + { 5, 72, 127, 228, 156, 177, 245, 186, 255, 238, 255 }, + { 1, 44, 76, 190, 119, 156, 234, 167, 255, 231, 255 }, + { 1, 21, 36, 116, 92, 138, 195, 128, 250, 208, 241 } + }, { /* Coeff Band 4 */ + { 94, 210, 236, 252, 215, 206, 253, 209, 255, 247, 128 }, + { 68, 153, 224, 251, 209, 204, 251, 213, 255, 240, 128 }, + { 14, 103, 178, 246, 188, 195, 251, 209, 255, 239, 128 }, + { 2, 70, 122, 230, 154, 177, 247, 194, 255, 239, 128 }, + { 1, 42, 72, 189, 115, 153, 234, 166, 255, 229, 255 }, + { 1, 19, 34, 104, 98, 143, 180, 124, 252, 200, 255 } + }, { /* Coeff Band 5 */ + { 87, 200, 238, 254, 226, 214, 250, 212, 255, 226, 128 }, + { 55, 151, 225, 253, 217, 212, 253, 217, 255, 233, 128 }, + { 11, 106, 179, 249, 193, 200, 252, 213, 255, 247, 128 }, + { 2, 72, 124, 232, 155, 180, 246, 195, 255, 230, 128 }, + { 1, 42, 70, 182, 114, 153, 232, 163, 255, 236, 255 }, + { 1, 17, 28, 95, 92, 137, 170, 115, 252, 208, 228 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 238, 66, 250, 245, 205, 193, 232, 180, 254, 228, 255 }, + { 178, 84, 226, 237, 192, 185, 230, 176, 253, 217, 251 }, + { 76, 83, 168, 218, 166, 173, 225, 162, 252, 220, 243 } + }, { /* Coeff Band 1 */ + { 137, 176, 246, 252, 218, 207, 251, 208, 255, 238, 128 }, + { 176, 160, 237, 252, 217, 206, 249, 209, 255, 247, 128 }, + { 68, 128, 205, 251, 209, 207, 251, 207, 255, 248, 128 }, + { 40, 105, 167, 246, 172, 192, 252, 215, 255, 247, 128 }, + { 22, 84, 131, 214, 144, 164, 249, 185, 255, 250, 255 }, + { 11, 60, 91, 161, 130, 155, 194, 133, 253, 214, 255 } + }, { /* Coeff Band 2 */ + { 124, 192, 247, 253, 223, 210, 254, 215, 255, 255, 128 }, + { 103, 161, 234, 253, 218, 209, 253, 214, 255, 255, 128 }, + { 19, 108, 190, 250, 202, 202, 251, 213, 255, 241, 128 }, + { 6, 74, 131, 242, 165, 191, 251, 207, 255, 244, 128 }, + { 1, 41, 72, 198, 111, 151, 249, 185, 255, 248, 128 }, + { 1, 14, 24, 82, 90, 140, 185, 96, 254, 224, 255 } + }, { /* Coeff Band 3 */ + { 118, 200, 248, 254, 228, 216, 254, 222, 255, 213, 128 }, + { 91, 166, 235, 254, 220, 212, 254, 223, 255, 233, 128 }, + { 16, 110, 186, 251, 197, 201, 255, 225, 255, 255, 128 }, + { 3, 72, 124, 239, 160, 186, 253, 209, 255, 239, 128 }, + { 1, 39, 66, 198, 106, 151, 248, 191, 255, 247, 128 }, + { 1, 14, 19, 94, 74, 124, 209, 109, 255, 245, 128 } + }, { /* Coeff Band 4 */ + { 112, 213, 248, 255, 231, 218, 255, 234, 255, 255, 128 }, + { 80, 172, 234, 254, 220, 216, 255, 233, 255, 255, 128 }, + { 11, 112, 182, 251, 195, 204, 255, 231, 255, 224, 128 }, + { 2, 73, 126, 241, 159, 186, 254, 219, 255, 255, 128 }, + { 1, 40, 69, 207, 111, 159, 249, 191, 255, 255, 128 }, + { 1, 16, 24, 83, 78, 138, 230, 134, 255, 239, 128 } + }, { /* Coeff Band 5 */ + { 100, 209, 245, 255, 236, 225, 248, 231, 255, 192, 128 }, + { 65, 164, 232, 255, 226, 221, 255, 240, 255, 255, 128 }, + { 11, 117, 186, 253, 203, 209, 255, 240, 255, 255, 128 }, + { 2, 83, 136, 245, 167, 191, 253, 222, 255, 255, 128 }, + { 1, 55, 88, 213, 122, 157, 248, 182, 255, 255, 128 }, + { 1, 10, 38, 58, 85, 43, 198, 107, 255, 255, 128 } + } } } }; -static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES_8X8] = { +static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES] = { { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 179, 203, 246, 252, 217, 208, 249, 197, 238, 237, 255 }, - { 136, 193, 232, 247, 202, 199, 245, 194, 255, 235, 255 }, - { 66, 170, 209, 244, 190, 191, 250, 199, 255, 242, 192 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 1, 191, 232, 250, 204, 201, 248, 199, 254, 243, 213 }, - { 50, 161, 209, 247, 196, 197, 250, 206, 253, 240, 213 }, - { 6, 118, 160, 239, 173, 186, 249, 203, 254, 235, 255 }, - { 2, 90, 110, 211, 141, 166, 242, 181, 254, 235, 255 } - }, { /* Coeff Band 3 */ - { 1, 209, 242, 254, 223, 215, 253, 218, 255, 253, 128 }, - { 58, 168, 227, 253, 216, 211, 254, 226, 255, 251, 128 }, - { 7, 111, 178, 249, 195, 202, 253, 222, 254, 240, 255 }, - { 2, 63, 103, 226, 142, 175, 250, 202, 255, 246, 128 } - }, { /* Coeff Band 4 */ - { 1, 207, 241, 252, 213, 205, 252, 215, 255, 228, 255 }, - { 55, 171, 225, 251, 209, 205, 251, 212, 254, 234, 255 }, - { 5, 108, 173, 247, 187, 195, 251, 211, 255, 231, 128 }, - { 2, 56, 97, 220, 138, 169, 248, 191, 253, 237, 255 } - }, { /* Coeff Band 5 */ - { 1, 211, 245, 255, 227, 219, 255, 233, 255, 255, 128 }, - { 58, 175, 228, 254, 217, 215, 255, 231, 255, 255, 128 }, - { 6, 124, 181, 249, 191, 199, 255, 222, 255, 251, 128 }, - { 2, 85, 122, 227, 149, 172, 250, 195, 255, 245, 128 } - }, { /* Coeff Band 6 */ - { 1, 216, 246, 255, 231, 217, 254, 220, 255, 250, 128 }, - { 74, 177, 236, 254, 222, 214, 254, 221, 255, 255, 128 }, - { 13, 125, 192, 250, 200, 203, 254, 217, 255, 245, 128 }, - { 2, 70, 114, 227, 147, 175, 251, 198, 255, 240, 128 } - }, { /* Coeff Band 7 */ - { 1, 199, 246, 255, 238, 229, 255, 226, 255, 255, 128 }, - { 132, 162, 240, 255, 229, 222, 255, 239, 255, 255, 128 }, - { 79, 125, 207, 253, 213, 214, 255, 232, 255, 255, 128 }, - { 41, 89, 149, 240, 161, 187, 250, 216, 255, 255, 128 } + { /* Intra */ + { /* Coeff Band 0 */ + { 8, 26, 101, 170, 141, 159, 166, 138, 205, 164, 158 }, + { 2, 25, 67, 119, 124, 152, 121, 123, 189, 145, 175 }, + { 1, 15, 28, 67, 102, 139, 95, 107, 191, 136, 187 } + }, { /* Coeff Band 1 */ + { 22, 73, 118, 160, 137, 157, 175, 132, 242, 184, 229 }, + { 43, 73, 116, 160, 137, 157, 177, 132, 242, 185, 231 }, + { 24, 66, 105, 158, 134, 156, 175, 133, 242, 185, 232 }, + { 9, 54, 85, 150, 126, 153, 175, 132, 242, 185, 231 }, + { 2, 34, 54, 123, 109, 145, 168, 124, 242, 183, 231 }, + { 1, 14, 22, 63, 93, 134, 108, 103, 214, 149, 206 } + }, { /* Coeff Band 2 */ + { 34, 123, 149, 186, 148, 163, 195, 143, 245, 195, 233 }, + { 34, 106, 147, 189, 149, 164, 198, 146, 246, 197, 234 }, + { 10, 81, 123, 186, 143, 162, 200, 147, 246, 198, 235 }, + { 2, 56, 87, 170, 127, 156, 201, 143, 248, 202, 234 }, + { 1, 35, 56, 138, 109, 146, 187, 133, 246, 196, 233 }, + { 1, 17, 27, 80, 93, 135, 136, 109, 229, 168, 215 } + }, { /* Coeff Band 3 */ + { 27, 159, 171, 208, 161, 171, 211, 155, 249, 205, 239 }, + { 17, 119, 162, 213, 160, 172, 218, 160, 250, 210, 238 }, + { 3, 81, 128, 207, 149, 168, 220, 161, 250, 213, 238 }, + { 1, 53, 87, 183, 128, 158, 217, 153, 251, 214, 239 }, + { 1, 31, 52, 143, 106, 145, 199, 137, 249, 205, 235 }, + { 1, 14, 24, 77, 89, 133, 142, 109, 234, 174, 215 } + }, { /* Coeff Band 4 */ + { 24, 189, 200, 224, 177, 178, 221, 164, 250, 212, 234 }, + { 14, 136, 184, 230, 176, 181, 228, 172, 252, 215, 231 }, + { 2, 87, 140, 222, 159, 176, 230, 172, 252, 218, 238 }, + { 1, 54, 90, 193, 130, 161, 223, 160, 252, 217, 241 }, + { 1, 28, 49, 142, 103, 144, 202, 139, 250, 208, 233 }, + { 1, 12, 21, 73, 87, 132, 141, 106, 234, 176, 209 } + }, { /* Coeff Band 5 */ + { 32, 220, 227, 242, 199, 190, 234, 180, 251, 220, 232 }, + { 12, 155, 200, 242, 190, 191, 240, 187, 252, 225, 230 }, + { 1, 90, 144, 231, 164, 180, 240, 184, 253, 229, 239 }, + { 1, 53, 90, 198, 130, 162, 230, 165, 253, 226, 238 }, + { 1, 28, 50, 145, 103, 144, 207, 140, 251, 213, 236 }, + { 1, 13, 22, 74, 88, 132, 142, 107, 233, 176, 216 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 5, 61, 234, 230, 183, 183, 212, 164, 241, 199, 205 }, + { 3, 65, 184, 199, 164, 170, 182, 145, 232, 175, 223 }, + { 1, 56, 104, 154, 137, 158, 156, 131, 221, 165, 210 } + }, { /* Coeff Band 1 */ + { 46, 183, 210, 229, 181, 182, 222, 165, 252, 214, 251 }, + { 122, 166, 202, 228, 179, 181, 223, 164, 252, 217, 250 }, + { 49, 125, 177, 225, 172, 179, 223, 163, 252, 215, 253 }, + { 22, 99, 142, 216, 155, 173, 222, 164, 252, 215, 250 }, + { 8, 69, 95, 180, 127, 156, 220, 153, 252, 214, 250 }, + { 2, 38, 51, 112, 109, 144, 159, 118, 243, 184, 232 } + }, { /* Coeff Band 2 */ + { 56, 196, 218, 236, 187, 185, 231, 172, 254, 223, 239 }, + { 38, 141, 195, 235, 182, 185, 233, 174, 254, 225, 232 }, + { 7, 93, 147, 225, 164, 178, 233, 173, 255, 226, 248 }, + { 2, 63, 101, 201, 137, 165, 227, 162, 254, 225, 248 }, + { 1, 39, 61, 159, 110, 148, 213, 146, 254, 218, 247 }, + { 1, 20, 33, 98, 95, 136, 166, 115, 247, 192, 231 } + }, { /* Coeff Band 3 */ + { 44, 206, 223, 240, 193, 189, 235, 177, 255, 231, 224 }, + { 27, 147, 200, 240, 188, 189, 238, 181, 255, 229, 239 }, + { 4, 93, 147, 230, 165, 180, 238, 180, 255, 231, 237 }, + { 1, 58, 95, 201, 134, 164, 229, 164, 255, 228, 254 }, + { 1, 32, 52, 152, 105, 146, 212, 142, 254, 221, 255 }, + { 1, 14, 23, 81, 87, 133, 156, 109, 248, 191, 236 } + }, { /* Coeff Band 4 */ + { 39, 216, 227, 244, 200, 194, 237, 179, 255, 231, 255 }, + { 22, 152, 204, 243, 192, 193, 240, 186, 255, 231, 240 }, + { 2, 92, 148, 232, 167, 183, 239, 182, 255, 232, 255 }, + { 1, 55, 91, 200, 132, 164, 229, 164, 255, 230, 255 }, + { 1, 28, 47, 144, 99, 142, 211, 141, 255, 222, 251 }, + { 1, 13, 21, 75, 86, 131, 152, 103, 249, 193, 242 } + }, { /* Coeff Band 5 */ + { 34, 228, 234, 249, 213, 201, 246, 194, 255, 239, 255 }, + { 13, 161, 208, 247, 198, 197, 248, 197, 255, 243, 255 }, + { 1, 95, 148, 234, 166, 183, 246, 190, 255, 243, 236 }, + { 1, 55, 90, 199, 128, 161, 237, 168, 255, 239, 255 }, + { 1, 30, 51, 147, 102, 144, 218, 142, 255, 232, 254 }, + { 1, 16, 25, 86, 88, 131, 168, 109, 252, 207, 245 } + } } }, { /* block Type 1 */ - { /* Coeff Band 0 */ - { 138, 65, 189, 212, 172, 169, 200, 153, 233, 182, 214 }, - { 93, 60, 162, 203, 160, 169, 200, 153, 239, 190, 213 }, - { 66, 55, 141, 195, 152, 166, 199, 152, 238, 190, 212 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 102, 221, 247, 205, 198, 248, 201, 255, 235, 128 }, - { 122, 95, 215, 247, 200, 197, 248, 200, 254, 227, 255 }, - { 60, 81, 166, 241, 177, 190, 245, 193, 255, 246, 255 }, - { 32, 61, 108, 195, 133, 159, 230, 163, 254, 230, 238 } - }, { /* Coeff Band 2 */ - { 1, 58, 203, 242, 194, 193, 229, 177, 253, 225, 249 }, - { 113, 62, 192, 237, 184, 187, 231, 181, 253, 220, 249 }, - { 50, 50, 135, 225, 159, 177, 229, 172, 254, 222, 241 }, - { 24, 34, 82, 185, 125, 152, 223, 158, 253, 212, 219 } - }, { /* Coeff Band 3 */ - { 1, 1, 220, 253, 218, 209, 251, 213, 255, 255, 128 }, - { 154, 1, 216, 252, 211, 206, 252, 212, 255, 252, 128 }, - { 102, 1, 157, 249, 184, 200, 253, 214, 255, 247, 128 }, - { 68, 1, 101, 213, 129, 161, 247, 186, 255, 237, 255 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - } - }, { /* block Type 2 */ - { /* Coeff Band 0 */ - { 229, 64, 235, 236, 189, 190, 227, 179, 247, 203, 226 }, - { 148, 70, 194, 228, 175, 182, 216, 170, 238, 192, 224 }, - { 53, 63, 134, 207, 150, 169, 213, 161, 247, 204, 232 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 173, 234, 244, 201, 193, 239, 180, 252, 214, 255 }, - { 160, 156, 222, 243, 200, 193, 237, 179, 253, 216, 255 }, - { 55, 119, 187, 240, 189, 192, 236, 180, 253, 226, 255 }, - { 14, 65, 105, 193, 142, 165, 205, 151, 249, 200, 250 } - }, { /* Coeff Band 2 */ - { 1, 124, 218, 246, 195, 196, 242, 198, 254, 229, 255 }, - { 85, 114, 180, 240, 179, 187, 239, 191, 253, 223, 239 }, - { 18, 81, 128, 220, 152, 173, 232, 176, 252, 221, 254 }, - { 2, 42, 64, 150, 115, 149, 192, 137, 247, 197, 247 } - }, { /* Coeff Band 3 */ - { 1, 164, 230, 251, 210, 204, 245, 201, 255, 238, 255 }, - { 96, 137, 210, 248, 199, 199, 244, 198, 254, 218, 255 }, - { 20, 97, 169, 240, 179, 188, 242, 190, 254, 228, 255 }, - { 2, 58, 95, 197, 137, 164, 220, 158, 252, 217, 248 } - }, { /* Coeff Band 4 */ - { 1, 193, 236, 245, 203, 194, 243, 191, 254, 223, 255 }, - { 86, 163, 217, 241, 190, 188, 242, 189, 253, 220, 255 }, - { 14, 108, 161, 228, 167, 178, 238, 180, 253, 224, 255 }, - { 1, 51, 84, 186, 127, 159, 216, 155, 251, 208, 243 } - }, { /* Coeff Band 5 */ - { 1, 183, 235, 248, 209, 197, 244, 195, 253, 236, 239 }, - { 79, 144, 208, 243, 193, 190, 244, 191, 254, 231, 255 }, - { 13, 100, 151, 227, 163, 176, 240, 180, 255, 233, 244 }, - { 1, 48, 77, 171, 121, 153, 214, 150, 252, 214, 245 } - }, { /* Coeff Band 6 */ - { 1, 202, 234, 252, 215, 207, 248, 207, 254, 242, 255 }, - { 75, 153, 216, 249, 203, 201, 248, 203, 255, 239, 255 }, - { 11, 104, 168, 241, 179, 189, 245, 194, 255, 237, 128 }, - { 1, 57, 95, 201, 134, 163, 229, 165, 254, 223, 246 } - }, { /* Coeff Band 7 */ - { 1, 184, 236, 254, 222, 212, 254, 225, 255, 255, 128 }, - { 74, 149, 220, 252, 210, 208, 253, 223, 255, 249, 128 }, - { 18, 109, 175, 247, 184, 195, 253, 211, 255, 250, 128 }, - { 3, 64, 113, 219, 144, 171, 246, 187, 255, 250, 128 } - } - }, { /* block Type 3 */ - { /* Coeff Band 0 */ - { 140, 101, 214, 227, 176, 182, 218, 167, 233, 205, 164 }, - { 96, 101, 176, 204, 161, 173, 193, 152, 223, 182, 182 }, - { 27, 84, 123, 176, 140, 162, 190, 142, 238, 189, 210 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 178, 218, 240, 189, 189, 238, 184, 250, 232, 189 }, - { 69, 146, 204, 239, 187, 189, 238, 183, 251, 226, 221 }, - { 16, 98, 157, 234, 170, 185, 237, 183, 252, 220, 218 }, - { 3, 49, 78, 172, 122, 154, 204, 150, 242, 198, 207 } - }, { /* Coeff Band 2 */ - { 1, 165, 207, 230, 179, 181, 234, 172, 252, 228, 218 }, - { 25, 130, 175, 224, 169, 177, 232, 169, 252, 230, 207 }, - { 4, 81, 118, 205, 144, 167, 227, 162, 252, 225, 219 }, - { 2, 51, 63, 150, 114, 148, 197, 138, 244, 202, 204 } - }, { /* Coeff Band 3 */ - { 1, 181, 222, 247, 200, 197, 246, 199, 252, 232, 228 }, - { 25, 142, 200, 244, 190, 193, 245, 195, 253, 233, 204 }, - { 3, 90, 146, 233, 166, 181, 242, 188, 252, 229, 216 }, - { 1, 47, 79, 188, 124, 157, 222, 162, 245, 213, 203 } - }, { /* Coeff Band 4 */ - { 1, 179, 220, 242, 195, 191, 237, 182, 251, 217, 231 }, - { 27, 144, 200, 241, 188, 190, 238, 185, 250, 224, 235 }, - { 3, 93, 149, 230, 166, 180, 235, 180, 249, 222, 221 }, - { 1, 47, 79, 181, 125, 157, 211, 154, 241, 205, 198 } - }, { /* Coeff Band 5 */ - { 1, 176, 222, 247, 202, 198, 247, 199, 252, 234, 219 }, - { 24, 139, 197, 244, 190, 192, 246, 196, 253, 232, 220 }, - { 2, 89, 140, 229, 161, 178, 243, 185, 253, 233, 234 }, - { 1, 49, 76, 176, 121, 154, 214, 153, 243, 209, 208 } - }, { /* Coeff Band 6 */ - { 1, 197, 233, 251, 213, 205, 247, 206, 249, 222, 247 }, - { 35, 159, 216, 249, 203, 201, 246, 203, 250, 222, 223 }, - { 4, 108, 167, 240, 178, 188, 244, 195, 248, 220, 235 }, - { 1, 58, 93, 198, 133, 161, 220, 167, 233, 195, 221 } - }, { /* Coeff Band 7 */ - { 1, 188, 240, 253, 221, 209, 248, 207, 252, 223, 255 }, - { 84, 153, 227, 251, 212, 205, 247, 205, 254, 215, 255 }, - { 25, 117, 182, 244, 186, 192, 243, 198, 250, 209, 255 }, - { 7, 72, 108, 197, 138, 162, 203, 161, 240, 178, 247 } + { /* Intra */ + { /* Coeff Band 0 */ + { 204, 33, 217, 233, 185, 184, 199, 165, 204, 163, 162 }, + { 93, 48, 151, 209, 157, 171, 193, 161, 203, 167, 189 }, + { 18, 43, 86, 173, 126, 156, 203, 149, 231, 193, 200 } + }, { /* Coeff Band 1 */ + { 43, 121, 184, 233, 173, 182, 235, 187, 248, 211, 237 }, + { 93, 117, 177, 232, 170, 180, 235, 182, 246, 204, 224 }, + { 33, 101, 158, 229, 165, 179, 235, 182, 245, 207, 236 }, + { 11, 81, 129, 221, 153, 173, 233, 179, 246, 203, 229 }, + { 2, 51, 82, 188, 124, 158, 224, 162, 248, 206, 228 }, + { 1, 18, 29, 88, 93, 137, 141, 116, 222, 161, 217 } + }, { /* Coeff Band 2 */ + { 63, 154, 199, 239, 184, 187, 236, 187, 248, 209, 221 }, + { 53, 128, 191, 239, 182, 188, 236, 188, 251, 209, 255 }, + { 14, 99, 160, 235, 172, 184, 235, 187, 249, 207, 240 }, + { 4, 75, 122, 219, 150, 173, 226, 177, 250, 204, 240 }, + { 1, 47, 77, 176, 121, 154, 207, 153, 245, 197, 237 }, + { 1, 18, 30, 84, 95, 136, 138, 112, 229, 167, 228 } + }, { /* Coeff Band 3 */ + { 48, 193, 210, 245, 194, 194, 241, 196, 252, 213, 255 }, + { 26, 145, 201, 245, 194, 196, 240, 195, 251, 215, 240 }, + { 6, 104, 165, 241, 179, 190, 239, 191, 253, 222, 255 }, + { 1, 73, 120, 218, 151, 174, 227, 172, 251, 219, 248 }, + { 1, 42, 69, 167, 118, 153, 205, 146, 251, 206, 245 }, + { 1, 16, 27, 84, 89, 133, 148, 112, 240, 179, 238 } + }, { /* Coeff Band 4 */ + { 47, 213, 225, 248, 203, 199, 240, 194, 254, 211, 255 }, + { 32, 153, 212, 248, 201, 199, 241, 196, 251, 226, 255 }, + { 6, 102, 168, 240, 181, 190, 240, 187, 251, 225, 238 }, + { 1, 66, 111, 211, 146, 169, 229, 167, 255, 224, 244 }, + { 1, 36, 60, 157, 110, 148, 209, 143, 252, 215, 255 }, + { 1, 16, 27, 83, 90, 133, 152, 111, 244, 184, 250 } + }, { /* Coeff Band 5 */ + { 46, 225, 232, 252, 219, 208, 247, 204, 254, 233, 255 }, + { 24, 162, 214, 250, 208, 204, 247, 201, 254, 236, 255 }, + { 3, 106, 165, 242, 182, 191, 245, 196, 255, 231, 255 }, + { 1, 66, 108, 213, 142, 169, 235, 175, 255, 226, 247 }, + { 1, 35, 59, 158, 108, 147, 216, 146, 254, 220, 255 }, + { 1, 16, 27, 85, 90, 131, 159, 110, 248, 191, 252 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 229, 28, 245, 227, 195, 182, 200, 145, 253, 186, 255 }, + { 151, 44, 210, 214, 180, 175, 193, 146, 247, 185, 254 }, + { 55, 48, 131, 183, 148, 163, 194, 138, 249, 201, 246 } + }, { /* Coeff Band 1 */ + { 126, 165, 239, 250, 206, 204, 248, 193, 255, 255, 128 }, + { 199, 158, 231, 248, 206, 198, 247, 200, 243, 255, 255 }, + { 102, 136, 209, 248, 203, 197, 247, 201, 255, 244, 128 }, + { 64, 116, 181, 245, 185, 196, 248, 201, 255, 233, 128 }, + { 44, 98, 151, 233, 162, 179, 248, 195, 255, 242, 128 }, + { 44, 81, 119, 204, 140, 165, 222, 163, 252, 217, 255 } + }, { /* Coeff Band 2 */ + { 108, 185, 239, 252, 216, 209, 248, 205, 255, 230, 128 }, + { 91, 155, 224, 252, 211, 205, 251, 211, 255, 230, 128 }, + { 20, 116, 185, 248, 194, 196, 252, 206, 255, 255, 128 }, + { 8, 86, 141, 239, 168, 185, 248, 196, 255, 247, 128 }, + { 3, 50, 92, 206, 125, 164, 242, 176, 255, 246, 128 }, + { 1, 21, 40, 131, 85, 141, 200, 131, 247, 236, 255 } + }, { /* Coeff Band 3 */ + { 94, 198, 243, 254, 226, 215, 254, 220, 255, 255, 128 }, + { 67, 164, 228, 253, 217, 208, 250, 216, 255, 213, 128 }, + { 14, 120, 185, 250, 196, 205, 248, 205, 255, 255, 128 }, + { 4, 83, 134, 238, 161, 181, 250, 202, 255, 233, 128 }, + { 1, 48, 82, 196, 119, 157, 248, 178, 255, 255, 128 }, + { 1, 26, 38, 96, 84, 132, 221, 110, 255, 209, 128 } + }, { /* Coeff Band 4 */ + { 82, 210, 245, 255, 230, 215, 246, 221, 255, 255, 128 }, + { 55, 170, 231, 254, 222, 213, 255, 220, 255, 255, 128 }, + { 8, 118, 184, 251, 200, 207, 255, 219, 255, 255, 128 }, + { 2, 78, 126, 239, 156, 185, 251, 216, 255, 255, 128 }, + { 1, 43, 68, 189, 108, 151, 247, 187, 255, 228, 128 }, + { 1, 34, 40, 121, 114, 102, 205, 96, 255, 255, 128 } + }, { /* Coeff Band 5 */ + { 65, 228, 241, 255, 231, 214, 253, 222, 255, 255, 128 }, + { 33, 173, 226, 254, 222, 216, 255, 231, 255, 255, 128 }, + { 5, 120, 180, 251, 197, 205, 251, 226, 255, 233, 128 }, + { 1, 81, 130, 240, 159, 187, 251, 206, 255, 205, 128 }, + { 1, 51, 78, 198, 119, 168, 238, 181, 255, 171, 128 }, + { 1, 18, 49, 183, 119, 160, 255, 171, 128, 128, 128 } + } } } }; -static const vp9_coeff_probs default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8] = { +static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES] = { { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + { /* Intra */ + { /* Coeff Band 0 */ + { 37, 34, 137, 205, 154, 170, 151, 159, 109, 172, 44 }, + { 3, 26, 60, 113, 123, 154, 100, 124, 152, 131, 144 }, + { 1, 13, 23, 54, 102, 139, 71, 106, 146, 123, 148 } + }, { /* Coeff Band 1 */ + { 26, 77, 122, 152, 144, 160, 143, 129, 216, 158, 201 }, + { 43, 76, 123, 152, 142, 159, 145, 129, 218, 160, 204 }, + { 25, 67, 112, 150, 141, 159, 144, 128, 218, 159, 204 }, + { 9, 54, 90, 143, 134, 156, 144, 127, 218, 159, 204 }, + { 2, 32, 52, 116, 114, 148, 138, 123, 217, 158, 207 }, + { 1, 10, 15, 44, 91, 133, 75, 99, 172, 128, 169 } + }, { /* Coeff Band 2 */ + { 32, 122, 143, 163, 145, 161, 162, 131, 226, 171, 206 }, + { 46, 105, 143, 168, 148, 161, 165, 133, 228, 174, 204 }, + { 17, 79, 116, 164, 142, 161, 166, 134, 229, 174, 206 }, + { 4, 53, 78, 143, 125, 153, 163, 129, 232, 175, 213 }, + { 1, 29, 44, 105, 105, 142, 147, 120, 228, 168, 211 }, + { 1, 12, 18, 52, 91, 133, 92, 100, 193, 140, 183 } + }, { /* Coeff Band 3 */ + { 33, 157, 160, 182, 149, 163, 185, 141, 236, 185, 218 }, + { 20, 116, 152, 188, 152, 165, 191, 144, 238, 188, 217 }, + { 4, 74, 114, 180, 141, 162, 192, 143, 240, 191, 219 }, + { 1, 44, 69, 148, 119, 151, 183, 134, 243, 192, 227 }, + { 1, 25, 40, 110, 101, 141, 162, 121, 238, 181, 223 }, + { 1, 12, 18, 56, 89, 132, 103, 101, 206, 148, 196 } + }, { /* Coeff Band 4 */ + { 25, 183, 174, 207, 159, 171, 205, 156, 243, 194, 228 }, + { 13, 124, 159, 209, 157, 171, 213, 160, 243, 200, 228 }, + { 2, 75, 117, 199, 143, 166, 215, 158, 246, 205, 230 }, + { 1, 45, 73, 165, 119, 153, 204, 144, 248, 205, 231 }, + { 1, 26, 43, 120, 101, 141, 178, 127, 242, 192, 226 }, + { 1, 12, 19, 59, 89, 132, 112, 102, 215, 154, 201 } + }, { /* Coeff Band 5 */ + { 13, 232, 223, 239, 196, 188, 225, 172, 248, 209, 226 }, + { 4, 155, 187, 237, 184, 187, 233, 180, 250, 216, 232 }, + { 1, 86, 131, 222, 156, 175, 233, 176, 251, 218, 237 }, + { 1, 49, 79, 181, 123, 157, 218, 155, 251, 214, 237 }, + { 1, 26, 43, 125, 100, 141, 188, 130, 246, 199, 231 }, + { 1, 12, 20, 62, 88, 131, 119, 102, 222, 161, 209 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 51, 37, 227, 237, 205, 184, 200, 162, 231, 187, 207 }, + { 9, 36, 172, 204, 176, 173, 171, 145, 217, 167, 197 }, + { 21, 26, 112, 162, 145, 162, 155, 133, 215, 165, 191 } + }, { /* Coeff Band 1 */ + { 79, 169, 219, 223, 176, 177, 222, 161, 248, 213, 244 }, + { 177, 166, 216, 222, 175, 178, 222, 161, 246, 212, 226 }, + { 119, 141, 196, 222, 174, 176, 220, 163, 250, 212, 236 }, + { 63, 117, 165, 217, 163, 175, 218, 161, 248, 209, 231 }, + { 30, 87, 117, 192, 138, 162, 216, 157, 247, 211, 224 }, + { 14, 56, 60, 119, 111, 146, 156, 123, 227, 171, 220 } + }, { /* Coeff Band 2 */ + { 88, 195, 225, 229, 181, 181, 229, 171, 252, 212, 221 }, + { 66, 145, 202, 229, 177, 180, 230, 172, 253, 220, 255 }, + { 12, 97, 152, 221, 162, 174, 230, 169, 253, 218, 249 }, + { 3, 66, 103, 198, 138, 165, 223, 159, 253, 219, 251 }, + { 1, 38, 61, 158, 110, 148, 209, 146, 252, 212, 238 }, + { 1, 19, 30, 94, 94, 136, 160, 114, 244, 185, 236 } + }, { /* Coeff Band 3 */ + { 79, 211, 228, 235, 186, 184, 233, 176, 255, 225, 255 }, + { 50, 151, 205, 235, 182, 185, 237, 177, 254, 223, 255 }, + { 7, 95, 149, 225, 162, 176, 236, 177, 254, 229, 219 }, + { 1, 62, 98, 198, 134, 164, 228, 162, 254, 224, 238 }, + { 1, 35, 57, 156, 108, 148, 211, 143, 253, 215, 238 }, + { 1, 17, 26, 87, 89, 135, 161, 113, 246, 189, 237 } + }, { /* Coeff Band 4 */ + { 68, 225, 230, 239, 190, 187, 238, 180, 252, 234, 255 }, + { 39, 156, 206, 239, 185, 187, 241, 187, 254, 231, 255 }, + { 4, 94, 147, 229, 163, 178, 242, 183, 255, 236, 224 }, + { 1, 58, 94, 200, 132, 163, 232, 166, 254, 230, 255 }, + { 1, 32, 52, 153, 104, 146, 214, 144, 253, 222, 236 }, + { 1, 15, 24, 84, 89, 131, 159, 109, 247, 192, 240 } + }, { /* Coeff Band 5 */ + { 45, 248, 234, 248, 208, 198, 244, 193, 255, 233, 255 }, + { 19, 169, 204, 246, 195, 195, 246, 199, 255, 233, 255 }, + { 2, 98, 145, 235, 166, 183, 245, 192, 255, 235, 255 }, + { 1, 59, 92, 205, 131, 164, 236, 172, 254, 231, 250 }, + { 1, 33, 52, 152, 103, 145, 216, 144, 253, 221, 240 }, + { 1, 15, 24, 83, 87, 133, 156, 110, 246, 191, 242 } + } } }, { /* block Type 1 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + { /* Intra */ + { /* Coeff Band 0 */ + { 179, 23, 200, 222, 180, 182, 150, 152, 148, 135, 125 }, + { 60, 33, 113, 185, 143, 166, 168, 144, 189, 168, 152 }, + { 8, 31, 59, 137, 114, 150, 163, 132, 206, 171, 169 } + }, { /* Coeff Band 1 */ + { 27, 103, 158, 215, 157, 174, 209, 165, 239, 191, 233 }, + { 90, 101, 159, 213, 156, 173, 212, 164, 230, 185, 237 }, + { 39, 91, 146, 212, 155, 169, 212, 165, 232, 186, 207 }, + { 16, 75, 120, 203, 144, 169, 210, 161, 233, 189, 227 }, + { 3, 48, 76, 167, 120, 154, 199, 146, 236, 190, 218 }, + { 1, 18, 26, 72, 95, 137, 113, 109, 197, 146, 186 } + }, { /* Coeff Band 2 */ + { 45, 137, 177, 218, 166, 174, 206, 163, 234, 184, 214 }, + { 47, 117, 167, 218, 166, 176, 206, 164, 234, 182, 229 }, + { 16, 90, 136, 211, 153, 172, 205, 162, 236, 192, 231 }, + { 6, 65, 100, 188, 136, 162, 193, 155, 237, 177, 228 }, + { 1, 37, 58, 137, 113, 150, 166, 134, 229, 167, 234 }, + { 1, 13, 19, 55, 90, 132, 93, 103, 196, 137, 202 } + }, { /* Coeff Band 3 */ + { 36, 171, 194, 227, 177, 179, 208, 165, 244, 196, 245 }, + { 19, 129, 178, 227, 175, 184, 214, 165, 246, 188, 255 }, + { 5, 90, 139, 217, 158, 174, 213, 166, 246, 198, 255 }, + { 1, 59, 93, 182, 134, 162, 193, 150, 242, 188, 241 }, + { 1, 31, 49, 122, 108, 145, 160, 127, 235, 172, 229 }, + { 1, 10, 18, 54, 89, 132, 101, 99, 213, 144, 217 } + }, { /* Coeff Band 4 */ + { 37, 197, 210, 233, 187, 186, 216, 172, 250, 202, 255 }, + { 20, 142, 191, 234, 183, 186, 219, 170, 249, 207, 246 }, + { 3, 93, 144, 222, 163, 176, 219, 170, 249, 204, 224 }, + { 1, 56, 88, 179, 130, 159, 199, 148, 246, 197, 243 }, + { 1, 29, 47, 123, 104, 144, 172, 127, 244, 185, 234 }, + { 1, 14, 22, 66, 91, 130, 120, 103, 225, 158, 221 } + }, { /* Coeff Band 5 */ + { 19, 227, 223, 245, 203, 194, 238, 187, 251, 225, 217 }, + { 6, 152, 192, 242, 189, 190, 241, 190, 253, 225, 255 }, + { 1, 89, 138, 228, 161, 177, 239, 181, 254, 224, 248 }, + { 1, 52, 84, 188, 127, 157, 224, 159, 253, 222, 247 }, + { 1, 29, 47, 132, 102, 140, 196, 132, 251, 208, 244 }, + { 1, 14, 23, 71, 90, 133, 134, 103, 239, 174, 233 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 205, 14, 245, 235, 216, 189, 190, 146, 249, 201, 255 }, + { 97, 19, 213, 210, 194, 174, 176, 139, 241, 183, 250 }, + { 31, 20, 144, 183, 160, 167, 171, 132, 240, 184, 253 } + }, { /* Coeff Band 1 */ + { 137, 182, 245, 254, 221, 216, 255, 160, 128, 128, 128 }, + { 231, 185, 242, 251, 218, 205, 255, 233, 128, 128, 128 }, + { 170, 175, 229, 252, 205, 209, 255, 211, 128, 128, 128 }, + { 107, 157, 213, 250, 199, 205, 251, 207, 255, 255, 128 }, + { 77, 126, 183, 243, 182, 183, 252, 206, 255, 255, 128 }, + { 69, 96, 149, 229, 157, 170, 247, 169, 255, 255, 128 } + }, { /* Coeff Band 2 */ + { 107, 196, 241, 252, 211, 208, 255, 210, 128, 128, 128 }, + { 92, 162, 221, 249, 203, 195, 255, 199, 128, 128, 128 }, + { 20, 108, 181, 244, 190, 191, 250, 200, 255, 255, 128 }, + { 7, 80, 132, 241, 172, 197, 253, 191, 255, 255, 128 }, + { 2, 43, 75, 219, 122, 150, 255, 203, 128, 128, 128 }, + { 1, 15, 48, 98, 51, 192, 255, 160, 128, 128, 128 } + }, { /* Coeff Band 3 */ + { 107, 202, 244, 254, 226, 215, 255, 192, 128, 128, 128 }, + { 77, 167, 224, 252, 215, 212, 255, 235, 128, 128, 128 }, + { 14, 117, 179, 249, 191, 196, 255, 212, 128, 128, 128 }, + { 3, 84, 134, 237, 160, 194, 248, 216, 255, 255, 128 }, + { 1, 57, 84, 216, 145, 136, 255, 161, 128, 128, 128 }, + { 1, 1, 1, 255, 128, 255, 128, 128, 128, 128, 128 } + }, { /* Coeff Band 4 */ + { 88, 219, 248, 255, 239, 225, 255, 255, 128, 128, 128 }, + { 61, 178, 234, 255, 227, 227, 255, 217, 128, 128, 128 }, + { 6, 127, 188, 252, 201, 211, 255, 244, 128, 128, 128 }, + { 1, 83, 130, 248, 173, 197, 255, 175, 128, 128, 128 }, + { 1, 61, 66, 211, 121, 188, 255, 213, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + }, { /* Coeff Band 5 */ + { 73, 243, 250, 255, 244, 220, 255, 205, 128, 128, 128 }, + { 42, 197, 242, 255, 237, 227, 242, 166, 255, 255, 128 }, + { 10, 137, 197, 252, 214, 199, 255, 238, 128, 128, 128 }, + { 2, 85, 134, 242, 163, 185, 224, 238, 255, 255, 128 }, + { 1, 70, 69, 199, 110, 64, 255, 213, 128, 128, 128 }, + { 1, 1, 1, 1, 128, 128, 255, 1, 128, 128, 128 } + } } - }, { /* block Type 2 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } +}; + +#if CONFIG_CODE_NONZEROCOUNT + +// TODO(debargha): Remove the macro and count tables after experimentation +#define NZC_DEFAULT_COUNTS /* Uncomment to use counts as defaults */ + +#ifdef NZC_DEFAULT_COUNTS +static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS] + [REF_TYPES] + [BLOCK_TYPES] + [NZC4X4_TOKENS] = { + { + { + { 967652, 29023, 15039, 6952, 1568, 116 }, + { 289116, 22938, 4522, 1935, 520, 47 } + }, { + { 967652, 29023, 15039, 6952, 1568, 116 }, + { 689116, 22938, 4522, 1935, 520, 47 } + }, + }, { + { + { 124684, 37167, 15270, 8483, 1777, 102 }, + { 10405, 12395, 3401, 3574, 2461, 771 } + }, { + { 124684, 37167, 15270, 8483, 1777, 102 }, + { 20405, 12395, 3401, 3574, 2461, 771 } } - }, { /* block Type 3 */ - { /* Coeff Band 0 */ - { 118, 27, 105, 170, 137, 166, 183, 137, 243, 189, 241 }, - { 44, 34, 85, 142, 127, 158, 161, 128, 232, 174, 213 }, - { 8, 26, 47, 104, 108, 145, 143, 117, 226, 168, 207 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 134, 172, 217, 163, 175, 226, 167, 251, 220, 204 }, - { 56, 129, 168, 217, 161, 174, 223, 164, 249, 218, 223 }, - { 20, 110, 151, 215, 158, 174, 221, 165, 249, 209, 221 }, - { 2, 59, 88, 169, 128, 157, 192, 143, 239, 189, 214 } - }, { /* Coeff Band 2 */ - { 1, 65, 126, 191, 140, 163, 218, 153, 252, 218, 229 }, - { 21, 57, 92, 175, 126, 156, 214, 148, 252, 218, 229 }, - { 4, 44, 66, 148, 114, 148, 200, 136, 251, 211, 228 }, - { 1, 28, 42, 108, 104, 141, 158, 119, 235, 180, 210 } - }, { /* Coeff Band 3 */ - { 1, 114, 172, 227, 166, 177, 236, 178, 252, 226, 233 }, - { 41, 94, 152, 218, 156, 172, 233, 172, 251, 223, 231 }, - { 9, 69, 116, 202, 142, 165, 226, 162, 251, 221, 227 }, - { 1, 36, 60, 151, 113, 148, 195, 140, 241, 198, 211 } - }, { /* Coeff Band 4 */ - { 1, 186, 200, 227, 174, 178, 230, 169, 248, 210, 238 }, - { 27, 148, 181, 221, 167, 176, 226, 166, 250, 218, 228 }, - { 3, 96, 139, 208, 154, 170, 219, 161, 249, 214, 229 }, - { 1, 44, 70, 156, 120, 152, 188, 139, 239, 193, 200 } - }, { /* Coeff Band 5 */ - { 1, 169, 203, 238, 186, 186, 238, 184, 252, 224, 230 }, - { 32, 119, 173, 232, 172, 181, 236, 182, 252, 222, 237 }, - { 6, 84, 128, 215, 150, 170, 232, 172, 251, 221, 235 }, - { 1, 49, 78, 167, 124, 154, 200, 145, 243, 198, 217 } - }, { /* Coeff Band 6 */ - { 1, 193, 215, 244, 197, 195, 239, 192, 249, 213, 240 }, - { 52, 136, 193, 239, 184, 189, 237, 189, 248, 211, 226 }, - { 13, 90, 146, 227, 162, 178, 233, 182, 248, 211, 231 }, - { 1, 49, 79, 177, 124, 156, 201, 154, 234, 188, 212 } - }, { /* Coeff Band 7 */ - { 1, 189, 238, 248, 219, 196, 232, 180, 253, 211, 255 }, - { 104, 148, 224, 245, 211, 194, 225, 171, 251, 206, 255 }, - { 43, 116, 190, 231, 179, 183, 217, 168, 249, 199, 255 }, - { 13, 65, 92, 154, 131, 152, 167, 132, 238, 174, 243 } + }, { + { + { 4100, 22976, 15627, 16137, 7982, 1793 }, + { 4249, 3084, 2131, 4081, 6439, 1653 } + }, { + { 21100, 22976, 15627, 16137, 7982, 1793 }, + { 4249, 3084, 2131, 4081, 2439, 1653 } } } }; -static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES_16X16] = { - { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - } - }, { /* block Type 1 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + +static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS] + [REF_TYPES] + [BLOCK_TYPES] + [NZC8X8_TOKENS] = { + { + { + { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 }, + { 72052, 30468, 6973, 3250, 1500, 750, 375, 5 }, + }, { + { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 }, + { 192052, 30468, 6973, 3250, 1500, 750, 375, 5 }, } - }, { /* block Type 2 */ - { /* Coeff Band 0 */ - { 223, 34, 236, 234, 193, 185, 216, 169, 239, 189, 229 }, - { 125, 40, 195, 221, 173, 175, 209, 165, 220, 181, 196 }, - { 41, 37, 127, 185, 145, 162, 191, 150, 227, 180, 219 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 160, 224, 239, 193, 190, 213, 178, 244, 174, 255 }, - { 199, 154, 212, 238, 190, 190, 210, 173, 246, 183, 249 }, - { 88, 122, 178, 234, 180, 187, 213, 174, 244, 182, 247 }, - { 27, 69, 100, 174, 139, 165, 159, 142, 225, 157, 240 } - }, { /* Coeff Band 2 */ - { 1, 118, 207, 237, 179, 185, 234, 189, 241, 194, 237 }, - { 86, 103, 161, 227, 163, 176, 231, 183, 241, 196, 234 }, - { 19, 69, 113, 205, 140, 166, 220, 169, 240, 188, 242 }, - { 3, 32, 49, 106, 111, 144, 132, 121, 225, 151, 237 } - }, { /* Coeff Band 3 */ - { 1, 160, 218, 245, 197, 195, 235, 189, 254, 218, 255 }, - { 90, 127, 193, 240, 186, 189, 235, 187, 251, 217, 230 }, - { 18, 92, 148, 229, 164, 179, 228, 180, 254, 212, 229 }, - { 2, 50, 79, 163, 126, 156, 186, 140, 247, 191, 236 } - }, { /* Coeff Band 4 */ - { 1, 196, 231, 240, 203, 191, 225, 171, 253, 214, 255 }, - { 71, 167, 210, 234, 194, 188, 218, 165, 253, 215, 236 }, - { 11, 119, 165, 217, 171, 177, 213, 155, 252, 209, 255 }, - { 1, 46, 70, 145, 121, 153, 180, 131, 249, 192, 246 } - }, { /* Coeff Band 5 */ - { 1, 176, 223, 242, 202, 194, 222, 169, 253, 211, 244 }, - { 62, 131, 191, 233, 185, 186, 219, 164, 251, 211, 252 }, - { 7, 89, 133, 207, 156, 173, 211, 157, 251, 206, 247 }, - { 1, 36, 56, 127, 113, 147, 166, 125, 243, 183, 242 } - }, { /* Coeff Band 6 */ - { 1, 203, 232, 249, 213, 202, 245, 193, 254, 237, 255 }, - { 51, 155, 212, 245, 199, 195, 244, 192, 254, 234, 255 }, - { 7, 101, 158, 233, 170, 181, 244, 185, 253, 242, 255 }, - { 1, 49, 82, 185, 123, 157, 226, 156, 252, 225, 240 } - }, { /* Coeff Band 7 */ - { 1, 222, 233, 252, 220, 207, 247, 206, 255, 240, 128 }, - { 40, 159, 216, 250, 205, 201, 248, 207, 249, 219, 255 }, - { 6, 106, 163, 240, 176, 188, 247, 198, 251, 222, 255 }, - { 1, 51, 88, 196, 127, 159, 232, 169, 252, 214, 255 } + }, { + { + { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 }, + { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 }, + }, { + { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 }, + { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 }, } - }, { /* block Type 3 */ - { /* Coeff Band 0 */ - { 14, 78, 225, 217, 173, 181, 198, 153, 228, 185, 176 }, - { 9, 74, 179, 191, 157, 171, 178, 143, 229, 175, 209 }, - { 3, 48, 92, 128, 130, 155, 135, 123, 220, 155, 219 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 178, 209, 214, 173, 175, 208, 152, 252, 210, 237 }, - { 142, 151, 193, 212, 170, 175, 209, 151, 251, 208, 237 }, - { 38, 105, 150, 206, 159, 173, 208, 151, 250, 209, 238 }, - { 5, 44, 61, 128, 114, 147, 167, 125, 239, 184, 217 } - }, { /* Coeff Band 2 */ - { 1, 154, 195, 202, 166, 173, 184, 144, 245, 184, 236 }, - { 49, 110, 150, 188, 155, 168, 180, 141, 244, 183, 239 }, - { 4, 63, 90, 158, 132, 157, 171, 134, 243, 179, 239 }, - { 1, 25, 37, 93, 104, 141, 133, 114, 231, 161, 226 } - }, { /* Coeff Band 3 */ - { 1, 184, 201, 223, 173, 177, 224, 164, 253, 220, 238 }, - { 42, 127, 170, 215, 164, 173, 223, 162, 253, 219, 233 }, - { 4, 75, 114, 195, 142, 164, 218, 155, 253, 217, 235 }, - { 1, 32, 50, 128, 108, 144, 180, 127, 247, 197, 219 } - }, { /* Coeff Band 4 */ - { 1, 190, 207, 232, 181, 184, 228, 172, 251, 216, 212 }, - { 35, 136, 180, 227, 173, 180, 227, 171, 251, 216, 218 }, - { 2, 85, 131, 214, 154, 173, 224, 166, 250, 214, 225 }, - { 1, 44, 71, 162, 120, 153, 195, 143, 240, 195, 197 } - }, { /* Coeff Band 5 */ - { 1, 185, 201, 230, 177, 180, 232, 172, 253, 225, 235 }, - { 27, 122, 165, 221, 164, 175, 230, 169, 253, 224, 220 }, - { 1, 72, 108, 197, 139, 163, 224, 159, 253, 224, 226 }, - { 1, 33, 51, 132, 107, 144, 186, 130, 245, 201, 206 } - }, { /* Coeff Band 6 */ - { 1, 203, 214, 240, 193, 191, 235, 178, 252, 225, 224 }, - { 20, 140, 188, 235, 182, 186, 234, 177, 252, 226, 226 }, - { 1, 85, 132, 218, 155, 174, 230, 170, 251, 224, 227 }, - { 1, 39, 62, 154, 114, 150, 199, 141, 241, 203, 214 } - }, { /* Coeff Band 7 */ - { 1, 217, 224, 244, 202, 193, 241, 187, 252, 227, 239 }, - { 22, 151, 200, 239, 187, 188, 240, 184, 252, 226, 237 }, - { 2, 90, 138, 222, 158, 174, 237, 176, 252, 226, 239 }, - { 1, 41, 66, 163, 116, 151, 206, 146, 243, 201, 230 } + }, { + { + { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 }, + { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 }, + }, { + { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 }, + { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 }, } } }; -static const vp9_coeff_probs default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16] = { - { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - } - }, { /* block Type 1 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + +static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS] + [REF_TYPES] + [BLOCK_TYPES] + [NZC16X16_TOKENS] = { + { + { + { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 }, + { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 }, + }, { + { 32988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 }, + { 92052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 }, } - }, { /* block Type 2 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + }, { + { + { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 }, + { 47772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 }, + }, { + { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 }, + { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 }, } - }, { /* block Type 3 */ - { /* Coeff Band 0 */ - { 3, 29, 86, 140, 130, 163, 135, 131, 190, 148, 186 }, - { 1, 26, 61, 105, 124, 156, 105, 119, 178, 138, 173 }, - { 1, 15, 28, 60, 105, 142, 80, 105, 173, 128, 178 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 130, 142, 172, 141, 161, 191, 140, 244, 193, 216 }, - { 61, 124, 141, 173, 141, 161, 190, 139, 244, 194, 215 }, - { 28, 103, 124, 171, 138, 160, 190, 140, 243, 194, 225 }, - { 1, 36, 51, 111, 109, 144, 152, 120, 227, 173, 205 } - }, { /* Coeff Band 2 */ - { 1, 60, 125, 153, 143, 159, 156, 127, 234, 170, 233 }, - { 22, 48, 78, 129, 124, 152, 151, 123, 234, 170, 233 }, - { 3, 32, 46, 98, 107, 142, 138, 114, 232, 165, 232 }, - { 1, 15, 23, 61, 96, 135, 101, 103, 210, 144, 213 } - }, { /* Coeff Band 3 */ - { 1, 102, 144, 182, 146, 162, 194, 143, 246, 196, 239 }, - { 34, 76, 116, 171, 136, 159, 192, 140, 246, 195, 239 }, - { 4, 51, 81, 153, 124, 153, 184, 135, 246, 192, 239 }, - { 1, 23, 37, 98, 102, 140, 142, 116, 230, 167, 227 } - }, { /* Coeff Band 4 */ - { 1, 165, 171, 214, 163, 174, 214, 160, 245, 203, 219 }, - { 16, 120, 154, 210, 158, 172, 212, 159, 245, 201, 219 }, - { 1, 80, 122, 199, 147, 167, 208, 154, 244, 200, 223 }, - { 1, 40, 65, 145, 118, 151, 171, 135, 226, 175, 202 } - }, { /* Coeff Band 5 */ - { 1, 146, 162, 215, 159, 172, 226, 165, 251, 218, 231 }, - { 16, 92, 131, 205, 147, 167, 224, 162, 252, 217, 228 }, - { 2, 60, 92, 182, 129, 158, 216, 152, 251, 214, 234 }, - { 1, 32, 50, 126, 107, 144, 176, 128, 240, 189, 216 } - }, { /* Coeff Band 6 */ - { 1, 178, 186, 224, 172, 178, 224, 167, 251, 214, 232 }, - { 14, 118, 158, 215, 160, 173, 223, 164, 250, 214, 228 }, - { 2, 70, 109, 194, 139, 164, 217, 156, 250, 213, 227 }, - { 1, 32, 51, 129, 108, 146, 175, 128, 240, 187, 218 } - }, { /* Coeff Band 7 */ - { 1, 210, 214, 240, 192, 188, 235, 182, 251, 221, 228 }, - { 22, 140, 187, 233, 177, 183, 234, 178, 251, 219, 233 }, - { 3, 82, 130, 215, 152, 171, 229, 171, 250, 217, 232 }, - { 1, 38, 63, 154, 115, 149, 195, 141, 240, 196, 219 } + }, { + { + { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 }, + { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 }, + }, { + { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 }, + { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 }, } } }; -static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES_32X32] = { - { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - } - }, { /* block Type 1 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + +static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS] + [REF_TYPES] + [BLOCK_TYPES] + [NZC32X32_TOKENS] = { + { + { + { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 }, + { 52052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 }, + }, { + { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 }, + { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 }, } - }, { /* block Type 2 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 2 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 3 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + }, { + { + { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 }, + { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 }, + }, { + { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 }, + { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 }, } - }, { /* block Type 3 */ - { /* Coeff Band 0 */ - { 8, 40, 224, 217, 183, 181, 180, 148, 200, 180, 123 }, - { 6, 37, 178, 193, 173, 171, 160, 139, 205, 166, 173 }, - { 3, 27, 93, 133, 143, 159, 115, 125, 183, 141, 178 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } - }, { /* Coeff Band 1 */ - { 1, 170, 209, 202, 172, 175, 179, 143, 238, 181, 214 }, - { 184, 164, 199, 199, 169, 173, 180, 143, 238, 184, 217 }, - { 99, 128, 165, 194, 161, 171, 180, 142, 239, 182, 219 }, - { 17, 49, 59, 102, 117, 148, 122, 116, 208, 152, 191 } - }, { /* Coeff Band 2 */ - { 1, 136, 200, 197, 172, 172, 168, 142, 226, 170, 216 }, - { 66, 104, 146, 175, 152, 165, 163, 139, 225, 170, 219 }, - { 11, 52, 83, 144, 130, 156, 151, 130, 222, 165, 216 }, - { 1, 16, 25, 65, 99, 137, 96, 106, 190, 138, 184 } - }, { /* Coeff Band 3 */ - { 1, 180, 203, 198, 166, 170, 190, 143, 241, 190, 227 }, - { 74, 125, 161, 187, 154, 165, 187, 142, 241, 189, 224 }, - { 15, 70, 98, 163, 133, 157, 182, 137, 241, 187, 226 }, - { 1, 25, 37, 89, 104, 140, 128, 113, 218, 158, 206 } - }, { /* Coeff Band 4 */ - { 1, 191, 208, 213, 169, 173, 212, 156, 246, 206, 217 }, - { 53, 136, 170, 205, 159, 170, 211, 156, 246, 205, 208 }, - { 3, 75, 112, 189, 140, 163, 209, 151, 246, 205, 215 }, - { 1, 32, 51, 127, 108, 145, 171, 128, 231, 183, 197 } - }, { /* Coeff Band 5 */ - { 1, 183, 195, 202, 161, 168, 206, 150, 247, 202, 229 }, - { 42, 113, 144, 190, 147, 163, 203, 148, 247, 202, 229 }, - { 2, 56, 82, 160, 124, 153, 195, 140, 246, 200, 229 }, - { 1, 22, 34, 93, 99, 138, 143, 115, 227, 170, 206 } - }, { /* Coeff Band 6 */ - { 1, 202, 193, 221, 168, 175, 227, 167, 251, 217, 236 }, - { 26, 122, 158, 213, 157, 171, 225, 165, 251, 216, 242 }, - { 1, 68, 105, 194, 136, 162, 221, 158, 251, 215, 239 }, - { 1, 32, 51, 131, 107, 145, 179, 130, 240, 188, 231 } - }, { /* Coeff Band 7 */ - { 1, 234, 212, 243, 195, 192, 240, 187, 253, 226, 227 }, - { 14, 141, 186, 237, 181, 186, 239, 184, 253, 226, 233 }, - { 1, 85, 132, 221, 155, 174, 235, 176, 253, 224, 226 }, - { 1, 39, 65, 159, 115, 150, 202, 144, 245, 202, 214 } + }, { + { + { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 }, + { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 }, + }, { + { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 }, + { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 }, } } }; + +#else + +static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS] + [REF_TYPES] + [BLOCK_TYPES] + [NZC4X4_TOKENS] = { + { + { + { 219, 162, 179, 142, 242, }, + { 214, 253, 228, 246, 255, }, + }, { + { 225, 236, 190, 229, 253, }, + { 251, 253, 240, 248, 255, }, + }, + }, { + { + { 106, 126, 158, 126, 244, }, + { 118, 241, 201, 240, 255, }, + }, { + { 165, 179, 143, 189, 242, }, + { 173, 239, 192, 255, 128, }, + }, + }, { + { + { 42 , 78 , 153, 92 , 223, }, + { 128, 128, 128, 128, 128, }, + }, { + { 76 , 68 , 126, 110, 216, }, + { 128, 128, 128, 128, 128, }, + }, + }, +}; + +static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS] + [REF_TYPES] + [BLOCK_TYPES] + [NZC8X8_TOKENS] = { + { + { + { 134, 139, 170, 178, 142, 197, 255, }, + { 167, 224, 199, 252, 205, 255, 128, }, + }, { + { 181, 210, 180, 241, 190, 235, 255, }, + { 234, 251, 235, 252, 219, 255, 128, }, + }, + }, { + { + { 33 , 64 , 155, 143, 86 , 216, 255, }, + { 73 , 160, 167, 251, 153, 255, 128, }, + }, { + { 79 , 104, 153, 195, 119, 246, 255, }, + { 149, 183, 186, 249, 203, 255, 128, }, + }, + }, { + { + { 10 , 25 , 156, 61 , 69 , 156, 254, }, + { 32 , 1 , 128, 146, 64 , 255, 128, }, + }, { + { 37 , 48 , 143, 113, 81 , 202, 255, }, + { 1 , 255, 128, 128, 128, 128, 128, }, + }, + }, +}; + +static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS] + [REF_TYPES] + [BLOCK_TYPES] + [NZC16X16_TOKENS] = { + { + { + { 11 , 188, 210, 167, 141, 143, 152, 255, 128, }, + { 171, 201, 203, 244, 207, 255, 255, 128, 128, }, + }, { + { 23 , 217, 207, 251, 198, 255, 219, 128, 128, }, + { 235, 249, 229, 255, 199, 128, 128, 128, 128, }, + }, + }, { + { + { 9 , 45 , 168, 85 , 66 , 221, 139, 246, 255, }, + { 51 , 110, 163, 238, 94 , 255, 255, 128, 128, }, + }, { + { 4 , 149, 175, 240, 149, 255, 205, 128, 128, }, + { 141, 217, 186, 255, 128, 128, 128, 128, 128, }, + }, + }, { + { + { 1 , 12 , 173, 6 , 68 , 145, 41 , 204, 255, }, + { 39 , 47 , 128, 199, 110, 255, 128, 128, 128, }, + }, { + { 1 , 121, 171, 149, 115, 242, 159, 255, 128, }, + { 1 , 255, 255, 128, 128, 128, 128, 128, 128, }, + }, + }, +}; + +static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS] + [REF_TYPES] + [BLOCK_TYPES] + [NZC32X32_TOKENS] = { + { + { + { 11 , 216, 195, 201, 160, 247, 217, 255, 255, 128, 128, }, + { 177, 240, 239, 255, 192, 128, 128, 128, 128, 128, 128, }, + }, { + { 48 , 235, 213, 235, 199, 255, 255, 128, 128, 128, 128, }, + { 205, 255, 248, 128, 128, 128, 128, 128, 128, 128, 128, }, + }, + }, { + { + { 6 , 96 , 138, 99 , 125, 248, 188, 255, 128, 128, 128, }, + { 17 , 53 , 43 , 189, 1 , 255, 171, 128, 128, 128, 128, }, + }, { + { 5 , 187, 235, 232, 117, 255, 219, 128, 128, 128, 128, }, + { 146, 255, 255, 128, 128, 128, 128, 128, 128, 128, 128, }, + }, + }, { + { + { 1 , 7 , 93 , 14 , 100, 30 , 85 , 65 , 81 , 210, 255, }, + { 1 , 1 , 128, 26 , 1 , 218, 78 , 255, 255, 128, 128, }, + }, { + { 4 , 148, 206, 137, 160, 255, 255, 128, 128, 128, 128, }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, }, + }, + }, +}; +#endif + +static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS] + [NZC_TOKENS_EXTRA] + [NZC_BITS_EXTRA] = { + // Bit probabilities are in least to most significance order + { + {176, 128, 128, 128, 128, 128, 128, 128, 128}, // 3 - 4 + {164, 192, 128, 128, 128, 128, 128, 128, 128}, // 5 - 8 + {154, 184, 208, 128, 128, 128, 128, 128, 128}, // 9 - 16 + {144, 176, 200, 216, 128, 128, 128, 128, 128}, // 17 - 32 + {140, 172, 192, 208, 224, 128, 128, 128, 128}, // 33 - 64 + {136, 168, 188, 200, 220, 232, 128, 128, 128}, // 65 - 128 + {132, 164, 184, 196, 216, 228, 240, 128, 128}, // 129 - 256 + {130, 162, 178, 194, 212, 226, 240, 248, 128}, // 257 - 512 + {128, 160, 176, 192, 208, 224, 240, 248, 254}, // 513 - 1024 + }, { + {168, 128, 128, 128, 128, 128, 128, 128, 128}, // 3 - 4 + {152, 184, 128, 128, 128, 128, 128, 128, 128}, // 5 - 8 + {152, 184, 208, 128, 128, 128, 128, 128, 128}, // 9 - 16 + {144, 176, 200, 216, 128, 128, 128, 128, 128}, // 17 - 32 + {140, 172, 192, 208, 224, 128, 128, 128, 128}, // 33 - 64 + {136, 168, 188, 200, 220, 232, 128, 128, 128}, // 65 - 128 + {132, 164, 184, 196, 216, 228, 240, 128, 128}, // 129 - 256 + {130, 162, 178, 194, 212, 226, 240, 248, 128}, // 257 - 512 + {128, 160, 176, 192, 208, 224, 240, 248, 254}, // 513 - 1024 + }, { + {160, 128, 128, 128, 128, 128, 128, 128, 128}, // 3 - 4 + {152, 176, 128, 128, 128, 128, 128, 128, 128}, // 5 - 8 + {150, 184, 208, 128, 128, 128, 128, 128, 128}, // 9 - 16 + {144, 176, 200, 216, 128, 128, 128, 128, 128}, // 17 - 32 + {140, 172, 192, 208, 224, 128, 128, 128, 128}, // 33 - 64 + {136, 168, 188, 200, 220, 232, 128, 128, 128}, // 65 - 128 + {132, 164, 184, 196, 216, 228, 240, 128, 128}, // 129 - 256 + {130, 162, 178, 194, 212, 226, 240, 248, 128}, // 257 - 512 + {128, 160, 176, 192, 208, 224, 240, 248, 254}, // 513 - 1024 + }, +}; + +#endif // CONFIG_CODE_NONZEROCOUNT diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 352e17c0ccca32b58ae7854abee0178fca6c310c..d05be990d72c2439a9be06bbe0007980b7990fdd 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -41,14 +41,175 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]) = { - 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7 +// Unified coefficient band structure used by all block sizes +DECLARE_ALIGNED(16, const int, vp9_coef_bands8x8[64]) = { + 0, 1, 2, 3, 4, 4, 5, 5, + 1, 2, 3, 4, 4, 5, 5, 5, + 2, 3, 4, 4, 5, 5, 5, 5, + 3, 4, 4, 5, 5, 5, 5, 5, + 4, 4, 5, 5, 5, 5, 5, 5, + 4, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5 +}; +DECLARE_ALIGNED(16, const int, vp9_coef_bands4x4[16]) = { + 0, 1, 2, 3, + 1, 2, 3, 4, + 2, 3, 4, 5, + 3, 4, 5, 5 }; -DECLARE_ALIGNED(16, const uint8_t, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = { - 0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0 +DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { + 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 }; +#if CONFIG_SCATTERSCAN +DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = { + 0, 4, 1, 5, + 8, 2, 12, 9, + 3, 6, 13, 10, + 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = { + 0, 4, 8, 1, + 12, 5, 9, 2, + 13, 6, 10, 3, + 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = { + 0, 1, 4, 2, + 5, 3, 6, 8, + 9, 7, 12, 10, + 13, 11, 14, 15, +}; + +DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = { + 0, 8, 1, 16, 9, 2, 17, 24, + 10, 3, 18, 25, 32, 11, 4, 26, + 33, 19, 40, 12, 34, 27, 5, 41, + 20, 48, 13, 35, 42, 28, 21, 6, + 49, 56, 36, 43, 29, 7, 14, 50, + 57, 44, 22, 37, 15, 51, 58, 30, + 45, 23, 52, 59, 38, 31, 60, 53, + 46, 39, 61, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = { + 0, 8, 16, 1, 24, 9, 32, 17, + 2, 40, 25, 10, 33, 18, 48, 3, + 26, 41, 11, 56, 19, 34, 4, 49, + 27, 42, 12, 35, 20, 57, 50, 28, + 5, 43, 13, 36, 58, 51, 21, 44, + 6, 29, 59, 37, 14, 52, 22, 7, + 45, 60, 30, 15, 38, 53, 23, 46, + 31, 61, 39, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = { + 0, 1, 2, 8, 9, 3, 16, 10, + 4, 17, 11, 24, 5, 18, 25, 12, + 19, 26, 32, 6, 13, 20, 33, 27, + 7, 34, 40, 21, 28, 41, 14, 35, + 48, 42, 29, 36, 49, 22, 43, 15, + 56, 37, 50, 44, 30, 57, 23, 51, + 58, 45, 38, 52, 31, 59, 53, 46, + 60, 39, 61, 47, 54, 55, 62, 63, +}; + +DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, + 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, + 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, + 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146, + 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25, + 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119, + 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194, + 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59, + 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13, + 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, + 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, + 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, + 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, + 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, + 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251, + 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255, +}; + +DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = { + 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, + 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, + 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, + 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85, + 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179, + 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24, + 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227, + 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167, + 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229, + 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, + 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, + 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, + 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, + 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, + 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236, + 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255, +}; + +DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = { + 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, + 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, + 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, + 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100, + 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102, + 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160, + 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176, + 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136, + 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166, + 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, + 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, + 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, + 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, + 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, + 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175, + 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255, +}; + +DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100, + 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197, + 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, 41, 417, 199, 136, + 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105, 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169, 295, 420, 106, 451, + 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, 453, 139, 44, 234, + 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 392, 203, 108, 546, 485, 576, 298, 235, 140, 361, 516, 330, 172, 547, 45, 424, 455, 267, 393, 577, + 486, 77, 204, 517, 362, 548, 608, 14, 456, 299, 578, 109, 236, 425, 394, 487, 609, 331, 141, 579, 518, 46, 268, 15, 173, 549, 610, 640, 363, 78, 519, 488, + 300, 205, 16, 457, 580, 426, 550, 395, 110, 237, 611, 641, 332, 672, 142, 642, 269, 458, 47, 581, 427, 489, 174, 364, 520, 612, 551, 673, 79, 206, 301, 643, + 704, 17, 111, 490, 674, 238, 582, 48, 521, 613, 333, 396, 459, 143, 270, 552, 644, 705, 736, 365, 80, 675, 583, 175, 428, 706, 112, 302, 207, 614, 553, 49, + 645, 522, 737, 397, 768, 144, 334, 18, 676, 491, 239, 615, 707, 584, 81, 460, 176, 271, 738, 429, 113, 800, 366, 208, 523, 708, 646, 554, 677, 769, 19, 145, + 585, 739, 240, 303, 50, 461, 616, 398, 647, 335, 492, 177, 82, 770, 832, 555, 272, 430, 678, 209, 709, 114, 740, 801, 617, 51, 304, 679, 524, 367, 586, 241, + 20, 146, 771, 864, 83, 802, 648, 493, 399, 273, 336, 710, 178, 462, 833, 587, 741, 115, 305, 711, 368, 525, 618, 803, 210, 896, 680, 834, 772, 52, 649, 147, + 431, 494, 556, 242, 400, 865, 337, 21, 928, 179, 742, 84, 463, 274, 369, 804, 650, 557, 743, 960, 835, 619, 773, 306, 211, 526, 432, 992, 588, 712, 116, 243, + 866, 495, 681, 558, 805, 589, 401, 897, 53, 338, 148, 682, 867, 464, 275, 22, 370, 433, 307, 620, 527, 836, 774, 651, 713, 744, 85, 180, 621, 465, 929, 775, + 496, 898, 212, 339, 244, 402, 590, 117, 559, 714, 434, 23, 868, 930, 806, 683, 528, 652, 371, 961, 149, 837, 54, 899, 745, 276, 993, 497, 403, 622, 181, 776, + 746, 529, 560, 435, 86, 684, 466, 308, 591, 653, 715, 807, 340, 869, 213, 962, 245, 838, 561, 931, 808, 592, 118, 498, 372, 623, 685, 994, 467, 654, 747, 900, + 716, 277, 150, 55, 24, 404, 530, 839, 777, 655, 182, 963, 840, 686, 778, 309, 870, 341, 87, 499, 809, 624, 593, 436, 717, 932, 214, 246, 995, 718, 625, 373, + 562, 25, 119, 901, 531, 468, 964, 748, 810, 278, 779, 500, 563, 656, 405, 687, 871, 872, 594, 151, 933, 749, 841, 310, 657, 626, 595, 437, 688, 183, 996, 965, + 902, 811, 342, 750, 689, 719, 532, 56, 215, 469, 934, 374, 247, 720, 780, 564, 781, 842, 406, 26, 751, 903, 873, 57, 279, 627, 501, 658, 843, 997, 812, 904, + 88, 813, 438, 752, 935, 936, 311, 596, 533, 690, 343, 966, 874, 89, 120, 470, 721, 875, 659, 782, 565, 998, 375, 844, 845, 27, 628, 967, 121, 905, 968, 152, + 937, 814, 753, 502, 691, 783, 184, 153, 722, 407, 58, 815, 999, 660, 597, 723, 534, 906, 216, 439, 907, 248, 185, 876, 846, 692, 784, 629, 90, 969, 280, 754, + 938, 939, 217, 847, 566, 471, 785, 816, 877, 1000, 249, 878, 661, 503, 312, 970, 755, 122, 817, 281, 344, 786, 598, 724, 28, 59, 29, 154, 535, 630, 376, 1001, + 313, 908, 186, 91, 848, 849, 345, 909, 940, 879, 408, 818, 693, 1002, 971, 941, 567, 377, 218, 756, 910, 787, 440, 123, 880, 725, 662, 250, 819, 1003, 282, 972, + 850, 599, 472, 409, 155, 441, 942, 757, 788, 694, 911, 881, 314, 631, 973, 504, 187, 1004, 346, 473, 851, 943, 820, 726, 60, 505, 219, 378, 912, 974, 30, 31, + 536, 882, 1005, 92, 251, 663, 944, 913, 283, 695, 883, 568, 1006, 975, 410, 442, 945, 789, 852, 537, 1007, 124, 315, 61, 758, 821, 600, 914, 976, 569, 474, 347, + 156, 1008, 915, 93, 977, 506, 946, 727, 379, 884, 188, 632, 601, 1009, 790, 853, 978, 947, 220, 411, 125, 633, 664, 759, 252, 443, 916, 538, 157, 822, 62, 570, + 979, 284, 1010, 885, 948, 189, 475, 94, 316, 665, 696, 1011, 854, 791, 980, 221, 348, 63, 917, 602, 380, 507, 253, 126, 697, 823, 634, 285, 728, 949, 886, 95, + 158, 539, 1012, 317, 412, 444, 760, 571, 190, 981, 729, 918, 127, 666, 349, 381, 476, 855, 761, 1013, 603, 222, 159, 698, 950, 508, 254, 792, 286, 635, 887, 793, + 413, 191, 982, 445, 540, 318, 730, 667, 223, 824, 919, 1014, 350, 477, 572, 255, 825, 951, 762, 509, 604, 856, 382, 699, 287, 319, 636, 983, 794, 414, 541, 731, + 857, 888, 351, 446, 573, 1015, 668, 889, 478, 826, 383, 763, 605, 920, 510, 637, 415, 700, 921, 858, 447, 952, 542, 795, 479, 953, 732, 890, 669, 574, 511, 984, + 827, 985, 922, 1016, 764, 606, 543, 701, 859, 638, 1017, 575, 796, 954, 733, 891, 670, 607, 828, 986, 765, 923, 639, 1018, 702, 860, 955, 671, 892, 734, 797, 703, + 987, 829, 1019, 766, 924, 735, 861, 956, 988, 893, 767, 798, 830, 1020, 925, 957, 799, 862, 831, 989, 894, 1021, 863, 926, 895, 958, 990, 1022, 927, 959, 991, 1023, +}; +#else // CONFIG_SCATTERSCAN DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, @@ -70,17 +231,6 @@ DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = { 12, 13, 14, 15 }; -DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { - 0, 1, 2, 3, 5, 4, 4, 5, - 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7 -}; - DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, @@ -88,24 +238,26 @@ DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = { 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, }; -// Table can be optimized. -DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = { - 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = { + 0, 8, 16, 24, 32, 40, 48, 56, + 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, + 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, + 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, + 7, 15, 23, 31, 39, 47, 55, 63, +}; + +DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, }; DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = { @@ -143,692 +295,42 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = { 237, 252, 253, 238, 223, 239, 254, 255, }; -#if CONFIG_DWTDCTHYBRID - -#if DWTDCT_TYPE == DWTDCT16X16_LEAN -DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { - 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, }; -DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { - 0, 1, 32, 64, 33, 2, 3, 34, - 65, 96, 128, 97, 66, 35, 4, 5, - 36, 67, 98, 129, 160, 192, 161, 130, - 99, 68, 37, 6, 7, 38, 69, 100, - 131, 162, 193, 224, 256, 225, 194, 163, - 132, 101, 70, 39, 8, 9, 40, 71, - 102, 133, 164, 195, 226, 257, 288, 320, - 289, 258, 227, 196, 165, 134, 103, 72, - 41, 10, 11, 42, 73, 104, 135, 166, - 197, 228, 259, 290, 321, 352, 384, 353, - 322, 291, 260, 229, 198, 167, 136, 105, - 74, 43, 12, 13, 44, 75, 106, 137, - 168, 199, 230, 261, 292, 323, 354, 385, - 416, 448, 417, 386, 355, 324, 293, 262, - 231, 200, 169, 138, 107, 76, 45, 14, - 15, 46, 77, 108, 139, 170, 201, 232, - 263, 294, 325, 356, 387, 418, 449, 480, - 481, 450, 419, 388, 357, 326, 295, 264, - 233, 202, 171, 140, 109, 78, 47, 79, - 110, 141, 172, 203, 234, 265, 296, 327, - 358, 389, 420, 451, 482, 483, 452, 421, - 390, 359, 328, 297, 266, 235, 204, 173, - 142, 111, 143, 174, 205, 236, 267, 298, - 329, 360, 391, 422, 453, 484, 485, 454, - 423, 392, 361, 330, 299, 268, 237, 206, - 175, 207, 238, 269, 300, 331, 362, 393, - 424, 455, 486, 487, 456, 425, 394, 363, - 332, 301, 270, 239, 271, 302, 333, 364, - 395, 426, 457, 488, 489, 458, 427, 396, - 365, 334, 303, 335, 366, 397, 428, 459, - 490, 491, 460, 429, 398, 367, 399, 430, - 461, 492, 493, 462, 431, 463, 494, 495, - - 16, 512, 528, 17, 513, 529, 48, 544, - 560, 80, 576, 592, 49, 545, 561, 18, - 514, 530, 19, 515, 531, 50, 546, 562, - 81, 577, 593, 112, 608, 624, 144, 640, - 656, 113, 609, 625, 82, 578, 594, 51, - 547, 563, 20, 516, 532, 21, 517, 533, - 52, 548, 564, 83, 579, 595, 114, 610, - 626, 145, 641, 657, 176, 672, 688, 208, - 704, 720, 177, 673, 689, 146, 642, 658, - 115, 611, 627, 84, 580, 596, 53, 549, - 565, 22, 518, 534, 23, 519, 535, 54, - 550, 566, 85, 581, 597, 116, 612, 628, - 147, 643, 659, 178, 674, 690, 209, 705, - 721, 240, 736, 752, 272, 768, 784, 241, - 737, 753, 210, 706, 722, 179, 675, 691, - 148, 644, 660, 117, 613, 629, 86, 582, - 598, 55, 551, 567, 24, 520, 536, 25, - 521, 537, 56, 552, 568, 87, 583, 599, - 118, 614, 630, 149, 645, 661, 180, 676, - 692, 211, 707, 723, 242, 738, 754, 273, - 769, 785, 304, 800, 816, 336, 832, 848, - 305, 801, 817, 274, 770, 786, 243, 739, - 755, 212, 708, 724, 181, 677, 693, 150, - 646, 662, 119, 615, 631, 88, 584, 600, - 57, 553, 569, 26, 522, 538, 27, 523, - 539, 58, 554, 570, 89, 585, 601, 120, - 616, 632, 151, 647, 663, 182, 678, 694, - 213, 709, 725, 244, 740, 756, 275, 771, - 787, 306, 802, 818, 337, 833, 849, 368, - 864, 880, 400, 896, 912, 369, 865, 881, - 338, 834, 850, 307, 803, 819, 276, 772, - 788, 245, 741, 757, 214, 710, 726, 183, - - 679, 695, 152, 648, 664, 121, 617, 633, - 90, 586, 602, 59, 555, 571, 28, 524, - 540, 29, 525, 541, 60, 556, 572, 91, - 587, 603, 122, 618, 634, 153, 649, 665, - 184, 680, 696, 215, 711, 727, 246, 742, - 758, 277, 773, 789, 308, 804, 820, 339, - 835, 851, 370, 866, 882, 401, 897, 913, - 432, 928, 944, 464, 960, 976, 433, 929, - 945, 402, 898, 914, 371, 867, 883, 340, - 836, 852, 309, 805, 821, 278, 774, 790, - 247, 743, 759, 216, 712, 728, 185, 681, - 697, 154, 650, 666, 123, 619, 635, 92, - 588, 604, 61, 557, 573, 30, 526, 542, - 31, 527, 543, 62, 558, 574, 93, 589, - 605, 124, 620, 636, 155, 651, 667, 186, - 682, 698, 217, 713, 729, 248, 744, 760, - 279, 775, 791, 310, 806, 822, 341, 837, - 853, 372, 868, 884, 403, 899, 915, 434, - 930, 946, 465, 961, 977, 496, 992, 1008, - 497, 993, 1009, 466, 962, 978, 435, 931, - 947, 404, 900, 916, 373, 869, 885, 342, - 838, 854, 311, 807, 823, 280, 776, 792, - 249, 745, 761, 218, 714, 730, 187, 683, - 699, 156, 652, 668, 125, 621, 637, 94, - 590, 606, 63, 559, 575, 95, 591, 607, - 126, 622, 638, 157, 653, 669, 188, 684, - 700, 219, 715, 731, 250, 746, 762, 281, - 777, 793, 312, 808, 824, 343, 839, 855, - 374, 870, 886, 405, 901, 917, 436, 932, - 948, 467, 963, 979, 498, 994, 1010, 499, - 995, 1011, 468, 964, 980, 437, 933, 949, - 406, 902, 918, 375, 871, 887, 344, 840, - - 856, 313, 809, 825, 282, 778, 794, 251, - 747, 763, 220, 716, 732, 189, 685, 701, - 158, 654, 670, 127, 623, 639, 159, 655, - 671, 190, 686, 702, 221, 717, 733, 252, - 748, 764, 283, 779, 795, 314, 810, 826, - 345, 841, 857, 376, 872, 888, 407, 903, - 919, 438, 934, 950, 469, 965, 981, 500, - 996, 1012, 501, 997, 1013, 470, 966, 982, - 439, 935, 951, 408, 904, 920, 377, 873, - 889, 346, 842, 858, 315, 811, 827, 284, - 780, 796, 253, 749, 765, 222, 718, 734, - 191, 687, 703, 223, 719, 735, 254, 750, - 766, 285, 781, 797, 316, 812, 828, 347, - 843, 859, 378, 874, 890, 409, 905, 921, - 440, 936, 952, 471, 967, 983, 502, 998, - 1014, 503, 999, 1015, 472, 968, 984, 441, - 937, 953, 410, 906, 922, 379, 875, 891, - 348, 844, 860, 317, 813, 829, 286, 782, - 798, 255, 751, 767, 287, 783, 799, 318, - 814, 830, 349, 845, 861, 380, 876, 892, - 411, 907, 923, 442, 938, 954, 473, 969, - 985, 504, 1000, 1016, 505, 1001, 1017, 474, - 970, 986, 443, 939, 955, 412, 908, 924, - 381, 877, 893, 350, 846, 862, 319, 815, - 831, 351, 847, 863, 382, 878, 894, 413, - 909, 925, 444, 940, 956, 475, 971, 987, - 506, 1002, 1018, 507, 1003, 1019, 476, 972, - 988, 445, 941, 957, 414, 910, 926, 383, - 879, 895, 415, 911, 927, 446, 942, 958, - 477, 973, 989, 508, 1004, 1020, 509, 1005, - 1021, 478, 974, 990, 447, 943, 959, 479, - 975, 991, 510, 1006, 1022, 511, 1007, 1023, -}; - -#elif DWTDCT_TYPE == DWTDCT16X16 - -DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { - 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, - 6, 6, 6, - 6, - 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, -}; - -DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { - 0, 1, 32, 64, 33, 2, 3, 34, - 65, 96, 128, 97, 66, 35, 4, - 16, 512, 528, - 5, - 36, 67, 98, 129, 160, 192, 161, 130, - 99, 68, 37, 6, 7, 38, 69, 100, - 131, 162, 193, 224, 256, 225, 194, 163, - 132, 101, 70, 39, 8, 9, 40, 71, - 102, 133, 164, 195, 226, 257, 288, 320, - 289, 258, 227, 196, 165, 134, 103, 72, - 41, 10, 11, 42, 73, 104, 135, 166, - 197, 228, 259, 290, 321, 352, 384, 353, - 322, 291, 260, 229, 198, 167, 136, 105, - 74, 43, 12, 13, 44, 75, 106, 137, - 168, 199, 230, 261, 292, 323, 354, 385, - 416, 448, 417, 386, 355, 324, 293, 262, - 231, 200, 169, 138, 107, 76, 45, 14, - 15, 46, 77, 108, 139, 170, 201, 232, - 263, 294, 325, 356, 387, 418, 449, 480, - 481, 450, 419, 388, 357, 326, 295, 264, - 233, 202, 171, 140, 109, 78, 47, 79, - 110, 141, 172, 203, 234, 265, 296, 327, - 358, 389, 420, 451, 482, 483, 452, 421, - 390, 359, 328, 297, 266, 235, 204, 173, - 142, 111, 143, 174, 205, 236, 267, 298, - 329, 360, 391, 422, 453, 484, 485, 454, - 423, 392, 361, 330, 299, 268, 237, 206, - 175, 207, 238, 269, 300, 331, 362, 393, - 424, 455, 486, 487, 456, 425, 394, 363, - 332, 301, 270, 239, 271, 302, 333, 364, - 395, 426, 457, 488, 489, 458, 427, 396, - 365, 334, 303, 335, 366, 397, 428, 459, - 490, 491, 460, 429, 398, 367, 399, 430, - 461, 492, 493, 462, 431, 463, 494, 495, - - 17, 513, 529, 48, 544, - 560, 80, 576, 592, 49, 545, 561, 18, - 514, 530, 19, 515, 531, 50, 546, 562, - 81, 577, 593, 112, 608, 624, 144, 640, - 656, 113, 609, 625, 82, 578, 594, 51, - 547, 563, 20, 516, 532, 21, 517, 533, - 52, 548, 564, 83, 579, 595, 114, 610, - 626, 145, 641, 657, 176, 672, 688, 208, - 704, 720, 177, 673, 689, 146, 642, 658, - 115, 611, 627, 84, 580, 596, 53, 549, - 565, 22, 518, 534, 23, 519, 535, 54, - 550, 566, 85, 581, 597, 116, 612, 628, - 147, 643, 659, 178, 674, 690, 209, 705, - 721, 240, 736, 752, 272, 768, 784, 241, - 737, 753, 210, 706, 722, 179, 675, 691, - 148, 644, 660, 117, 613, 629, 86, 582, - 598, 55, 551, 567, 24, 520, 536, 25, - 521, 537, 56, 552, 568, 87, 583, 599, - 118, 614, 630, 149, 645, 661, 180, 676, - 692, 211, 707, 723, 242, 738, 754, 273, - 769, 785, 304, 800, 816, 336, 832, 848, - 305, 801, 817, 274, 770, 786, 243, 739, - 755, 212, 708, 724, 181, 677, 693, 150, - 646, 662, 119, 615, 631, 88, 584, 600, - 57, 553, 569, 26, 522, 538, 27, 523, - 539, 58, 554, 570, 89, 585, 601, 120, - 616, 632, 151, 647, 663, 182, 678, 694, - 213, 709, 725, 244, 740, 756, 275, 771, - 787, 306, 802, 818, 337, 833, 849, 368, - 864, 880, 400, 896, 912, 369, 865, 881, - 338, 834, 850, 307, 803, 819, 276, 772, - 788, 245, 741, 757, 214, 710, 726, 183, - - 679, 695, 152, 648, 664, 121, 617, 633, - 90, 586, 602, 59, 555, 571, 28, 524, - 540, 29, 525, 541, 60, 556, 572, 91, - 587, 603, 122, 618, 634, 153, 649, 665, - 184, 680, 696, 215, 711, 727, 246, 742, - 758, 277, 773, 789, 308, 804, 820, 339, - 835, 851, 370, 866, 882, 401, 897, 913, - 432, 928, 944, 464, 960, 976, 433, 929, - 945, 402, 898, 914, 371, 867, 883, 340, - 836, 852, 309, 805, 821, 278, 774, 790, - 247, 743, 759, 216, 712, 728, 185, 681, - 697, 154, 650, 666, 123, 619, 635, 92, - 588, 604, 61, 557, 573, 30, 526, 542, - 31, 527, 543, 62, 558, 574, 93, 589, - 605, 124, 620, 636, 155, 651, 667, 186, - 682, 698, 217, 713, 729, 248, 744, 760, - 279, 775, 791, 310, 806, 822, 341, 837, - 853, 372, 868, 884, 403, 899, 915, 434, - 930, 946, 465, 961, 977, 496, 992, 1008, - 497, 993, 1009, 466, 962, 978, 435, 931, - 947, 404, 900, 916, 373, 869, 885, 342, - 838, 854, 311, 807, 823, 280, 776, 792, - 249, 745, 761, 218, 714, 730, 187, 683, - 699, 156, 652, 668, 125, 621, 637, 94, - 590, 606, 63, 559, 575, 95, 591, 607, - 126, 622, 638, 157, 653, 669, 188, 684, - 700, 219, 715, 731, 250, 746, 762, 281, - 777, 793, 312, 808, 824, 343, 839, 855, - 374, 870, 886, 405, 901, 917, 436, 932, - 948, 467, 963, 979, 498, 994, 1010, 499, - 995, 1011, 468, 964, 980, 437, 933, 949, - 406, 902, 918, 375, 871, 887, 344, 840, - - 856, 313, 809, 825, 282, 778, 794, 251, - 747, 763, 220, 716, 732, 189, 685, 701, - 158, 654, 670, 127, 623, 639, 159, 655, - 671, 190, 686, 702, 221, 717, 733, 252, - 748, 764, 283, 779, 795, 314, 810, 826, - 345, 841, 857, 376, 872, 888, 407, 903, - 919, 438, 934, 950, 469, 965, 981, 500, - 996, 1012, 501, 997, 1013, 470, 966, 982, - 439, 935, 951, 408, 904, 920, 377, 873, - 889, 346, 842, 858, 315, 811, 827, 284, - 780, 796, 253, 749, 765, 222, 718, 734, - 191, 687, 703, 223, 719, 735, 254, 750, - 766, 285, 781, 797, 316, 812, 828, 347, - 843, 859, 378, 874, 890, 409, 905, 921, - 440, 936, 952, 471, 967, 983, 502, 998, - 1014, 503, 999, 1015, 472, 968, 984, 441, - 937, 953, 410, 906, 922, 379, 875, 891, - 348, 844, 860, 317, 813, 829, 286, 782, - 798, 255, 751, 767, 287, 783, 799, 318, - 814, 830, 349, 845, 861, 380, 876, 892, - 411, 907, 923, 442, 938, 954, 473, 969, - 985, 504, 1000, 1016, 505, 1001, 1017, 474, - 970, 986, 443, 939, 955, 412, 908, 924, - 381, 877, 893, 350, 846, 862, 319, 815, - 831, 351, 847, 863, 382, 878, 894, 413, - 909, 925, 444, 940, 956, 475, 971, 987, - 506, 1002, 1018, 507, 1003, 1019, 476, 972, - 988, 445, 941, 957, 414, 910, 926, 383, - 879, 895, 415, 911, 927, 446, 942, 958, - 477, 973, 989, 508, 1004, 1020, 509, 1005, - 1021, 478, 974, 990, 447, 943, 959, 479, - 975, 991, 510, 1006, 1022, 511, 1007, 1023, -}; - -#elif DWTDCT_TYPE == DWTDCT8X8 - -DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { - 0, 1, 2, 3, 5, 4, 4, 5, - 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - - 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, -}; - -DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { - 0, 1, 32, 64, 33, 2, 3, 34, - 65, 96, 128, 97, 66, 35, 4, 5, - 36, 67, 98, 129, 160, 192, 161, 130, - 99, 68, 37, 6, 7, 38, 69, 100, - 131, 162, 193, 224, 225, 194, 163, 132, - 101, 70, 39, 71, 102, 133, 164, 195, - 226, 227, 196, 165, 134, 103, 135, 166, - 197, 228, 229, 198, 167, 199, 230, 231, - - 8, 256, 264, 9, 257, 265, 40, 288, 296, 72, 320, 328, - 41, 289, 297, 10, 258, 266, 11, 259, 267, 42, 290, 298, - 73, 321, 329, 104, 352, 360, 136, 384, 392, 105, 353, 361, - 74, 322, 330, 43, 291, 299, 12, 260, 268, 13, 261, 269, - 44, 292, 300, 75, 323, 331, 106, 354, 362, 137, 385, 393, - 168, 416, 424, 200, 448, 456, 169, 417, 425, 138, 386, 394, - 107, 355, 363, 76, 324, 332, 45, 293, 301, 14, 262, 270, - 15, 263, 271, 46, 294, 302, 77, 325, 333, 108, 356, 364, - 139, 387, 395, 170, 418, 426, 201, 449, 457, 232, 480, 488, - 233, 481, 489, 202, 450, 458, 171, 419, 427, 140, 388, 396, - 109, 357, 365, 78, 326, 334, 47, 295, 303, 79, 327, 335, - 110, 358, 366, 141, 389, 397, 172, 420, 428, 203, 451, 459, - 234, 482, 490, 235, 483, 491, 204, 452, 460, 173, 421, 429, - 142, 390, 398, 111, 359, 367, 143, 391, 399, 174, 422, 430, - 205, 453, 461, 236, 484, 492, 237, 485, 493, 206, 454, 462, - 175, 423, 431, 207, 455, 463, 238, 486, 494, 239, 487, 495, - - 16, 512, 528, 17, 513, 529, 18, 514, - 530, 19, 515, 531, 20, 516, 532, 21, - 517, 533, 22, 518, 534, 23, 519, 535, - 24, 520, 536, 25, 521, 537, 26, 522, - 538, 27, 523, 539, 28, 524, 540, 29, - 525, 541, 30, 526, 542, 31, 527, 543, - 48, 544, 560, 49, 545, 561, 50, 546, - 562, 51, 547, 563, 52, 548, 564, 53, - 549, 565, 54, 550, 566, 55, 551, 567, - 56, 552, 568, 57, 553, 569, 58, 554, - 570, 59, 555, 571, 60, 556, 572, 61, - 557, 573, 62, 558, 574, 63, 559, 575, - 80, 576, 592, 81, 577, 593, 82, 578, - 594, 83, 579, 595, 84, 580, 596, 85, - 581, 597, 86, 582, 598, 87, 583, 599, - 88, 584, 600, 89, 585, 601, 90, 586, - 602, 91, 587, 603, 92, 588, 604, 93, - 589, 605, 94, 590, 606, 95, 591, 607, - 112, 608, 624, 113, 609, 625, 114, 610, - 626, 115, 611, 627, 116, 612, 628, 117, - 613, 629, 118, 614, 630, 119, 615, 631, - 120, 616, 632, 121, 617, 633, 122, 618, - 634, 123, 619, 635, 124, 620, 636, 125, - 621, 637, 126, 622, 638, 127, 623, 639, - 144, 640, 656, 145, 641, 657, 146, 642, - 658, 147, 643, 659, 148, 644, 660, 149, - 645, 661, 150, 646, 662, 151, 647, 663, - 152, 648, 664, 153, 649, 665, 154, 650, - 666, 155, 651, 667, 156, 652, 668, 157, - 653, 669, 158, 654, 670, 159, 655, 671, - 176, 672, 688, 177, 673, 689, 178, 674, - 690, 179, 675, 691, 180, 676, 692, 181, - 677, 693, 182, 678, 694, 183, 679, 695, - 184, 680, 696, 185, 681, 697, 186, 682, - 698, 187, 683, 699, 188, 684, 700, 189, - 685, 701, 190, 686, 702, 191, 687, 703, - 208, 704, 720, 209, 705, 721, 210, 706, - 722, 211, 707, 723, 212, 708, 724, 213, - 709, 725, 214, 710, 726, 215, 711, 727, - 216, 712, 728, 217, 713, 729, 218, 714, - 730, 219, 715, 731, 220, 716, 732, 221, - 717, 733, 222, 718, 734, 223, 719, 735, - 240, 736, 752, 241, 737, 753, 242, 738, - 754, 243, 739, 755, 244, 740, 756, 245, - 741, 757, 246, 742, 758, 247, 743, 759, - 248, 744, 760, 249, 745, 761, 250, 746, - 762, 251, 747, 763, 252, 748, 764, 253, - 749, 765, 254, 750, 766, 255, 751, 767, - 272, 768, 784, 273, 769, 785, 274, 770, - 786, 275, 771, 787, 276, 772, 788, 277, - 773, 789, 278, 774, 790, 279, 775, 791, - 280, 776, 792, 281, 777, 793, 282, 778, - 794, 283, 779, 795, 284, 780, 796, 285, - 781, 797, 286, 782, 798, 287, 783, 799, - 304, 800, 816, 305, 801, 817, 306, 802, - 818, 307, 803, 819, 308, 804, 820, 309, - 805, 821, 310, 806, 822, 311, 807, 823, - 312, 808, 824, 313, 809, 825, 314, 810, - 826, 315, 811, 827, 316, 812, 828, 317, - 813, 829, 318, 814, 830, 319, 815, 831, - 336, 832, 848, 337, 833, 849, 338, 834, - 850, 339, 835, 851, 340, 836, 852, 341, - 837, 853, 342, 838, 854, 343, 839, 855, - 344, 840, 856, 345, 841, 857, 346, 842, - 858, 347, 843, 859, 348, 844, 860, 349, - 845, 861, 350, 846, 862, 351, 847, 863, - 368, 864, 880, 369, 865, 881, 370, 866, - 882, 371, 867, 883, 372, 868, 884, 373, - 869, 885, 374, 870, 886, 375, 871, 887, - 376, 872, 888, 377, 873, 889, 378, 874, - 890, 379, 875, 891, 380, 876, 892, 381, - 877, 893, 382, 878, 894, 383, 879, 895, - 400, 896, 912, 401, 897, 913, 402, 898, - 914, 403, 899, 915, 404, 900, 916, 405, - 901, 917, 406, 902, 918, 407, 903, 919, - 408, 904, 920, 409, 905, 921, 410, 906, - 922, 411, 907, 923, 412, 908, 924, 413, - 909, 925, 414, 910, 926, 415, 911, 927, - 432, 928, 944, 433, 929, 945, 434, 930, - 946, 435, 931, 947, 436, 932, 948, 437, - 933, 949, 438, 934, 950, 439, 935, 951, - 440, 936, 952, 441, 937, 953, 442, 938, - 954, 443, 939, 955, 444, 940, 956, 445, - 941, 957, 446, 942, 958, 447, 943, 959, - 464, 960, 976, 465, 961, 977, 466, 962, - 978, 467, 963, 979, 468, 964, 980, 469, - 965, 981, 470, 966, 982, 471, 967, 983, - 472, 968, 984, 473, 969, 985, 474, 970, - 986, 475, 971, 987, 476, 972, 988, 477, - 973, 989, 478, 974, 990, 479, 975, 991, - 496, 992, 1008, 497, 993, 1009, 498, 994, - 1010, 499, 995, 1011, 500, 996, 1012, 501, - 997, 1013, 502, 998, 1014, 503, 999, 1015, - 504, 1000, 1016, 505, 1001, 1017, 506, 1002, - 1018, 507, 1003, 1019, 508, 1004, 1020, 509, - 1005, 1021, 510, 1006, 1022, 511, 1007, 1023, -}; -#endif - -#else - -DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { - 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, }; DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { @@ -865,7 +367,7 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { 951, 920, 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890, 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767, 799, 830, 861, 892, 923, 954, 985, 1016, 1017, 986, 955, 924, 893, 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895, 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023, }; -#endif // CONFIG_DWTDCTHYBRID +#endif // CONFIG_SCATTERSCAN /* Array indices are identical to previously-existing CONTEXT_NODE indices */ @@ -898,6 +400,1661 @@ static const vp9_prob Pcat6[] = { 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 }; +#if CONFIG_CODE_NONZEROCOUNT +const vp9_tree_index vp9_nzc4x4_tree[2 * NZC4X4_NODES] = { + -NZC_0, 2, + 4, 6, + -NZC_1, -NZC_2, + -NZC_3TO4, 8, + -NZC_5TO8, -NZC_9TO16, +}; +struct vp9_token_struct vp9_nzc4x4_encodings[NZC4X4_TOKENS]; + +const vp9_tree_index vp9_nzc8x8_tree[2 * NZC8X8_NODES] = { + -NZC_0, 2, + 4, 6, + -NZC_1, -NZC_2, + 8, 10, + -NZC_3TO4, -NZC_5TO8, + -NZC_9TO16, 12, + -NZC_17TO32, -NZC_33TO64, +}; +struct vp9_token_struct vp9_nzc8x8_encodings[NZC8X8_TOKENS]; + +const vp9_tree_index vp9_nzc16x16_tree[2 * NZC16X16_NODES] = { + -NZC_0, 2, + 4, 6, + -NZC_1, -NZC_2, + 8, 10, + -NZC_3TO4, -NZC_5TO8, + 12, 14, + -NZC_9TO16, -NZC_17TO32, + -NZC_33TO64, 16, + -NZC_65TO128, -NZC_129TO256, +}; +struct vp9_token_struct vp9_nzc16x16_encodings[NZC16X16_TOKENS]; + +const vp9_tree_index vp9_nzc32x32_tree[2 * NZC32X32_NODES] = { + -NZC_0, 2, + 4, 6, + -NZC_1, -NZC_2, + 8, 10, + -NZC_3TO4, -NZC_5TO8, + 12, 14, + -NZC_9TO16, -NZC_17TO32, + 16, 18, + -NZC_33TO64, -NZC_65TO128, + -NZC_129TO256, 20, + -NZC_257TO512, -NZC_513TO1024, +}; +struct vp9_token_struct vp9_nzc32x32_encodings[NZC32X32_TOKENS]; + +const int vp9_extranzcbits[NZC32X32_TOKENS] = { + 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +}; + +const int vp9_basenzcvalue[NZC32X32_TOKENS] = { + 0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513 +}; + +#endif // CONFIG_CODE_NONZEROCOUNT + +#if CONFIG_MODELCOEFPROB + +const vp9_prob vp9_modelcoefprobs_gg875[COEFPROB_MODELS][ENTROPY_NODES - 1] = { + // Probs generated with a Generalized Gaussian (with shape parameter 0.875) + // source model with varying quantizer step size for a uniform quantizer + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, // do not use + {1, 2, 6, 86, 129, 11, 87, 42, 92, 52,}, + {2, 4, 12, 87, 129, 22, 89, 75, 97, 91,}, + {3, 6, 17, 88, 130, 32, 90, 102, 102, 121,}, + {4, 8, 22, 89, 131, 41, 91, 125, 107, 145,}, + {5, 10, 28, 90, 131, 50, 93, 144, 112, 164,}, + {6, 12, 33, 90, 132, 59, 94, 160, 117, 180,}, + {7, 14, 38, 91, 132, 67, 95, 173, 122, 193,}, + {8, 15, 42, 92, 133, 75, 97, 185, 126, 204,}, + {9, 17, 47, 92, 133, 82, 98, 195, 131, 212,}, + {10, 19, 52, 93, 134, 89, 99, 203, 135, 220,}, + {11, 21, 56, 94, 134, 96, 101, 211, 140, 226,}, + {12, 23, 60, 95, 135, 102, 102, 217, 144, 231,}, + {13, 25, 65, 95, 135, 109, 103, 222, 148, 235,}, + {14, 26, 69, 96, 136, 115, 105, 227, 153, 238,}, + {15, 28, 73, 97, 136, 120, 106, 231, 157, 241,}, + {16, 30, 77, 97, 137, 126, 107, 234, 161, 244,}, + {17, 32, 81, 98, 138, 131, 108, 237, 164, 246,}, + {18, 34, 85, 99, 138, 136, 110, 240, 168, 247,}, + {19, 35, 89, 100, 139, 141, 111, 242, 172, 249,}, + {20, 37, 92, 100, 139, 145, 112, 244, 175, 250,}, + {21, 39, 96, 101, 140, 150, 113, 246, 179, 251,}, + {22, 41, 99, 102, 140, 154, 115, 247, 182, 252,}, + {23, 42, 103, 102, 141, 158, 116, 248, 185, 252,}, + {24, 44, 106, 103, 141, 162, 117, 249, 188, 253,}, + {25, 46, 110, 104, 142, 166, 118, 250, 191, 253,}, + {26, 48, 113, 104, 142, 170, 120, 251, 194, 254,}, + {27, 49, 116, 105, 143, 173, 121, 252, 197, 254,}, + {28, 51, 119, 106, 143, 176, 122, 252, 200, 254,}, + {29, 53, 122, 107, 144, 180, 123, 253, 202, 255,}, + {30, 54, 125, 107, 144, 183, 125, 253, 205, 255,}, + {31, 56, 128, 108, 145, 186, 126, 254, 207, 255,}, + {32, 58, 131, 109, 145, 189, 127, 254, 209, 255,}, + {33, 59, 134, 109, 146, 191, 128, 254, 212, 255,}, + {34, 61, 137, 110, 146, 194, 130, 254, 214, 255,}, + {35, 62, 139, 111, 147, 196, 131, 255, 216, 255,}, + {36, 64, 142, 112, 147, 199, 132, 255, 218, 255,}, + {37, 66, 145, 112, 148, 201, 134, 255, 220, 255,}, + {38, 67, 147, 113, 148, 203, 135, 255, 221, 255,}, + {39, 69, 150, 114, 149, 206, 136, 255, 223, 255,}, + {40, 70, 152, 114, 149, 208, 137, 255, 225, 255,}, + {41, 72, 155, 115, 150, 210, 138, 255, 226, 255,}, + {42, 74, 157, 116, 150, 212, 140, 255, 228, 255,}, + {43, 75, 159, 117, 151, 213, 141, 255, 229, 255,}, + {44, 77, 161, 117, 151, 215, 142, 255, 230, 255,}, + {45, 78, 164, 118, 152, 217, 143, 255, 232, 255,}, + {46, 80, 166, 119, 152, 219, 145, 255, 233, 255,}, + {47, 81, 168, 120, 153, 220, 146, 255, 234, 255,}, + {48, 83, 170, 120, 153, 222, 147, 255, 235, 255,}, + {49, 84, 172, 121, 154, 223, 148, 255, 236, 255,}, + {50, 86, 174, 122, 154, 225, 150, 255, 237, 255,}, + {51, 87, 176, 123, 155, 226, 151, 255, 238, 255,}, + {52, 89, 178, 123, 155, 227, 152, 255, 239, 255,}, + {53, 90, 180, 124, 156, 228, 153, 255, 240, 255,}, + {54, 92, 182, 125, 156, 230, 154, 255, 241, 255,}, + {55, 93, 183, 126, 157, 231, 156, 255, 242, 255,}, + {56, 95, 185, 126, 157, 232, 157, 255, 242, 255,}, + {57, 96, 187, 127, 158, 233, 158, 255, 243, 255,}, + {58, 98, 189, 128, 158, 234, 159, 255, 244, 255,}, + {59, 99, 190, 129, 159, 235, 160, 255, 244, 255,}, + {60, 101, 192, 129, 159, 236, 162, 255, 245, 255,}, + {61, 102, 193, 130, 160, 237, 163, 255, 246, 255,}, + {62, 104, 195, 131, 160, 238, 164, 255, 246, 255,}, + {63, 105, 197, 132, 161, 238, 165, 255, 247, 255,}, + {64, 106, 198, 132, 162, 239, 166, 255, 247, 255,}, + {65, 108, 199, 133, 162, 240, 167, 255, 248, 255,}, + {66, 109, 201, 134, 163, 241, 169, 255, 248, 255,}, + {67, 111, 202, 135, 163, 241, 170, 255, 249, 255,}, + {68, 112, 204, 135, 164, 242, 171, 255, 249, 255,}, + {69, 113, 205, 136, 164, 243, 172, 255, 249, 255,}, + {70, 115, 206, 137, 165, 243, 173, 255, 250, 255,}, + {71, 116, 208, 138, 165, 244, 174, 255, 250, 255,}, + {72, 117, 209, 138, 166, 244, 175, 255, 250, 255,}, + {73, 119, 210, 139, 166, 245, 177, 255, 251, 255,}, + {74, 120, 211, 140, 167, 245, 178, 255, 251, 255,}, + {75, 121, 212, 141, 167, 246, 179, 255, 251, 255,}, + {76, 123, 214, 142, 168, 246, 180, 255, 252, 255,}, + {77, 124, 215, 142, 168, 247, 181, 255, 252, 255,}, + {78, 125, 216, 143, 169, 247, 182, 255, 252, 255,}, + {79, 127, 217, 144, 170, 248, 183, 255, 252, 255,}, + {80, 128, 218, 145, 170, 248, 184, 255, 253, 255,}, + {81, 129, 219, 146, 171, 248, 185, 255, 253, 255,}, + {82, 131, 220, 146, 171, 249, 186, 255, 253, 255,}, + {83, 132, 221, 147, 172, 249, 187, 255, 253, 255,}, + {84, 133, 222, 148, 172, 249, 188, 255, 253, 255,}, + {85, 134, 223, 149, 173, 250, 189, 255, 253, 255,}, + {86, 136, 224, 149, 173, 250, 190, 255, 254, 255,}, + {87, 137, 225, 150, 174, 250, 191, 255, 254, 255,}, + {88, 138, 226, 151, 174, 251, 192, 255, 254, 255,}, + {89, 139, 226, 152, 175, 251, 193, 255, 254, 255,}, + {90, 141, 227, 153, 175, 251, 194, 255, 254, 255,}, + {91, 142, 228, 153, 176, 251, 195, 255, 254, 255,}, + {92, 143, 229, 154, 177, 252, 196, 255, 254, 255,}, + {93, 144, 230, 155, 177, 252, 197, 255, 254, 255,}, + {94, 146, 230, 156, 178, 252, 198, 255, 255, 255,}, + {95, 147, 231, 157, 178, 252, 199, 255, 255, 255,}, + {96, 148, 232, 157, 179, 252, 200, 255, 255, 255,}, + {97, 149, 233, 158, 179, 253, 201, 255, 255, 255,}, + {98, 150, 233, 159, 180, 253, 202, 255, 255, 255,}, + {99, 152, 234, 160, 180, 253, 203, 255, 255, 255,}, + {100, 153, 235, 161, 181, 253, 204, 255, 255, 255,}, + {101, 154, 235, 161, 182, 253, 205, 255, 255, 255,}, + {102, 155, 236, 162, 182, 253, 206, 255, 255, 255,}, + {103, 156, 236, 163, 183, 254, 207, 255, 255, 255,}, + {104, 157, 237, 164, 183, 254, 207, 255, 255, 255,}, + {105, 159, 238, 165, 184, 254, 208, 255, 255, 255,}, + {106, 160, 238, 166, 184, 254, 209, 255, 255, 255,}, + {107, 161, 239, 166, 185, 254, 210, 255, 255, 255,}, + {108, 162, 239, 167, 185, 254, 211, 255, 255, 255,}, + {109, 163, 240, 168, 186, 254, 212, 255, 255, 255,}, + {110, 164, 240, 169, 187, 254, 212, 255, 255, 255,}, + {111, 165, 241, 170, 187, 254, 213, 255, 255, 255,}, + {112, 166, 241, 170, 188, 255, 214, 255, 255, 255,}, + {113, 167, 242, 171, 188, 255, 215, 255, 255, 255,}, + {114, 169, 242, 172, 189, 255, 216, 255, 255, 255,}, + {115, 170, 243, 173, 189, 255, 216, 255, 255, 255,}, + {116, 171, 243, 174, 190, 255, 217, 255, 255, 255,}, + {117, 172, 244, 174, 190, 255, 218, 255, 255, 255,}, + {118, 173, 244, 175, 191, 255, 219, 255, 255, 255,}, + {119, 174, 244, 176, 192, 255, 219, 255, 255, 255,}, + {120, 175, 245, 177, 192, 255, 220, 255, 255, 255,}, + {121, 176, 245, 178, 193, 255, 221, 255, 255, 255,}, + {122, 177, 245, 178, 193, 255, 222, 255, 255, 255,}, + {123, 178, 246, 179, 194, 255, 222, 255, 255, 255,}, + {124, 179, 246, 180, 194, 255, 223, 255, 255, 255,}, + {125, 180, 247, 181, 195, 255, 224, 255, 255, 255,}, + {126, 181, 247, 182, 196, 255, 224, 255, 255, 255,}, + {127, 182, 247, 182, 196, 255, 225, 255, 255, 255,}, + {128, 183, 247, 183, 197, 255, 226, 255, 255, 255,}, + {129, 184, 248, 184, 197, 255, 226, 255, 255, 255,}, + {130, 185, 248, 185, 198, 255, 227, 255, 255, 255,}, + {131, 186, 248, 186, 198, 255, 228, 255, 255, 255,}, + {132, 187, 249, 186, 199, 255, 228, 255, 255, 255,}, + {133, 188, 249, 187, 200, 255, 229, 255, 255, 255,}, + {134, 189, 249, 188, 200, 255, 230, 255, 255, 255,}, + {135, 190, 249, 189, 201, 255, 230, 255, 255, 255,}, + {136, 191, 250, 190, 201, 255, 231, 255, 255, 255,}, + {137, 192, 250, 190, 202, 255, 231, 255, 255, 255,}, + {138, 193, 250, 191, 202, 255, 232, 255, 255, 255,}, + {139, 194, 250, 192, 203, 255, 232, 255, 255, 255,}, + {140, 195, 251, 193, 204, 255, 233, 255, 255, 255,}, + {141, 195, 251, 194, 204, 255, 234, 255, 255, 255,}, + {142, 196, 251, 194, 205, 255, 234, 255, 255, 255,}, + {143, 197, 251, 195, 205, 255, 235, 255, 255, 255,}, + {144, 198, 251, 196, 206, 255, 235, 255, 255, 255,}, + {145, 199, 252, 197, 206, 255, 236, 255, 255, 255,}, + {146, 200, 252, 197, 207, 255, 236, 255, 255, 255,}, + {147, 201, 252, 198, 208, 255, 237, 255, 255, 255,}, + {148, 202, 252, 199, 208, 255, 237, 255, 255, 255,}, + {149, 203, 252, 200, 209, 255, 238, 255, 255, 255,}, + {150, 203, 252, 201, 209, 255, 238, 255, 255, 255,}, + {151, 204, 253, 201, 210, 255, 239, 255, 255, 255,}, + {152, 205, 253, 202, 210, 255, 239, 255, 255, 255,}, + {153, 206, 253, 203, 211, 255, 239, 255, 255, 255,}, + {154, 207, 253, 204, 212, 255, 240, 255, 255, 255,}, + {155, 208, 253, 204, 212, 255, 240, 255, 255, 255,}, + {156, 209, 253, 205, 213, 255, 241, 255, 255, 255,}, + {157, 209, 253, 206, 213, 255, 241, 255, 255, 255,}, + {158, 210, 254, 207, 214, 255, 242, 255, 255, 255,}, + {159, 211, 254, 207, 214, 255, 242, 255, 255, 255,}, + {160, 212, 254, 208, 215, 255, 242, 255, 255, 255,}, + {161, 213, 254, 209, 215, 255, 243, 255, 255, 255,}, + {162, 213, 254, 210, 216, 255, 243, 255, 255, 255,}, + {163, 214, 254, 210, 217, 255, 244, 255, 255, 255,}, + {164, 215, 254, 211, 217, 255, 244, 255, 255, 255,}, + {165, 216, 254, 212, 218, 255, 244, 255, 255, 255,}, + {166, 216, 254, 212, 218, 255, 245, 255, 255, 255,}, + {167, 217, 254, 213, 219, 255, 245, 255, 255, 255,}, + {168, 218, 254, 214, 219, 255, 245, 255, 255, 255,}, + {169, 219, 255, 215, 220, 255, 246, 255, 255, 255,}, + {170, 219, 255, 215, 221, 255, 246, 255, 255, 255,}, + {171, 220, 255, 216, 221, 255, 246, 255, 255, 255,}, + {172, 221, 255, 217, 222, 255, 247, 255, 255, 255,}, + {173, 222, 255, 217, 222, 255, 247, 255, 255, 255,}, + {174, 222, 255, 218, 223, 255, 247, 255, 255, 255,}, + {175, 223, 255, 219, 223, 255, 248, 255, 255, 255,}, + {176, 224, 255, 220, 224, 255, 248, 255, 255, 255,}, + {177, 224, 255, 220, 224, 255, 248, 255, 255, 255,}, + {178, 225, 255, 221, 225, 255, 248, 255, 255, 255,}, + {179, 226, 255, 222, 225, 255, 249, 255, 255, 255,}, + {180, 226, 255, 222, 226, 255, 249, 255, 255, 255,}, + {181, 227, 255, 223, 227, 255, 249, 255, 255, 255,}, + {182, 228, 255, 224, 227, 255, 249, 255, 255, 255,}, + {183, 228, 255, 224, 228, 255, 250, 255, 255, 255,}, + {184, 229, 255, 225, 228, 255, 250, 255, 255, 255,}, + {185, 230, 255, 226, 229, 255, 250, 255, 255, 255,}, + {186, 230, 255, 226, 229, 255, 250, 255, 255, 255,}, + {187, 231, 255, 227, 230, 255, 251, 255, 255, 255,}, + {188, 232, 255, 228, 230, 255, 251, 255, 255, 255,}, + {189, 232, 255, 228, 231, 255, 251, 255, 255, 255,}, + {190, 233, 255, 229, 231, 255, 251, 255, 255, 255,}, + {191, 233, 255, 229, 232, 255, 251, 255, 255, 255,}, + {192, 234, 255, 230, 232, 255, 252, 255, 255, 255,}, + {193, 234, 255, 231, 233, 255, 252, 255, 255, 255,}, + {194, 235, 255, 231, 233, 255, 252, 255, 255, 255,}, + {195, 236, 255, 232, 234, 255, 252, 255, 255, 255,}, + {196, 236, 255, 232, 234, 255, 252, 255, 255, 255,}, + {197, 237, 255, 233, 235, 255, 252, 255, 255, 255,}, + {198, 237, 255, 234, 235, 255, 253, 255, 255, 255,}, + {199, 238, 255, 234, 236, 255, 253, 255, 255, 255,}, + {200, 238, 255, 235, 236, 255, 253, 255, 255, 255,}, + {201, 239, 255, 235, 237, 255, 253, 255, 255, 255,}, + {202, 239, 255, 236, 237, 255, 253, 255, 255, 255,}, + {203, 240, 255, 237, 238, 255, 253, 255, 255, 255,}, + {204, 240, 255, 237, 238, 255, 254, 255, 255, 255,}, + {205, 241, 255, 238, 239, 255, 254, 255, 255, 255,}, + {206, 241, 255, 238, 239, 255, 254, 255, 255, 255,}, + {207, 242, 255, 239, 240, 255, 254, 255, 255, 255,}, + {208, 242, 255, 239, 240, 255, 254, 255, 255, 255,}, + {209, 243, 255, 240, 241, 255, 254, 255, 255, 255,}, + {210, 243, 255, 240, 241, 255, 254, 255, 255, 255,}, + {211, 244, 255, 241, 242, 255, 254, 255, 255, 255,}, + {212, 244, 255, 241, 242, 255, 254, 255, 255, 255,}, + {213, 245, 255, 242, 243, 255, 255, 255, 255, 255,}, + {214, 245, 255, 242, 243, 255, 255, 255, 255, 255,}, + {215, 246, 255, 243, 244, 255, 255, 255, 255, 255,}, + {216, 246, 255, 243, 244, 255, 255, 255, 255, 255,}, + {217, 246, 255, 244, 244, 255, 255, 255, 255, 255,}, + {218, 247, 255, 244, 245, 255, 255, 255, 255, 255,}, + {219, 247, 255, 245, 245, 255, 255, 255, 255, 255,}, + {220, 248, 255, 245, 246, 255, 255, 255, 255, 255,}, + {221, 248, 255, 246, 246, 255, 255, 255, 255, 255,}, + {222, 248, 255, 246, 247, 255, 255, 255, 255, 255,}, + {223, 249, 255, 247, 247, 255, 255, 255, 255, 255,}, + {224, 249, 255, 247, 247, 255, 255, 255, 255, 255,}, + {225, 250, 255, 247, 248, 255, 255, 255, 255, 255,}, + {226, 250, 255, 248, 248, 255, 255, 255, 255, 255,}, + {227, 250, 255, 248, 249, 255, 255, 255, 255, 255,}, + {228, 251, 255, 249, 249, 255, 255, 255, 255, 255,}, + {229, 251, 255, 249, 249, 255, 255, 255, 255, 255,}, + {230, 251, 255, 249, 250, 255, 255, 255, 255, 255,}, + {231, 251, 255, 250, 250, 255, 255, 255, 255, 255,}, + {232, 252, 255, 250, 250, 255, 255, 255, 255, 255,}, + {233, 252, 255, 251, 251, 255, 255, 255, 255, 255,}, + {234, 252, 255, 251, 251, 255, 255, 255, 255, 255,}, + {235, 253, 255, 251, 251, 255, 255, 255, 255, 255,}, + {236, 253, 255, 252, 252, 255, 255, 255, 255, 255,}, + {237, 253, 255, 252, 252, 255, 255, 255, 255, 255,}, + {238, 253, 255, 252, 252, 255, 255, 255, 255, 255,}, + {239, 254, 255, 253, 253, 255, 255, 255, 255, 255,}, + {240, 254, 255, 253, 253, 255, 255, 255, 255, 255,}, + {241, 254, 255, 253, 253, 255, 255, 255, 255, 255,}, + {242, 254, 255, 253, 254, 255, 255, 255, 255, 255,}, + {243, 254, 255, 254, 254, 255, 255, 255, 255, 255,}, + {244, 255, 255, 254, 254, 255, 255, 255, 255, 255,}, + {245, 255, 255, 254, 254, 255, 255, 255, 255, 255,}, + {246, 255, 255, 254, 254, 255, 255, 255, 255, 255,}, + {247, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {249, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {251, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, +}; + +const vp9_prob vp9_modelcoefprobs_gg75[COEFPROB_MODELS][ENTROPY_NODES - 1] = { + // Probs generated with a Generalized Gaussian (with shape parameter 0.75) + // source model with varying quantizer step size for a uniform quantizer + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, // do not use + {1, 2, 6, 87, 129, 11, 88, 39, 93, 47,}, + {2, 4, 11, 88, 130, 21, 89, 68, 98, 79,}, + {3, 6, 16, 89, 131, 30, 91, 92, 103, 105,}, + {4, 8, 21, 90, 131, 38, 92, 112, 107, 126,}, + {5, 10, 26, 90, 132, 46, 94, 129, 111, 143,}, + {6, 11, 31, 91, 133, 54, 95, 143, 115, 157,}, + {7, 13, 35, 92, 133, 61, 96, 156, 119, 170,}, + {8, 15, 40, 93, 134, 68, 97, 167, 123, 180,}, + {9, 17, 44, 94, 134, 74, 98, 177, 126, 189,}, + {10, 19, 48, 94, 135, 80, 100, 185, 130, 197,}, + {11, 20, 52, 95, 135, 86, 101, 192, 133, 204,}, + {12, 22, 56, 96, 136, 92, 102, 199, 137, 210,}, + {13, 24, 60, 96, 136, 97, 103, 205, 140, 215,}, + {14, 26, 64, 97, 137, 103, 104, 210, 143, 219,}, + {15, 27, 68, 98, 137, 108, 105, 215, 146, 223,}, + {16, 29, 71, 98, 138, 112, 106, 219, 149, 227,}, + {17, 31, 75, 99, 138, 117, 107, 223, 152, 230,}, + {18, 32, 78, 100, 139, 121, 108, 226, 155, 233,}, + {19, 34, 82, 100, 139, 126, 109, 229, 158, 235,}, + {20, 36, 85, 101, 140, 130, 110, 231, 161, 238,}, + {21, 37, 88, 102, 140, 134, 111, 234, 164, 239,}, + {22, 39, 91, 102, 141, 138, 112, 236, 167, 241,}, + {23, 40, 94, 103, 141, 141, 113, 238, 169, 243,}, + {24, 42, 97, 104, 142, 145, 114, 240, 172, 244,}, + {25, 44, 100, 104, 142, 149, 115, 241, 174, 245,}, + {26, 45, 103, 105, 143, 152, 116, 243, 177, 246,}, + {27, 47, 106, 105, 143, 155, 117, 244, 179, 247,}, + {28, 48, 109, 106, 143, 158, 118, 245, 182, 248,}, + {29, 50, 112, 107, 144, 161, 119, 246, 184, 249,}, + {30, 52, 115, 107, 144, 164, 120, 247, 186, 250,}, + {31, 53, 117, 108, 145, 167, 121, 248, 188, 250,}, + {32, 55, 120, 109, 145, 170, 122, 249, 190, 251,}, + {33, 56, 122, 109, 146, 173, 123, 249, 192, 252,}, + {34, 58, 125, 110, 146, 175, 124, 250, 194, 252,}, + {35, 59, 127, 110, 147, 178, 125, 251, 196, 252,}, + {36, 61, 130, 111, 147, 180, 126, 251, 198, 253,}, + {37, 62, 132, 112, 147, 183, 127, 251, 200, 253,}, + {38, 64, 135, 112, 148, 185, 128, 252, 202, 253,}, + {39, 65, 137, 113, 148, 187, 129, 252, 204, 254,}, + {40, 67, 139, 114, 149, 189, 130, 253, 205, 254,}, + {41, 68, 141, 114, 149, 191, 131, 253, 207, 254,}, + {42, 70, 144, 115, 150, 193, 132, 253, 209, 254,}, + {43, 71, 146, 115, 150, 195, 133, 254, 210, 254,}, + {44, 72, 148, 116, 151, 197, 134, 254, 212, 255,}, + {45, 74, 150, 117, 151, 199, 135, 254, 213, 255,}, + {46, 75, 152, 117, 151, 201, 136, 254, 215, 255,}, + {47, 77, 154, 118, 152, 202, 137, 254, 216, 255,}, + {48, 78, 156, 119, 152, 204, 138, 254, 217, 255,}, + {49, 80, 158, 119, 153, 206, 139, 255, 219, 255,}, + {50, 81, 160, 120, 153, 207, 140, 255, 220, 255,}, + {51, 82, 162, 120, 154, 209, 141, 255, 221, 255,}, + {52, 84, 164, 121, 154, 210, 142, 255, 222, 255,}, + {53, 85, 165, 122, 155, 212, 143, 255, 224, 255,}, + {54, 87, 167, 122, 155, 213, 144, 255, 225, 255,}, + {55, 88, 169, 123, 155, 215, 145, 255, 226, 255,}, + {56, 89, 171, 124, 156, 216, 146, 255, 227, 255,}, + {57, 91, 172, 124, 156, 217, 146, 255, 228, 255,}, + {58, 92, 174, 125, 157, 218, 147, 255, 229, 255,}, + {59, 93, 176, 126, 157, 220, 148, 255, 230, 255,}, + {60, 95, 177, 126, 158, 221, 149, 255, 231, 255,}, + {61, 96, 179, 127, 158, 222, 150, 255, 232, 255,}, + {62, 97, 180, 127, 159, 223, 151, 255, 232, 255,}, + {63, 99, 182, 128, 159, 224, 152, 255, 233, 255,}, + {64, 100, 183, 129, 159, 225, 153, 255, 234, 255,}, + {65, 101, 185, 129, 160, 226, 154, 255, 235, 255,}, + {66, 103, 186, 130, 160, 227, 155, 255, 236, 255,}, + {67, 104, 188, 131, 161, 228, 156, 255, 236, 255,}, + {68, 105, 189, 131, 161, 229, 157, 255, 237, 255,}, + {69, 106, 190, 132, 162, 230, 158, 255, 238, 255,}, + {70, 108, 192, 133, 162, 231, 159, 255, 238, 255,}, + {71, 109, 193, 133, 162, 231, 159, 255, 239, 255,}, + {72, 110, 194, 134, 163, 232, 160, 255, 240, 255,}, + {73, 111, 196, 134, 163, 233, 161, 255, 240, 255,}, + {74, 113, 197, 135, 164, 234, 162, 255, 241, 255,}, + {75, 114, 198, 136, 164, 235, 163, 255, 241, 255,}, + {76, 115, 199, 136, 165, 235, 164, 255, 242, 255,}, + {77, 116, 200, 137, 165, 236, 165, 255, 243, 255,}, + {78, 118, 202, 138, 166, 237, 166, 255, 243, 255,}, + {79, 119, 203, 138, 166, 237, 167, 255, 244, 255,}, + {80, 120, 204, 139, 167, 238, 168, 255, 244, 255,}, + {81, 121, 205, 140, 167, 239, 168, 255, 244, 255,}, + {82, 123, 206, 140, 167, 239, 169, 255, 245, 255,}, + {83, 124, 207, 141, 168, 240, 170, 255, 245, 255,}, + {84, 125, 208, 142, 168, 240, 171, 255, 246, 255,}, + {85, 126, 209, 142, 169, 241, 172, 255, 246, 255,}, + {86, 127, 210, 143, 169, 241, 173, 255, 247, 255,}, + {87, 129, 211, 144, 170, 242, 174, 255, 247, 255,}, + {88, 130, 212, 144, 170, 242, 175, 255, 247, 255,}, + {89, 131, 213, 145, 171, 243, 175, 255, 248, 255,}, + {90, 132, 214, 146, 171, 243, 176, 255, 248, 255,}, + {91, 133, 215, 146, 171, 244, 177, 255, 248, 255,}, + {92, 134, 216, 147, 172, 244, 178, 255, 249, 255,}, + {93, 136, 217, 148, 172, 245, 179, 255, 249, 255,}, + {94, 137, 218, 148, 173, 245, 180, 255, 249, 255,}, + {95, 138, 219, 149, 173, 245, 181, 255, 249, 255,}, + {96, 139, 220, 150, 174, 246, 181, 255, 250, 255,}, + {97, 140, 220, 150, 174, 246, 182, 255, 250, 255,}, + {98, 141, 221, 151, 175, 247, 183, 255, 250, 255,}, + {99, 142, 222, 152, 175, 247, 184, 255, 250, 255,}, + {100, 144, 223, 152, 176, 247, 185, 255, 251, 255,}, + {101, 145, 224, 153, 176, 248, 186, 255, 251, 255,}, + {102, 146, 224, 154, 177, 248, 186, 255, 251, 255,}, + {103, 147, 225, 154, 177, 248, 187, 255, 251, 255,}, + {104, 148, 226, 155, 177, 248, 188, 255, 252, 255,}, + {105, 149, 226, 156, 178, 249, 189, 255, 252, 255,}, + {106, 150, 227, 156, 178, 249, 190, 255, 252, 255,}, + {107, 151, 228, 157, 179, 249, 191, 255, 252, 255,}, + {108, 152, 229, 158, 179, 250, 191, 255, 252, 255,}, + {109, 153, 229, 158, 180, 250, 192, 255, 252, 255,}, + {110, 154, 230, 159, 180, 250, 193, 255, 253, 255,}, + {111, 155, 231, 160, 181, 250, 194, 255, 253, 255,}, + {112, 157, 231, 160, 181, 251, 195, 255, 253, 255,}, + {113, 158, 232, 161, 182, 251, 195, 255, 253, 255,}, + {114, 159, 232, 162, 182, 251, 196, 255, 253, 255,}, + {115, 160, 233, 162, 183, 251, 197, 255, 253, 255,}, + {116, 161, 234, 163, 183, 251, 198, 255, 253, 255,}, + {117, 162, 234, 164, 184, 252, 198, 255, 254, 255,}, + {118, 163, 235, 165, 184, 252, 199, 255, 254, 255,}, + {119, 164, 235, 165, 185, 252, 200, 255, 254, 255,}, + {120, 165, 236, 166, 185, 252, 201, 255, 254, 255,}, + {121, 166, 236, 167, 186, 252, 201, 255, 254, 255,}, + {122, 167, 237, 167, 186, 252, 202, 255, 254, 255,}, + {123, 168, 237, 168, 186, 253, 203, 255, 254, 255,}, + {124, 169, 238, 169, 187, 253, 204, 255, 254, 255,}, + {125, 170, 238, 169, 187, 253, 204, 255, 254, 255,}, + {126, 171, 239, 170, 188, 253, 205, 255, 254, 255,}, + {127, 172, 239, 171, 188, 253, 206, 255, 254, 255,}, + {128, 173, 240, 171, 189, 253, 207, 255, 255, 255,}, + {129, 174, 240, 172, 189, 253, 207, 255, 255, 255,}, + {130, 175, 241, 173, 190, 253, 208, 255, 255, 255,}, + {131, 176, 241, 174, 190, 254, 209, 255, 255, 255,}, + {132, 177, 241, 174, 191, 254, 209, 255, 255, 255,}, + {133, 178, 242, 175, 191, 254, 210, 255, 255, 255,}, + {134, 179, 242, 176, 192, 254, 211, 255, 255, 255,}, + {135, 180, 243, 176, 192, 254, 212, 255, 255, 255,}, + {136, 180, 243, 177, 193, 254, 212, 255, 255, 255,}, + {137, 181, 243, 178, 193, 254, 213, 255, 255, 255,}, + {138, 182, 244, 179, 194, 254, 214, 255, 255, 255,}, + {139, 183, 244, 179, 194, 254, 214, 255, 255, 255,}, + {140, 184, 244, 180, 195, 254, 215, 255, 255, 255,}, + {141, 185, 245, 181, 195, 254, 216, 255, 255, 255,}, + {142, 186, 245, 181, 196, 255, 216, 255, 255, 255,}, + {143, 187, 245, 182, 196, 255, 217, 255, 255, 255,}, + {144, 188, 246, 183, 197, 255, 218, 255, 255, 255,}, + {145, 189, 246, 183, 197, 255, 218, 255, 255, 255,}, + {146, 190, 246, 184, 198, 255, 219, 255, 255, 255,}, + {147, 191, 247, 185, 198, 255, 220, 255, 255, 255,}, + {148, 191, 247, 186, 199, 255, 220, 255, 255, 255,}, + {149, 192, 247, 186, 199, 255, 221, 255, 255, 255,}, + {150, 193, 248, 187, 200, 255, 221, 255, 255, 255,}, + {151, 194, 248, 188, 200, 255, 222, 255, 255, 255,}, + {152, 195, 248, 188, 201, 255, 223, 255, 255, 255,}, + {153, 196, 248, 189, 201, 255, 223, 255, 255, 255,}, + {154, 197, 249, 190, 202, 255, 224, 255, 255, 255,}, + {155, 198, 249, 191, 202, 255, 224, 255, 255, 255,}, + {156, 198, 249, 191, 203, 255, 225, 255, 255, 255,}, + {157, 199, 249, 192, 203, 255, 226, 255, 255, 255,}, + {158, 200, 250, 193, 204, 255, 226, 255, 255, 255,}, + {159, 201, 250, 193, 204, 255, 227, 255, 255, 255,}, + {160, 202, 250, 194, 205, 255, 227, 255, 255, 255,}, + {161, 203, 250, 195, 206, 255, 228, 255, 255, 255,}, + {162, 203, 250, 196, 206, 255, 228, 255, 255, 255,}, + {163, 204, 251, 196, 207, 255, 229, 255, 255, 255,}, + {164, 205, 251, 197, 207, 255, 229, 255, 255, 255,}, + {165, 206, 251, 198, 208, 255, 230, 255, 255, 255,}, + {166, 207, 251, 198, 208, 255, 231, 255, 255, 255,}, + {167, 207, 251, 199, 209, 255, 231, 255, 255, 255,}, + {168, 208, 252, 200, 209, 255, 232, 255, 255, 255,}, + {169, 209, 252, 201, 210, 255, 232, 255, 255, 255,}, + {170, 210, 252, 201, 210, 255, 233, 255, 255, 255,}, + {171, 211, 252, 202, 211, 255, 233, 255, 255, 255,}, + {172, 211, 252, 203, 211, 255, 234, 255, 255, 255,}, + {173, 212, 252, 203, 212, 255, 234, 255, 255, 255,}, + {174, 213, 252, 204, 212, 255, 235, 255, 255, 255,}, + {175, 214, 253, 205, 213, 255, 235, 255, 255, 255,}, + {176, 214, 253, 206, 213, 255, 236, 255, 255, 255,}, + {177, 215, 253, 206, 214, 255, 236, 255, 255, 255,}, + {178, 216, 253, 207, 214, 255, 237, 255, 255, 255,}, + {179, 217, 253, 208, 215, 255, 237, 255, 255, 255,}, + {180, 217, 253, 208, 216, 255, 237, 255, 255, 255,}, + {181, 218, 253, 209, 216, 255, 238, 255, 255, 255,}, + {182, 219, 254, 210, 217, 255, 238, 255, 255, 255,}, + {183, 220, 254, 211, 217, 255, 239, 255, 255, 255,}, + {184, 220, 254, 211, 218, 255, 239, 255, 255, 255,}, + {185, 221, 254, 212, 218, 255, 240, 255, 255, 255,}, + {186, 222, 254, 213, 219, 255, 240, 255, 255, 255,}, + {187, 222, 254, 213, 219, 255, 241, 255, 255, 255,}, + {188, 223, 254, 214, 220, 255, 241, 255, 255, 255,}, + {189, 224, 254, 215, 220, 255, 241, 255, 255, 255,}, + {190, 225, 254, 215, 221, 255, 242, 255, 255, 255,}, + {191, 225, 254, 216, 221, 255, 242, 255, 255, 255,}, + {192, 226, 254, 217, 222, 255, 243, 255, 255, 255,}, + {193, 227, 255, 218, 223, 255, 243, 255, 255, 255,}, + {194, 227, 255, 218, 223, 255, 243, 255, 255, 255,}, + {195, 228, 255, 219, 224, 255, 244, 255, 255, 255,}, + {196, 229, 255, 220, 224, 255, 244, 255, 255, 255,}, + {197, 229, 255, 220, 225, 255, 244, 255, 255, 255,}, + {198, 230, 255, 221, 225, 255, 245, 255, 255, 255,}, + {199, 230, 255, 222, 226, 255, 245, 255, 255, 255,}, + {200, 231, 255, 222, 226, 255, 246, 255, 255, 255,}, + {201, 232, 255, 223, 227, 255, 246, 255, 255, 255,}, + {202, 232, 255, 224, 228, 255, 246, 255, 255, 255,}, + {203, 233, 255, 224, 228, 255, 247, 255, 255, 255,}, + {204, 234, 255, 225, 229, 255, 247, 255, 255, 255,}, + {205, 234, 255, 226, 229, 255, 247, 255, 255, 255,}, + {206, 235, 255, 227, 230, 255, 248, 255, 255, 255,}, + {207, 235, 255, 227, 230, 255, 248, 255, 255, 255,}, + {208, 236, 255, 228, 231, 255, 248, 255, 255, 255,}, + {209, 237, 255, 229, 231, 255, 248, 255, 255, 255,}, + {210, 237, 255, 229, 232, 255, 249, 255, 255, 255,}, + {211, 238, 255, 230, 233, 255, 249, 255, 255, 255,}, + {212, 238, 255, 231, 233, 255, 249, 255, 255, 255,}, + {213, 239, 255, 231, 234, 255, 250, 255, 255, 255,}, + {214, 239, 255, 232, 234, 255, 250, 255, 255, 255,}, + {215, 240, 255, 233, 235, 255, 250, 255, 255, 255,}, + {216, 241, 255, 233, 235, 255, 250, 255, 255, 255,}, + {217, 241, 255, 234, 236, 255, 251, 255, 255, 255,}, + {218, 242, 255, 235, 236, 255, 251, 255, 255, 255,}, + {219, 242, 255, 235, 237, 255, 251, 255, 255, 255,}, + {220, 243, 255, 236, 237, 255, 251, 255, 255, 255,}, + {221, 243, 255, 236, 238, 255, 252, 255, 255, 255,}, + {222, 244, 255, 237, 239, 255, 252, 255, 255, 255,}, + {223, 244, 255, 238, 239, 255, 252, 255, 255, 255,}, + {224, 245, 255, 238, 240, 255, 252, 255, 255, 255,}, + {225, 245, 255, 239, 240, 255, 252, 255, 255, 255,}, + {226, 246, 255, 240, 241, 255, 253, 255, 255, 255,}, + {227, 246, 255, 240, 241, 255, 253, 255, 255, 255,}, + {228, 247, 255, 241, 242, 255, 253, 255, 255, 255,}, + {229, 247, 255, 242, 242, 255, 253, 255, 255, 255,}, + {230, 248, 255, 242, 243, 255, 253, 255, 255, 255,}, + {231, 248, 255, 243, 244, 255, 254, 255, 255, 255,}, + {232, 248, 255, 243, 244, 255, 254, 255, 255, 255,}, + {233, 249, 255, 244, 245, 255, 254, 255, 255, 255,}, + {234, 249, 255, 245, 245, 255, 254, 255, 255, 255,}, + {235, 250, 255, 245, 246, 255, 254, 255, 255, 255,}, + {236, 250, 255, 246, 246, 255, 254, 255, 255, 255,}, + {237, 251, 255, 246, 247, 255, 255, 255, 255, 255,}, + {238, 251, 255, 247, 247, 255, 255, 255, 255, 255,}, + {239, 251, 255, 248, 248, 255, 255, 255, 255, 255,}, + {240, 252, 255, 248, 248, 255, 255, 255, 255, 255,}, + {241, 252, 255, 249, 249, 255, 255, 255, 255, 255,}, + {242, 252, 255, 249, 249, 255, 255, 255, 255, 255,}, + {243, 253, 255, 250, 250, 255, 255, 255, 255, 255,}, + {244, 253, 255, 250, 250, 255, 255, 255, 255, 255,}, + {245, 253, 255, 251, 251, 255, 255, 255, 255, 255,}, + {246, 254, 255, 251, 251, 255, 255, 255, 255, 255,}, + {247, 254, 255, 252, 252, 255, 255, 255, 255, 255,}, + {248, 254, 255, 252, 252, 255, 255, 255, 255, 255,}, + {249, 255, 255, 253, 253, 255, 255, 255, 255, 255,}, + {250, 255, 255, 253, 253, 255, 255, 255, 255, 255,}, + {251, 255, 255, 254, 254, 255, 255, 255, 255, 255,}, + {252, 255, 255, 254, 254, 255, 255, 255, 255, 255,}, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255,}, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255,} +}; + +const vp9_prob vp9_modelcoefprobs_gg625[COEFPROB_MODELS][ENTROPY_NODES - 1] = { + // Probs generated with a Generalized Gaussian (with shape parameter 0.625) + // source model with varying quantizer step size for a uniform quantizer + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, // do not use + {1, 2, 6, 88, 130, 10, 88, 35, 94, 40,}, + {2, 4, 11, 89, 131, 19, 90, 60, 99, 67,}, + {3, 6, 15, 90, 132, 27, 92, 80, 103, 88,}, + {4, 7, 20, 91, 132, 34, 93, 97, 107, 105,}, + {5, 9, 24, 92, 133, 41, 94, 112, 110, 120,}, + {6, 11, 28, 93, 134, 48, 95, 125, 113, 132,}, + {7, 13, 33, 93, 134, 54, 97, 136, 116, 143,}, + {8, 14, 36, 94, 135, 60, 98, 146, 119, 152,}, + {9, 16, 40, 95, 135, 65, 99, 155, 122, 161,}, + {10, 18, 44, 95, 136, 70, 100, 163, 125, 168,}, + {11, 19, 48, 96, 136, 75, 101, 170, 127, 175,}, + {12, 21, 51, 97, 137, 80, 102, 176, 130, 181,}, + {13, 23, 55, 97, 137, 85, 102, 182, 132, 187,}, + {14, 24, 58, 98, 138, 89, 103, 188, 135, 192,}, + {15, 26, 61, 99, 138, 94, 104, 193, 137, 196,}, + {16, 27, 64, 99, 139, 98, 105, 197, 140, 201,}, + {17, 29, 67, 100, 139, 102, 106, 201, 142, 205,}, + {18, 30, 70, 101, 140, 106, 107, 205, 144, 208,}, + {19, 32, 73, 101, 140, 109, 108, 209, 146, 211,}, + {20, 34, 76, 102, 140, 113, 109, 212, 148, 214,}, + {21, 35, 79, 102, 141, 116, 109, 215, 151, 217,}, + {22, 37, 82, 103, 141, 120, 110, 218, 153, 220,}, + {23, 38, 85, 103, 142, 123, 111, 220, 155, 222,}, + {24, 40, 87, 104, 142, 126, 112, 223, 157, 224,}, + {25, 41, 90, 105, 143, 129, 113, 225, 159, 226,}, + {26, 42, 93, 105, 143, 132, 113, 227, 161, 228,}, + {27, 44, 95, 106, 143, 135, 114, 229, 162, 230,}, + {28, 45, 98, 106, 144, 138, 115, 230, 164, 232,}, + {29, 47, 100, 107, 144, 141, 116, 232, 166, 233,}, + {30, 48, 103, 107, 145, 144, 117, 234, 168, 235,}, + {31, 50, 105, 108, 145, 146, 117, 235, 170, 236,}, + {32, 51, 107, 108, 145, 149, 118, 236, 171, 237,}, + {33, 52, 110, 109, 146, 151, 119, 238, 173, 238,}, + {34, 54, 112, 110, 146, 154, 120, 239, 175, 239,}, + {35, 55, 114, 110, 147, 156, 120, 240, 176, 240,}, + {36, 57, 116, 111, 147, 158, 121, 241, 178, 241,}, + {37, 58, 119, 111, 147, 161, 122, 242, 180, 242,}, + {38, 59, 121, 112, 148, 163, 123, 243, 181, 243,}, + {39, 61, 123, 112, 148, 165, 123, 244, 183, 244,}, + {40, 62, 125, 113, 148, 167, 124, 244, 184, 245,}, + {41, 63, 127, 113, 149, 169, 125, 245, 186, 245,}, + {42, 65, 129, 114, 149, 171, 126, 246, 187, 246,}, + {43, 66, 131, 114, 150, 173, 126, 246, 188, 247,}, + {44, 67, 133, 115, 150, 175, 127, 247, 190, 247,}, + {45, 69, 135, 115, 150, 177, 128, 247, 191, 248,}, + {46, 70, 136, 116, 151, 178, 129, 248, 193, 248,}, + {47, 71, 138, 116, 151, 180, 129, 248, 194, 249,}, + {48, 73, 140, 117, 151, 182, 130, 249, 195, 249,}, + {49, 74, 142, 118, 152, 184, 131, 249, 197, 250,}, + {50, 75, 144, 118, 152, 185, 131, 250, 198, 250,}, + {51, 76, 145, 119, 153, 187, 132, 250, 199, 250,}, + {52, 78, 147, 119, 153, 188, 133, 251, 200, 251,}, + {53, 79, 149, 120, 153, 190, 134, 251, 201, 251,}, + {54, 80, 151, 120, 154, 192, 134, 251, 203, 251,}, + {55, 82, 152, 121, 154, 193, 135, 251, 204, 252,}, + {56, 83, 154, 121, 154, 194, 136, 252, 205, 252,}, + {57, 84, 155, 122, 155, 196, 136, 252, 206, 252,}, + {58, 85, 157, 122, 155, 197, 137, 252, 207, 252,}, + {59, 86, 158, 123, 156, 199, 138, 252, 208, 252,}, + {60, 88, 160, 123, 156, 200, 139, 253, 209, 253,}, + {61, 89, 162, 124, 156, 201, 139, 253, 210, 253,}, + {62, 90, 163, 124, 157, 202, 140, 253, 211, 253,}, + {63, 91, 164, 125, 157, 204, 141, 253, 212, 253,}, + {64, 93, 166, 125, 157, 205, 141, 253, 213, 253,}, + {65, 94, 167, 126, 158, 206, 142, 254, 214, 254,}, + {66, 95, 169, 126, 158, 207, 143, 254, 215, 254,}, + {67, 96, 170, 127, 158, 208, 143, 254, 216, 254,}, + {68, 97, 172, 127, 159, 209, 144, 254, 217, 254,}, + {69, 98, 173, 128, 159, 210, 145, 254, 218, 254,}, + {70, 100, 174, 128, 160, 212, 146, 254, 219, 254,}, + {71, 101, 176, 129, 160, 213, 146, 254, 220, 254,}, + {72, 102, 177, 130, 160, 214, 147, 254, 220, 254,}, + {73, 103, 178, 130, 161, 215, 148, 255, 221, 255,}, + {74, 104, 179, 131, 161, 216, 148, 255, 222, 255,}, + {75, 105, 181, 131, 161, 217, 149, 255, 223, 255,}, + {76, 107, 182, 132, 162, 217, 150, 255, 224, 255,}, + {77, 108, 183, 132, 162, 218, 150, 255, 224, 255,}, + {78, 109, 184, 133, 163, 219, 151, 255, 225, 255,}, + {79, 110, 185, 133, 163, 220, 152, 255, 226, 255,}, + {80, 111, 187, 134, 163, 221, 153, 255, 227, 255,}, + {81, 112, 188, 134, 164, 222, 153, 255, 227, 255,}, + {82, 113, 189, 135, 164, 223, 154, 255, 228, 255,}, + {83, 115, 190, 135, 164, 223, 155, 255, 229, 255,}, + {84, 116, 191, 136, 165, 224, 155, 255, 229, 255,}, + {85, 117, 192, 136, 165, 225, 156, 255, 230, 255,}, + {86, 118, 193, 137, 165, 226, 157, 255, 231, 255,}, + {87, 119, 194, 137, 166, 226, 157, 255, 231, 255,}, + {88, 120, 195, 138, 166, 227, 158, 255, 232, 255,}, + {89, 121, 196, 139, 167, 228, 159, 255, 232, 255,}, + {90, 122, 197, 139, 167, 229, 159, 255, 233, 255,}, + {91, 123, 198, 140, 167, 229, 160, 255, 234, 255,}, + {92, 124, 199, 140, 168, 230, 161, 255, 234, 255,}, + {93, 125, 200, 141, 168, 231, 162, 255, 235, 255,}, + {94, 127, 201, 141, 168, 231, 162, 255, 235, 255,}, + {95, 128, 202, 142, 169, 232, 163, 255, 236, 255,}, + {96, 129, 203, 142, 169, 232, 164, 255, 236, 255,}, + {97, 130, 204, 143, 170, 233, 164, 255, 237, 255,}, + {98, 131, 205, 143, 170, 234, 165, 255, 237, 255,}, + {99, 132, 206, 144, 170, 234, 166, 255, 238, 255,}, + {100, 133, 207, 144, 171, 235, 166, 255, 238, 255,}, + {101, 134, 208, 145, 171, 235, 167, 255, 239, 255,}, + {102, 135, 209, 146, 171, 236, 168, 255, 239, 255,}, + {103, 136, 209, 146, 172, 236, 168, 255, 240, 255,}, + {104, 137, 210, 147, 172, 237, 169, 255, 240, 255,}, + {105, 138, 211, 147, 173, 237, 170, 255, 240, 255,}, + {106, 139, 212, 148, 173, 238, 170, 255, 241, 255,}, + {107, 140, 213, 148, 173, 238, 171, 255, 241, 255,}, + {108, 141, 213, 149, 174, 239, 172, 255, 242, 255,}, + {109, 142, 214, 149, 174, 239, 172, 255, 242, 255,}, + {110, 143, 215, 150, 175, 240, 173, 255, 242, 255,}, + {111, 144, 216, 151, 175, 240, 174, 255, 243, 255,}, + {112, 145, 217, 151, 175, 240, 174, 255, 243, 255,}, + {113, 146, 217, 152, 176, 241, 175, 255, 244, 255,}, + {114, 147, 218, 152, 176, 241, 176, 255, 244, 255,}, + {115, 148, 219, 153, 176, 242, 177, 255, 244, 255,}, + {116, 149, 219, 153, 177, 242, 177, 255, 245, 255,}, + {117, 150, 220, 154, 177, 242, 178, 255, 245, 255,}, + {118, 151, 221, 155, 178, 243, 179, 255, 245, 255,}, + {119, 152, 222, 155, 178, 243, 179, 255, 245, 255,}, + {120, 153, 222, 156, 178, 244, 180, 255, 246, 255,}, + {121, 154, 223, 156, 179, 244, 181, 255, 246, 255,}, + {122, 155, 224, 157, 179, 244, 181, 255, 246, 255,}, + {123, 156, 224, 157, 180, 245, 182, 255, 247, 255,}, + {124, 157, 225, 158, 180, 245, 183, 255, 247, 255,}, + {125, 158, 225, 159, 180, 245, 183, 255, 247, 255,}, + {126, 159, 226, 159, 181, 246, 184, 255, 247, 255,}, + {127, 160, 227, 160, 181, 246, 185, 255, 248, 255,}, + {128, 161, 227, 160, 182, 246, 185, 255, 248, 255,}, + {129, 162, 228, 161, 182, 246, 186, 255, 248, 255,}, + {130, 163, 228, 161, 182, 247, 187, 255, 248, 255,}, + {131, 164, 229, 162, 183, 247, 187, 255, 249, 255,}, + {132, 165, 230, 163, 183, 247, 188, 255, 249, 255,}, + {133, 166, 230, 163, 184, 248, 189, 255, 249, 255,}, + {134, 166, 231, 164, 184, 248, 189, 255, 249, 255,}, + {135, 167, 231, 164, 184, 248, 190, 255, 250, 255,}, + {136, 168, 232, 165, 185, 248, 191, 255, 250, 255,}, + {137, 169, 232, 166, 185, 248, 191, 255, 250, 255,}, + {138, 170, 233, 166, 186, 249, 192, 255, 250, 255,}, + {139, 171, 233, 167, 186, 249, 192, 255, 250, 255,}, + {140, 172, 234, 167, 187, 249, 193, 255, 251, 255,}, + {141, 173, 234, 168, 187, 249, 194, 255, 251, 255,}, + {142, 174, 235, 169, 187, 250, 194, 255, 251, 255,}, + {143, 175, 235, 169, 188, 250, 195, 255, 251, 255,}, + {144, 176, 236, 170, 188, 250, 196, 255, 251, 255,}, + {145, 177, 236, 170, 189, 250, 196, 255, 251, 255,}, + {146, 177, 237, 171, 189, 250, 197, 255, 252, 255,}, + {147, 178, 237, 172, 189, 251, 198, 255, 252, 255,}, + {148, 179, 238, 172, 190, 251, 198, 255, 252, 255,}, + {149, 180, 238, 173, 190, 251, 199, 255, 252, 255,}, + {150, 181, 238, 173, 191, 251, 200, 255, 252, 255,}, + {151, 182, 239, 174, 191, 251, 200, 255, 252, 255,}, + {152, 183, 239, 175, 192, 251, 201, 255, 252, 255,}, + {153, 184, 240, 175, 192, 252, 202, 255, 252, 255,}, + {154, 184, 240, 176, 193, 252, 202, 255, 253, 255,}, + {155, 185, 240, 177, 193, 252, 203, 255, 253, 255,}, + {156, 186, 241, 177, 193, 252, 203, 255, 253, 255,}, + {157, 187, 241, 178, 194, 252, 204, 255, 253, 255,}, + {158, 188, 242, 178, 194, 252, 205, 255, 253, 255,}, + {159, 189, 242, 179, 195, 252, 205, 255, 253, 255,}, + {160, 190, 242, 180, 195, 253, 206, 255, 253, 255,}, + {161, 190, 243, 180, 196, 253, 207, 255, 253, 255,}, + {162, 191, 243, 181, 196, 253, 207, 255, 254, 255,}, + {163, 192, 243, 182, 197, 253, 208, 255, 254, 255,}, + {164, 193, 244, 182, 197, 253, 209, 255, 254, 255,}, + {165, 194, 244, 183, 197, 253, 209, 255, 254, 255,}, + {166, 195, 244, 184, 198, 253, 210, 255, 254, 255,}, + {167, 196, 245, 184, 198, 253, 210, 255, 254, 255,}, + {168, 196, 245, 185, 199, 253, 211, 255, 254, 255,}, + {169, 197, 245, 186, 199, 254, 212, 255, 254, 255,}, + {170, 198, 246, 186, 200, 254, 212, 255, 254, 255,}, + {171, 199, 246, 187, 200, 254, 213, 255, 254, 255,}, + {172, 200, 246, 188, 201, 254, 214, 255, 254, 255,}, + {173, 200, 246, 188, 201, 254, 214, 255, 254, 255,}, + {174, 201, 247, 189, 202, 254, 215, 255, 254, 255,}, + {175, 202, 247, 189, 202, 254, 215, 255, 255, 255,}, + {176, 203, 247, 190, 203, 254, 216, 255, 255, 255,}, + {177, 204, 248, 191, 203, 254, 217, 255, 255, 255,}, + {178, 204, 248, 191, 204, 254, 217, 255, 255, 255,}, + {179, 205, 248, 192, 204, 254, 218, 255, 255, 255,}, + {180, 206, 248, 193, 204, 254, 218, 255, 255, 255,}, + {181, 207, 249, 194, 205, 255, 219, 255, 255, 255,}, + {182, 208, 249, 194, 205, 255, 220, 255, 255, 255,}, + {183, 208, 249, 195, 206, 255, 220, 255, 255, 255,}, + {184, 209, 249, 196, 206, 255, 221, 255, 255, 255,}, + {185, 210, 250, 196, 207, 255, 221, 255, 255, 255,}, + {186, 211, 250, 197, 207, 255, 222, 255, 255, 255,}, + {187, 211, 250, 198, 208, 255, 223, 255, 255, 255,}, + {188, 212, 250, 198, 208, 255, 223, 255, 255, 255,}, + {189, 213, 250, 199, 209, 255, 224, 255, 255, 255,}, + {190, 214, 251, 200, 209, 255, 224, 255, 255, 255,}, + {191, 215, 251, 200, 210, 255, 225, 255, 255, 255,}, + {192, 215, 251, 201, 211, 255, 225, 255, 255, 255,}, + {193, 216, 251, 202, 211, 255, 226, 255, 255, 255,}, + {194, 217, 251, 203, 212, 255, 227, 255, 255, 255,}, + {195, 218, 252, 203, 212, 255, 227, 255, 255, 255,}, + {196, 218, 252, 204, 213, 255, 228, 255, 255, 255,}, + {197, 219, 252, 205, 213, 255, 228, 255, 255, 255,}, + {198, 220, 252, 205, 214, 255, 229, 255, 255, 255,}, + {199, 221, 252, 206, 214, 255, 229, 255, 255, 255,}, + {200, 221, 252, 207, 215, 255, 230, 255, 255, 255,}, + {201, 222, 252, 208, 215, 255, 231, 255, 255, 255,}, + {202, 223, 253, 208, 216, 255, 231, 255, 255, 255,}, + {203, 223, 253, 209, 216, 255, 232, 255, 255, 255,}, + {204, 224, 253, 210, 217, 255, 232, 255, 255, 255,}, + {205, 225, 253, 211, 218, 255, 233, 255, 255, 255,}, + {206, 226, 253, 211, 218, 255, 233, 255, 255, 255,}, + {207, 226, 253, 212, 219, 255, 234, 255, 255, 255,}, + {208, 227, 253, 213, 219, 255, 234, 255, 255, 255,}, + {209, 228, 254, 214, 220, 255, 235, 255, 255, 255,}, + {210, 228, 254, 214, 220, 255, 236, 255, 255, 255,}, + {211, 229, 254, 215, 221, 255, 236, 255, 255, 255,}, + {212, 230, 254, 216, 222, 255, 237, 255, 255, 255,}, + {213, 230, 254, 217, 222, 255, 237, 255, 255, 255,}, + {214, 231, 254, 217, 223, 255, 238, 255, 255, 255,}, + {215, 232, 254, 218, 223, 255, 238, 255, 255, 255,}, + {216, 233, 254, 219, 224, 255, 239, 255, 255, 255,}, + {217, 233, 254, 220, 225, 255, 239, 255, 255, 255,}, + {218, 234, 255, 220, 225, 255, 240, 255, 255, 255,}, + {219, 235, 255, 221, 226, 255, 240, 255, 255, 255,}, + {220, 235, 255, 222, 226, 255, 241, 255, 255, 255,}, + {221, 236, 255, 223, 227, 255, 241, 255, 255, 255,}, + {222, 237, 255, 224, 228, 255, 242, 255, 255, 255,}, + {223, 237, 255, 224, 228, 255, 242, 255, 255, 255,}, + {224, 238, 255, 225, 229, 255, 243, 255, 255, 255,}, + {225, 238, 255, 226, 230, 255, 243, 255, 255, 255,}, + {226, 239, 255, 227, 230, 255, 244, 255, 255, 255,}, + {227, 240, 255, 228, 231, 255, 244, 255, 255, 255,}, + {228, 240, 255, 228, 232, 255, 245, 255, 255, 255,}, + {229, 241, 255, 229, 232, 255, 245, 255, 255, 255,}, + {230, 242, 255, 230, 233, 255, 246, 255, 255, 255,}, + {231, 242, 255, 231, 234, 255, 246, 255, 255, 255,}, + {232, 243, 255, 232, 234, 255, 247, 255, 255, 255,}, + {233, 243, 255, 233, 235, 255, 247, 255, 255, 255,}, + {234, 244, 255, 233, 236, 255, 247, 255, 255, 255,}, + {235, 245, 255, 234, 236, 255, 248, 255, 255, 255,}, + {236, 245, 255, 235, 237, 255, 248, 255, 255, 255,}, + {237, 246, 255, 236, 238, 255, 249, 255, 255, 255,}, + {238, 247, 255, 237, 239, 255, 249, 255, 255, 255,}, + {239, 247, 255, 238, 239, 255, 250, 255, 255, 255,}, + {240, 248, 255, 239, 240, 255, 250, 255, 255, 255,}, + {241, 248, 255, 240, 241, 255, 251, 255, 255, 255,}, + {242, 249, 255, 241, 242, 255, 251, 255, 255, 255,}, + {243, 249, 255, 241, 243, 255, 251, 255, 255, 255,}, + {244, 250, 255, 242, 243, 255, 252, 255, 255, 255,}, + {245, 251, 255, 243, 244, 255, 252, 255, 255, 255,}, + {246, 251, 255, 244, 245, 255, 253, 255, 255, 255,}, + {247, 252, 255, 245, 246, 255, 253, 255, 255, 255,}, + {248, 252, 255, 246, 247, 255, 253, 255, 255, 255,}, + {249, 253, 255, 247, 248, 255, 254, 255, 255, 255,}, + {250, 253, 255, 248, 249, 255, 254, 255, 255, 255,}, + {251, 254, 255, 249, 250, 255, 254, 255, 255, 255,}, + {252, 254, 255, 251, 251, 255, 255, 255, 255, 255,}, + {253, 255, 255, 252, 252, 255, 255, 255, 255, 255,}, + {254, 255, 255, 253, 253, 255, 255, 255, 255, 255,}, + {255, 255, 255, 254, 254, 255, 255, 255, 255, 255,}, +}; + +const vp9_prob vp9_modelcoefprobs_gg875p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = { + // Probs generated with a Generalized Gaussian (with shape parameter 0.625) + // source model with varying quantizer step size for a uniform quantizer + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, // do not use + {1, 1, 3, 86, 128, 6, 86, 22, 89, 28,}, + {1, 2, 6, 86, 129, 11, 87, 42, 92, 52,}, + {2, 3, 9, 87, 129, 17, 88, 59, 94, 73,}, + {2, 4, 12, 87, 129, 22, 89, 75, 97, 92,}, + {3, 5, 14, 88, 130, 27, 89, 90, 100, 108,}, + {3, 6, 17, 88, 130, 33, 90, 103, 102, 122,}, + {4, 7, 20, 88, 130, 37, 91, 115, 105, 135,}, + {4, 8, 23, 89, 131, 42, 92, 126, 108, 147,}, + {5, 9, 25, 89, 131, 47, 92, 137, 110, 157,}, + {5, 10, 28, 90, 131, 52, 93, 146, 113, 167,}, + {6, 11, 31, 90, 132, 56, 94, 154, 115, 175,}, + {6, 12, 33, 90, 132, 60, 94, 162, 118, 183,}, + {7, 13, 36, 91, 132, 65, 95, 170, 120, 190,}, + {7, 14, 39, 91, 132, 69, 96, 176, 123, 196,}, + {8, 15, 41, 92, 133, 73, 96, 182, 125, 201,}, + {8, 16, 44, 92, 133, 77, 97, 188, 128, 206,}, + {9, 17, 46, 92, 133, 81, 98, 193, 130, 211,}, + {9, 18, 49, 93, 134, 85, 99, 198, 133, 215,}, + {10, 19, 51, 93, 134, 89, 99, 203, 135, 219,}, + {10, 20, 54, 93, 134, 92, 100, 207, 137, 222,}, + {11, 21, 56, 94, 134, 96, 101, 211, 140, 226,}, + {12, 22, 58, 94, 135, 100, 101, 214, 142, 228,}, + {12, 23, 61, 95, 135, 103, 102, 217, 145, 231,}, + {13, 24, 63, 95, 135, 106, 103, 220, 147, 233,}, + {13, 25, 66, 95, 136, 110, 103, 223, 149, 235,}, + {14, 26, 68, 96, 136, 113, 104, 226, 151, 237,}, + {14, 27, 70, 96, 136, 116, 105, 228, 154, 239,}, + {15, 28, 72, 97, 136, 119, 106, 230, 156, 241,}, + {15, 29, 75, 97, 137, 122, 106, 232, 158, 242,}, + {16, 30, 77, 97, 137, 125, 107, 234, 160, 243,}, + {17, 31, 79, 98, 137, 128, 108, 236, 163, 245,}, + {17, 32, 81, 98, 138, 131, 108, 237, 165, 246,}, + {18, 33, 83, 99, 138, 134, 109, 239, 167, 247,}, + {18, 34, 86, 99, 138, 137, 110, 240, 169, 248,}, + {19, 35, 88, 99, 138, 140, 111, 242, 171, 248,}, + {19, 36, 90, 100, 139, 142, 111, 243, 173, 249,}, + {20, 37, 92, 100, 139, 145, 112, 244, 175, 250,}, + {20, 38, 94, 101, 139, 148, 113, 245, 177, 250,}, + {21, 39, 96, 101, 140, 150, 113, 246, 179, 251,}, + {22, 40, 98, 101, 140, 153, 114, 246, 181, 251,}, + {22, 41, 100, 102, 140, 155, 115, 247, 183, 252,}, + {23, 42, 102, 102, 140, 157, 116, 248, 185, 252,}, + {23, 43, 104, 103, 141, 160, 116, 249, 186, 253,}, + {24, 44, 106, 103, 141, 162, 117, 249, 188, 253,}, + {25, 45, 108, 103, 141, 164, 118, 250, 190, 253,}, + {25, 46, 110, 104, 142, 166, 119, 250, 192, 253,}, + {26, 47, 112, 104, 142, 168, 119, 251, 193, 254,}, + {26, 48, 114, 105, 142, 171, 120, 251, 195, 254,}, + {27, 49, 116, 105, 143, 173, 121, 252, 197, 254,}, + {27, 50, 118, 105, 143, 175, 122, 252, 198, 254,}, + {28, 51, 119, 106, 143, 177, 122, 252, 200, 254,}, + {29, 52, 121, 106, 143, 179, 123, 253, 201, 255,}, + {29, 53, 123, 107, 144, 180, 124, 253, 203, 255,}, + {30, 54, 125, 107, 144, 182, 125, 253, 204, 255,}, + {30, 55, 127, 108, 144, 184, 125, 253, 206, 255,}, + {31, 56, 128, 108, 145, 186, 126, 254, 207, 255,}, + {32, 57, 130, 108, 145, 188, 127, 254, 209, 255,}, + {32, 58, 132, 109, 145, 189, 128, 254, 210, 255,}, + {33, 59, 134, 109, 146, 191, 128, 254, 211, 255,}, + {33, 60, 135, 110, 146, 193, 129, 254, 213, 255,}, + {34, 61, 137, 110, 146, 194, 130, 254, 214, 255,}, + {35, 62, 139, 111, 146, 196, 131, 255, 215, 255,}, + {35, 63, 140, 111, 147, 197, 131, 255, 216, 255,}, + {36, 64, 142, 112, 147, 199, 132, 255, 218, 255,}, + {37, 65, 144, 112, 147, 200, 133, 255, 219, 255,}, + {37, 66, 145, 112, 148, 202, 134, 255, 220, 255,}, + {38, 67, 147, 113, 148, 203, 135, 255, 221, 255,}, + {38, 68, 148, 113, 148, 204, 135, 255, 222, 255,}, + {39, 69, 150, 114, 149, 206, 136, 255, 223, 255,}, + {40, 70, 151, 114, 149, 207, 137, 255, 224, 255,}, + {40, 71, 153, 115, 149, 208, 138, 255, 225, 255,}, + {41, 72, 154, 115, 150, 210, 138, 255, 226, 255,}, + {42, 73, 156, 116, 150, 211, 139, 255, 227, 255,}, + {42, 74, 157, 116, 150, 212, 140, 255, 228, 255,}, + {43, 75, 159, 117, 151, 213, 141, 255, 229, 255,}, + {44, 76, 160, 117, 151, 214, 142, 255, 230, 255,}, + {44, 77, 162, 117, 151, 216, 142, 255, 231, 255,}, + {45, 78, 163, 118, 152, 217, 143, 255, 231, 255,}, + {45, 79, 165, 118, 152, 218, 144, 255, 232, 255,}, + {46, 80, 166, 119, 152, 219, 145, 255, 233, 255,}, + {47, 81, 167, 119, 153, 220, 146, 255, 234, 255,}, + {47, 82, 169, 120, 153, 221, 146, 255, 235, 255,}, + {48, 83, 170, 120, 153, 222, 147, 255, 235, 255,}, + {49, 84, 171, 121, 154, 223, 148, 255, 236, 255,}, + {49, 85, 173, 121, 154, 224, 149, 255, 237, 255,}, + {50, 86, 174, 122, 154, 225, 150, 255, 237, 255,}, + {51, 87, 175, 122, 155, 225, 150, 255, 238, 255,}, + {51, 88, 177, 123, 155, 226, 151, 255, 239, 255,}, + {52, 89, 178, 123, 155, 227, 152, 255, 239, 255,}, + {53, 90, 179, 124, 156, 228, 153, 255, 240, 255,}, + {53, 91, 180, 124, 156, 229, 154, 255, 240, 255,}, + {54, 92, 182, 125, 156, 230, 154, 255, 241, 255,}, + {55, 93, 183, 125, 157, 230, 155, 255, 241, 255,}, + {55, 94, 184, 126, 157, 231, 156, 255, 242, 255,}, + {56, 95, 185, 126, 157, 232, 157, 255, 242, 255,}, + {57, 96, 187, 127, 158, 233, 158, 255, 243, 255,}, + {57, 97, 188, 127, 158, 233, 159, 255, 243, 255,}, + {58, 98, 189, 128, 158, 234, 159, 255, 244, 255,}, + {59, 99, 190, 128, 159, 235, 160, 255, 244, 255,}, + {60, 100, 191, 129, 159, 235, 161, 255, 245, 255,}, + {60, 101, 192, 129, 160, 236, 162, 255, 245, 255,}, + {61, 102, 193, 130, 160, 237, 163, 255, 246, 255,}, + {62, 103, 194, 131, 160, 237, 164, 255, 246, 255,}, + {62, 104, 196, 131, 161, 238, 164, 255, 246, 255,}, + {63, 105, 197, 132, 161, 238, 165, 255, 247, 255,}, + {64, 106, 198, 132, 161, 239, 166, 255, 247, 255,}, + {64, 107, 199, 133, 162, 239, 167, 255, 247, 255,}, + {65, 108, 200, 133, 162, 240, 168, 255, 248, 255,}, + {66, 109, 201, 134, 163, 241, 168, 255, 248, 255,}, + {67, 110, 202, 134, 163, 241, 169, 255, 248, 255,}, + {67, 111, 203, 135, 163, 242, 170, 255, 249, 255,}, + {68, 112, 204, 135, 164, 242, 171, 255, 249, 255,}, + {69, 113, 205, 136, 164, 242, 172, 255, 249, 255,}, + {69, 114, 206, 137, 164, 243, 173, 255, 250, 255,}, + {70, 115, 207, 137, 165, 243, 173, 255, 250, 255,}, + {71, 116, 208, 138, 165, 244, 174, 255, 250, 255,}, + {72, 117, 208, 138, 166, 244, 175, 255, 250, 255,}, + {72, 118, 209, 139, 166, 245, 176, 255, 251, 255,}, + {73, 119, 210, 139, 166, 245, 177, 255, 251, 255,}, + {74, 120, 211, 140, 167, 245, 178, 255, 251, 255,}, + {75, 121, 212, 141, 167, 246, 178, 255, 251, 255,}, + {75, 122, 213, 141, 168, 246, 179, 255, 251, 255,}, + {76, 123, 214, 142, 168, 246, 180, 255, 252, 255,}, + {77, 124, 215, 142, 168, 247, 181, 255, 252, 255,}, + {78, 125, 215, 143, 169, 247, 182, 255, 252, 255,}, + {78, 126, 216, 144, 169, 247, 182, 255, 252, 255,}, + {79, 127, 217, 144, 170, 248, 183, 255, 252, 255,}, + {80, 128, 218, 145, 170, 248, 184, 255, 253, 255,}, + {81, 129, 219, 145, 170, 248, 185, 255, 253, 255,}, + {82, 130, 219, 146, 171, 249, 186, 255, 253, 255,}, + {82, 131, 220, 147, 171, 249, 187, 255, 253, 255,}, + {83, 132, 221, 147, 172, 249, 187, 255, 253, 255,}, + {84, 133, 222, 148, 172, 249, 188, 255, 253, 255,}, + {85, 134, 222, 148, 173, 250, 189, 255, 253, 255,}, + {85, 135, 223, 149, 173, 250, 190, 255, 254, 255,}, + {86, 136, 224, 150, 173, 250, 191, 255, 254, 255,}, + {87, 137, 225, 150, 174, 250, 191, 255, 254, 255,}, + {88, 138, 225, 151, 174, 251, 192, 255, 254, 255,}, + {89, 139, 226, 152, 175, 251, 193, 255, 254, 255,}, + {89, 140, 227, 152, 175, 251, 194, 255, 254, 255,}, + {90, 141, 227, 153, 176, 251, 195, 255, 254, 255,}, + {91, 142, 228, 153, 176, 251, 195, 255, 254, 255,}, + {92, 143, 229, 154, 176, 252, 196, 255, 254, 255,}, + {93, 144, 229, 155, 177, 252, 197, 255, 254, 255,}, + {93, 145, 230, 155, 177, 252, 198, 255, 255, 255,}, + {94, 146, 231, 156, 178, 252, 199, 255, 255, 255,}, + {95, 147, 231, 157, 178, 252, 199, 255, 255, 255,}, + {96, 148, 232, 157, 179, 252, 200, 255, 255, 255,}, + {97, 149, 232, 158, 179, 253, 201, 255, 255, 255,}, + {98, 150, 233, 159, 180, 253, 202, 255, 255, 255,}, + {99, 151, 234, 159, 180, 253, 202, 255, 255, 255,}, + {99, 152, 234, 160, 181, 253, 203, 255, 255, 255,}, + {100, 153, 235, 161, 181, 253, 204, 255, 255, 255,}, + {101, 154, 235, 162, 182, 253, 205, 255, 255, 255,}, + {102, 155, 236, 162, 182, 253, 206, 255, 255, 255,}, + {103, 156, 236, 163, 183, 254, 206, 255, 255, 255,}, + {104, 157, 237, 164, 183, 254, 207, 255, 255, 255,}, + {105, 158, 237, 164, 183, 254, 208, 255, 255, 255,}, + {105, 159, 238, 165, 184, 254, 209, 255, 255, 255,}, + {106, 160, 238, 166, 184, 254, 209, 255, 255, 255,}, + {107, 161, 239, 166, 185, 254, 210, 255, 255, 255,}, + {108, 162, 239, 167, 185, 254, 211, 255, 255, 255,}, + {109, 163, 240, 168, 186, 254, 212, 255, 255, 255,}, + {110, 164, 240, 169, 186, 254, 212, 255, 255, 255,}, + {111, 165, 241, 169, 187, 254, 213, 255, 255, 255,}, + {112, 166, 241, 170, 187, 255, 214, 255, 255, 255,}, + {113, 167, 242, 171, 188, 255, 215, 255, 255, 255,}, + {114, 168, 242, 172, 189, 255, 215, 255, 255, 255,}, + {114, 169, 242, 172, 189, 255, 216, 255, 255, 255,}, + {115, 170, 243, 173, 190, 255, 217, 255, 255, 255,}, + {116, 171, 243, 174, 190, 255, 217, 255, 255, 255,}, + {117, 172, 244, 175, 191, 255, 218, 255, 255, 255,}, + {118, 173, 244, 175, 191, 255, 219, 255, 255, 255,}, + {119, 174, 244, 176, 192, 255, 220, 255, 255, 255,}, + {120, 175, 245, 177, 192, 255, 220, 255, 255, 255,}, + {121, 176, 245, 178, 193, 255, 221, 255, 255, 255,}, + {122, 177, 245, 178, 193, 255, 222, 255, 255, 255,}, + {123, 178, 246, 179, 194, 255, 222, 255, 255, 255,}, + {124, 179, 246, 180, 194, 255, 223, 255, 255, 255,}, + {125, 180, 247, 181, 195, 255, 224, 255, 255, 255,}, + {126, 181, 247, 182, 196, 255, 224, 255, 255, 255,}, + {127, 182, 247, 182, 196, 255, 225, 255, 255, 255,}, + {128, 183, 247, 183, 197, 255, 226, 255, 255, 255,}, + {129, 184, 248, 184, 197, 255, 226, 255, 255, 255,}, + {130, 185, 248, 185, 198, 255, 227, 255, 255, 255,}, + {131, 186, 248, 186, 198, 255, 228, 255, 255, 255,}, + {132, 187, 249, 186, 199, 255, 228, 255, 255, 255,}, + {133, 188, 249, 187, 200, 255, 229, 255, 255, 255,}, + {134, 189, 249, 188, 200, 255, 230, 255, 255, 255,}, + {135, 190, 249, 189, 201, 255, 230, 255, 255, 255,}, + {136, 191, 250, 190, 201, 255, 231, 255, 255, 255,}, + {137, 192, 250, 191, 202, 255, 231, 255, 255, 255,}, + {138, 193, 250, 191, 203, 255, 232, 255, 255, 255,}, + {139, 194, 250, 192, 203, 255, 233, 255, 255, 255,}, + {140, 195, 251, 193, 204, 255, 233, 255, 255, 255,}, + {142, 196, 251, 194, 204, 255, 234, 255, 255, 255,}, + {143, 197, 251, 195, 205, 255, 234, 255, 255, 255,}, + {144, 198, 251, 196, 206, 255, 235, 255, 255, 255,}, + {145, 199, 252, 197, 206, 255, 236, 255, 255, 255,}, + {146, 200, 252, 197, 207, 255, 236, 255, 255, 255,}, + {147, 201, 252, 198, 208, 255, 237, 255, 255, 255,}, + {148, 202, 252, 199, 208, 255, 237, 255, 255, 255,}, + {149, 203, 252, 200, 209, 255, 238, 255, 255, 255,}, + {151, 204, 253, 201, 210, 255, 238, 255, 255, 255,}, + {152, 205, 253, 202, 210, 255, 239, 255, 255, 255,}, + {153, 206, 253, 203, 211, 255, 239, 255, 255, 255,}, + {154, 207, 253, 204, 212, 255, 240, 255, 255, 255,}, + {155, 208, 253, 205, 212, 255, 241, 255, 255, 255,}, + {157, 209, 253, 206, 213, 255, 241, 255, 255, 255,}, + {158, 210, 253, 206, 214, 255, 242, 255, 255, 255,}, + {159, 211, 254, 207, 214, 255, 242, 255, 255, 255,}, + {160, 212, 254, 208, 215, 255, 243, 255, 255, 255,}, + {162, 213, 254, 209, 216, 255, 243, 255, 255, 255,}, + {163, 214, 254, 210, 217, 255, 244, 255, 255, 255,}, + {164, 215, 254, 211, 217, 255, 244, 255, 255, 255,}, + {165, 216, 254, 212, 218, 255, 244, 255, 255, 255,}, + {167, 217, 254, 213, 219, 255, 245, 255, 255, 255,}, + {168, 218, 254, 214, 219, 255, 245, 255, 255, 255,}, + {169, 219, 255, 215, 220, 255, 246, 255, 255, 255,}, + {171, 220, 255, 216, 221, 255, 246, 255, 255, 255,}, + {172, 221, 255, 217, 222, 255, 247, 255, 255, 255,}, + {174, 222, 255, 218, 223, 255, 247, 255, 255, 255,}, + {175, 223, 255, 219, 223, 255, 248, 255, 255, 255,}, + {177, 224, 255, 220, 224, 255, 248, 255, 255, 255,}, + {178, 225, 255, 221, 225, 255, 248, 255, 255, 255,}, + {179, 226, 255, 222, 226, 255, 249, 255, 255, 255,}, + {181, 227, 255, 223, 227, 255, 249, 255, 255, 255,}, + {182, 228, 255, 224, 227, 255, 250, 255, 255, 255,}, + {184, 229, 255, 225, 228, 255, 250, 255, 255, 255,}, + {186, 230, 255, 226, 229, 255, 250, 255, 255, 255,}, + {187, 231, 255, 227, 230, 255, 251, 255, 255, 255,}, + {189, 232, 255, 228, 231, 255, 251, 255, 255, 255,}, + {190, 233, 255, 229, 232, 255, 251, 255, 255, 255,}, + {192, 234, 255, 230, 232, 255, 252, 255, 255, 255,}, + {194, 235, 255, 231, 233, 255, 252, 255, 255, 255,}, + {196, 236, 255, 232, 234, 255, 252, 255, 255, 255,}, + {197, 237, 255, 233, 235, 255, 253, 255, 255, 255,}, + {199, 238, 255, 234, 236, 255, 253, 255, 255, 255,}, + {201, 239, 255, 235, 237, 255, 253, 255, 255, 255,}, + {203, 240, 255, 237, 238, 255, 253, 255, 255, 255,}, + {205, 241, 255, 238, 239, 255, 254, 255, 255, 255,}, + {207, 242, 255, 239, 240, 255, 254, 255, 255, 255,}, + {209, 243, 255, 240, 241, 255, 254, 255, 255, 255,}, + {211, 244, 255, 241, 242, 255, 254, 255, 255, 255,}, + {214, 245, 255, 242, 243, 255, 255, 255, 255, 255,}, + {216, 246, 255, 243, 244, 255, 255, 255, 255, 255,}, + {218, 247, 255, 244, 245, 255, 255, 255, 255, 255,}, + {221, 248, 255, 246, 246, 255, 255, 255, 255, 255,}, + {224, 249, 255, 247, 247, 255, 255, 255, 255, 255,}, + {226, 250, 255, 248, 248, 255, 255, 255, 255, 255,}, + {229, 251, 255, 249, 249, 255, 255, 255, 255, 255,}, + {233, 252, 255, 251, 251, 255, 255, 255, 255, 255,}, + {236, 253, 255, 252, 252, 255, 255, 255, 255, 255,}, + {241, 254, 255, 253, 253, 255, 255, 255, 255, 255,}, + {246, 255, 255, 254, 254, 255, 255, 255, 255, 255,}, +}; + +const vp9_prob vp9_modelcoefprobs_gg75p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = { + // Probs generated with a Generalized Gaussian (with shape parameter 0.625) + // source model with varying quantizer step size for a uniform quantizer + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, // do not use + {1, 1, 3, 86, 129, 6, 87, 21, 90, 26,}, + {1, 2, 6, 87, 129, 11, 88, 39, 93, 47,}, + {2, 3, 9, 87, 130, 16, 89, 55, 96, 65,}, + {2, 4, 11, 88, 130, 21, 89, 69, 98, 81,}, + {3, 5, 14, 88, 130, 26, 90, 82, 101, 95,}, + {3, 6, 17, 89, 131, 31, 91, 94, 103, 107,}, + {4, 7, 20, 89, 131, 35, 92, 105, 105, 119,}, + {4, 8, 22, 90, 131, 40, 92, 115, 108, 129,}, + {5, 9, 25, 90, 132, 44, 93, 124, 110, 138,}, + {5, 10, 27, 91, 132, 48, 94, 133, 112, 147,}, + {6, 11, 30, 91, 132, 52, 95, 141, 114, 155,}, + {6, 12, 32, 92, 133, 56, 95, 148, 116, 162,}, + {7, 13, 35, 92, 133, 60, 96, 155, 118, 168,}, + {7, 14, 37, 92, 133, 64, 97, 161, 121, 174,}, + {8, 15, 40, 93, 134, 68, 97, 167, 123, 180,}, + {9, 16, 42, 93, 134, 71, 98, 173, 125, 185,}, + {9, 17, 44, 94, 134, 75, 99, 178, 127, 190,}, + {10, 18, 47, 94, 135, 78, 99, 182, 129, 195,}, + {10, 19, 49, 94, 135, 82, 100, 187, 131, 199,}, + {11, 20, 51, 95, 135, 85, 100, 191, 133, 202,}, + {11, 21, 54, 95, 135, 88, 101, 195, 135, 206,}, + {12, 22, 56, 96, 136, 92, 102, 199, 137, 209,}, + {13, 23, 58, 96, 136, 95, 102, 202, 138, 213,}, + {13, 24, 61, 96, 136, 98, 103, 206, 140, 215,}, + {14, 25, 63, 97, 137, 101, 104, 209, 142, 218,}, + {14, 26, 65, 97, 137, 104, 104, 211, 144, 221,}, + {15, 27, 67, 98, 137, 107, 105, 214, 146, 223,}, + {15, 28, 69, 98, 138, 110, 106, 217, 148, 225,}, + {16, 29, 71, 98, 138, 113, 106, 219, 150, 227,}, + {17, 30, 73, 99, 138, 115, 107, 221, 151, 229,}, + {17, 31, 76, 99, 138, 118, 107, 223, 153, 231,}, + {18, 32, 78, 100, 139, 121, 108, 225, 155, 232,}, + {18, 33, 80, 100, 139, 123, 109, 227, 157, 234,}, + {19, 34, 82, 100, 139, 126, 109, 229, 158, 235,}, + {20, 35, 84, 101, 140, 128, 110, 231, 160, 237,}, + {20, 36, 86, 101, 140, 131, 111, 232, 162, 238,}, + {21, 37, 88, 102, 140, 133, 111, 234, 164, 239,}, + {21, 38, 90, 102, 140, 136, 112, 235, 165, 240,}, + {22, 39, 92, 102, 141, 138, 112, 236, 167, 241,}, + {23, 40, 94, 103, 141, 140, 113, 237, 169, 242,}, + {23, 41, 95, 103, 141, 143, 114, 238, 170, 243,}, + {24, 42, 97, 103, 142, 145, 114, 240, 172, 244,}, + {25, 43, 99, 104, 142, 147, 115, 241, 173, 245,}, + {25, 44, 101, 104, 142, 149, 116, 242, 175, 246,}, + {26, 45, 103, 105, 142, 151, 116, 242, 176, 246,}, + {26, 46, 105, 105, 143, 153, 117, 243, 178, 247,}, + {27, 47, 107, 105, 143, 156, 117, 244, 180, 248,}, + {28, 48, 108, 106, 143, 158, 118, 245, 181, 248,}, + {28, 49, 110, 106, 144, 159, 119, 245, 182, 249,}, + {29, 50, 112, 107, 144, 161, 119, 246, 184, 249,}, + {30, 51, 114, 107, 144, 163, 120, 247, 185, 250,}, + {30, 52, 115, 108, 144, 165, 121, 247, 187, 250,}, + {31, 53, 117, 108, 145, 167, 121, 248, 188, 250,}, + {32, 54, 119, 108, 145, 169, 122, 248, 190, 251,}, + {32, 55, 121, 109, 145, 171, 123, 249, 191, 251,}, + {33, 56, 122, 109, 146, 172, 123, 249, 192, 251,}, + {34, 57, 124, 110, 146, 174, 124, 250, 194, 252,}, + {34, 58, 126, 110, 146, 176, 125, 250, 195, 252,}, + {35, 59, 127, 110, 147, 177, 125, 250, 196, 252,}, + {36, 60, 129, 111, 147, 179, 126, 251, 197, 253,}, + {36, 61, 130, 111, 147, 181, 127, 251, 199, 253,}, + {37, 62, 132, 112, 147, 182, 127, 251, 200, 253,}, + {38, 63, 134, 112, 148, 184, 128, 252, 201, 253,}, + {38, 64, 135, 112, 148, 185, 128, 252, 202, 253,}, + {39, 65, 137, 113, 148, 187, 129, 252, 204, 254,}, + {40, 66, 138, 113, 149, 188, 130, 253, 205, 254,}, + {40, 67, 140, 114, 149, 190, 130, 253, 206, 254,}, + {41, 68, 141, 114, 149, 191, 131, 253, 207, 254,}, + {42, 69, 143, 115, 150, 192, 132, 253, 208, 254,}, + {42, 70, 144, 115, 150, 194, 132, 253, 209, 254,}, + {43, 71, 146, 115, 150, 195, 133, 254, 210, 254,}, + {44, 72, 147, 116, 150, 197, 134, 254, 211, 255,}, + {44, 73, 149, 116, 151, 198, 134, 254, 212, 255,}, + {45, 74, 150, 117, 151, 199, 135, 254, 213, 255,}, + {46, 75, 152, 117, 151, 200, 136, 254, 214, 255,}, + {46, 76, 153, 118, 152, 202, 136, 254, 215, 255,}, + {47, 77, 154, 118, 152, 203, 137, 254, 216, 255,}, + {48, 78, 156, 119, 152, 204, 138, 254, 217, 255,}, + {49, 79, 157, 119, 153, 205, 139, 255, 218, 255,}, + {49, 80, 159, 119, 153, 206, 139, 255, 219, 255,}, + {50, 81, 160, 120, 153, 207, 140, 255, 220, 255,}, + {51, 82, 161, 120, 154, 208, 141, 255, 221, 255,}, + {51, 83, 163, 121, 154, 210, 141, 255, 222, 255,}, + {52, 84, 164, 121, 154, 211, 142, 255, 223, 255,}, + {53, 85, 165, 122, 154, 212, 143, 255, 223, 255,}, + {54, 86, 166, 122, 155, 213, 143, 255, 224, 255,}, + {54, 87, 168, 123, 155, 214, 144, 255, 225, 255,}, + {55, 88, 169, 123, 155, 215, 145, 255, 226, 255,}, + {56, 89, 170, 123, 156, 216, 145, 255, 227, 255,}, + {57, 90, 172, 124, 156, 217, 146, 255, 227, 255,}, + {57, 91, 173, 124, 156, 218, 147, 255, 228, 255,}, + {58, 92, 174, 125, 157, 218, 147, 255, 229, 255,}, + {59, 93, 175, 125, 157, 219, 148, 255, 230, 255,}, + {60, 94, 176, 126, 157, 220, 149, 255, 230, 255,}, + {60, 95, 178, 126, 158, 221, 150, 255, 231, 255,}, + {61, 96, 179, 127, 158, 222, 150, 255, 232, 255,}, + {62, 97, 180, 127, 158, 223, 151, 255, 232, 255,}, + {63, 98, 181, 128, 159, 224, 152, 255, 233, 255,}, + {63, 99, 182, 128, 159, 224, 152, 255, 234, 255,}, + {64, 100, 183, 129, 159, 225, 153, 255, 234, 255,}, + {65, 101, 184, 129, 160, 226, 154, 255, 235, 255,}, + {66, 102, 186, 130, 160, 227, 154, 255, 235, 255,}, + {66, 103, 187, 130, 160, 227, 155, 255, 236, 255,}, + {67, 104, 188, 131, 161, 228, 156, 255, 236, 255,}, + {68, 105, 189, 131, 161, 229, 157, 255, 237, 255,}, + {69, 106, 190, 132, 161, 230, 157, 255, 238, 255,}, + {69, 107, 191, 132, 162, 230, 158, 255, 238, 255,}, + {70, 108, 192, 133, 162, 231, 159, 255, 239, 255,}, + {71, 109, 193, 133, 163, 232, 159, 255, 239, 255,}, + {72, 110, 194, 134, 163, 232, 160, 255, 240, 255,}, + {73, 111, 195, 134, 163, 233, 161, 255, 240, 255,}, + {73, 112, 196, 135, 164, 233, 162, 255, 241, 255,}, + {74, 113, 197, 135, 164, 234, 162, 255, 241, 255,}, + {75, 114, 198, 136, 164, 235, 163, 255, 241, 255,}, + {76, 115, 199, 136, 165, 235, 164, 255, 242, 255,}, + {77, 116, 200, 137, 165, 236, 165, 255, 242, 255,}, + {77, 117, 201, 137, 165, 236, 165, 255, 243, 255,}, + {78, 118, 202, 138, 166, 237, 166, 255, 243, 255,}, + {79, 119, 203, 138, 166, 237, 167, 255, 244, 255,}, + {80, 120, 204, 139, 166, 238, 167, 255, 244, 255,}, + {81, 121, 205, 139, 167, 238, 168, 255, 244, 255,}, + {82, 122, 206, 140, 167, 239, 169, 255, 245, 255,}, + {82, 123, 206, 141, 168, 239, 170, 255, 245, 255,}, + {83, 124, 207, 141, 168, 240, 170, 255, 245, 255,}, + {84, 125, 208, 142, 168, 240, 171, 255, 246, 255,}, + {85, 126, 209, 142, 169, 241, 172, 255, 246, 255,}, + {86, 127, 210, 143, 169, 241, 173, 255, 246, 255,}, + {87, 128, 211, 143, 169, 242, 173, 255, 247, 255,}, + {87, 129, 212, 144, 170, 242, 174, 255, 247, 255,}, + {88, 130, 212, 144, 170, 242, 175, 255, 247, 255,}, + {89, 131, 213, 145, 171, 243, 176, 255, 248, 255,}, + {90, 132, 214, 146, 171, 243, 176, 255, 248, 255,}, + {91, 133, 215, 146, 171, 244, 177, 255, 248, 255,}, + {92, 134, 216, 147, 172, 244, 178, 255, 248, 255,}, + {93, 135, 216, 147, 172, 244, 179, 255, 249, 255,}, + {93, 136, 217, 148, 173, 245, 179, 255, 249, 255,}, + {94, 137, 218, 148, 173, 245, 180, 255, 249, 255,}, + {95, 138, 219, 149, 173, 245, 181, 255, 249, 255,}, + {96, 139, 220, 150, 174, 246, 181, 255, 250, 255,}, + {97, 140, 220, 150, 174, 246, 182, 255, 250, 255,}, + {98, 141, 221, 151, 175, 246, 183, 255, 250, 255,}, + {99, 142, 222, 151, 175, 247, 184, 255, 250, 255,}, + {100, 143, 222, 152, 175, 247, 184, 255, 251, 255,}, + {100, 144, 223, 153, 176, 247, 185, 255, 251, 255,}, + {101, 145, 224, 153, 176, 248, 186, 255, 251, 255,}, + {102, 146, 224, 154, 177, 248, 187, 255, 251, 255,}, + {103, 147, 225, 154, 177, 248, 187, 255, 251, 255,}, + {104, 148, 226, 155, 178, 248, 188, 255, 252, 255,}, + {105, 149, 226, 156, 178, 249, 189, 255, 252, 255,}, + {106, 150, 227, 156, 178, 249, 190, 255, 252, 255,}, + {107, 151, 228, 157, 179, 249, 190, 255, 252, 255,}, + {108, 152, 228, 158, 179, 249, 191, 255, 252, 255,}, + {109, 153, 229, 158, 180, 250, 192, 255, 252, 255,}, + {110, 154, 230, 159, 180, 250, 193, 255, 253, 255,}, + {111, 155, 230, 159, 181, 250, 193, 255, 253, 255,}, + {111, 156, 231, 160, 181, 250, 194, 255, 253, 255,}, + {112, 157, 231, 161, 181, 251, 195, 255, 253, 255,}, + {113, 158, 232, 161, 182, 251, 196, 255, 253, 255,}, + {114, 159, 233, 162, 182, 251, 196, 255, 253, 255,}, + {115, 160, 233, 163, 183, 251, 197, 255, 253, 255,}, + {116, 161, 234, 163, 183, 251, 198, 255, 253, 255,}, + {117, 162, 234, 164, 184, 252, 199, 255, 254, 255,}, + {118, 163, 235, 165, 184, 252, 199, 255, 254, 255,}, + {119, 164, 235, 165, 185, 252, 200, 255, 254, 255,}, + {120, 165, 236, 166, 185, 252, 201, 255, 254, 255,}, + {121, 166, 236, 167, 186, 252, 202, 255, 254, 255,}, + {122, 167, 237, 167, 186, 252, 202, 255, 254, 255,}, + {123, 168, 237, 168, 187, 253, 203, 255, 254, 255,}, + {124, 169, 238, 169, 187, 253, 204, 255, 254, 255,}, + {125, 170, 238, 169, 188, 253, 205, 255, 254, 255,}, + {126, 171, 239, 170, 188, 253, 205, 255, 254, 255,}, + {127, 172, 239, 171, 189, 253, 206, 255, 254, 255,}, + {128, 173, 240, 172, 189, 253, 207, 255, 255, 255,}, + {129, 174, 240, 172, 190, 253, 208, 255, 255, 255,}, + {130, 175, 241, 173, 190, 253, 208, 255, 255, 255,}, + {131, 176, 241, 174, 191, 254, 209, 255, 255, 255,}, + {132, 177, 242, 175, 191, 254, 210, 255, 255, 255,}, + {133, 178, 242, 175, 192, 254, 210, 255, 255, 255,}, + {134, 179, 242, 176, 192, 254, 211, 255, 255, 255,}, + {135, 180, 243, 177, 193, 254, 212, 255, 255, 255,}, + {137, 181, 243, 177, 193, 254, 213, 255, 255, 255,}, + {138, 182, 244, 178, 194, 254, 213, 255, 255, 255,}, + {139, 183, 244, 179, 194, 254, 214, 255, 255, 255,}, + {140, 184, 244, 180, 195, 254, 215, 255, 255, 255,}, + {141, 185, 245, 181, 195, 254, 216, 255, 255, 255,}, + {142, 186, 245, 181, 196, 255, 216, 255, 255, 255,}, + {143, 187, 245, 182, 196, 255, 217, 255, 255, 255,}, + {144, 188, 246, 183, 197, 255, 218, 255, 255, 255,}, + {145, 189, 246, 184, 197, 255, 218, 255, 255, 255,}, + {146, 190, 247, 184, 198, 255, 219, 255, 255, 255,}, + {147, 191, 247, 185, 199, 255, 220, 255, 255, 255,}, + {149, 192, 247, 186, 199, 255, 221, 255, 255, 255,}, + {150, 193, 247, 187, 200, 255, 221, 255, 255, 255,}, + {151, 194, 248, 188, 200, 255, 222, 255, 255, 255,}, + {152, 195, 248, 188, 201, 255, 223, 255, 255, 255,}, + {153, 196, 248, 189, 201, 255, 223, 255, 255, 255,}, + {154, 197, 249, 190, 202, 255, 224, 255, 255, 255,}, + {156, 198, 249, 191, 203, 255, 225, 255, 255, 255,}, + {157, 199, 249, 192, 203, 255, 225, 255, 255, 255,}, + {158, 200, 250, 193, 204, 255, 226, 255, 255, 255,}, + {159, 201, 250, 193, 205, 255, 227, 255, 255, 255,}, + {160, 202, 250, 194, 205, 255, 227, 255, 255, 255,}, + {162, 203, 250, 195, 206, 255, 228, 255, 255, 255,}, + {163, 204, 251, 196, 206, 255, 229, 255, 255, 255,}, + {164, 205, 251, 197, 207, 255, 229, 255, 255, 255,}, + {165, 206, 251, 198, 208, 255, 230, 255, 255, 255,}, + {166, 207, 251, 199, 208, 255, 231, 255, 255, 255,}, + {168, 208, 251, 200, 209, 255, 231, 255, 255, 255,}, + {169, 209, 252, 201, 210, 255, 232, 255, 255, 255,}, + {170, 210, 252, 201, 210, 255, 233, 255, 255, 255,}, + {172, 211, 252, 202, 211, 255, 233, 255, 255, 255,}, + {173, 212, 252, 203, 212, 255, 234, 255, 255, 255,}, + {174, 213, 252, 204, 212, 255, 235, 255, 255, 255,}, + {175, 214, 253, 205, 213, 255, 235, 255, 255, 255,}, + {177, 215, 253, 206, 214, 255, 236, 255, 255, 255,}, + {178, 216, 253, 207, 215, 255, 237, 255, 255, 255,}, + {179, 217, 253, 208, 215, 255, 237, 255, 255, 255,}, + {181, 218, 253, 209, 216, 255, 238, 255, 255, 255,}, + {182, 219, 254, 210, 217, 255, 238, 255, 255, 255,}, + {184, 220, 254, 211, 217, 255, 239, 255, 255, 255,}, + {185, 221, 254, 212, 218, 255, 240, 255, 255, 255,}, + {186, 222, 254, 213, 219, 255, 240, 255, 255, 255,}, + {188, 223, 254, 214, 220, 255, 241, 255, 255, 255,}, + {189, 224, 254, 215, 221, 255, 241, 255, 255, 255,}, + {191, 225, 254, 216, 221, 255, 242, 255, 255, 255,}, + {192, 226, 254, 217, 222, 255, 243, 255, 255, 255,}, + {194, 227, 255, 218, 223, 255, 243, 255, 255, 255,}, + {195, 228, 255, 219, 224, 255, 244, 255, 255, 255,}, + {197, 229, 255, 220, 225, 255, 244, 255, 255, 255,}, + {198, 230, 255, 221, 225, 255, 245, 255, 255, 255,}, + {200, 231, 255, 222, 226, 255, 245, 255, 255, 255,}, + {201, 232, 255, 223, 227, 255, 246, 255, 255, 255,}, + {203, 233, 255, 224, 228, 255, 247, 255, 255, 255,}, + {205, 234, 255, 226, 229, 255, 247, 255, 255, 255,}, + {206, 235, 255, 227, 230, 255, 248, 255, 255, 255,}, + {208, 236, 255, 228, 231, 255, 248, 255, 255, 255,}, + {210, 237, 255, 229, 232, 255, 249, 255, 255, 255,}, + {211, 238, 255, 230, 233, 255, 249, 255, 255, 255,}, + {213, 239, 255, 231, 234, 255, 250, 255, 255, 255,}, + {215, 240, 255, 233, 235, 255, 250, 255, 255, 255,}, + {217, 241, 255, 234, 236, 255, 251, 255, 255, 255,}, + {219, 242, 255, 235, 237, 255, 251, 255, 255, 255,}, + {221, 243, 255, 236, 238, 255, 252, 255, 255, 255,}, + {223, 244, 255, 237, 239, 255, 252, 255, 255, 255,}, + {225, 245, 255, 239, 240, 255, 252, 255, 255, 255,}, + {227, 246, 255, 240, 241, 255, 253, 255, 255, 255,}, + {229, 247, 255, 241, 242, 255, 253, 255, 255, 255,}, + {231, 248, 255, 243, 244, 255, 254, 255, 255, 255,}, + {233, 249, 255, 244, 245, 255, 254, 255, 255, 255,}, + {236, 250, 255, 246, 246, 255, 254, 255, 255, 255,}, + {238, 251, 255, 247, 247, 255, 255, 255, 255, 255,}, + {241, 252, 255, 249, 249, 255, 255, 255, 255, 255,}, + {244, 253, 255, 250, 250, 255, 255, 255, 255, 255,}, + {247, 254, 255, 252, 252, 255, 255, 255, 255, 255,}, + {251, 255, 255, 254, 254, 255, 255, 255, 255, 255,}, +}; + +const vp9_prob vp9_modelcoefprobs_gg625p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = { + // Probs generated with a Generalized Gaussian (with shape parameter 0.625) + // source model with varying quantizer step size for a uniform quantizer + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}, // do not use + {1, 1, 3, 87, 129, 6, 87, 20, 91, 24,}, + {1, 2, 6, 88, 130, 11, 89, 36, 94, 41,}, + {2, 3, 8, 88, 130, 15, 90, 50, 97, 56,}, + {2, 4, 11, 89, 131, 20, 90, 62, 99, 70,}, + {3, 5, 14, 90, 131, 24, 91, 74, 102, 81,}, + {3, 6, 16, 90, 132, 29, 92, 84, 104, 92,}, + {4, 7, 19, 91, 132, 33, 93, 93, 106, 101,}, + {4, 8, 21, 91, 132, 37, 93, 102, 108, 110,}, + {5, 9, 24, 92, 133, 40, 94, 110, 110, 118,}, + {5, 10, 26, 92, 133, 44, 95, 118, 111, 125,}, + {6, 11, 29, 93, 134, 48, 96, 125, 113, 132,}, + {7, 12, 31, 93, 134, 51, 96, 132, 115, 139,}, + {7, 13, 33, 93, 134, 55, 97, 138, 117, 145,}, + {8, 14, 36, 94, 135, 58, 97, 144, 119, 150,}, + {8, 15, 38, 94, 135, 62, 98, 149, 120, 155,}, + {9, 16, 40, 95, 135, 65, 99, 154, 122, 160,}, + {10, 17, 42, 95, 136, 68, 99, 159, 124, 165,}, + {10, 18, 45, 96, 136, 71, 100, 164, 125, 169,}, + {11, 19, 47, 96, 136, 74, 100, 168, 127, 174,}, + {11, 20, 49, 96, 136, 77, 101, 173, 128, 177,}, + {12, 21, 51, 97, 137, 80, 102, 176, 130, 181,}, + {13, 22, 53, 97, 137, 83, 102, 180, 131, 185,}, + {13, 23, 55, 98, 137, 86, 103, 184, 133, 188,}, + {14, 24, 57, 98, 138, 89, 103, 187, 135, 191,}, + {14, 25, 59, 98, 138, 91, 104, 190, 136, 194,}, + {15, 26, 61, 99, 138, 94, 104, 193, 138, 197,}, + {16, 27, 64, 99, 139, 97, 105, 196, 139, 200,}, + {16, 28, 66, 100, 139, 99, 106, 199, 141, 202,}, + {17, 29, 68, 100, 139, 102, 106, 201, 142, 205,}, + {18, 30, 69, 100, 139, 104, 107, 204, 143, 207,}, + {18, 31, 71, 101, 140, 107, 107, 206, 145, 209,}, + {19, 32, 73, 101, 140, 109, 108, 209, 146, 211,}, + {20, 33, 75, 102, 140, 112, 108, 211, 148, 213,}, + {20, 34, 77, 102, 141, 114, 109, 213, 149, 215,}, + {21, 35, 79, 102, 141, 116, 109, 215, 150, 217,}, + {22, 36, 81, 103, 141, 119, 110, 217, 152, 219,}, + {22, 37, 83, 103, 141, 121, 110, 218, 153, 220,}, + {23, 38, 85, 103, 142, 123, 111, 220, 155, 222,}, + {24, 39, 87, 104, 142, 125, 112, 222, 156, 224,}, + {24, 40, 88, 104, 142, 127, 112, 223, 157, 225,}, + {25, 41, 90, 105, 143, 129, 113, 225, 159, 226,}, + {26, 42, 92, 105, 143, 131, 113, 226, 160, 228,}, + {26, 43, 94, 105, 143, 133, 114, 227, 161, 229,}, + {27, 44, 95, 106, 143, 135, 114, 229, 162, 230,}, + {28, 45, 97, 106, 144, 137, 115, 230, 164, 231,}, + {28, 46, 99, 107, 144, 139, 115, 231, 165, 232,}, + {29, 47, 101, 107, 144, 141, 116, 232, 166, 233,}, + {30, 48, 102, 107, 145, 143, 116, 233, 168, 234,}, + {31, 49, 104, 108, 145, 145, 117, 234, 169, 235,}, + {31, 50, 106, 108, 145, 147, 118, 235, 170, 236,}, + {32, 51, 107, 108, 145, 149, 118, 236, 171, 237,}, + {33, 52, 109, 109, 146, 150, 119, 237, 172, 238,}, + {33, 53, 111, 109, 146, 152, 119, 238, 174, 239,}, + {34, 54, 112, 110, 146, 154, 120, 239, 175, 240,}, + {35, 55, 114, 110, 146, 156, 120, 240, 176, 240,}, + {36, 56, 115, 110, 147, 157, 121, 240, 177, 241,}, + {36, 57, 117, 111, 147, 159, 121, 241, 178, 242,}, + {37, 58, 119, 111, 147, 161, 122, 242, 180, 242,}, + {38, 59, 120, 112, 148, 162, 122, 242, 181, 243,}, + {38, 60, 122, 112, 148, 164, 123, 243, 182, 244,}, + {39, 61, 123, 112, 148, 165, 124, 244, 183, 244,}, + {40, 62, 125, 113, 148, 167, 124, 244, 184, 245,}, + {41, 63, 126, 113, 149, 168, 125, 245, 185, 245,}, + {41, 64, 128, 114, 149, 170, 125, 245, 186, 246,}, + {42, 65, 129, 114, 149, 171, 126, 246, 187, 246,}, + {43, 66, 131, 114, 150, 173, 126, 246, 188, 247,}, + {44, 67, 132, 115, 150, 174, 127, 247, 189, 247,}, + {44, 68, 134, 115, 150, 176, 127, 247, 191, 247,}, + {45, 69, 135, 116, 150, 177, 128, 248, 192, 248,}, + {46, 70, 136, 116, 151, 178, 129, 248, 193, 248,}, + {47, 71, 138, 116, 151, 180, 129, 248, 194, 249,}, + {48, 72, 139, 117, 151, 181, 130, 249, 195, 249,}, + {48, 73, 141, 117, 152, 183, 130, 249, 196, 249,}, + {49, 74, 142, 118, 152, 184, 131, 249, 197, 250,}, + {50, 75, 143, 118, 152, 185, 131, 250, 198, 250,}, + {51, 76, 145, 118, 152, 186, 132, 250, 199, 250,}, + {51, 77, 146, 119, 153, 188, 132, 250, 200, 250,}, + {52, 78, 148, 119, 153, 189, 133, 251, 201, 251,}, + {53, 79, 149, 120, 153, 190, 134, 251, 201, 251,}, + {54, 80, 150, 120, 154, 191, 134, 251, 202, 251,}, + {55, 81, 151, 120, 154, 192, 135, 251, 203, 251,}, + {55, 82, 153, 121, 154, 194, 135, 252, 204, 252,}, + {56, 83, 154, 121, 155, 195, 136, 252, 205, 252,}, + {57, 84, 155, 122, 155, 196, 136, 252, 206, 252,}, + {58, 85, 157, 122, 155, 197, 137, 252, 207, 252,}, + {59, 86, 158, 123, 155, 198, 138, 252, 208, 252,}, + {59, 87, 159, 123, 156, 199, 138, 253, 209, 253,}, + {60, 88, 160, 123, 156, 200, 139, 253, 210, 253,}, + {61, 89, 162, 124, 156, 201, 139, 253, 210, 253,}, + {62, 90, 163, 124, 157, 202, 140, 253, 211, 253,}, + {63, 91, 164, 125, 157, 203, 140, 253, 212, 253,}, + {64, 92, 165, 125, 157, 204, 141, 253, 213, 253,}, + {64, 93, 166, 126, 158, 205, 142, 254, 214, 253,}, + {65, 94, 168, 126, 158, 206, 142, 254, 214, 254,}, + {66, 95, 169, 126, 158, 207, 143, 254, 215, 254,}, + {67, 96, 170, 127, 158, 208, 143, 254, 216, 254,}, + {68, 97, 171, 127, 159, 209, 144, 254, 217, 254,}, + {69, 98, 172, 128, 159, 210, 145, 254, 218, 254,}, + {69, 99, 173, 128, 159, 211, 145, 254, 218, 254,}, + {70, 100, 175, 129, 160, 212, 146, 254, 219, 254,}, + {71, 101, 176, 129, 160, 213, 146, 254, 220, 254,}, + {72, 102, 177, 130, 160, 214, 147, 254, 220, 254,}, + {73, 103, 178, 130, 161, 214, 148, 255, 221, 255,}, + {74, 104, 179, 130, 161, 215, 148, 255, 222, 255,}, + {75, 105, 180, 131, 161, 216, 149, 255, 223, 255,}, + {75, 106, 181, 131, 162, 217, 149, 255, 223, 255,}, + {76, 107, 182, 132, 162, 218, 150, 255, 224, 255,}, + {77, 108, 183, 132, 162, 219, 151, 255, 225, 255,}, + {78, 109, 184, 133, 163, 219, 151, 255, 225, 255,}, + {79, 110, 185, 133, 163, 220, 152, 255, 226, 255,}, + {80, 111, 186, 134, 163, 221, 152, 255, 226, 255,}, + {81, 112, 187, 134, 164, 222, 153, 255, 227, 255,}, + {82, 113, 188, 135, 164, 222, 154, 255, 228, 255,}, + {83, 114, 189, 135, 164, 223, 154, 255, 228, 255,}, + {83, 115, 190, 136, 165, 224, 155, 255, 229, 255,}, + {84, 116, 191, 136, 165, 224, 156, 255, 230, 255,}, + {85, 117, 192, 137, 165, 225, 156, 255, 230, 255,}, + {86, 118, 193, 137, 166, 226, 157, 255, 231, 255,}, + {87, 119, 194, 137, 166, 226, 157, 255, 231, 255,}, + {88, 120, 195, 138, 166, 227, 158, 255, 232, 255,}, + {89, 121, 196, 138, 167, 228, 159, 255, 232, 255,}, + {90, 122, 197, 139, 167, 228, 159, 255, 233, 255,}, + {91, 123, 198, 139, 167, 229, 160, 255, 233, 255,}, + {92, 124, 199, 140, 168, 230, 161, 255, 234, 255,}, + {93, 125, 200, 140, 168, 230, 161, 255, 234, 255,}, + {93, 126, 201, 141, 168, 231, 162, 255, 235, 255,}, + {94, 127, 202, 141, 169, 231, 163, 255, 235, 255,}, + {95, 128, 203, 142, 169, 232, 163, 255, 236, 255,}, + {96, 129, 203, 142, 169, 233, 164, 255, 236, 255,}, + {97, 130, 204, 143, 170, 233, 164, 255, 237, 255,}, + {98, 131, 205, 143, 170, 234, 165, 255, 237, 255,}, + {99, 132, 206, 144, 170, 234, 166, 255, 238, 255,}, + {100, 133, 207, 145, 171, 235, 166, 255, 238, 255,}, + {101, 134, 208, 145, 171, 235, 167, 255, 239, 255,}, + {102, 135, 209, 146, 171, 236, 168, 255, 239, 255,}, + {103, 136, 209, 146, 172, 236, 168, 255, 240, 255,}, + {104, 137, 210, 147, 172, 237, 169, 255, 240, 255,}, + {105, 138, 211, 147, 173, 237, 170, 255, 240, 255,}, + {106, 139, 212, 148, 173, 238, 170, 255, 241, 255,}, + {107, 140, 213, 148, 173, 238, 171, 255, 241, 255,}, + {108, 141, 213, 149, 174, 239, 172, 255, 242, 255,}, + {109, 142, 214, 149, 174, 239, 172, 255, 242, 255,}, + {110, 143, 215, 150, 174, 240, 173, 255, 242, 255,}, + {111, 144, 216, 150, 175, 240, 174, 255, 243, 255,}, + {112, 145, 216, 151, 175, 240, 174, 255, 243, 255,}, + {113, 146, 217, 152, 176, 241, 175, 255, 243, 255,}, + {114, 147, 218, 152, 176, 241, 176, 255, 244, 255,}, + {115, 148, 219, 153, 176, 242, 176, 255, 244, 255,}, + {116, 149, 219, 153, 177, 242, 177, 255, 244, 255,}, + {117, 150, 220, 154, 177, 242, 178, 255, 245, 255,}, + {118, 151, 221, 154, 178, 243, 178, 255, 245, 255,}, + {119, 152, 221, 155, 178, 243, 179, 255, 245, 255,}, + {120, 153, 222, 156, 178, 244, 180, 255, 246, 255,}, + {121, 154, 223, 156, 179, 244, 180, 255, 246, 255,}, + {122, 155, 223, 157, 179, 244, 181, 255, 246, 255,}, + {123, 156, 224, 157, 180, 245, 182, 255, 247, 255,}, + {124, 157, 225, 158, 180, 245, 183, 255, 247, 255,}, + {125, 158, 225, 159, 180, 245, 183, 255, 247, 255,}, + {126, 159, 226, 159, 181, 246, 184, 255, 247, 255,}, + {127, 160, 227, 160, 181, 246, 185, 255, 248, 255,}, + {128, 161, 227, 160, 182, 246, 185, 255, 248, 255,}, + {129, 162, 228, 161, 182, 246, 186, 255, 248, 255,}, + {130, 163, 229, 162, 183, 247, 187, 255, 248, 255,}, + {131, 164, 229, 162, 183, 247, 187, 255, 249, 255,}, + {132, 165, 230, 163, 183, 247, 188, 255, 249, 255,}, + {133, 166, 230, 163, 184, 248, 189, 255, 249, 255,}, + {135, 167, 231, 164, 184, 248, 190, 255, 249, 255,}, + {136, 168, 232, 165, 185, 248, 190, 255, 250, 255,}, + {137, 169, 232, 165, 185, 248, 191, 255, 250, 255,}, + {138, 170, 233, 166, 186, 249, 192, 255, 250, 255,}, + {139, 171, 233, 167, 186, 249, 192, 255, 250, 255,}, + {140, 172, 234, 167, 187, 249, 193, 255, 251, 255,}, + {141, 173, 234, 168, 187, 249, 194, 255, 251, 255,}, + {142, 174, 235, 169, 187, 250, 195, 255, 251, 255,}, + {143, 175, 235, 169, 188, 250, 195, 255, 251, 255,}, + {144, 176, 236, 170, 188, 250, 196, 255, 251, 255,}, + {146, 177, 236, 171, 189, 250, 197, 255, 251, 255,}, + {147, 178, 237, 171, 189, 251, 197, 255, 252, 255,}, + {148, 179, 237, 172, 190, 251, 198, 255, 252, 255,}, + {149, 180, 238, 173, 190, 251, 199, 255, 252, 255,}, + {150, 181, 238, 173, 191, 251, 200, 255, 252, 255,}, + {151, 182, 239, 174, 191, 251, 200, 255, 252, 255,}, + {152, 183, 239, 175, 192, 251, 201, 255, 252, 255,}, + {153, 184, 240, 176, 192, 252, 202, 255, 253, 255,}, + {155, 185, 240, 176, 193, 252, 203, 255, 253, 255,}, + {156, 186, 241, 177, 193, 252, 203, 255, 253, 255,}, + {157, 187, 241, 178, 194, 252, 204, 255, 253, 255,}, + {158, 188, 242, 179, 194, 252, 205, 255, 253, 255,}, + {159, 189, 242, 179, 195, 252, 206, 255, 253, 255,}, + {160, 190, 242, 180, 195, 253, 206, 255, 253, 255,}, + {162, 191, 243, 181, 196, 253, 207, 255, 253, 255,}, + {163, 192, 243, 182, 196, 253, 208, 255, 254, 255,}, + {164, 193, 244, 182, 197, 253, 209, 255, 254, 255,}, + {165, 194, 244, 183, 198, 253, 209, 255, 254, 255,}, + {166, 195, 244, 184, 198, 253, 210, 255, 254, 255,}, + {168, 196, 245, 185, 199, 253, 211, 255, 254, 255,}, + {169, 197, 245, 185, 199, 254, 212, 255, 254, 255,}, + {170, 198, 246, 186, 200, 254, 212, 255, 254, 255,}, + {171, 199, 246, 187, 200, 254, 213, 255, 254, 255,}, + {172, 200, 246, 188, 201, 254, 214, 255, 254, 255,}, + {174, 201, 247, 189, 201, 254, 215, 255, 254, 255,}, + {175, 202, 247, 189, 202, 254, 215, 255, 255, 255,}, + {176, 203, 247, 190, 203, 254, 216, 255, 255, 255,}, + {177, 204, 248, 191, 203, 254, 217, 255, 255, 255,}, + {179, 205, 248, 192, 204, 254, 218, 255, 255, 255,}, + {180, 206, 248, 193, 204, 254, 218, 255, 255, 255,}, + {181, 207, 249, 194, 205, 255, 219, 255, 255, 255,}, + {183, 208, 249, 195, 206, 255, 220, 255, 255, 255,}, + {184, 209, 249, 195, 206, 255, 221, 255, 255, 255,}, + {185, 210, 250, 196, 207, 255, 221, 255, 255, 255,}, + {186, 211, 250, 197, 208, 255, 222, 255, 255, 255,}, + {188, 212, 250, 198, 208, 255, 223, 255, 255, 255,}, + {189, 213, 250, 199, 209, 255, 224, 255, 255, 255,}, + {190, 214, 251, 200, 210, 255, 224, 255, 255, 255,}, + {192, 215, 251, 201, 210, 255, 225, 255, 255, 255,}, + {193, 216, 251, 202, 211, 255, 226, 255, 255, 255,}, + {194, 217, 251, 203, 212, 255, 227, 255, 255, 255,}, + {196, 218, 252, 204, 212, 255, 228, 255, 255, 255,}, + {197, 219, 252, 205, 213, 255, 228, 255, 255, 255,}, + {198, 220, 252, 206, 214, 255, 229, 255, 255, 255,}, + {200, 221, 252, 207, 215, 255, 230, 255, 255, 255,}, + {201, 222, 252, 208, 215, 255, 231, 255, 255, 255,}, + {202, 223, 253, 209, 216, 255, 231, 255, 255, 255,}, + {204, 224, 253, 210, 217, 255, 232, 255, 255, 255,}, + {205, 225, 253, 211, 218, 255, 233, 255, 255, 255,}, + {207, 226, 253, 212, 218, 255, 234, 255, 255, 255,}, + {208, 227, 253, 213, 219, 255, 234, 255, 255, 255,}, + {209, 228, 254, 214, 220, 255, 235, 255, 255, 255,}, + {211, 229, 254, 215, 221, 255, 236, 255, 255, 255,}, + {212, 230, 254, 216, 222, 255, 237, 255, 255, 255,}, + {214, 231, 254, 217, 223, 255, 238, 255, 255, 255,}, + {215, 232, 254, 218, 223, 255, 238, 255, 255, 255,}, + {217, 233, 254, 219, 224, 255, 239, 255, 255, 255,}, + {218, 234, 255, 221, 225, 255, 240, 255, 255, 255,}, + {220, 235, 255, 222, 226, 255, 241, 255, 255, 255,}, + {221, 236, 255, 223, 227, 255, 241, 255, 255, 255,}, + {223, 237, 255, 224, 228, 255, 242, 255, 255, 255,}, + {224, 238, 255, 225, 229, 255, 243, 255, 255, 255,}, + {226, 239, 255, 227, 230, 255, 244, 255, 255, 255,}, + {227, 240, 255, 228, 231, 255, 244, 255, 255, 255,}, + {229, 241, 255, 229, 232, 255, 245, 255, 255, 255,}, + {231, 242, 255, 231, 233, 255, 246, 255, 255, 255,}, + {232, 243, 255, 232, 234, 255, 247, 255, 255, 255,}, + {234, 244, 255, 233, 236, 255, 247, 255, 255, 255,}, + {235, 245, 255, 235, 237, 255, 248, 255, 255, 255,}, + {237, 246, 255, 236, 238, 255, 249, 255, 255, 255,}, + {239, 247, 255, 238, 239, 255, 250, 255, 255, 255,}, + {241, 248, 255, 239, 241, 255, 250, 255, 255, 255,}, + {242, 249, 255, 241, 242, 255, 251, 255, 255, 255,}, + {244, 250, 255, 243, 243, 255, 252, 255, 255, 255,}, + {246, 251, 255, 244, 245, 255, 253, 255, 255, 255,}, + {248, 252, 255, 246, 247, 255, 253, 255, 255, 255,}, + {250, 253, 255, 248, 248, 255, 254, 255, 255, 255,}, + {252, 254, 255, 250, 250, 255, 255, 255, 255, 255,}, + {254, 255, 255, 253, 253, 255, 255, 255, 255, 255,}, +}; + +void vp9_get_model_distribution(vp9_prob p, vp9_prob *tree_probs, + int b, int r) { + const vp9_prob (*model)[ENTROPY_NODES - 1]; +#if UNCONSTRAINED_NODES == 2 + if (r != INTRA_FRAME && b == PLANE_TYPE_UV) + model = vp9_modelcoefprobs_gg75; + else if (r == INTRA_FRAME && b == PLANE_TYPE_UV) + model = vp9_modelcoefprobs_gg75; + else if (r != INTRA_FRAME && b == PLANE_TYPE_Y_WITH_DC) + model = vp9_modelcoefprobs_gg75; + else + model = vp9_modelcoefprobs_gg75; +#else + if (r != INTRA_FRAME && b == PLANE_TYPE_UV) + model = vp9_modelcoefprobs_gg75p1; + else if (r == INTRA_FRAME && b == PLANE_TYPE_UV) + model = vp9_modelcoefprobs_gg75p1; + else if (r != INTRA_FRAME && b == PLANE_TYPE_Y_WITH_DC) + model = vp9_modelcoefprobs_gg75p1; + else + model = vp9_modelcoefprobs_gg75p1; +#endif + vpx_memcpy(tree_probs + UNCONSTRAINED_NODES, + model[p] + UNCONSTRAINED_NODES - 1, + (ENTROPY_NODES - UNCONSTRAINED_NODES) * sizeof(vp9_prob)); +} +#endif + static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; static void init_bit_tree(vp9_tree_index *p, int n) { @@ -937,7 +2094,187 @@ vp9_extra_bit_struct vp9_extra_bits[12] = { #include "vp9/common/vp9_default_coef_probs.h" -#if CONFIG_NEWCOEFCONTEXT +// This function updates and then returns n AC coefficient context +// This is currently a placeholder function to allow experimentation +// using various context models based on the energy earlier tokens +// within the current block. +// +// For now it just returns the previously used context. +#define MAX_NEIGHBORS 2 +int vp9_get_coef_context(const int *scan, const int *neighbors, + int nb_pad, uint8_t *token_cache, int c, int l) { + int eob = l; + assert(nb_pad == MAX_NEIGHBORS); + if (c == eob) { + return 0; + } else { + int ctx; + assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0); + if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) { + ctx = (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + + token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; + } else { + ctx = token_cache[neighbors[MAX_NEIGHBORS * c + 0]]; + } + return vp9_pt_energy_class[ctx]; + } +}; + +void vp9_default_coef_probs(VP9_COMMON *pc) { +#if CONFIG_MODELCOEFPROB + int b, r, c, p; +#endif +#if CONFIG_CODE_NONZEROCOUNT +#ifdef NZC_DEFAULT_COUNTS + int h, g; + for (h = 0; h < MAX_NZC_CONTEXTS; ++h) { + for (g = 0; g < REF_TYPES; ++g) { + int i; + unsigned int branch_ct4x4[NZC4X4_NODES][2]; + unsigned int branch_ct8x8[NZC8X8_NODES][2]; + unsigned int branch_ct16x16[NZC16X16_NODES][2]; + unsigned int branch_ct32x32[NZC32X32_NODES][2]; + for (i = 0; i < BLOCK_TYPES; ++i) { + vp9_tree_probs_from_distribution( + vp9_nzc4x4_tree, + pc->fc.nzc_probs_4x4[h][g][i], branch_ct4x4, + default_nzc_counts_4x4[h][g][i], 0); + } + for (i = 0; i < BLOCK_TYPES; ++i) { + vp9_tree_probs_from_distribution( + vp9_nzc8x8_tree, + pc->fc.nzc_probs_8x8[h][g][i], branch_ct8x8, + default_nzc_counts_8x8[h][g][i], 0); + } + for (i = 0; i < BLOCK_TYPES; ++i) { + vp9_tree_probs_from_distribution( + vp9_nzc16x16_tree, + pc->fc.nzc_probs_16x16[h][g][i], branch_ct16x16, + default_nzc_counts_16x16[h][g][i], 0); + } + for (i = 0; i < BLOCK_TYPES; ++i) { + vp9_tree_probs_from_distribution( + vp9_nzc32x32_tree, + pc->fc.nzc_probs_32x32[h][g][i], branch_ct32x32, + default_nzc_counts_32x32[h][g][i], 0); + } + } + } +#else + vpx_memcpy(pc->fc.nzc_probs_4x4, default_nzc_probs_4x4, + sizeof(pc->fc.nzc_probs_4x4)); + vpx_memcpy(pc->fc.nzc_probs_8x8, default_nzc_probs_8x8, + sizeof(pc->fc.nzc_probs_8x8)); + vpx_memcpy(pc->fc.nzc_probs_16x16, default_nzc_probs_16x16, + sizeof(pc->fc.nzc_probs_16x16)); + vpx_memcpy(pc->fc.nzc_probs_32x32, default_nzc_probs_32x32, + sizeof(pc->fc.nzc_probs_32x32)); +#endif + vpx_memcpy(pc->fc.nzc_pcat_probs, default_nzc_pcat_probs, + sizeof(pc->fc.nzc_pcat_probs)); +#endif // CONFIG_CODE_NONZEROCOUNT +#if CONFIG_MODELCOEFPROB + for (b = 0; b < BLOCK_TYPES; ++b) + for (r = 0; r < REF_TYPES; ++r) + for (c = 0; c < COEF_BANDS; ++c) + for (p = 0; p < PREV_COEF_CONTEXTS; ++p) { + int t; + for (t = 0; t < UNCONSTRAINED_NODES; t++) + pc->fc.coef_probs_4x4[b][r][c][p][t] = + default_coef_probs_4x4[b][r][c][p][t]; + vp9_get_model_distribution( + default_coef_probs_4x4[b][r][c][p][UNCONSTRAINED_NODES - 1], + pc->fc.coef_probs_4x4[b][r][c][p], b, r); + for (t = 0; t < UNCONSTRAINED_NODES; t++) + pc->fc.coef_probs_8x8[b][r][c][p][t] = + default_coef_probs_8x8[b][r][c][p][t]; + vp9_get_model_distribution( + default_coef_probs_8x8[b][r][c][p][UNCONSTRAINED_NODES - 1], + pc->fc.coef_probs_8x8[b][r][c][p], b, r); + for (t = 0; t < UNCONSTRAINED_NODES; t++) + pc->fc.coef_probs_16x16[b][r][c][p][t] = + default_coef_probs_16x16[b][r][c][p][t]; + vp9_get_model_distribution( + default_coef_probs_16x16[b][r][c][p][UNCONSTRAINED_NODES - 1], + pc->fc.coef_probs_16x16[b][r][c][p], b, r); + for (t = 0; t < UNCONSTRAINED_NODES; t++) + pc->fc.coef_probs_32x32[b][r][c][p][t] = + default_coef_probs_32x32[b][r][c][p][t]; + vp9_get_model_distribution( + default_coef_probs_32x32[b][r][c][p][UNCONSTRAINED_NODES - 1], + pc->fc.coef_probs_32x32[b][r][c][p], b, r); + } +#else + vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4, + sizeof(pc->fc.coef_probs_4x4)); + vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8, + sizeof(pc->fc.coef_probs_8x8)); + vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16, + sizeof(pc->fc.coef_probs_16x16)); + vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32, + sizeof(pc->fc.coef_probs_32x32)); +#endif +} + +#if CONFIG_MODELCOEFPROB +// This is a placeholder function that will enable the default coef probs to +// change for key frames based on the base_qindex. If base_qindex is large, +// we can expect probabilities of zeros to be bigger, and vice versa. The rest +// of the probabilities are derived from the nodel. +void vp9_adjust_default_coef_probs(VP9_COMMON *cm) { + static const int factor_bits = 4; + static const int factor_rnd = 8; // (1 << (factor_bits - 1)) + int b, r, c, p; + int factor = (1 << factor_bits); + /* + if (cm->base_qindex < 32) + factor -= ((32 - cm->base_qindex) >> 4); + */ + if (cm->base_qindex > 128) + factor += ((cm->base_qindex - 128) >> 4); + // printf(" Q %d factor %d\n", cm->base_qindex, factor); + + for (b = 0; b < BLOCK_TYPES; ++b) + for (r = 0; r < REF_TYPES; ++r) + for (c = 0; c < COEF_BANDS; ++c) + for (p = 0; p < PREV_COEF_CONTEXTS; ++p) { + int t, x; + vp9_prob prob; + for (t = 0; t < UNCONSTRAINED_NODES; t++) { + x = (default_coef_probs_4x4[b][r][c][p][t] * factor + factor_rnd) + >> factor_bits; + prob = (x > 255 ? 255 : (x < 1 ? 1 : x)); + cm->fc.coef_probs_4x4[b][r][c][p][t] = prob; + } + vp9_get_model_distribution( + prob, cm->fc.coef_probs_4x4[b][r][c][p], b, r); + for (t = 0; t < UNCONSTRAINED_NODES; t++) { + x = (default_coef_probs_8x8[b][r][c][p][t] * factor + factor_rnd) + >> factor_bits; + prob = (x > 255 ? 255 : (x < 1 ? 1 : x)); + cm->fc.coef_probs_8x8[b][r][c][p][t] = prob; + } + vp9_get_model_distribution( + prob, cm->fc.coef_probs_8x8[b][r][c][p], b, r); + for (t = 0; t < UNCONSTRAINED_NODES; t++) { + x = (default_coef_probs_16x16[b][r][c][p][t] * factor + factor_rnd) + >> factor_bits; + prob = (x > 255 ? 255 : (x < 1 ? 1 : x)); + cm->fc.coef_probs_16x16[b][r][c][p][t] = prob; + } + vp9_get_model_distribution( + prob, cm->fc.coef_probs_16x16[b][r][c][p], b, r); + for (t = 0; t < UNCONSTRAINED_NODES; t++) { + x = (default_coef_probs_32x32[b][r][c][p][t] * factor + factor_rnd) + >> factor_bits; + prob = (x > 255 ? 255 : (x < 1 ? 1 : x)); + cm->fc.coef_probs_32x32[b][r][c][p][t] = prob; + } + vp9_get_model_distribution( + prob, cm->fc.coef_probs_32x32[b][r][c][p], b, r); + } +} +#endif // Neighborhood 5-tuples for various scans and blocksizes, // in {top, left, topleft, topright, bottomleft} order @@ -949,159 +2286,1236 @@ DECLARE_ALIGNED(16, int, vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int, vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_8x8_neighbors[64 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_16x16_neighbors[256 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_32x32_neighbors[1024 * MAX_NEIGHBORS]); -static int find_in_scan(const int *scan, int l, int m) { - int i, l2 = l * l; - for (i = 0; i < l2; ++i) { - if (scan[i] == m) - return i; +static int find_in_scan(const int *scan, int l, int idx) { + int n, l2 = l * l; + for (n = 0; n < l2; n++) { + int rc = scan[n]; + if (rc == idx) + return n; } + assert(0); return -1; } - -static void init_scan_neighbors(const int *scan, int l, int *neighbors) { +static void init_scan_neighbors(const int *scan, int l, int *neighbors, + int max_neighbors) { int l2 = l * l; - int m, n, i, j, k; - for (n = 0; n < l2; ++n) { - int locn = find_in_scan(scan, l, n); - int z = -1; - i = n / l; - j = n % l; - for (k = 0; k < MAX_NEIGHBORS; ++k) - neighbors[MAX_NEIGHBORS * n + k] = -1; - if (i - 1 >= 0) { - m = (i - 1) * l + j; - if (find_in_scan(scan, l, m) < locn) { - neighbors[MAX_NEIGHBORS * n] = m; - if (m == 0) z = 0; - } - } - if (j - 1 >= 0) { - m = i * l + j - 1; - if (find_in_scan(scan, l, m) < locn) { - neighbors[MAX_NEIGHBORS * n + 1] = m; - if (m == 0) z = 1; - } - } - if (i - 1 >= 0 && j - 1 >= 0) { - m = (i - 1) * l + j - 1; - if (find_in_scan(scan, l, m) < locn) { - neighbors[MAX_NEIGHBORS * n + 2] = m; - if (m == 0) z = 2; - } - } - if (i - 1 >= 0 && j + 1 < l) { - m = (i - 1) * l + j + 1; - if (find_in_scan(scan, l, m) < locn) { - neighbors[MAX_NEIGHBORS * n + 3] = m; - if (m == 0) z = 3; - } - } - if (i + 1 < l && j - 1 >= 0) { - m = (i + 1) * l + j - 1; - if (find_in_scan(scan, l, m) < locn) { - neighbors[MAX_NEIGHBORS * n + 4] = m; - if (m == 0) z = 4; - } - } - if (z != -1) { // zero exists - int v = 0; - for (k = 0; k < MAX_NEIGHBORS; ++k) - v += (neighbors[MAX_NEIGHBORS * n + k] > 0); - if (v) { - neighbors[MAX_NEIGHBORS * n + z] = -1; + int n, i, j; + + for (n = 0; n < l2; n++) { + int rc = scan[n]; + assert(max_neighbors == MAX_NEIGHBORS); + i = rc / l; + j = rc % l; + if (i > 0 && j > 0) { + // col/row scan is used for adst/dct, and generally means that + // energy decreases to zero much faster in the dimension in + // which ADST is used compared to the direction in which DCT + // is used. Likewise, we find much higher correlation between + // coefficients within the direction in which DCT is used. + // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff + // as a context. If ADST or DCT is used in both directions, we + // use the combination of the two as a context. + int a = find_in_scan(scan, l, (i - 1) * l + j); + int b = find_in_scan(scan, l, i * l + j - 1); + if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 || + scan == vp9_col_scan_16x16) { + neighbors[max_neighbors * n + 0] = a; + neighbors[max_neighbors * n + 1] = -1; + } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 || + scan == vp9_row_scan_16x16) { + neighbors[max_neighbors * n + 0] = b; + neighbors[max_neighbors * n + 1] = -1; + } else { + neighbors[max_neighbors * n + 0] = a; + neighbors[max_neighbors * n + 1] = b; } + } else if (i > 0) { + neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j); + neighbors[max_neighbors * n + 1] = -1; + } else if (j > 0) { + neighbors[max_neighbors * n + 0] = + find_in_scan(scan, l, i * l + j - 1); + neighbors[max_neighbors * n + 1] = -1; + } else { + assert(n == 0); + // dc predictor doesn't use previous tokens + neighbors[max_neighbors * n + 0] = -1; } + assert(neighbors[max_neighbors * n + 0] < n); } } void vp9_init_neighbors() { init_scan_neighbors(vp9_default_zig_zag1d_4x4, 4, - vp9_default_zig_zag1d_4x4_neighbors); + vp9_default_zig_zag1d_4x4_neighbors, MAX_NEIGHBORS); init_scan_neighbors(vp9_row_scan_4x4, 4, - vp9_row_scan_4x4_neighbors); + vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS); init_scan_neighbors(vp9_col_scan_4x4, 4, - vp9_col_scan_4x4_neighbors); + vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS); init_scan_neighbors(vp9_default_zig_zag1d_8x8, 8, - vp9_default_zig_zag1d_8x8_neighbors); + vp9_default_zig_zag1d_8x8_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_row_scan_8x8, 8, + vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_col_scan_8x8, 8, + vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS); init_scan_neighbors(vp9_default_zig_zag1d_16x16, 16, - vp9_default_zig_zag1d_16x16_neighbors); + vp9_default_zig_zag1d_16x16_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_row_scan_16x16, 16, + vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_col_scan_16x16, 16, + vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS); init_scan_neighbors(vp9_default_zig_zag1d_32x32, 32, - vp9_default_zig_zag1d_32x32_neighbors); + vp9_default_zig_zag1d_32x32_neighbors, MAX_NEIGHBORS); } -const int *vp9_get_coef_neighbors_handle(const int *scan) { +const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) { if (scan == vp9_default_zig_zag1d_4x4) { + *pad = MAX_NEIGHBORS; return vp9_default_zig_zag1d_4x4_neighbors; } else if (scan == vp9_row_scan_4x4) { + *pad = MAX_NEIGHBORS; return vp9_row_scan_4x4_neighbors; } else if (scan == vp9_col_scan_4x4) { + *pad = MAX_NEIGHBORS; return vp9_col_scan_4x4_neighbors; } else if (scan == vp9_default_zig_zag1d_8x8) { + *pad = MAX_NEIGHBORS; return vp9_default_zig_zag1d_8x8_neighbors; + } else if (scan == vp9_row_scan_8x8) { + *pad = 2; + return vp9_row_scan_8x8_neighbors; + } else if (scan == vp9_col_scan_8x8) { + *pad = 2; + return vp9_col_scan_8x8_neighbors; } else if (scan == vp9_default_zig_zag1d_16x16) { + *pad = MAX_NEIGHBORS; return vp9_default_zig_zag1d_16x16_neighbors; + } else if (scan == vp9_row_scan_16x16) { + *pad = 2; + return vp9_row_scan_16x16_neighbors; + } else if (scan == vp9_col_scan_16x16) { + *pad = 2; + return vp9_col_scan_16x16_neighbors; } else if (scan == vp9_default_zig_zag1d_32x32) { + *pad = MAX_NEIGHBORS; return vp9_default_zig_zag1d_32x32_neighbors; + } else { + assert(0); + return NULL; } - return vp9_default_zig_zag1d_4x4_neighbors; } -int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc, - const int *neigbor_handle, int rc) { - static int neighbors_used = MAX_NEIGHBORS; // maximum is MAX_NEIGHBORS - const int *nb = neigbor_handle + rc * MAX_NEIGHBORS; - int i, v, val = 0, n = 0; - for (i = 0; i < neighbors_used; ++i) { - if (nb[i] == -1 || (nb[i] == 0 && nodc)) { - continue; - } - v = abs(qcoeff_ptr[nb[i]]); - val = (v > val ? v : val); - n++; +void vp9_coef_tree_initialize() { + vp9_init_neighbors(); + init_bit_trees(); + vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); +#if CONFIG_CODE_NONZEROCOUNT + vp9_tokens_from_tree(vp9_nzc4x4_encodings, vp9_nzc4x4_tree); + vp9_tokens_from_tree(vp9_nzc8x8_encodings, vp9_nzc8x8_tree); + vp9_tokens_from_tree(vp9_nzc16x16_encodings, vp9_nzc16x16_tree); + vp9_tokens_from_tree(vp9_nzc32x32_encodings, vp9_nzc32x32_tree); +#endif +} + +#if CONFIG_CODE_NONZEROCOUNT + +#define mb_in_cur_tile(cm, mb_row, mb_col) \ + ((mb_col) >= (cm)->cur_tile_mb_col_start && \ + (mb_col) <= (cm)->cur_tile_mb_col_end && \ + (mb_row) >= 0) + +#define choose_nzc_context(nzc_exp, t2, t1) \ + ((nzc_exp) >= (t2) ? 2 : (nzc_exp) >= (t1) ? 1 : 0) + +#define NZC_T2_32X32 (16 << 6) +#define NZC_T1_32X32 (4 << 6) + +#define NZC_T2_16X16 (12 << 6) +#define NZC_T1_16X16 (3 << 6) + +#define NZC_T2_8X8 (8 << 6) +#define NZC_T1_8X8 (2 << 6) + +#define NZC_T2_4X4 (4 << 6) +#define NZC_T1_4X4 (1 << 6) + +// Transforms a mb16 block index to a sb64 block index +static inline int mb16_to_sb64_index(int mb_row, int mb_col, int block) { + int r = (mb_row & 3); + int c = (mb_col & 3); + int b; + if (block < 16) { // Y + int ib = block >> 2; + int jb = block & 3; + ib += r * 4; + jb += c * 4; + b = ib * 16 + jb; + assert(b < 256); + return b; + } else { // UV + int base = block - (block & 3); + int ib = (block - base) >> 1; + int jb = (block - base) & 1; + ib += r * 2; + jb += c * 2; + b = base * 16 + ib * 8 + jb; + assert(b >= 256 && b < 384); + return b; } - if (n == 0) +} + +// Transforms a mb16 block index to a sb32 block index +static inline int mb16_to_sb32_index(int mb_row, int mb_col, int block) { + int r = (mb_row & 1); + int c = (mb_col & 1); + int b; + if (block < 16) { // Y + int ib = block >> 2; + int jb = block & 3; + ib += r * 4; + jb += c * 4; + b = ib * 8 + jb; + assert(b < 64); + return b; + } else { // UV + int base = block - (block & 3); + int ib = (block - base) >> 1; + int jb = (block - base) & 1; + ib += r * 2; + jb += c * 2; + b = base * 4 + ib * 4 + jb; + assert(b >= 64 && b < 96); + return b; + } +} + +static inline int block_to_txfm_index(int block, TX_SIZE tx_size, int s) { + // s is the log of the number of 4x4 blocks in each row/col of larger block + int b, ib, jb, nb; + ib = block >> s; + jb = block - (ib << s); + ib >>= tx_size; + jb >>= tx_size; + nb = 1 << (s - tx_size); + b = (ib * nb + jb) << (2 * tx_size); + return b; +} + +/* BEGIN - Helper functions to get the y nzcs */ +static unsigned int get_nzc_4x4_y_sb64(MB_MODE_INFO *mi, int block) { + int b; + assert(block < 256); + b = block_to_txfm_index(block, mi->txfm_size, 4); + assert(b < 256); + return mi->nzcs[b] << (6 - 2 * mi->txfm_size); +} + +static unsigned int get_nzc_4x4_y_sb32(MB_MODE_INFO *mi, int block) { + int b; + assert(block < 64); + b = block_to_txfm_index(block, mi->txfm_size, 3); + assert(b < 64); + return mi->nzcs[b] << (6 - 2 * mi->txfm_size); +} + +static unsigned int get_nzc_4x4_y_mb16(MB_MODE_INFO *mi, int block) { + int b; + assert(block < 16); + b = block_to_txfm_index(block, mi->txfm_size, 2); + assert(b < 16); + return mi->nzcs[b] << (6 - 2 * mi->txfm_size); +} +/* END - Helper functions to get the y nzcs */ + +/* Function to get y nzc where block index is in mb16 terms */ +static unsigned int get_nzc_4x4_y(VP9_COMMON *cm, MODE_INFO *m, + int mb_row, int mb_col, int block) { + // NOTE: All values returned are at 64 times the true value at 4x4 scale + MB_MODE_INFO *const mi = &m->mbmi; + const int mis = cm->mode_info_stride; + if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col)) return 0; - else if (val <= 1) - return val; - else if (val < 4) - return 2; + if (mi->sb_type == BLOCK_SIZE_SB64X64) { + int r = mb_row & 3; + int c = mb_col & 3; + m -= c + r * mis; + if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c)) + return 0; + else + return get_nzc_4x4_y_sb64( + &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block)); + } else if (mi->sb_type == BLOCK_SIZE_SB32X32) { + int r = mb_row & 1; + int c = mb_col & 1; + m -= c + r * mis; + if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c)) + return 0; + else + return get_nzc_4x4_y_sb32( + &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block)); + } else { + if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col)) + return 0; + return get_nzc_4x4_y_mb16(mi, block); + } +} + +/* BEGIN - Helper functions to get the uv nzcs */ +static unsigned int get_nzc_4x4_uv_sb64(MB_MODE_INFO *mi, int block) { + int b; + int base, uvtxfm_size; + assert(block >= 256 && block < 384); + uvtxfm_size = mi->txfm_size; + base = 256 + (block & 64); + block -= base; + b = base + block_to_txfm_index(block, uvtxfm_size, 3); + assert(b >= 256 && b < 384); + return mi->nzcs[b] << (6 - 2 * uvtxfm_size); +} + +static unsigned int get_nzc_4x4_uv_sb32(MB_MODE_INFO *mi, int block) { + int b; + int base, uvtxfm_size; + assert(block >= 64 && block < 96); + if (mi->txfm_size == TX_32X32) + uvtxfm_size = TX_16X16; else - return 3; + uvtxfm_size = mi->txfm_size; + base = 64 + (block & 16); + block -= base; + b = base + block_to_txfm_index(block, uvtxfm_size, 2); + assert(b >= 64 && b < 96); + return mi->nzcs[b] << (6 - 2 * uvtxfm_size); } -#endif /* CONFIG_NEWCOEFCONTEXT */ -void vp9_default_coef_probs(VP9_COMMON *pc) { - vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4, - sizeof(pc->fc.coef_probs_4x4)); - vpx_memcpy(pc->fc.hybrid_coef_probs_4x4, default_hybrid_coef_probs_4x4, - sizeof(pc->fc.hybrid_coef_probs_4x4)); +static unsigned int get_nzc_4x4_uv_mb16(MB_MODE_INFO *mi, int block) { + int b; + int base, uvtxfm_size; + assert(block >= 16 && block < 24); + if (mi->txfm_size == TX_8X8 && + (mi->mode == SPLITMV || mi->mode == I8X8_PRED)) + uvtxfm_size = TX_4X4; + else if (mi->txfm_size == TX_16X16) + uvtxfm_size = TX_8X8; + else + uvtxfm_size = mi->txfm_size; + base = 16 + (block & 4); + block -= base; + b = base + block_to_txfm_index(block, uvtxfm_size, 1); + assert(b >= 16 && b < 24); + return mi->nzcs[b] << (6 - 2 * uvtxfm_size); +} +/* END - Helper functions to get the uv nzcs */ + +/* Function to get uv nzc where block index is in mb16 terms */ +static unsigned int get_nzc_4x4_uv(VP9_COMMON *cm, MODE_INFO *m, + int mb_row, int mb_col, int block) { + // NOTE: All values returned are at 64 times the true value at 4x4 scale + MB_MODE_INFO *const mi = &m->mbmi; + const int mis = cm->mode_info_stride; + if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col)) + return 0; + if (mi->sb_type == BLOCK_SIZE_SB64X64) { + int r = mb_row & 3; + int c = mb_col & 3; + m -= c + r * mis; + if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c)) + return 0; + else + return get_nzc_4x4_uv_sb64( + &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block)); + } else if (mi->sb_type == BLOCK_SIZE_SB32X32) { + int r = mb_row & 1; + int c = mb_col & 1; + m -= c + r * mis; + if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c)) + return 0; + else + return get_nzc_4x4_uv_sb32( + &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block)); + } else { + return get_nzc_4x4_uv_mb16(mi, block); + } +} - vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8, - sizeof(pc->fc.coef_probs_8x8)); - vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8, - sizeof(pc->fc.hybrid_coef_probs_8x8)); +int vp9_get_nzc_context_y_sb64(VP9_COMMON *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block) { + // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy + // neighboring blocks are + int mis = cm->mode_info_stride; + int nzc_exp = 0; + TX_SIZE txfm_size = cur->mbmi.txfm_size; + assert(block < 256); + switch (txfm_size) { + case TX_32X32: + assert((block & 63) == 0); + if (block < 128) { + int o = (block >> 6) * 2; + nzc_exp = + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15) + + get_nzc_4x4_y(cm, cur - mis + o + 1, + mb_row - 1, mb_col + o + 1, 12) + + get_nzc_4x4_y(cm, cur - mis + o + 1, + mb_row - 1, mb_col + o + 1, 13) + + get_nzc_4x4_y(cm, cur - mis + o + 1, + mb_row - 1, mb_col + o + 1, 14) + + get_nzc_4x4_y(cm, cur - mis + o + 1, + mb_row - 1, mb_col + o + 1, 15); + } else { + nzc_exp = cur->mbmi.nzcs[block - 128] << 3; + } + if ((block & 127) == 0) { + int o = (block >> 7) * 2; + nzc_exp += + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15) + + get_nzc_4x4_y(cm, cur - 1 + o * mis + mis, + mb_row + o + 1, mb_col - 1, 3) + + get_nzc_4x4_y(cm, cur - 1 + o * mis + mis, + mb_row + o + 1, mb_col - 1, 7) + + get_nzc_4x4_y(cm, cur - 1 + o * mis + mis, + mb_row + o + 1, mb_col - 1, 11) + + get_nzc_4x4_y(cm, cur - 1 + o * mis + mis, + mb_row + o + 1, mb_col - 1, 15); + } else { + nzc_exp += cur->mbmi.nzcs[block - 64] << 3; + } + nzc_exp <<= 2; + // Note nzc_exp is 64 times the average value expected at 32x32 scale + return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32); + break; + + case TX_16X16: + assert((block & 15) == 0); + if (block < 64) { + int o = block >> 4; + nzc_exp = + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15); + } else { + nzc_exp = cur->mbmi.nzcs[block - 64] << 4; + } + if ((block & 63) == 0) { + int o = block >> 6; + nzc_exp += + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15); + } else { + nzc_exp += cur->mbmi.nzcs[block - 16] << 4; + } + nzc_exp <<= 1; + // Note nzc_exp is 64 times the average value expected at 16x16 scale + return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16); + break; + + case TX_8X8: + assert((block & 3) == 0); + if (block < 32) { + int o = block >> 3; + int p = ((block >> 2) & 1) ? 14 : 12; + nzc_exp = + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1); + } else { + nzc_exp = cur->mbmi.nzcs[block - 32] << 5; + } + if ((block & 31) == 0) { + int o = block >> 6; + int p = ((block >> 5) & 1) ? 11 : 3; + nzc_exp += + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4); + } else { + nzc_exp += cur->mbmi.nzcs[block - 4] << 5; + } + // Note nzc_exp is 64 times the average value expected at 8x8 scale + return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8); + break; + + case TX_4X4: + if (block < 16) { + int o = block >> 2; + int p = block & 3; + nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, + 12 + p); + } else { + nzc_exp = (cur->mbmi.nzcs[block - 16] << 6); + } + if ((block & 15) == 0) { + int o = block >> 6; + int p = (block >> 4) & 3; + nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, + 3 + 4 * p); + } else { + nzc_exp += (cur->mbmi.nzcs[block - 1] << 6); + } + nzc_exp >>= 1; + // Note nzc_exp is 64 times the average value expected at 4x4 scale + return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4); + break; - vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16, - sizeof(pc->fc.coef_probs_16x16)); - vpx_memcpy(pc->fc.hybrid_coef_probs_16x16, - default_hybrid_coef_probs_16x16, - sizeof(pc->fc.hybrid_coef_probs_16x16)); - vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32, - sizeof(pc->fc.coef_probs_32x32)); + default: + return 0; + } } -void vp9_coef_tree_initialize() { - init_bit_trees(); - vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); +int vp9_get_nzc_context_y_sb32(VP9_COMMON *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block) { + // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy + // neighboring blocks are + int mis = cm->mode_info_stride; + int nzc_exp = 0; + TX_SIZE txfm_size = cur->mbmi.txfm_size; + assert(block < 64); + switch (txfm_size) { + case TX_32X32: + assert(block == 0); + nzc_exp = + (get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) + + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) + + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) + + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) + + get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 12) + + get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 13) + + get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 14) + + get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 15) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15) + + get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 3) + + get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 7) + + get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 11) + + get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 15)) << 2; + // Note nzc_exp is 64 times the average value expected at 32x32 scale + return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32); + break; + + case TX_16X16: + assert((block & 15) == 0); + if (block < 32) { + int o = (block >> 4) & 1; + nzc_exp = + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15); + } else { + nzc_exp = cur->mbmi.nzcs[block - 32] << 4; + } + if ((block & 31) == 0) { + int o = block >> 5; + nzc_exp += + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15); + } else { + nzc_exp += cur->mbmi.nzcs[block - 16] << 4; + } + nzc_exp <<= 1; + // Note nzc_exp is 64 times the average value expected at 16x16 scale + return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16); + break; + + case TX_8X8: + assert((block & 3) == 0); + if (block < 16) { + int o = block >> 3; + int p = ((block >> 2) & 1) ? 14 : 12; + nzc_exp = + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) + + get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1); + } else { + nzc_exp = cur->mbmi.nzcs[block - 16] << 5; + } + if ((block & 15) == 0) { + int o = block >> 5; + int p = ((block >> 4) & 1) ? 11 : 3; + nzc_exp += + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) + + get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4); + } else { + nzc_exp += cur->mbmi.nzcs[block - 4] << 5; + } + // Note nzc_exp is 64 times the average value expected at 8x8 scale + return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8); + break; + + case TX_4X4: + if (block < 8) { + int o = block >> 2; + int p = block & 3; + nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, + 12 + p); + } else { + nzc_exp = (cur->mbmi.nzcs[block - 8] << 6); + } + if ((block & 7) == 0) { + int o = block >> 5; + int p = (block >> 3) & 3; + nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, + 3 + 4 * p); + } else { + nzc_exp += (cur->mbmi.nzcs[block - 1] << 6); + } + nzc_exp >>= 1; + // Note nzc_exp is 64 times the average value expected at 4x4 scale + return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4); + break; + + default: + return 0; + break; + } } +int vp9_get_nzc_context_y_mb16(VP9_COMMON *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block) { + // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy + // neighboring blocks are + int mis = cm->mode_info_stride; + int nzc_exp = 0; + TX_SIZE txfm_size = cur->mbmi.txfm_size; + assert(block < 16); + switch (txfm_size) { + case TX_16X16: + assert(block == 0); + nzc_exp = + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) + + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) + + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) + + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15); + nzc_exp <<= 1; + // Note nzc_exp is 64 times the average value expected at 16x16 scale + return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16); + + case TX_8X8: + assert((block & 3) == 0); + if (block < 8) { + int p = ((block >> 2) & 1) ? 14 : 12; + nzc_exp = + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p) + + get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p + 1); + } else { + nzc_exp = cur->mbmi.nzcs[block - 8] << 5; + } + if ((block & 7) == 0) { + int p = ((block >> 3) & 1) ? 11 : 3; + nzc_exp += + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p) + + get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p + 4); + } else { + nzc_exp += cur->mbmi.nzcs[block - 4] << 5; + } + // Note nzc_exp is 64 times the average value expected at 8x8 scale + return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8); + + case TX_4X4: + if (block < 4) { + int p = block & 3; + nzc_exp = get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, + 12 + p); + } else { + nzc_exp = (cur->mbmi.nzcs[block - 4] << 6); + } + if ((block & 3) == 0) { + int p = (block >> 2) & 3; + nzc_exp += get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, + 3 + 4 * p); + } else { + nzc_exp += (cur->mbmi.nzcs[block - 1] << 6); + } + nzc_exp >>= 1; + // Note nzc_exp is 64 times the average value expected at 4x4 scale + return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4); + + default: + return 0; + break; + } +} + +int vp9_get_nzc_context_uv_sb64(VP9_COMMON *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block) { + // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy + // neighboring blocks are + int mis = cm->mode_info_stride; + int nzc_exp = 0; + const int base = block - (block & 63); + const int boff = (block & 63); + const int base_mb16 = base >> 4; + TX_SIZE txfm_size = cur->mbmi.txfm_size; + TX_SIZE txfm_size_uv; + + assert(block >= 256 && block < 384); + txfm_size_uv = txfm_size; + + switch (txfm_size_uv) { + case TX_32X32: + assert(block == 256 || block == 320); + nzc_exp = + get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, + base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, + base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1, + base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1, + base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1, + base_mb16 + 3); + nzc_exp <<= 2; + // Note nzc_exp is 64 times the average value expected at 32x32 scale + return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32); + + case TX_16X16: + // uv txfm_size 16x16 + assert((block & 15) == 0); + if (boff < 32) { + int o = (boff >> 4) & 1; + nzc_exp = + get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1, + base_mb16 + 3); + } else { + nzc_exp = cur->mbmi.nzcs[block - 32] << 4; + } + if ((boff & 31) == 0) { + int o = boff >> 5; + nzc_exp += + get_nzc_4x4_uv(cm, cur - 1 + o * mis, + mb_row + o, mb_col - 1, base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + o * mis, + mb_row + o, mb_col - 1, base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis, + mb_row + o + 1, mb_col - 1, base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis, + mb_row + o + 1, mb_col - 1, base_mb16 + 3); + } else { + nzc_exp += cur->mbmi.nzcs[block - 16] << 4; + } + nzc_exp <<= 1; + // Note nzc_exp is 64 times the average value expected at 16x16 scale + return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16); + + case TX_8X8: + assert((block & 3) == 0); + if (boff < 16) { + int o = boff >> 2; + nzc_exp = + get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o, + base_mb16 + 3); + } else { + nzc_exp = cur->mbmi.nzcs[block - 16] << 5; + } + if ((boff & 15) == 0) { + int o = boff >> 4; + nzc_exp += + get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, + base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, + base_mb16 + 3); + } else { + nzc_exp += cur->mbmi.nzcs[block - 4] << 5; + } + // Note nzc_exp is 64 times the average value expected at 8x8 scale + return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8); + + case TX_4X4: + if (boff < 8) { + int o = boff >> 1; + int p = boff & 1; + nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o, + base_mb16 + 2 + p); + } else { + nzc_exp = (cur->mbmi.nzcs[block - 8] << 6); + } + if ((boff & 7) == 0) { + int o = boff >> 4; + int p = (boff >> 3) & 1; + nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, + base_mb16 + 1 + 2 * p); + } else { + nzc_exp += (cur->mbmi.nzcs[block - 1] << 6); + } + nzc_exp >>= 1; + // Note nzc_exp is 64 times the average value expected at 4x4 scale + return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4); + + default: + return 0; + } +} + +int vp9_get_nzc_context_uv_sb32(VP9_COMMON *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block) { + // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy + // neighboring blocks are + int mis = cm->mode_info_stride; + int nzc_exp = 0; + const int base = block - (block & 15); + const int boff = (block & 15); + const int base_mb16 = base >> 2; + TX_SIZE txfm_size = cur->mbmi.txfm_size; + TX_SIZE txfm_size_uv; + + assert(block >= 64 && block < 96); + if (txfm_size == TX_32X32) + txfm_size_uv = TX_16X16; + else + txfm_size_uv = txfm_size; + + switch (txfm_size_uv) { + case TX_16X16: + // uv txfm_size 16x16 + assert(block == 64 || block == 80); + nzc_exp = + get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1, + base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1, + base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, + base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, + base_mb16 + 3); + nzc_exp <<= 1; + // Note nzc_exp is 64 times the average value expected at 16x16 scale + return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16); + break; + + case TX_8X8: + assert((block & 3) == 0); + if (boff < 8) { + int o = boff >> 2; + nzc_exp = + get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o, + base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o, + base_mb16 + 3); + } else { + nzc_exp = cur->mbmi.nzcs[block - 8] << 5; + } + if ((boff & 7) == 0) { + int o = boff >> 3; + nzc_exp += + get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, + base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, + base_mb16 + 3); + } else { + nzc_exp += cur->mbmi.nzcs[block - 4] << 5; + } + // Note nzc_exp is 64 times the average value expected at 8x8 scale + return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8); + + case TX_4X4: + if (boff < 4) { + int o = boff >> 1; + int p = boff & 1; + nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o, + base_mb16 + 2 + p); + } else { + nzc_exp = (cur->mbmi.nzcs[block - 4] << 6); + } + if ((boff & 3) == 0) { + int o = boff >> 3; + int p = (boff >> 2) & 1; + nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, + base_mb16 + 1 + 2 * p); + } else { + nzc_exp += (cur->mbmi.nzcs[block - 1] << 6); + } + nzc_exp >>= 1; + // Note nzc_exp is 64 times the average value expected at 4x4 scale + return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4); + + default: + return 0; + } +} + +int vp9_get_nzc_context_uv_mb16(VP9_COMMON *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block) { + // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy + // neighboring blocks are + int mis = cm->mode_info_stride; + int nzc_exp = 0; + const int base = block - (block & 3); + const int boff = (block & 3); + const int base_mb16 = base; + TX_SIZE txfm_size = cur->mbmi.txfm_size; + TX_SIZE txfm_size_uv; + + assert(block >= 16 && block < 24); + if (txfm_size == TX_16X16) + txfm_size_uv = TX_8X8; + else if (txfm_size == TX_8X8 && + (cur->mbmi.mode == I8X8_PRED || cur->mbmi.mode == SPLITMV)) + txfm_size_uv = TX_4X4; + else + txfm_size_uv = txfm_size; + + switch (txfm_size_uv) { + case TX_8X8: + assert((block & 3) == 0); + nzc_exp = + get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 2) + + get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 3) + + get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 1) + + get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 3); + // Note nzc_exp is 64 times the average value expected at 8x8 scale + return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8); + + case TX_4X4: + if (boff < 2) { + int p = boff & 1; + nzc_exp = get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, + base_mb16 + 2 + p); + } else { + nzc_exp = (cur->mbmi.nzcs[block - 2] << 6); + } + if ((boff & 1) == 0) { + int p = (boff >> 1) & 1; + nzc_exp += get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, + base_mb16 + 1 + 2 * p); + } else { + nzc_exp += (cur->mbmi.nzcs[block - 1] << 6); + } + nzc_exp >>= 1; + // Note nzc_exp is 64 times the average value expected at 4x4 scale + return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4); + + default: + return 0; + } +} + +int vp9_get_nzc_context(VP9_COMMON *cm, MACROBLOCKD *xd, int block) { + if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) { + assert(block < 384); + if (block < 256) + return vp9_get_nzc_context_y_sb64(cm, xd->mode_info_context, + get_mb_row(xd), get_mb_col(xd), block); + else + return vp9_get_nzc_context_uv_sb64(cm, xd->mode_info_context, + get_mb_row(xd), get_mb_col(xd), block); + } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) { + assert(block < 96); + if (block < 64) + return vp9_get_nzc_context_y_sb32(cm, xd->mode_info_context, + get_mb_row(xd), get_mb_col(xd), block); + else + return vp9_get_nzc_context_uv_sb32(cm, xd->mode_info_context, + get_mb_row(xd), get_mb_col(xd), block); + } else { + assert(block < 64); + if (block < 16) + return vp9_get_nzc_context_y_mb16(cm, xd->mode_info_context, + get_mb_row(xd), get_mb_col(xd), block); + else + return vp9_get_nzc_context_uv_mb16(cm, xd->mode_info_context, + get_mb_row(xd), get_mb_col(xd), block); + } +} + +static void update_nzc(VP9_COMMON *cm, + uint16_t nzc, + int nzc_context, + TX_SIZE tx_size, + int ref, + int type) { + int e, c; + c = codenzc(nzc); + if (tx_size == TX_32X32) + cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++; + else if (tx_size == TX_16X16) + cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++; + else if (tx_size == TX_8X8) + cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++; + else if (tx_size == TX_4X4) + cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++; + else + assert(0); + + if ((e = vp9_extranzcbits[c])) { + int x = nzc - vp9_basenzcvalue[c]; + while (e--) { + int b = (x >> e) & 1; + cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++; + } + } +} + +static void update_nzcs_sb64(VP9_COMMON *cm, + MACROBLOCKD *xd, + int mb_row, + int mb_col) { + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_32X32: + for (j = 0; j < 256; j += 64) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0); + } + for (j = 256; j < 384; j += 64) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1); + } + break; + + case TX_16X16: + for (j = 0; j < 256; j += 16) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0); + } + for (j = 256; j < 384; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1); + } + break; + + case TX_8X8: + for (j = 0; j < 256; j += 4) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0); + } + for (j = 256; j < 384; j += 4) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1); + } + break; + + case TX_4X4: + for (j = 0; j < 256; ++j) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0); + } + for (j = 256; j < 384; ++j) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1); + } + break; + + default: + break; + } +} + +static void update_nzcs_sb32(VP9_COMMON *cm, + MACROBLOCKD *xd, + int mb_row, + int mb_col) { + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_32X32: + for (j = 0; j < 64; j += 64) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0); + } + for (j = 64; j < 96; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1); + } + break; + + case TX_16X16: + for (j = 0; j < 64; j += 16) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0); + } + for (j = 64; j < 96; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1); + } + break; + + case TX_8X8: + for (j = 0; j < 64; j += 4) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0); + } + for (j = 64; j < 96; j += 4) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1); + } + break; + + case TX_4X4: + for (j = 0; j < 64; ++j) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0); + } + for (j = 64; j < 96; ++j) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1); + } + break; + + default: + break; + } +} + +static void update_nzcs_mb16(VP9_COMMON *cm, + MACROBLOCKD *xd, + int mb_row, + int mb_col) { + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_16X16: + for (j = 0; j < 16; j += 16) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0); + } + for (j = 16; j < 24; j += 4) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1); + } + break; + + case TX_8X8: + for (j = 0; j < 16; j += 4) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0); + } + if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) { + for (j = 16; j < 24; ++j) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1); + } + } else { + for (j = 16; j < 24; j += 4) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1); + } + } + break; + + case TX_4X4: + for (j = 0; j < 16; ++j) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0); + } + for (j = 16; j < 24; ++j) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1); + } + break; + + default: + break; + } +} + +void vp9_update_nzc_counts(VP9_COMMON *cm, + MACROBLOCKD *xd, + int mb_row, + int mb_col) { + if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) + update_nzcs_sb64(cm, xd, mb_row, mb_col); + else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) + update_nzcs_sb32(cm, xd, mb_row, mb_col); + else + update_nzcs_mb16(cm, xd, mb_row, mb_col); +} +#endif // CONFIG_CODE_NONZEROCOUNT + // #define COEF_COUNT_TESTING #define COEF_COUNT_SAT 24 @@ -1111,42 +3525,55 @@ void vp9_coef_tree_initialize() { #define COEF_COUNT_SAT_AFTER_KEY 24 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128 -static void update_coef_probs(vp9_coeff_probs *dst_coef_probs, - vp9_coeff_probs *pre_coef_probs, - int block_types, vp9_coeff_count *coef_counts, - int count_sat, int update_factor) { - int t, i, j, k, count; +static void adapt_coef_probs(vp9_coeff_probs *dst_coef_probs, + vp9_coeff_probs *pre_coef_probs, + int block_types, vp9_coeff_count *coef_counts, + unsigned int (*eob_branch_count)[REF_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS], + int count_sat, int update_factor) { + int t, i, j, k, l, count; unsigned int branch_ct[ENTROPY_NODES][2]; vp9_prob coef_probs[ENTROPY_NODES]; int factor; +#if CONFIG_MODELCOEFPROB && MODEL_BASED_ADAPT + int entropy_nodes_adapt = UNCONSTRAINED_ADAPT_NODES; +#else + int entropy_nodes_adapt = ENTROPY_NODES; +#endif for (i = 0; i < block_types; ++i) - for (j = 0; j < COEF_BANDS; ++j) - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS, - vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, - coef_counts[i][j][k]); - for (t = 0; t < ENTROPY_NODES; ++t) { - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > count_sat ? count_sat : count; - factor = (update_factor * count / count_sat); - dst_coef_probs[i][j][k][t] = weighted_prob(pre_coef_probs[i][j][k][t], - coef_probs[t], factor); + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + if (l >= 3 && k == 0) + continue; + vp9_tree_probs_from_distribution(vp9_coef_tree, + coef_probs, branch_ct, + coef_counts[i][j][k][l], 0); + branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; + coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); + for (t = 0; t < entropy_nodes_adapt; ++t) { + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + dst_coef_probs[i][j][k][l][t] = + weighted_prob(pre_coef_probs[i][j][k][l][t], + coef_probs[t], factor); +#if CONFIG_MODELCOEFPROB && MODEL_BASED_ADAPT + if (t == UNCONSTRAINED_NODES - 1) + vp9_get_model_distribution( + dst_coef_probs[i][j][k][l][UNCONSTRAINED_NODES - 1], + dst_coef_probs[i][j][k][l], i, j); +#endif + } } - } } void vp9_adapt_coef_probs(VP9_COMMON *cm) { -#ifdef COEF_COUNT_TESTING - int t, i, j, k; -#endif int count_sat; int update_factor; /* denominator 256 */ - // printf("Frame type: %d\n", cm->frame_type); if (cm->frame_type == KEY_FRAME) { update_factor = COEF_MAX_UPDATE_FACTOR_KEY; count_sat = COEF_COUNT_SAT_KEY; @@ -1158,87 +3585,141 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) { count_sat = COEF_COUNT_SAT; } -#ifdef COEF_COUNT_TESTING - { - printf("static const unsigned int\ncoef_counts" - "[BLOCK_TYPES] [COEF_BANDS]" - "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); - for (i = 0; i < BLOCK_TYPES; ++i) { - printf(" {\n"); - for (j = 0; j < COEF_BANDS; ++j) { - printf(" {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - printf(" {"); - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - printf("%d, ", cm->fc.coef_counts[i][j][k][t]); - printf("},\n"); + adapt_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4, + BLOCK_TYPES, cm->fc.coef_counts_4x4, + cm->fc.eob_branch_counts[TX_4X4], + count_sat, update_factor); + adapt_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8, + BLOCK_TYPES, cm->fc.coef_counts_8x8, + cm->fc.eob_branch_counts[TX_8X8], + count_sat, update_factor); + adapt_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16, + BLOCK_TYPES, cm->fc.coef_counts_16x16, + cm->fc.eob_branch_counts[TX_16X16], + count_sat, update_factor); + adapt_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32, + BLOCK_TYPES, cm->fc.coef_counts_32x32, + cm->fc.eob_branch_counts[TX_32X32], + count_sat, update_factor); +} + +#if CONFIG_CODE_NONZEROCOUNT +static void adapt_nzc_probs(VP9_COMMON *cm, + int block_size, + int count_sat, + int update_factor) { + int c, r, b, n; + int count, factor; + unsigned int nzc_branch_ct[NZC32X32_NODES][2]; + vp9_prob nzc_probs[NZC32X32_NODES]; + int tokens, nodes; + const vp9_tree_index *nzc_tree; + vp9_prob *dst_nzc_probs; + vp9_prob *pre_nzc_probs; + unsigned int *nzc_counts; + + if (block_size == 32) { + tokens = NZC32X32_TOKENS; + nzc_tree = vp9_nzc32x32_tree; + dst_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0]; + pre_nzc_probs = cm->fc.pre_nzc_probs_32x32[0][0][0]; + nzc_counts = cm->fc.nzc_counts_32x32[0][0][0]; + } else if (block_size == 16) { + tokens = NZC16X16_TOKENS; + nzc_tree = vp9_nzc16x16_tree; + dst_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0]; + pre_nzc_probs = cm->fc.pre_nzc_probs_16x16[0][0][0]; + nzc_counts = cm->fc.nzc_counts_16x16[0][0][0]; + } else if (block_size == 8) { + tokens = NZC8X8_TOKENS; + nzc_tree = vp9_nzc8x8_tree; + dst_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0]; + pre_nzc_probs = cm->fc.pre_nzc_probs_8x8[0][0][0]; + nzc_counts = cm->fc.nzc_counts_8x8[0][0][0]; + } else { + nzc_tree = vp9_nzc4x4_tree; + tokens = NZC4X4_TOKENS; + dst_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0]; + pre_nzc_probs = cm->fc.pre_nzc_probs_4x4[0][0][0]; + nzc_counts = cm->fc.nzc_counts_4x4[0][0][0]; + } + nodes = tokens - 1; + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) + for (r = 0; r < REF_TYPES; ++r) + for (b = 0; b < BLOCK_TYPES; ++b) { + int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b; + int offset_nodes = offset * nodes; + int offset_tokens = offset * tokens; + vp9_tree_probs_from_distribution(nzc_tree, + nzc_probs, nzc_branch_ct, + nzc_counts + offset_tokens, 0); + for (n = 0; n < nodes; ++n) { + count = nzc_branch_ct[n][0] + nzc_branch_ct[n][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + dst_nzc_probs[offset_nodes + n] = + weighted_prob(pre_nzc_probs[offset_nodes + n], + nzc_probs[n], factor); } - printf(" },\n"); } - printf(" },\n"); - } - printf("};\n"); - printf("static const unsigned int\ncoef_counts_8x8" - "[BLOCK_TYPES_8X8] [COEF_BANDS]" - "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); - for (i = 0; i < BLOCK_TYPES_8X8; ++i) { - printf(" {\n"); - for (j = 0; j < COEF_BANDS; ++j) { - printf(" {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - printf(" {"); - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]); - printf("},\n"); - } - printf(" },\n"); +} + +static void adapt_nzc_pcat(VP9_COMMON *cm, int count_sat, int update_factor) { + int c, t; + int count, factor; + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (t = 0; t < NZC_TOKENS_EXTRA; ++t) { + int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA]; + int b; + for (b = 0; b < bits; ++b) { + vp9_prob prob = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0], + cm->fc.nzc_pcat_counts[c][t][b][1]); + count = cm->fc.nzc_pcat_counts[c][t][b][0] + + cm->fc.nzc_pcat_counts[c][t][b][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + cm->fc.nzc_pcat_probs[c][t][b] = weighted_prob( + cm->fc.pre_nzc_pcat_probs[c][t][b], prob, factor); } - printf(" },\n"); } - printf("};\n"); - printf("static const unsigned int\nhybrid_coef_counts" - "[BLOCK_TYPES] [COEF_BANDS]" - "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); - for (i = 0; i < BLOCK_TYPES; ++i) { - printf(" {\n"); - for (j = 0; j < COEF_BANDS; ++j) { - printf(" {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - printf(" {"); - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]); - printf("},\n"); + } +} + +// #define NZC_COUNT_TESTING +void vp9_adapt_nzc_probs(VP9_COMMON *cm) { + int count_sat; + int update_factor; /* denominator 256 */ +#ifdef NZC_COUNT_TESTING + int c, r, b, t; + printf("\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + printf(" {"); + for (t = 0; t < NZC4X4_TOKENS; ++t) { + printf(" %d,", cm->fc.nzc_counts_4x4[c][r][b][t]); } - printf(" },\n"); + printf("}\n"); } - printf(" },\n"); + printf("\n"); } - printf("};\n"); - } #endif - update_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4, - BLOCK_TYPES_4X4, cm->fc.coef_counts_4x4, - count_sat, update_factor); - update_coef_probs(cm->fc.hybrid_coef_probs_4x4, - cm->fc.pre_hybrid_coef_probs_4x4, - BLOCK_TYPES_4X4, cm->fc.hybrid_coef_counts_4x4, - count_sat, update_factor); - update_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8, - BLOCK_TYPES_8X8, cm->fc.coef_counts_8x8, - count_sat, update_factor); - update_coef_probs(cm->fc.hybrid_coef_probs_8x8, - cm->fc.pre_hybrid_coef_probs_8x8, - BLOCK_TYPES_8X8, cm->fc.hybrid_coef_counts_8x8, - count_sat, update_factor); - update_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16, - BLOCK_TYPES_16X16, cm->fc.coef_counts_16x16, - count_sat, update_factor); - update_coef_probs(cm->fc.hybrid_coef_probs_16x16, - cm->fc.pre_hybrid_coef_probs_16x16, - BLOCK_TYPES_16X16, cm->fc.hybrid_coef_counts_16x16, - count_sat, update_factor); - update_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32, - BLOCK_TYPES_32X32, cm->fc.coef_counts_32x32, - count_sat, update_factor); + if (cm->frame_type == KEY_FRAME) { + update_factor = COEF_MAX_UPDATE_FACTOR_KEY; + count_sat = COEF_COUNT_SAT_KEY; + } else if (cm->last_frame_type == KEY_FRAME) { + update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */ + count_sat = COEF_COUNT_SAT_AFTER_KEY; + } else { + update_factor = COEF_MAX_UPDATE_FACTOR; + count_sat = COEF_COUNT_SAT; + } + + adapt_nzc_probs(cm, 4, count_sat, update_factor); + adapt_nzc_probs(cm, 8, count_sat, update_factor); + adapt_nzc_probs(cm, 16, count_sat, update_factor); + adapt_nzc_probs(cm, 32, count_sat, update_factor); + adapt_nzc_pcat(cm, count_sat, update_factor); } +#endif // CONFIG_CODE_NONZEROCOUNT diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index 84e5255c2871ccbc73733cf20105375dde85be51..64f595047ae3f16b49fce4457a5c1f6c22e19fd0 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -15,7 +15,6 @@ #include "vp9/common/vp9_treecoder.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_coefupdateprobs.h" extern const int vp9_i8x8_block[4]; @@ -31,10 +30,10 @@ extern const int vp9_i8x8_block[4]; #define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ #define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ #define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ -#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 13+1 */ +#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 14+1 */ #define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ -#define MAX_ENTROPY_TOKENS 12 -#define ENTROPY_NODES 11 +#define MAX_ENTROPY_TOKENS 12 +#define ENTROPY_NODES 11 #define EOSB_TOKEN 127 /* Not signalled, encoder only */ #define INTER_MODE_CONTEXTS 7 @@ -59,31 +58,20 @@ extern vp9_extra_bit_struct vp9_extra_bits[12]; /* indexed by token value */ /* Coefficients are predicted via a 3-dimensional probability table. */ -/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */ -#define BLOCK_TYPES_4X4 4 - -#define BLOCK_TYPES_8X8 4 - -#define BLOCK_TYPES_16X16 4 - -#define BLOCK_TYPES_32X32 4 - -/* Middle dimension is a coarsening of the coefficient's - position within the 4x4 DCT. */ +/* Outside dimension. 0 = Y with DC, 1 = UV */ +#define BLOCK_TYPES 2 +#define REF_TYPES 2 // intra=0, inter=1 -#define COEF_BANDS 8 -extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]); -extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]); -extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]); -extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]); +/* Middle dimension reflects the coefficient position within the transform. */ +#define COEF_BANDS 6 -/* Inside dimension is 3-valued measure of nearby complexity, that is, - the extent to which nearby coefficients are nonzero. For the first - coefficient (DC, unless block type is 0), we look at the (already encoded) - blocks above and to the left of the current block. The context index is - then the number (0,1,or 2) of these blocks having nonzero coefficients. - After decoding a coefficient, the measure is roughly the size of the - most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1). +/* Inside dimension is measure of nearby complexity, that reflects the energy + of nearby coefficients are nonzero. For the first coefficient (DC, unless + block type is 0), we look at the (already encoded) blocks above and to the + left of the current block. The context index is then the number (0,1,or 2) + of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is determined by the size of the + most recently decoded coefficient. Note that the intuitive meaning of this measure changes as coefficients are decoded, e.g., prior to the first token, a zero means that my neighbors are empty while, after the first token, because of the use of end-of-block, @@ -94,21 +82,18 @@ extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]); distinct bands). */ /*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ -#define PREV_COEF_CONTEXTS 4 +#define PREV_COEF_CONTEXTS 6 -typedef unsigned int vp9_coeff_count[COEF_BANDS][PREV_COEF_CONTEXTS] +typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; -typedef unsigned int vp9_coeff_stats[COEF_BANDS][PREV_COEF_CONTEXTS] +typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [ENTROPY_NODES][2]; -typedef vp9_prob vp9_coeff_probs[COEF_BANDS][PREV_COEF_CONTEXTS] +typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [ENTROPY_NODES]; #define SUBEXP_PARAM 4 /* Subexponential code parameter */ #define MODULUS_PARAM 13 /* Modulus parameter */ -extern DECLARE_ALIGNED(16, const uint8_t, - vp9_prev_token_class[MAX_ENTROPY_TOKENS]); - struct VP9Common; void vp9_default_coef_probs(struct VP9Common *); extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]); @@ -117,38 +102,168 @@ extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]); extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]); extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]); + +extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]); +extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]); + extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]); + +extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]); +extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]); + extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]); void vp9_coef_tree_initialize(void); void vp9_adapt_coef_probs(struct VP9Common *); -static void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) { +static INLINE void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) { /* Clear entropy contexts */ vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)); vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)); } -#if CONFIG_NEWCOEFCONTEXT - -#define MAX_NEIGHBORS 5 -#define NEWCOEFCONTEXT_BAND_COND(b) ((b) >= 1) -void vp9_init_neighbors(void); - -const int *vp9_get_coef_neighbors_handle(const int *scan); -int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc, - const int *neigbor_handle, int rc); -extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_4x4_neighbors[ - 16 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int, vp9_row_scan_4x4_neighbors[ - 16 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int, vp9_col_scan_4x4_neighbors[ - 16 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_8x8_neighbors[ - 64 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_16x16_neighbors[ - 256 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_32x32_neighbors[ - 1024 * MAX_NEIGHBORS]); -#endif // CONFIG_NEWCOEFCONTEXT +static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd) { + /* Clear entropy contexts */ + vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2); + vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2); +} + +static INLINE void vp9_reset_sb64_tokens_context(MACROBLOCKD* const xd) { + /* Clear entropy contexts */ + vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4); + vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4); +} + +extern const int vp9_coef_bands8x8[64]; +extern const int vp9_coef_bands4x4[16]; + +static int get_coef_band(const int *scan, TX_SIZE tx_size, int coef_index) { + if (tx_size == TX_4X4) { + return vp9_coef_bands4x4[scan[coef_index]]; + } else { + const int pos = scan[coef_index]; + const int sz = 1 << (2 + tx_size); + const int x = pos & (sz - 1), y = pos >> (2 + tx_size); + if (x >= 8 || y >= 8) + return 5; + else + return vp9_coef_bands8x8[y * 8 + x]; + } +} +extern int vp9_get_coef_context(const int *scan, const int *neighbors, + int nb_pad, uint8_t *token_cache, int c, int l); +const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad); + +#if CONFIG_MODELCOEFPROB +#define COEFPROB_BITS 8 +#define COEFPROB_MODELS (1 << COEFPROB_BITS) + +// 2 => EOB and Zero nodes are unconstrained, rest are modeled +// 3 => EOB, Zero and One nodes are unconstrained, rest are modeled +#define UNCONSTRAINED_NODES 3 // Choose one of 2 or 3 + +// whether forward updates are model-based +#define MODEL_BASED_UPDATE 0 +// if model-based how many nodes are unconstrained +#define UNCONSTRAINED_UPDATE_NODES 3 +// whether backward updates are model-based +#define MODEL_BASED_ADAPT 0 +#define UNCONSTRAINED_ADAPT_NODES 3 + +// whether to adjust the coef probs for key frames based on qindex +#define ADJUST_KF_COEF_PROBS 0 + +typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS] + [PREV_COEF_CONTEXTS][2]; +extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1]; +void vp9_get_model_distribution(vp9_prob model, vp9_prob *tree_probs, + int b, int r); +void vp9_adjust_default_coef_probs(struct VP9Common *cm); +#endif // CONFIG_MODELCOEFPROB + +#if CONFIG_CODE_NONZEROCOUNT +/* Alphabet for number of non-zero symbols in block */ +#define NZC_0 0 /* Used for all blocks */ +#define NZC_1 1 /* Used for all blocks */ +#define NZC_2 2 /* Used for all blocks */ +#define NZC_3TO4 3 /* Used for all blocks */ +#define NZC_5TO8 4 /* Used for all blocks */ +#define NZC_9TO16 5 /* Used for all blocks */ +#define NZC_17TO32 6 /* Used for 8x8 and larger blocks */ +#define NZC_33TO64 7 /* Used for 8x8 and larger blocks */ +#define NZC_65TO128 8 /* Used for 16x16 and larger blocks */ +#define NZC_129TO256 9 /* Used for 16x16 and larger blocks */ +#define NZC_257TO512 10 /* Used for 32x32 and larger blocks */ +#define NZC_513TO1024 11 /* Used for 32x32 and larger blocks */ + +/* Number of tokens for each block size */ +#define NZC4X4_TOKENS 6 +#define NZC8X8_TOKENS 8 +#define NZC16X16_TOKENS 10 +#define NZC32X32_TOKENS 12 + +/* Number of nodes for each block size */ +#define NZC4X4_NODES 5 +#define NZC8X8_NODES 7 +#define NZC16X16_NODES 9 +#define NZC32X32_NODES 11 + +/* Max number of tokens with extra bits */ +#define NZC_TOKENS_EXTRA 9 + +/* Max number of extra bits */ +#define NZC_BITS_EXTRA 9 + +/* Tokens without extra bits */ +#define NZC_TOKENS_NOEXTRA (NZC32X32_TOKENS - NZC_TOKENS_EXTRA) + +#define MAX_NZC_CONTEXTS 3 + +/* whether to update extra bit probabilities */ +#define NZC_PCAT_UPDATE + +/* nzc trees */ +extern const vp9_tree_index vp9_nzc4x4_tree[]; +extern const vp9_tree_index vp9_nzc8x8_tree[]; +extern const vp9_tree_index vp9_nzc16x16_tree[]; +extern const vp9_tree_index vp9_nzc32x32_tree[]; + +/* nzc encodings */ +extern struct vp9_token_struct vp9_nzc4x4_encodings[NZC4X4_TOKENS]; +extern struct vp9_token_struct vp9_nzc8x8_encodings[NZC8X8_TOKENS]; +extern struct vp9_token_struct vp9_nzc16x16_encodings[NZC16X16_TOKENS]; +extern struct vp9_token_struct vp9_nzc32x32_encodings[NZC32X32_TOKENS]; + +#define codenzc(x) (\ + (x) <= 3 ? (x) : (x) <= 4 ? 3 : (x) <= 8 ? 4 : \ + (x) <= 16 ? 5 : (x) <= 32 ? 6 : (x) <= 64 ? 7 :\ + (x) <= 128 ? 8 : (x) <= 256 ? 9 : (x) <= 512 ? 10 : 11) + +int vp9_get_nzc_context_y_sb64(struct VP9Common *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block); +int vp9_get_nzc_context_y_sb32(struct VP9Common *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block); +int vp9_get_nzc_context_y_mb16(struct VP9Common *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block); +int vp9_get_nzc_context_uv_sb64(struct VP9Common *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block); +int vp9_get_nzc_context_uv_sb32(struct VP9Common *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block); +int vp9_get_nzc_context_uv_mb16(struct VP9Common *cm, MODE_INFO *cur, + int mb_row, int mb_col, int block); +int vp9_get_nzc_context(struct VP9Common *cm, MACROBLOCKD *xd, int block); +void vp9_update_nzc_counts(struct VP9Common *cm, MACROBLOCKD *xd, + int mb_row, int mb_col); +void vp9_adapt_nzc_probs(struct VP9Common *cm); + +/* Extra bits array */ +extern const int vp9_extranzcbits[NZC32X32_TOKENS]; + +/* Base nzc values */ +extern const int vp9_basenzcvalue[NZC32X32_TOKENS]; + +#endif // CONFIG_CODE_NONZEROCOUNT + +#include "vp9/common/vp9_coefupdateprobs.h" + #endif // VP9_COMMON_VP9_ENTROPY_H_ diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index ecae5e057fbf8926554871ff01e816509c0c3ffb..673b35a8f3bb5c43b370caaf4d57cb25675d7c98 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -11,9 +11,10 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_modecont.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_alloccommon.h" #include "vpx_mem/vpx_mem.h" - static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = { /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */ {12, 6, 5, 5, 5, 5, 5, 5, 5, 2, 22, 200}, @@ -114,8 +115,6 @@ int vp9_mv_cont(const int_mv *l, const int_mv *a) { return SUBMVREF_NORMAL; } -const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25}; - const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = { { 147, 136, 18 }, { 106, 145, 1 }, @@ -301,40 +300,32 @@ struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS]; void vp9_init_mbmode_probs(VP9_COMMON *x) { unsigned int bct [VP9_YMODES] [2]; /* num Ymodes > num UV modes */ - vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings, - vp9_ymode_tree, x->fc.ymode_prob, - bct, y_mode_cts); - vp9_tree_probs_from_distribution(VP9_I32X32_MODES, vp9_sb_ymode_encodings, - vp9_sb_ymode_tree, x->fc.sb_ymode_prob, - bct, y_mode_cts); + vp9_tree_probs_from_distribution(vp9_ymode_tree, x->fc.ymode_prob, + bct, y_mode_cts, 0); + vp9_tree_probs_from_distribution(vp9_sb_ymode_tree, x->fc.sb_ymode_prob, + bct, y_mode_cts, 0); { int i; for (i = 0; i < 8; i++) { - vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings, - vp9_kf_ymode_tree, x->kf_ymode_prob[i], - bct, kf_y_mode_cts[i]); - vp9_tree_probs_from_distribution(VP9_I32X32_MODES, - vp9_sb_kf_ymode_encodings, - vp9_sb_kf_ymode_tree, + vp9_tree_probs_from_distribution(vp9_kf_ymode_tree, x->kf_ymode_prob[i], + bct, kf_y_mode_cts[i], 0); + vp9_tree_probs_from_distribution(vp9_sb_kf_ymode_tree, x->sb_kf_ymode_prob[i], bct, - kf_y_mode_cts[i]); + kf_y_mode_cts[i], 0); } } { int i; for (i = 0; i < VP9_YMODES; i++) { - vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings, - vp9_uv_mode_tree, x->kf_uv_mode_prob[i], - bct, kf_uv_mode_cts[i]); - vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings, - vp9_uv_mode_tree, x->fc.uv_mode_prob[i], - bct, uv_mode_cts[i]); + vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->kf_uv_mode_prob[i], + bct, kf_uv_mode_cts[i], 0); + vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->fc.uv_mode_prob[i], + bct, uv_mode_cts[i], 0); } } - vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings, - vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob, - bct, i8x8_mode_cts); + vp9_tree_probs_from_distribution(vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob, + bct, i8x8_mode_cts, 0); vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2, sizeof(vp9_sub_mv_ref_prob2)); @@ -344,6 +335,9 @@ void vp9_init_mbmode_probs(VP9_COMMON *x) { #if CONFIG_COMP_INTERINTRA_PRED x->fc.interintra_prob = VP9_DEF_INTERINTRA_PROB; #endif + x->ref_pred_probs[0] = 120; + x->ref_pred_probs[1] = 80; + x->ref_pred_probs[2] = 40; } @@ -351,8 +345,7 @@ static void intra_bmode_probs_from_distribution( vp9_prob p[VP9_NKF_BINTRAMODES - 1], unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2], const unsigned int events[VP9_NKF_BINTRAMODES]) { - vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, - vp9_bmode_tree, p, branch_ct, events); + vp9_tree_probs_from_distribution(vp9_bmode_tree, p, branch_ct, events, 0); } void vp9_default_bmode_probs(vp9_prob p[VP9_NKF_BINTRAMODES - 1]) { @@ -364,8 +357,7 @@ static void intra_kf_bmode_probs_from_distribution( vp9_prob p[VP9_KF_BINTRAMODES - 1], unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2], const unsigned int events[VP9_KF_BINTRAMODES]) { - vp9_tree_probs_from_distribution(VP9_KF_BINTRAMODES, vp9_kf_bmode_encodings, - vp9_kf_bmode_tree, p, branch_ct, events); + vp9_tree_probs_from_distribution(vp9_kf_bmode_tree, p, branch_ct, events, 0); } void vp9_kf_default_bmode_probs(vp9_prob p[VP9_KF_BINTRAMODES] @@ -419,6 +411,14 @@ const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1, -1}; #else const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, 0, 1, -1, -1}; #endif +#endif // VP9_SWITCHABLE_FILTERS + +// Indicates if the filter is interpolating or non-interpolating +// Note currently only the EIGHTTAP_SMOOTH is non-interpolating +#if CONFIG_ENABLE_6TAP +const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 0, 1, 1, 1, -1}; +#else +const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {0, 1, 1, 1, -1}; #endif void vp9_entropy_mode_init() { @@ -480,7 +480,7 @@ void vp9_accum_mv_refs(VP9_COMMON *pc, #define MVREF_COUNT_SAT 20 #define MVREF_MAX_UPDATE_FACTOR 128 -void vp9_update_mode_context(VP9_COMMON *pc) { +void vp9_adapt_mode_context(VP9_COMMON *pc) { int i, j; unsigned int (*mv_ref_ct)[4][2]; int (*mode_context)[4]; @@ -526,17 +526,17 @@ void print_mode_contexts(VP9_COMMON *pc) { #define MODE_COUNT_SAT 20 #define MODE_MAX_UPDATE_FACTOR 144 -static void update_mode_probs(int n_modes, struct vp9_token_struct *encoding, +static void update_mode_probs(int n_modes, const vp9_tree_index *tree, unsigned int *cnt, - vp9_prob *pre_probs, vp9_prob *dst_probs) { + vp9_prob *pre_probs, vp9_prob *dst_probs, + unsigned int tok0_offset) { #define MAX_PROBS 32 vp9_prob probs[MAX_PROBS]; unsigned int branch_ct[MAX_PROBS][2]; int t, count, factor; assert(n_modes - 1 < MAX_PROBS); - vp9_tree_probs_from_distribution(n_modes, encoding, tree, probs, - branch_ct, cnt); + vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset); for (t = 0; t < n_modes - 1; ++t) { count = branch_ct[t][0] + branch_ct[t][1]; count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; @@ -592,31 +592,32 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { #endif #endif - update_mode_probs(VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree, + update_mode_probs(VP9_YMODES, vp9_ymode_tree, cm->fc.ymode_counts, cm->fc.pre_ymode_prob, - cm->fc.ymode_prob); - update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_encodings, vp9_sb_ymode_tree, + cm->fc.ymode_prob, 0); + update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_tree, cm->fc.sb_ymode_counts, cm->fc.pre_sb_ymode_prob, - cm->fc.sb_ymode_prob); + cm->fc.sb_ymode_prob, 0); for (i = 0; i < VP9_YMODES; ++i) { - update_mode_probs(VP9_UV_MODES, vp9_uv_mode_encodings, vp9_uv_mode_tree, + update_mode_probs(VP9_UV_MODES, vp9_uv_mode_tree, cm->fc.uv_mode_counts[i], cm->fc.pre_uv_mode_prob[i], - cm->fc.uv_mode_prob[i]); + cm->fc.uv_mode_prob[i], 0); } - update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, vp9_bmode_tree, + update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_tree, cm->fc.bmode_counts, cm->fc.pre_bmode_prob, - cm->fc.bmode_prob); - update_mode_probs(VP9_I8X8_MODES, vp9_i8x8_mode_encodings, + cm->fc.bmode_prob, 0); + update_mode_probs(VP9_I8X8_MODES, vp9_i8x8_mode_tree, cm->fc.i8x8_mode_counts, - cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob); + cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob, 0); for (i = 0; i < SUBMVREF_COUNT; ++i) { - update_mode_probs(VP9_SUBMVREFS, vp9_sub_mv_ref_encoding_array, + update_mode_probs(VP9_SUBMVREFS, vp9_sub_mv_ref_tree, cm->fc.sub_mv_ref_counts[i], - cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i]); + cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i], + LEFT4X4); } - update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_encodings, vp9_mbsplit_tree, + update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_tree, cm->fc.mbsplit_counts, cm->fc.pre_mbsplit_prob, - cm->fc.mbsplit_prob); + cm->fc.mbsplit_prob, 0); #if CONFIG_COMP_INTERINTRA_PRED if (cm->use_interintra) { int factor, interintra_prob, count; @@ -631,3 +632,65 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { } #endif } + +static void set_default_lf_deltas(MACROBLOCKD *xd) { + xd->mode_ref_lf_delta_enabled = 1; + xd->mode_ref_lf_delta_update = 1; + + xd->ref_lf_deltas[INTRA_FRAME] = 2; + xd->ref_lf_deltas[LAST_FRAME] = 0; + xd->ref_lf_deltas[GOLDEN_FRAME] = -2; + xd->ref_lf_deltas[ALTREF_FRAME] = -2; + + xd->mode_lf_deltas[0] = 4; // BPRED + xd->mode_lf_deltas[1] = -2; // Zero + xd->mode_lf_deltas[2] = 2; // New mv + xd->mode_lf_deltas[3] = 4; // Split mv +} + +void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) { + // Reset the segment feature data to the default stats: + // Features disabled, 0, with delta coding (Default state). + int i; + vp9_clearall_segfeatures(xd); + xd->mb_segment_abs_delta = SEGMENT_DELTADATA; + if (cm->last_frame_seg_map) + vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols)); + + /* reset the mode ref deltas for loop filter */ + vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas)); + vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas)); + set_default_lf_deltas(xd); + + vp9_default_coef_probs(cm); + vp9_init_mbmode_probs(cm); + vp9_default_bmode_probs(cm->fc.bmode_prob); + vp9_kf_default_bmode_probs(cm->kf_bmode_prob); + vp9_init_mv_probs(cm); + // To force update of the sharpness + cm->last_sharpness_level = -1; + + vp9_init_mode_contexts(cm); + + for (i = 0; i < NUM_FRAME_CONTEXTS; i++) { + vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc)); + } + + vpx_memset(cm->prev_mip, 0, + (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO)); + vpx_memset(cm->mip, 0, + (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO)); + + vp9_update_mode_info_border(cm, cm->mip); + vp9_update_mode_info_in_image(cm, cm->mi); + +#if CONFIG_NEW_MVREF + // Defaults probabilities for encoding the MV ref id signal + vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB, + sizeof(xd->mb_mv_ref_probs)); +#endif + cm->ref_frame_sign_bias[GOLDEN_FRAME] = 0; + cm->ref_frame_sign_bias[ALTREF_FRAME] = 0; + + cm->frame_context_idx = 0; +} diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index e03c6fe6de73f91e52aacd1364d5ec4da619b467..8b0caf6eb38d04479bf921b33b89334c8be182e5 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -34,8 +34,6 @@ extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1]; extern int vp9_mv_cont(const int_mv *l, const int_mv *a); -extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1]; - extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1]; extern const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES] @@ -76,11 +74,14 @@ void vp9_entropy_mode_init(void); struct VP9Common; +/* sets up common features to forget past dependence */ +void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd); + void vp9_init_mbmode_probs(struct VP9Common *x); extern void vp9_init_mode_contexts(struct VP9Common *pc); -extern void vp9_update_mode_context(struct VP9Common *pc); +extern void vp9_adapt_mode_context(struct VP9Common *pc); extern void vp9_accum_mv_refs(struct VP9Common *pc, MB_PREDICTION_MODE m, @@ -101,6 +102,8 @@ extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp extern const int vp9_switchable_interp_map[SWITCHABLE + 1]; +extern const int vp9_is_interpolating_filter[SWITCHABLE + 1]; + extern const vp9_tree_index vp9_switchable_interp_tree [2 * (VP9_SWITCHABLE_FILTERS - 1)]; diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 99e3c2e8c2424658fa8432d70bccff2bec43e540..a4a9d5465ed43af0aff68ca4aa93a6fe77216dca 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -42,7 +42,10 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { -MV_CLASS_2, -MV_CLASS_3, 10, 12, -MV_CLASS_4, -MV_CLASS_5, - -MV_CLASS_6, -MV_CLASS_7, + -MV_CLASS_6, 14, + 16, 18, + -MV_CLASS_7, -MV_CLASS_8, + -MV_CLASS_9, -MV_CLASS_10, }; struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES]; @@ -62,24 +65,24 @@ const nmv_context vp9_default_nmv_context = { {32, 64, 96}, { { /* vert component */ - 128, /* sign */ - {224, 144, 192, 168, 192, 176, 192}, /* class */ - {216}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224}, /* bits */ - {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ - {64, 96, 64}, /* fp */ - 160, /* class0_hp bit */ - 128, /* hp */ + 128, /* sign */ + {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, /* class */ + {216}, /* class0 */ + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */ + {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ + {64, 96, 64}, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ }, { /* hor component */ - 128, /* sign */ - {216, 128, 176, 160, 176, 176, 192}, /* class */ - {208}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224}, /* bits */ - {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ - {64, 96, 64}, /* fp */ - 160, /* class0_hp bit */ - 128, /* hp */ + 128, /* sign */ + {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, /* class */ + {208}, /* class0 */ + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */ + {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ + {64, 96, 64}, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ } }, }; @@ -103,6 +106,9 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5; else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6; else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7; + else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8; + else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9; + else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10; else assert(0); if (offset) *offset = z - mv_class_base(c); @@ -110,11 +116,8 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { } int vp9_use_nmv_hp(const MV *ref) { - if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && - (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH) - return 1; - else - return 0; + return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && + (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH; } int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { @@ -134,6 +137,7 @@ static void increment_nmv_component(int v, int incr, int usehp) { int s, z, c, o, d, e, f; + if (!incr) return; assert (v != 0); /* should not be zero */ s = v < 0; mvcomp->sign[s] += incr; @@ -211,24 +215,26 @@ void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, } } -static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp, +static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) { int count = ct[0] + ct[1]; - if (count) { + vp9_prob newp = get_binary_prob(ct[0], ct[1]); count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; *dest = weighted_prob(prep, newp, MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); + } else { + *dest = prep; } } -void vp9_counts_process(nmv_context_counts *NMVcount, int usehp) { - counts_to_context(&NMVcount->comps[0], usehp); - counts_to_context(&NMVcount->comps[1], usehp); +void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) { + counts_to_context(&nmv_count->comps[0], usehp); + counts_to_context(&nmv_count->comps[1], usehp); } void vp9_counts_to_nmv_context( - nmv_context_counts *NMVcount, + nmv_context_counts *nmv_count, nmv_context *prob, int usehp, unsigned int (*branch_ct_joint)[2], @@ -241,81 +247,90 @@ void vp9_counts_to_nmv_context( unsigned int (*branch_ct_class0_hp)[2], unsigned int (*branch_ct_hp)[2]) { int i, j, k; - vp9_counts_process(NMVcount, usehp); - vp9_tree_probs_from_distribution(MV_JOINTS, - vp9_mv_joint_encodings, - vp9_mv_joint_tree, + vp9_counts_process(nmv_count, usehp); + vp9_tree_probs_from_distribution(vp9_mv_joint_tree, prob->joints, branch_ct_joint, - NMVcount->joints); + nmv_count->joints, 0); for (i = 0; i < 2; ++i) { - prob->comps[i].sign = get_binary_prob(NMVcount->comps[i].sign[0], - NMVcount->comps[i].sign[1]); - branch_ct_sign[i][0] = NMVcount->comps[i].sign[0]; - branch_ct_sign[i][1] = NMVcount->comps[i].sign[1]; - vp9_tree_probs_from_distribution(MV_CLASSES, - vp9_mv_class_encodings, - vp9_mv_class_tree, + prob->comps[i].sign = get_binary_prob(nmv_count->comps[i].sign[0], + nmv_count->comps[i].sign[1]); + branch_ct_sign[i][0] = nmv_count->comps[i].sign[0]; + branch_ct_sign[i][1] = nmv_count->comps[i].sign[1]; + vp9_tree_probs_from_distribution(vp9_mv_class_tree, prob->comps[i].classes, branch_ct_classes[i], - NMVcount->comps[i].classes); - vp9_tree_probs_from_distribution(CLASS0_SIZE, - vp9_mv_class0_encodings, - vp9_mv_class0_tree, + nmv_count->comps[i].classes, 0); + vp9_tree_probs_from_distribution(vp9_mv_class0_tree, prob->comps[i].class0, branch_ct_class0[i], - NMVcount->comps[i].class0); + nmv_count->comps[i].class0, 0); for (j = 0; j < MV_OFFSET_BITS; ++j) { - prob->comps[i].bits[j] = get_binary_prob(NMVcount->comps[i].bits[j][0], - NMVcount->comps[i].bits[j][1]); - branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0]; - branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1]; + prob->comps[i].bits[j] = get_binary_prob(nmv_count->comps[i].bits[j][0], + nmv_count->comps[i].bits[j][1]); + branch_ct_bits[i][j][0] = nmv_count->comps[i].bits[j][0]; + branch_ct_bits[i][j][1] = nmv_count->comps[i].bits[j][1]; } } for (i = 0; i < 2; ++i) { for (k = 0; k < CLASS0_SIZE; ++k) { - vp9_tree_probs_from_distribution(4, - vp9_mv_fp_encodings, - vp9_mv_fp_tree, + vp9_tree_probs_from_distribution(vp9_mv_fp_tree, prob->comps[i].class0_fp[k], branch_ct_class0_fp[i][k], - NMVcount->comps[i].class0_fp[k]); + nmv_count->comps[i].class0_fp[k], 0); } - vp9_tree_probs_from_distribution(4, - vp9_mv_fp_encodings, - vp9_mv_fp_tree, + vp9_tree_probs_from_distribution(vp9_mv_fp_tree, prob->comps[i].fp, branch_ct_fp[i], - NMVcount->comps[i].fp); + nmv_count->comps[i].fp, 0); } if (usehp) { for (i = 0; i < 2; ++i) { prob->comps[i].class0_hp = - get_binary_prob(NMVcount->comps[i].class0_hp[0], - NMVcount->comps[i].class0_hp[1]); - branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0]; - branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1]; - - prob->comps[i].hp = get_binary_prob(NMVcount->comps[i].hp[0], - NMVcount->comps[i].hp[1]); - branch_ct_hp[i][0] = NMVcount->comps[i].hp[0]; - branch_ct_hp[i][1] = NMVcount->comps[i].hp[1]; + get_binary_prob(nmv_count->comps[i].class0_hp[0], + nmv_count->comps[i].class0_hp[1]); + branch_ct_class0_hp[i][0] = nmv_count->comps[i].class0_hp[0]; + branch_ct_class0_hp[i][1] = nmv_count->comps[i].class0_hp[1]; + + prob->comps[i].hp = get_binary_prob(nmv_count->comps[i].hp[0], + nmv_count->comps[i].hp[1]); + branch_ct_hp[i][0] = nmv_count->comps[i].hp[0]; + branch_ct_hp[i][1] = nmv_count->comps[i].hp[1]; } } } +static unsigned int adapt_probs(unsigned int i, + vp9_tree tree, + vp9_prob this_probs[], + const vp9_prob last_probs[], + const unsigned int num_events[]) { + vp9_prob this_prob; + + const uint32_t left = tree[i] <= 0 + ? num_events[-tree[i]] + : adapt_probs(tree[i], tree, this_probs, last_probs, num_events); + + const uint32_t right = tree[i + 1] <= 0 + ? num_events[-tree[i + 1]] + : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events); + + uint32_t weight = left + right; + if (weight) { + this_prob = get_binary_prob(left, right); + weight = weight > MV_COUNT_SAT ? MV_COUNT_SAT : weight; + this_prob = weighted_prob(last_probs[i >> 1], this_prob, + MV_MAX_UPDATE_FACTOR * weight / MV_COUNT_SAT); + } else { + this_prob = last_probs[i >> 1]; + } + this_probs[i >> 1] = this_prob; + return left + right; +} + + void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) { - int i, j, k; - nmv_context prob; - unsigned int branch_ct_joint[MV_JOINTS - 1][2]; - unsigned int branch_ct_sign[2][2]; - unsigned int branch_ct_classes[2][MV_CLASSES - 1][2]; - unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2]; - unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2]; - unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2]; - unsigned int branch_ct_fp[2][4 - 1][2]; - unsigned int branch_ct_class0_hp[2][2]; - unsigned int branch_ct_hp[2][2]; + int i, j; #ifdef MV_COUNT_TESTING printf("joints count: "); for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]); @@ -376,75 +391,48 @@ void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) { smooth_counts(&cm->fc.NMVcount.comps[0]); smooth_counts(&cm->fc.NMVcount.comps[1]); #endif - vp9_counts_to_nmv_context(&cm->fc.NMVcount, - &prob, - usehp, - branch_ct_joint, - branch_ct_sign, - branch_ct_classes, - branch_ct_class0, - branch_ct_bits, - branch_ct_class0_fp, - branch_ct_fp, - branch_ct_class0_hp, - branch_ct_hp); - - for (j = 0; j < MV_JOINTS - 1; ++j) { - adapt_prob(&cm->fc.nmvc.joints[j], - cm->fc.pre_nmvc.joints[j], - prob.joints[j], - branch_ct_joint[j]); - } + vp9_counts_process(&cm->fc.NMVcount, usehp); + + adapt_probs(0, vp9_mv_joint_tree, + cm->fc.nmvc.joints, cm->fc.pre_nmvc.joints, + cm->fc.NMVcount.joints); + for (i = 0; i < 2; ++i) { adapt_prob(&cm->fc.nmvc.comps[i].sign, cm->fc.pre_nmvc.comps[i].sign, - prob.comps[i].sign, - branch_ct_sign[i]); - for (j = 0; j < MV_CLASSES - 1; ++j) { - adapt_prob(&cm->fc.nmvc.comps[i].classes[j], - cm->fc.pre_nmvc.comps[i].classes[j], - prob.comps[i].classes[j], - branch_ct_classes[i][j]); - } - for (j = 0; j < CLASS0_SIZE - 1; ++j) { - adapt_prob(&cm->fc.nmvc.comps[i].class0[j], - cm->fc.pre_nmvc.comps[i].class0[j], - prob.comps[i].class0[j], - branch_ct_class0[i][j]); - } + cm->fc.NMVcount.comps[i].sign); + adapt_probs(0, vp9_mv_class_tree, + cm->fc.nmvc.comps[i].classes, cm->fc.pre_nmvc.comps[i].classes, + cm->fc.NMVcount.comps[i].classes); + adapt_probs(0, vp9_mv_class0_tree, + cm->fc.nmvc.comps[i].class0, cm->fc.pre_nmvc.comps[i].class0, + cm->fc.NMVcount.comps[i].class0); for (j = 0; j < MV_OFFSET_BITS; ++j) { adapt_prob(&cm->fc.nmvc.comps[i].bits[j], cm->fc.pre_nmvc.comps[i].bits[j], - prob.comps[i].bits[j], - branch_ct_bits[i][j]); + cm->fc.NMVcount.comps[i].bits[j]); } } for (i = 0; i < 2; ++i) { for (j = 0; j < CLASS0_SIZE; ++j) { - for (k = 0; k < 3; ++k) { - adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k], - cm->fc.pre_nmvc.comps[i].class0_fp[j][k], - prob.comps[i].class0_fp[j][k], - branch_ct_class0_fp[i][j][k]); - } - } - for (j = 0; j < 3; ++j) { - adapt_prob(&cm->fc.nmvc.comps[i].fp[j], - cm->fc.pre_nmvc.comps[i].fp[j], - prob.comps[i].fp[j], - branch_ct_fp[i][j]); + adapt_probs(0, vp9_mv_fp_tree, + cm->fc.nmvc.comps[i].class0_fp[j], + cm->fc.pre_nmvc.comps[i].class0_fp[j], + cm->fc.NMVcount.comps[i].class0_fp[j]); } + adapt_probs(0, vp9_mv_fp_tree, + cm->fc.nmvc.comps[i].fp, + cm->fc.pre_nmvc.comps[i].fp, + cm->fc.NMVcount.comps[i].fp); } if (usehp) { for (i = 0; i < 2; ++i) { adapt_prob(&cm->fc.nmvc.comps[i].class0_hp, cm->fc.pre_nmvc.comps[i].class0_hp, - prob.comps[i].class0_hp, - branch_ct_class0_hp[i]); + cm->fc.NMVcount.comps[i].class0_hp); adapt_prob(&cm->fc.nmvc.comps[i].hp, cm->fc.pre_nmvc.comps[i].hp, - prob.comps[i].hp, - branch_ct_hp[i]); + cm->fc.NMVcount.comps[i].hp); } } } diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h index f5cfee9379e0affd7f1d337865eaa230ce839055..162d2b44ff78cbed15324019e922f180e4e5a4af 100644 --- a/vp9/common/vp9_entropymv.h +++ b/vp9/common/vp9_entropymv.h @@ -49,7 +49,7 @@ extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2]; extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS]; /* Symbols for coding magnitude class of nonzero components */ -#define MV_CLASSES 8 +#define MV_CLASSES 11 typedef enum { MV_CLASS_0 = 0, /* (0, 2] integer pel */ MV_CLASS_1 = 1, /* (2, 4] integer pel */ @@ -59,6 +59,9 @@ typedef enum { MV_CLASS_5 = 5, /* (32, 64] integer pel */ MV_CLASS_6 = 6, /* (64, 128] integer pel */ MV_CLASS_7 = 7, /* (128, 256] integer pel */ + MV_CLASS_8 = 8, /* (256, 512] integer pel */ + MV_CLASS_9 = 9, /* (512, 1024] integer pel */ + MV_CLASS_10 = 10, /* (1024,2048] integer pel */ } MV_CLASS_TYPE; extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2]; diff --git a/vp9/common/vp9_extend.c b/vp9/common/vp9_extend.c index d3e66f696de31a29d46c60a4ab9eb23654e86112..6aac905378a8ef86c0b9f28243a2dd8aa9bf06f8 100644 --- a/vp9/common/vp9_extend.c +++ b/vp9/common/vp9_extend.c @@ -11,159 +11,137 @@ #include "vp9/common/vp9_extend.h" #include "vpx_mem/vpx_mem.h" -static void copy_and_extend_plane(uint8_t *s, /* source */ - int sp, /* source pitch */ - uint8_t *d, /* destination */ - int dp, /* destination pitch */ - int h, /* height */ - int w, /* width */ - int et, /* extend top border */ - int el, /* extend left border */ - int eb, /* extend bottom border */ - int er) { /* extend right border */ - int i; - uint8_t *src_ptr1, *src_ptr2; - uint8_t *dest_ptr1, *dest_ptr2; - int linesize; - - /* copy the left and right most columns out */ - src_ptr1 = s; - src_ptr2 = s + w - 1; - dest_ptr1 = d - el; - dest_ptr2 = d + w; +static void copy_and_extend_plane(const uint8_t *src, int src_pitch, + uint8_t *dst, int dst_pitch, + int w, int h, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + + // copy the left and right most columns out + const uint8_t *src_ptr1 = src; + const uint8_t *src_ptr2 = src + w - 1; + uint8_t *dst_ptr1 = dst - extend_left; + uint8_t *dst_ptr2 = dst + w; for (i = 0; i < h; i++) { - vpx_memset(dest_ptr1, src_ptr1[0], el); - vpx_memcpy(dest_ptr1 + el, src_ptr1, w); - vpx_memset(dest_ptr2, src_ptr2[0], er); - src_ptr1 += sp; - src_ptr2 += sp; - dest_ptr1 += dp; - dest_ptr2 += dp; + vpx_memset(dst_ptr1, src_ptr1[0], extend_left); + vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w); + vpx_memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; } - /* Now copy the top and bottom lines into each line of the respective - * borders - */ - src_ptr1 = d - el; - src_ptr2 = d + dp * (h - 1) - el; - dest_ptr1 = d + dp * (-et) - el; - dest_ptr2 = d + dp * (h) - el; - linesize = el + er + w; - - for (i = 0; i < et; i++) { - vpx_memcpy(dest_ptr1, src_ptr1, linesize); - dest_ptr1 += dp; + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h) - extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + vpx_memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += dst_pitch; } - for (i = 0; i < eb; i++) { - vpx_memcpy(dest_ptr2, src_ptr2, linesize); - dest_ptr2 += dp; + for (i = 0; i < extend_bottom; i++) { + vpx_memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += dst_pitch; } } -void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, +void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst) { - int et = dst->border; - int el = dst->border; - int eb = dst->border + dst->y_height - src->y_height; - int er = dst->border + dst->y_width - src->y_width; + const int et_y = dst->border; + const int el_y = dst->border; + const int eb_y = dst->border + dst->y_height - src->y_height; + const int er_y = dst->border + dst->y_width - src->y_width; + + const int et_uv = dst->border >> 1; + const int el_uv = dst->border >> 1; + const int eb_uv = (dst->border >> 1) + dst->uv_height - src->uv_height; + const int er_uv = (dst->border >> 1) + dst->uv_width - src->uv_width; copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, - src->y_height, src->y_width, - et, el, eb, er); - - et = dst->border >> 1; - el = dst->border >> 1; - eb = (dst->border >> 1) + dst->uv_height - src->uv_height; - er = (dst->border >> 1) + dst->uv_width - src->uv_width; + src->y_width, src->y_height, + et_y, el_y, eb_y, er_y); copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, - src->uv_height, src->uv_width, - et, el, eb, er); + src->uv_width, src->uv_height, + et_uv, el_uv, eb_uv, er_uv); copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, - src->uv_height, src->uv_width, - et, el, eb, er); + src->uv_width, src->uv_height, + et_y, el_y, eb_uv, er_uv); } -void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, +void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int srcy, int srcx, int srch, int srcw) { - int et = dst->border; - int el = dst->border; - int eb = dst->border + dst->y_height - src->y_height; - int er = dst->border + dst->y_width - src->y_width; - int src_y_offset = srcy * src->y_stride + srcx; - int dst_y_offset = srcy * dst->y_stride + srcx; - int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); - int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); - // If the side is not touching the bounder then don't extend. - if (srcy) - et = 0; - if (srcx) - el = 0; - if (srcy + srch != src->y_height) - eb = 0; - if (srcx + srcw != src->y_width) - er = 0; - - copy_and_extend_plane(src->y_buffer + src_y_offset, - src->y_stride, - dst->y_buffer + dst_y_offset, - dst->y_stride, - srch, srcw, - et, el, eb, er); - - et = (et + 1) >> 1; - el = (el + 1) >> 1; - eb = (eb + 1) >> 1; - er = (er + 1) >> 1; - srch = (srch + 1) >> 1; - srcw = (srcw + 1) >> 1; - - copy_and_extend_plane(src->u_buffer + src_uv_offset, - src->uv_stride, - dst->u_buffer + dst_uv_offset, - dst->uv_stride, - srch, srcw, - et, el, eb, er); - - copy_and_extend_plane(src->v_buffer + src_uv_offset, - src->uv_stride, - dst->v_buffer + dst_uv_offset, - dst->uv_stride, - srch, srcw, - et, el, eb, er); + const int et_y = srcy ? 0 : dst->border; + const int el_y = srcx ? 0 : dst->border; + const int eb_y = srcy + srch != src->y_height ? 0 : + dst->border + dst->y_height - src->y_height; + const int er_y = srcx + srcw != src->y_width ? 0 : + dst->border + dst->y_width - src->y_width; + const int src_y_offset = srcy * src->y_stride + srcx; + const int dst_y_offset = srcy * dst->y_stride + srcx; + + const int et_uv = (et_y + 1) >> 1; + const int el_uv = (el_y + 1) >> 1; + const int eb_uv = (eb_y + 1) >> 1; + const int er_uv = (er_y + 1) >> 1; + const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + const int srch_uv = (srch + 1) >> 1; + const int srcw_uv = (srcw + 1) >> 1; + + copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, + dst->y_buffer + dst_y_offset, dst->y_stride, + srcw, srch, + et_y, el_y, eb_y, er_y); + + copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, + dst->u_buffer + dst_uv_offset, dst->uv_stride, + srcw_uv, srch_uv, + et_uv, el_uv, eb_uv, er_uv); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, + dst->v_buffer + dst_uv_offset, dst->uv_stride, + srcw_uv, srch_uv, + et_uv, el_uv, eb_uv, er_uv); } -/* note the extension is only for the last row, for intra prediction purpose */ -void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, uint8_t *YPtr, - uint8_t *UPtr, uint8_t *VPtr) { +// note the extension is only for the last row, for intra prediction purpose +void vp9_extend_mb_row(YV12_BUFFER_CONFIG *buf, + uint8_t *y, uint8_t *u, uint8_t *v) { int i; - YPtr += ybf->y_stride * 14; - UPtr += ybf->uv_stride * 6; - VPtr += ybf->uv_stride * 6; + y += buf->y_stride * 14; + u += buf->uv_stride * 6; + v += buf->uv_stride * 6; for (i = 0; i < 4; i++) { - YPtr[i] = YPtr[-1]; - UPtr[i] = UPtr[-1]; - VPtr[i] = VPtr[-1]; + y[i] = y[-1]; + u[i] = u[-1]; + v[i] = v[-1]; } - YPtr += ybf->y_stride; - UPtr += ybf->uv_stride; - VPtr += ybf->uv_stride; + y += buf->y_stride; + u += buf->uv_stride; + v += buf->uv_stride; for (i = 0; i < 4; i++) { - YPtr[i] = YPtr[-1]; - UPtr[i] = UPtr[-1]; - VPtr[i] = VPtr[-1]; + y[i] = y[-1]; + u[i] = u[-1]; + v[i] = v[-1]; } } diff --git a/vp9/common/vp9_extend.h b/vp9/common/vp9_extend.h index 847c2c5b95401060bedff56150a537d980ca4764..6ec75c992afa627185c796f1c1328084ad0b656c 100644 --- a/vp9/common/vp9_extend.h +++ b/vp9/common/vp9_extend.h @@ -14,15 +14,17 @@ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" -void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, uint8_t *YPtr, - uint8_t *UPtr, uint8_t *VPtr); -void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, +void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); -void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, +void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int srcy, int srcx, int srch, int srcw); +void vp9_extend_mb_row(YV12_BUFFER_CONFIG *buf, + uint8_t *y, uint8_t *u, uint8_t *v); + + #endif // VP9_COMMON_VP9_EXTEND_H_ diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index 07d8a169f6dbdd1332d1873ff27fe3d2c3c29f36..6c1ea21a1b594f7e4c276373e7e5e222bf259351 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -15,28 +15,30 @@ #include "vp9_rtcd.h" #include "vp9/common/vp9_common.h" -DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = { - { 128, 0 }, - { 120, 8 }, - { 112, 16 }, - { 104, 24 }, - { 96, 32 }, - { 88, 40 }, - { 80, 48 }, - { 72, 56 }, - { 64, 64 }, - { 56, 72 }, - { 48, 80 }, - { 40, 88 }, - { 32, 96 }, - { 24, 104 }, - { 16, 112 }, - { 8, 120 } +DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, + { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, + { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, + { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, + { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, + { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, + { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, + { 0, 0, 0, 8, 120, 0, 0, 0 } }; -#define FILTER_ALPHA 0 -#define FILTER_ALPHA_SHARP 1 -DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { +#define FILTER_ALPHA 0 +#define FILTER_ALPHA_SHARP 0 +#define FILTER_ALPHA_SMOOTH 50 +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) + = { #if FILTER_ALPHA == 0 /* Lagrangian interpolation filter */ { 0, 0, 0, 128, 0, 0, 0, 0}, @@ -55,6 +57,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { { -1, 3, -9, 27, 118, -13, 4, -1}, { 0, 2, -6, 18, 122, -10, 3, -1}, { 0, 1, -3, 8, 126, -5, 1, 0} + #elif FILTER_ALPHA == 50 /* Generated using MATLAB: * alpha = 0.5; @@ -79,11 +82,13 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { { 0, 3, -9, 27, 118, -13, 3, -1}, { 0, 2, -6, 18, 122, -10, 2, 0}, { 0, 1, -3, 8, 126, -5, 1, 0} + #endif /* FILTER_ALPHA */ }; -DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = { -#if FILTER_ALPHA_SHARP == 1 +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) + = { +#if FILTER_ALPHA_SHARP == 0 /* dct based filter */ {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, @@ -101,31 +106,34 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = { {-2, 5, -10, 27, 121, -17, 7, -3}, {-1, 3, -6, 17, 125, -13, 5, -2}, {0, 1, -3, 8, 127, -7, 3, -1} -#elif FILTER_ALPHA_SHARP == 75 - /* alpha = 0.75 */ - {0, 0, 0, 128, 0, 0, 0, 0}, - {-1, 2, -6, 126, 9, -3, 2, -1}, - {-1, 4, -11, 123, 18, -7, 3, -1}, - {-2, 6, -16, 119, 28, -10, 5, -2}, - {-2, 7, -19, 113, 38, -13, 6, -2}, - {-3, 8, -21, 106, 49, -16, 7, -2}, - {-3, 9, -22, 99, 59, -19, 8, -3}, - {-3, 9, -23, 90, 70, -21, 9, -3}, - {-3, 9, -22, 80, 80, -22, 9, -3}, - {-3, 9, -21, 70, 90, -23, 9, -3}, - {-3, 8, -19, 59, 99, -22, 9, -3}, - {-2, 7, -16, 49, 106, -21, 8, -3}, - {-2, 6, -13, 38, 113, -19, 7, -2}, - {-2, 5, -10, 28, 119, -16, 6, -2}, - {-1, 3, -7, 18, 123, -11, 4, -1}, - {-1, 2, -3, 9, 126, -6, 2, -1} + +#elif FILTER_ALPHA_SHARP == 80 + /* alpha = 0.80 */ + { 0, 0, 0, 128, 0, 0, 0, 0}, + {-1, 2, -6, 127, 9, -4, 2, -1}, + {-2, 5, -12, 124, 18, -7, 4, -2}, + {-2, 7, -16, 119, 28, -11, 5, -2}, + {-3, 8, -19, 114, 38, -14, 7, -3}, + {-3, 9, -22, 107, 49, -17, 8, -3}, + {-4, 10, -23, 99, 60, -20, 10, -4}, + {-4, 11, -23, 90, 70, -22, 10, -4}, + {-4, 11, -23, 80, 80, -23, 11, -4}, + {-4, 10, -22, 70, 90, -23, 11, -4}, + {-4, 10, -20, 60, 99, -23, 10, -4}, + {-3, 8, -17, 49, 107, -22, 9, -3}, + {-3, 7, -14, 38, 114, -19, 8, -3}, + {-2, 5, -11, 28, 119, -16, 7, -2}, + {-2, 4, -7, 18, 124, -12, 5, -2}, + {-1, 2, -4, 9, 127, -6, 2, -1} #endif /* FILTER_ALPHA_SHARP */ }; -DECLARE_ALIGNED(16, const int16_t, +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = { /* 8-tap lowpass filter */ /* Hamming window */ + /* freqmultiplier = 0.625 */ +#if FILTER_ALPHA_SMOOTH == 625 {-1, -7, 32, 80, 32, -7, -1, 0}, {-1, -8, 28, 80, 37, -7, -2, 1}, { 0, -8, 24, 79, 41, -7, -2, 1}, @@ -142,1074 +150,44 @@ DECLARE_ALIGNED(16, const int16_t, { 1, -3, -5, 45, 78, 20, -8, 0}, { 1, -2, -7, 41, 79, 24, -8, 0}, { 1, -2, -7, 37, 80, 28, -8, -1} -}; -DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = { - {0, 0, 128, 0, 0, 0}, - {1, -5, 125, 8, -2, 1}, - {1, -8, 122, 17, -5, 1}, - {2, -11, 116, 27, -8, 2}, - {3, -14, 110, 37, -10, 2}, - {3, -15, 103, 47, -12, 2}, - {3, -16, 95, 57, -14, 3}, - {3, -16, 86, 67, -15, 3}, - {3, -16, 77, 77, -16, 3}, - {3, -15, 67, 86, -16, 3}, - {3, -14, 57, 95, -16, 3}, - {2, -12, 47, 103, -15, 3}, - {2, -10, 37, 110, -14, 3}, - {2, -8, 27, 116, -11, 2}, - {1, -5, 17, 122, -8, 1}, - {1, -2, 8, 125, -5, 1} +#elif FILTER_ALPHA_SMOOTH == 50 + /* freqmultiplier = 0.5 */ + {-3, 0, 35, 64, 35, 0, -3, 0}, + {-3, -1, 32, 64, 38, 1, -3, 0}, + {-2, -2, 29, 63, 41, 2, -3, 0}, + {-2, -2, 26, 63, 43, 4, -4, 0}, + {-2, -3, 24, 62, 46, 5, -4, 0}, + {-2, -3, 21, 60, 49, 7, -4, 0}, + {-1, -4, 18, 59, 51, 9, -4, 0}, + {-1, -4, 16, 57, 53, 12, -4, -1}, + {-1, -4, 14, 55, 55, 14, -4, -1}, + {-1, -4, 12, 53, 57, 16, -4, -1}, + {0, -4, 9, 51, 59, 18, -4, -1}, + {0, -4, 7, 49, 60, 21, -3, -2}, + {0, -4, 5, 46, 62, 24, -3, -2}, + {0, -4, 4, 43, 63, 26, -2, -2}, + {0, -3, 2, 41, 63, 29, -2, -2}, + {0, -3, 1, 38, 64, 32, -1, -3} +#endif }; -static void filter_block2d_first_pass_6(uint8_t *src_ptr, - int *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Next row... */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -static void filter_block2d_second_pass_6(int *src_ptr, - uint8_t *output_ptr, - int output_pitch, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Start next row */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_pitch; - } -} - -/* - * The only functional difference between filter_block2d_second_pass() - * and this function is that filter_block2d_second_pass() does a sixtap - * filter on the input and stores it in the output. This function - * (filter_block2d_second_pass_avg()) does a sixtap filter on the input, - * and then averages that with the content already present in the output - * ((filter_result + dest + 1) >> 1) and stores that in the output. - */ -static void filter_block2d_second_pass_avg_6(int *src_ptr, - uint8_t *output_ptr, - int output_pitch, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) + - output_ptr[j] + 1) >> 1; - src_ptr++; - } - - /* Start next row */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_pitch; - } -} - -#define Interp_Extend 3 -static void filter_block2d_6(uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int output_pitch, - const int16_t *HFilter, - const int16_t *VFilter) { - int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, - output_pitch, 4, 4, 4, 4, VFilter); -} - - -void vp9_sixtap_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, - VFilter); -} - -/* - * The difference between filter_block2d_6() and filter_block2d_avg_6 is - * that filter_block2d_6() does a 6-tap filter and stores it in the output - * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and - * then averages that with the content already present in the output - * ((filter_result + dest + 1) >> 1) and stores that in the output. - */ -static void filter_block2d_avg_6(uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int output_pitch, - const int16_t *HFilter, - const int16_t *VFilter) { - int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr, - output_pitch, 4, 4, 4, 4, VFilter); -} - -void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, - HFilter, VFilter); -} - -void vp9_sixtap_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 8, 8, VFilter); - -} - -void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 8, 8, VFilter); -} - -void vp9_sixtap_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 4, 8, VFilter); -} - -void vp9_sixtap_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 16, 16, 16, 16, VFilter); -} - -void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 16, 16, 16, 16, VFilter); -} - -typedef enum { - VPX_FILTER_4x4 = 0, - VPX_FILTER_8x8 = 1, - VPX_FILTER_8x4 = 2, - VPX_FILTER_16x16 = 3, -} filter_size_t; - -static const unsigned int filter_size_to_wh[][2] = { - {4, 4}, - {8, 8}, - {8, 4}, - {16,16}, +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]) + = { + {0, 0, 0, 128, 0, 0, 0, 0}, + {0, 1, -5, 125, 8, -2, 1, 0}, + {0, 1, -8, 122, 17, -5, 1, 0}, + {0, 2, -11, 116, 27, -8, 2, 0}, + {0, 3, -14, 110, 37, -10, 2, 0}, + {0, 3, -15, 103, 47, -12, 2, 0}, + {0, 3, -16, 95, 57, -14, 3, 0}, + {0, 3, -16, 86, 67, -15, 3, 0}, + {0, 3, -16, 77, 77, -16, 3, 0}, + {0, 3, -15, 67, 86, -16, 3, 0}, + {0, 3, -14, 57, 95, -16, 3, 0}, + {0, 2, -12, 47, 103, -15, 3, 0}, + {0, 2, -10, 37, 110, -14, 3, 0}, + {0, 2, -8, 27, 116, -11, 2, 0}, + {0, 1, -5, 17, 122, -8, 1, 0}, + {0, 1, -2, 8, 125, -5, 1, 0} }; - -static void filter_block2d_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter, - const int16_t *VFilter, - const filter_size_t filter_size, - uint8_t *dst_ptr, - unsigned int dst_stride) { - const unsigned int output_width = filter_size_to_wh[filter_size][0]; - const unsigned int output_height = filter_size_to_wh[filter_size][1]; - - // Between passes, we use an intermediate buffer whose height is extended to - // have enough horizontally filtered values as input for the vertical pass. - // This buffer is allocated to be big enough for the largest block type we - // support. - const int kInterp_Extend = 4; - const unsigned int intermediate_height = - (kInterp_Extend - 1) + output_height + kInterp_Extend; - - /* Size of intermediate_buffer is max_intermediate_height * filter_max_width, - * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height - * + kInterp_Extend - * = 3 + 16 + 4 - * = 23 - * and filter_max_width = 16 - */ - uint8_t intermediate_buffer[23 * 16]; - const int intermediate_next_stride = 1 - intermediate_height * output_width; - - // Horizontal pass (src -> transposed intermediate). - { - uint8_t *output_ptr = intermediate_buffer; - const int src_next_row_stride = src_stride - output_width; - unsigned int i, j; - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - for (i = 0; i < intermediate_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter... - int temp = ((int)src_ptr[0] * HFilter[0]) + - ((int)src_ptr[1] * HFilter[1]) + - ((int)src_ptr[2] * HFilter[2]) + - ((int)src_ptr[3] * HFilter[3]) + - ((int)src_ptr[4] * HFilter[4]) + - ((int)src_ptr[5] * HFilter[5]) + - ((int)src_ptr[6] * HFilter[6]) + - ((int)src_ptr[7] * HFilter[7]) + - (VP9_FILTER_WEIGHT >> 1); // Rounding - - // Normalize back to 0-255... - *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - output_ptr += intermediate_height; - } - src_ptr += src_next_row_stride; - output_ptr += intermediate_next_stride; - } - } - - // Vertical pass (transposed intermediate -> dst). - { - uint8_t *src_ptr = intermediate_buffer; - const int dst_next_row_stride = dst_stride - output_width; - unsigned int i, j; - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter... - int temp = ((int)src_ptr[0] * VFilter[0]) + - ((int)src_ptr[1] * VFilter[1]) + - ((int)src_ptr[2] * VFilter[2]) + - ((int)src_ptr[3] * VFilter[3]) + - ((int)src_ptr[4] * VFilter[4]) + - ((int)src_ptr[5] * VFilter[5]) + - ((int)src_ptr[6] * VFilter[6]) + - ((int)src_ptr[7] * VFilter[7]) + - (VP9_FILTER_WEIGHT >> 1); // Rounding - - // Normalize back to 0-255... - *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr += intermediate_height; - } - src_ptr += intermediate_next_stride; - dst_ptr += dst_next_row_stride; - } - } -} - -void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_4x4, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_8x4, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_8x8, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_16x16, dst_ptr, dst_stride); -} - -static void block2d_average_c(uint8_t *src, - unsigned int src_stride, - uint8_t *output_ptr, - unsigned int output_stride, - const filter_size_t filter_size) { - const unsigned int output_width = filter_size_to_wh[filter_size][0]; - const unsigned int output_height = filter_size_to_wh[filter_size][1]; - - unsigned int i, j; - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; - } - output_ptr += output_stride; - } -} - -#define block2d_average block2d_average_c - -void vp9_eighttap_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8[xoffset]; - VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - -void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8s[xoffset]; - VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8lp[xoffset]; - VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - -void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - - -void vp9_eighttap_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_stride : Stride of source block. - * uint32_t height : Block height. - * uint32_t width : Block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : int32_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block - * in the horizontal direction to produce the filtered output - * block. Used to implement first-pass of 2-D separable filter. - * - * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * - ****************************************************************************/ -static void filter_block2d_bil_first_pass(uint8_t *src_ptr, - uint16_t *dst_ptr, - unsigned int src_stride, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply bilinear filter */ - dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[1] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; - src_ptr++; - } - - /* Next row... */ - src_ptr += src_stride - width; - dst_ptr += width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : int32_t *src_ptr : Pointer to source block. - * uint32_t dst_pitch : Destination block pitch. - * uint32_t height : Block height. - * uint32_t width : Block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : uint16_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block - * in the vertical direction to produce the filtered output - * block. Used to implement second-pass of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * - ****************************************************************************/ -static void filter_block2d_bil_second_pass(uint16_t *src_ptr, - uint8_t *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[width] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2); - dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Next row... */ - dst_ptr += dst_pitch; - } -} - -/* - * As before for filter_block2d_second_pass_avg(), the functional difference - * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg() - * is that filter_block2d_bil_second_pass() does a bilinear filter on input - * and stores the result in output; filter_block2d_bil_second_pass_avg(), - * instead, does a bilinear filter on input, averages the resulting value - * with the values already present in the output and stores the result of - * that back into the output ((filter_result + dest + 1) >> 1). - */ -static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr, - uint8_t *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply filter */ - temp = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[width] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; - dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1); - src_ptr++; - } - - /* Next row... */ - dst_ptr += dst_pitch; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_pitch : Stride of source block. - * uint32_t dst_pitch : Stride of destination block. - * int32_t *HFilter : Array of 2 horizontal filter taps. - * int32_t *VFilter : Array of 2 vertical filter taps. - * int32_t Width : Block width - * int32_t Height : Block height - * - * OUTPUTS : uint16_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : 2-D filters an input block by applying a 2-tap - * bi-linear filter horizontally followed by a 2-tap - * bi-linear filter vertically on the result. - * - * SPECIAL NOTES : The largest block size can be handled here is 16x16 - * - ****************************************************************************/ -static void filter_block2d_bil(uint8_t *src_ptr, - uint8_t *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const int16_t *HFilter, - const int16_t *VFilter, - int Width, - int Height) { - - uint16_t FData[17 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - -static void filter_block2d_bil_avg(uint8_t *src_ptr, - uint8_t *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const int16_t *HFilter, - const int16_t *VFilter, - int Width, - int Height) { - uint16_t FData[17 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - -void vp9_bilinear_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); - -} - -void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 8, 8); -} - -void vp9_bilinear_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); - -} - -void vp9_bilinear_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); -} - -void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 16, 16); -} diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index cd666578d3cd53a7cd1d062b8553338dbbf41b79..1ccfdaac25c57f45ebe02179cb1458ba6abf62a1 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -21,10 +21,17 @@ #define SUBPEL_SHIFTS 16 -extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2]; -extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]; +extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]; +// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear +// filter kernel as a 2 tap filter. +#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \ + sizeof(vp9_bilinear_filters[0][0])) +#define BF_OFFSET (BF_LENGTH / 2 - 1) +#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET) + #endif // VP9_COMMON_VP9_FILTER_H_ diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index 1d11f4244ff3e853bffc10c616bd319d405c3493..f6d6932cc5ba5b39a6b78480a1e3bad20ec58b6f 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -9,10 +9,11 @@ */ +#include <limits.h> + #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_sadmxn.h" #include "vp9/common/vp9_subpelvar.h" -#include <limits.h> const uint8_t vp9_mbsplit_offset[4][16] = { { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, @@ -32,8 +33,7 @@ static void lower_mv_precision(int_mv *mv, int usehp) } vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, - vp9_prob p[4], const int context - ) { + vp9_prob p[4], const int context) { p[0] = pc->fc.vp9_mode_contexts[context][0]; p[1] = pc->fc.vp9_mode_contexts[context][1]; p[2] = pc->fc.vp9_mode_contexts[context][2]; @@ -87,8 +87,8 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr, uint8_t temp2[2 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 3, 16, HFilter); @@ -108,8 +108,8 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr, uint8_t temp2[2 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 2, HFilter); @@ -118,10 +118,12 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr, return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse); } +#if CONFIG_USESELECTREFMV /* check a list of motion vectors by sad score using a number rows of pixels * above and a number cols of pixels in the left to select the one with best * score to use as ref motion vector */ + void vp9_find_best_ref_mvs(MACROBLOCKD *xd, uint8_t *ref_y_buffer, int ref_y_stride, @@ -141,130 +143,140 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int_mv sorted_mvs[MAX_MV_REF_CANDIDATES]; int zero_seen = FALSE; - // Default all to 0,0 if nothing else available - nearest->as_int = near->as_int = 0; - vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs)); + if (ref_y_buffer) { + + // Default all to 0,0 if nothing else available + nearest->as_int = near->as_int = 0; + vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs)); - above_src = xd->dst.y_buffer - xd->dst.y_stride * 2; - above_ref = ref_y_buffer - ref_y_stride * 2; + above_src = xd->dst.y_buffer - xd->dst.y_stride * 2; + above_ref = ref_y_buffer - ref_y_stride * 2; #if CONFIG_ABOVESPREFMV - above_src -= 4; - above_ref -= 4; + above_src -= 4; + above_ref -= 4; #else - left_src = xd->dst.y_buffer - 2; - left_ref = ref_y_buffer - 2; + left_src = xd->dst.y_buffer - 2; + left_ref = ref_y_buffer - 2; #endif - // Limit search to the predicted best few candidates - for(i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { - int_mv this_mv; - int offset = 0; - int row_offset, col_offset; + // Limit search to the predicted best few candidates + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + int_mv this_mv; + int offset = 0; + int row_offset, col_offset; - this_mv.as_int = mvlist[i].as_int; + this_mv.as_int = mvlist[i].as_int; - // If we see a 0,0 vector for a second time we have reached the end of - // the list of valid candidate vectors. - if (!this_mv.as_int && zero_seen) - break; + // If we see a 0,0 vector for a second time we have reached the end of + // the list of valid candidate vectors. + if (!this_mv.as_int && zero_seen) + break; - zero_seen = zero_seen || !this_mv.as_int; + zero_seen = zero_seen || !this_mv.as_int; #if !CONFIG_ABOVESPREFMV - clamp_mv(&this_mv, - xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24, - xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, - xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24, - xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); + clamp_mv(&this_mv, + xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); #else - clamp_mv(&this_mv, - xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32, - xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, - xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24, - xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); + clamp_mv(&this_mv, + xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); #endif - row_offset = this_mv.as_mv.row >> 3; - col_offset = this_mv.as_mv.col >> 3; - offset = ref_y_stride * row_offset + col_offset; - score = 0; - if (xd->up_available) { - vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src, xd->dst.y_stride, &sse); - score += sse; - if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) { - vp9_sub_pixel_variance16x2(above_ref + offset + 16, - ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src + 16, xd->dst.y_stride, &sse); - score += sse; - } - if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) { - vp9_sub_pixel_variance16x2(above_ref + offset + 32, - ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src + 32, xd->dst.y_stride, &sse); - score += sse; - vp9_sub_pixel_variance16x2(above_ref + offset + 48, - ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src + 48, xd->dst.y_stride, &sse); - score += sse; - } - } + row_offset = this_mv.as_mv.row >> 3; + col_offset = this_mv.as_mv.col >> 3; + offset = ref_y_stride * row_offset + col_offset; + score = 0; #if !CONFIG_ABOVESPREFMV - if (xd->left_available) { - vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride, + if (xd->up_available) { +#else + if (xd->up_available && xd->left_available) { +#endif + vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - left_src, xd->dst.y_stride, &sse); - score += sse; - if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) { - vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16, + above_src, xd->dst.y_stride, &sse); + score += sse; + if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) { + vp9_sub_pixel_variance16x2(above_ref + offset + 16, ref_y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - left_src + xd->dst.y_stride * 16, - xd->dst.y_stride, &sse); - score += sse; - } - if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) { - vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32, + above_src + 16, xd->dst.y_stride, &sse); + score += sse; + } + if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) { + vp9_sub_pixel_variance16x2(above_ref + offset + 32, ref_y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - left_src + xd->dst.y_stride * 32, - xd->dst.y_stride, &sse); - score += sse; - vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48, + above_src + 32, xd->dst.y_stride, &sse); + score += sse; + vp9_sub_pixel_variance16x2(above_ref + offset + 48, ref_y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - left_src + xd->dst.y_stride * 48, - xd->dst.y_stride, &sse); + above_src + 48, xd->dst.y_stride, &sse); + score += sse; + } + } +#if !CONFIG_ABOVESPREFMV + if (xd->left_available) { + vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + left_src, xd->dst.y_stride, &sse); score += sse; + if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) { + vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16, + ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + left_src + xd->dst.y_stride * 16, + xd->dst.y_stride, &sse); + score += sse; + } + if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) { + vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32, + ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + left_src + xd->dst.y_stride * 32, + xd->dst.y_stride, &sse); + score += sse; + vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48, + ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + left_src + xd->dst.y_stride * 48, + xd->dst.y_stride, &sse); + score += sse; + } } - } #endif - // Add the entry to our list and then resort the list on score. - ref_scores[i] = score; - sorted_mvs[i].as_int = this_mv.as_int; - j = i; - while (j > 0) { - if (ref_scores[j] < ref_scores[j-1]) { - ref_scores[j] = ref_scores[j-1]; - sorted_mvs[j].as_int = sorted_mvs[j-1].as_int; - ref_scores[j-1] = score; - sorted_mvs[j-1].as_int = this_mv.as_int; - j--; - } else - break; + // Add the entry to our list and then resort the list on score. + ref_scores[i] = score; + sorted_mvs[i].as_int = this_mv.as_int; + j = i; + while (j > 0) { + if (ref_scores[j] < ref_scores[j-1]) { + ref_scores[j] = ref_scores[j-1]; + sorted_mvs[j].as_int = sorted_mvs[j-1].as_int; + ref_scores[j-1] = score; + sorted_mvs[j-1].as_int = this_mv.as_int; + j--; + } else { + break; + } + } } + } else { + vpx_memcpy(sorted_mvs, mvlist, sizeof(sorted_mvs)); } // Make sure all the candidates are properly clamped etc @@ -273,23 +285,35 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, clamp_mv2(&sorted_mvs[i], xd); } - // Provided that there are non zero vectors available there will not - // be more than one 0,0 entry in the sorted list. - // The best ref mv is always set to the first entry (which gave the best - // results. The nearest is set to the first non zero vector if available and - // near to the second non zero vector if available. - // We do not use 0,0 as a nearest or near as 0,0 has its own mode. - if ( sorted_mvs[0].as_int ) { - nearest->as_int = sorted_mvs[0].as_int; - if ( sorted_mvs[1].as_int ) - near->as_int = sorted_mvs[1].as_int; - else - near->as_int = sorted_mvs[2].as_int; + // Nearest may be a 0,0 or non zero vector and now matches the chosen + // "best reference". This has advantages when it is used as part of a + // compound predictor as it means a non zero vector can be paired using + // this mode with a 0 vector. The Near vector is still forced to be a + // non zero candidate if one is avaialble. + nearest->as_int = sorted_mvs[0].as_int; + if ( sorted_mvs[1].as_int ) { + near->as_int = sorted_mvs[1].as_int; } else { - nearest->as_int = sorted_mvs[1].as_int; - near->as_int = sorted_mvs[2].as_int; + near->as_int = sorted_mvs[2].as_int; } // Copy back the re-ordered mv list vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs)); } +#else +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, + uint8_t *ref_y_buffer, + int ref_y_stride, + int_mv *mvlist, + int_mv *nearest, + int_mv *near) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv); + clamp_mv2(&mvlist[i], xd); + } + *nearest = mvlist[0]; + *near = mvlist[1]; +} +#endif diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h index a66a7de27ccce853b062590801817082e1db4bc7..6887b044f6d6559b965d550c7426b82eb92fe265 100644 --- a/vp9/common/vp9_findnearmv.h +++ b/vp9/common/vp9_findnearmv.h @@ -17,6 +17,9 @@ #include "vp9/common/vp9_treecoder.h" #include "vp9/common/vp9_onyxc_int.h" +#define LEFT_TOP_MARGIN (16 << 3) +#define RIGHT_BOTTOM_MARGIN (16 << 3) + /* check a list of motion vectors by sad score using a number rows of pixels * above and a number cols of pixels in the left to select the one with best * score to use as ref motion vector @@ -28,9 +31,9 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int_mv *nearest, int_mv *near); -static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) { - MV xmv; - xmv = mvp->as_mv; +static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, + int_mv *mvp, const int *ref_frame_sign_bias) { + MV xmv = mvp->as_mv; if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) { xmv.row *= -1; @@ -40,8 +43,6 @@ static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, co mvp->as_mv = xmv; } -#define LEFT_TOP_MARGIN (16 << 3) -#define RIGHT_BOTTOM_MARGIN (16 << 3) static void clamp_mv(int_mv *mv, int mb_to_left_edge, @@ -71,10 +72,10 @@ static unsigned int check_mv_bounds(int_mv *mv, int mb_to_right_edge, int mb_to_top_edge, int mb_to_bottom_edge) { - return (mv->as_mv.col < mb_to_left_edge) || - (mv->as_mv.col > mb_to_right_edge) || - (mv->as_mv.row < mb_to_top_edge) || - (mv->as_mv.row > mb_to_bottom_edge); + return mv->as_mv.col < mb_to_left_edge || + mv->as_mv.col > mb_to_right_edge || + mv->as_mv.row < mb_to_top_edge || + mv->as_mv.row > mb_to_bottom_edge; } vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, @@ -83,21 +84,30 @@ vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, extern const uint8_t vp9_mbsplit_offset[4][16]; -static int left_block_mv(const MODE_INFO *cur_mb, int b) { +static int left_block_mv(const MACROBLOCKD *xd, + const MODE_INFO *cur_mb, int b) { if (!(b & 3)) { - /* On L edge, get from MB to left of us */ + if (!xd->left_available) + return 0; + + // On L edge, get from MB to left of us --cur_mb; if (cur_mb->mbmi.mode != SPLITMV) return cur_mb->mbmi.mv[0].as_int; + b += 4; } - return (cur_mb->bmi + b - 1)->as_mv.first.as_int; + return (cur_mb->bmi + b - 1)->as_mv[0].as_int; } -static int left_block_second_mv(const MODE_INFO *cur_mb, int b) { +static int left_block_second_mv(const MACROBLOCKD *xd, + const MODE_INFO *cur_mb, int b) { if (!(b & 3)) { + if (!xd->left_available) + return 0; + /* On L edge, get from MB to left of us */ --cur_mb; @@ -108,8 +118,8 @@ static int left_block_second_mv(const MODE_INFO *cur_mb, int b) { } return cur_mb->mbmi.second_ref_frame > 0 ? - (cur_mb->bmi + b - 1)->as_mv.second.as_int : - (cur_mb->bmi + b - 1)->as_mv.first.as_int; + (cur_mb->bmi + b - 1)->as_mv[1].as_int : + (cur_mb->bmi + b - 1)->as_mv[0].as_int; } static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { @@ -122,7 +132,7 @@ static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { b += 16; } - return (cur_mb->bmi + b - 4)->as_mv.first.as_int; + return (cur_mb->bmi + b - 4)->as_mv[0].as_int; } static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { @@ -137,8 +147,8 @@ static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) } return cur_mb->mbmi.second_ref_frame > 0 ? - (cur_mb->bmi + b - 4)->as_mv.second.as_int : - (cur_mb->bmi + b - 4)->as_mv.first.as_int; + (cur_mb->bmi + b - 4)->as_mv[1].as_int : + (cur_mb->bmi + b - 4)->as_mv[0].as_int; } static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c new file mode 100644 index 0000000000000000000000000000000000000000..3ec093f735a7a2ecc8eff6f99737392c36678e95 --- /dev/null +++ b/vp9/common/vp9_idct.c @@ -0,0 +1,1307 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <math.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_systemdependent.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) { + int i; + int a1, b1, c1, d1; + int16_t *ip = input; + int16_t *op = output; + const int half_pitch = pitch >> 1; + + for (i = 0; i < 4; i++) { + a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR; + b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR; + c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR; + d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR; + + op[0] = (a1 + b1 + 1) >> 1; + op[1] = (c1 + d1) >> 1; + op[2] = (a1 - b1) >> 1; + op[3] = (d1 - c1) >> 1; + + ip += 4; + op += half_pitch; + } + + ip = output; + op = output; + for (i = 0; i < 4; i++) { + a1 = ip[half_pitch * 0] + ip[half_pitch * 3]; + b1 = ip[half_pitch * 1] + ip[half_pitch * 2]; + c1 = ip[half_pitch * 1] - ip[half_pitch * 2]; + d1 = ip[half_pitch * 0] - ip[half_pitch * 3]; + + + op[half_pitch * 0] = (a1 + b1 + 1) >> 1; + op[half_pitch * 1] = (c1 + d1) >> 1; + op[half_pitch * 2] = (a1 - b1) >> 1; + op[half_pitch * 3] = (d1 - c1) >> 1; + + ip++; + op++; + } +} + +void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) { + int i; + int16_t tmp[4]; + int16_t *ip = in; + int16_t *op = tmp; + const int half_pitch = pitch >> 1; + + op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; + op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1; + + ip = tmp; + op = out; + for (i = 0; i < 4; i++) { + op[half_pitch * 0] = (ip[0] + 1) >> 1; + op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1; + ip++; + op++; + } +} + +void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr, + uint8_t *dst_ptr, + int pitch, int stride) { + int r, c; + int16_t dc = input_dc; + int16_t tmp[4 * 4]; + vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1); + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) + dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]); + + dst_ptr += stride; + pred_ptr += pitch; + } +} + +void vp9_idct4_1d_c(int16_t *input, int16_t *output) { + int16_t step[4]; + int temp1, temp2; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = dct_const_round_shift(temp1); + step[1] = dct_const_round_shift(temp2); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = dct_const_round_shift(temp1); + step[3] = dct_const_round_shift(temp2); + + // stage 2 + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = step[1] - step[2]; + output[3] = step[0] - step[3]; +} + +void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[4 * 4]; + int16_t *outptr = out; + const int half_pitch = pitch >> 1; + int i, j; + int16_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = input[j]; + vp9_idct4_1d(temp_in, outptr); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + vp9_idct4_1d(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); + } +} + +void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) { + int i; + int a1; + int16_t *op = output; + const int half_pitch = pitch >> 1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + op[0] = op[1] = op[2] = op[3] = a1; + op += half_pitch; + } +} + +void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, + uint8_t *dst_ptr, int pitch, int stride) { + int a1; + int r, c; + int16_t out = dct_const_round_shift(input_dc * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) + dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]); + + dst_ptr += stride; + pred_ptr += pitch; + } +} + +static void idct8_1d(int16_t *input, int16_t *output) { + int16_t step1[8], step2[8]; + int temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = dct_const_round_shift(temp1); + step1[7] = dct_const_round_shift(temp2); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + + // stage 2 & stage 3 - even half + vp9_idct4_1d(step1, step1); + + // stage 2 - odd half + step2[4] = step1[4] + step1[5]; + step2[5] = step1[4] - step1[5]; + step2[6] = -step1[6] + step1[7]; + step2[7] = step1[6] + step1[7]; + + // stage 3 -odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + step1[7] = step2[7]; + + // stage 4 + output[0] = step1[0] + step1[7]; + output[1] = step1[1] + step1[6]; + output[2] = step1[2] + step1[5]; + output[3] = step1[3] + step1[4]; + output[4] = step1[3] - step1[4]; + output[5] = step1[2] - step1[5]; + output[6] = step1[1] - step1[6]; + output[7] = step1[0] - step1[7]; +} + +void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[8 * 8]; + int16_t *outptr = out; + const int half_pitch = pitch >> 1; + int i, j; + int16_t temp_in[8], temp_out[8]; + + // Rows + for (i = 0; i < 8; ++i) { + idct8_1d(input, outptr); + input += 8; + outptr += 8; + } + + // Columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + idct8_1d(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5); + } +} + +static void iadst4_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + int x0 = input[0]; + int x1 = input[1]; + int x2 = input[2]; + int x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} + +void vp9_short_iht4x4_c(int16_t *input, int16_t *output, + int pitch, int tx_type) { + const transform_2d IHT_4[] = { + { vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0 + { iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1 + { vp9_idct4_1d, iadst4_1d }, // DCT_ADST = 2 + { iadst4_1d, iadst4_1d } // ADST_ADST = 3 + }; + + int i, j; + int16_t out[4 * 4]; + int16_t *outptr = out; + int16_t temp_in[4], temp_out[4]; + + // inverse transform row vectors + for (i = 0; i < 4; ++i) { + IHT_4[tx_type].rows(input, outptr); + input += 4; + outptr += 4; + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + IHT_4[tx_type].cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); + } +} + +static void iadst8_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + int x0 = input[7]; + int x1 = input[0]; + int x2 = input[5]; + int x3 = input[2]; + int x4 = input[3]; + int x5 = input[4]; + int x6 = input[1]; + int x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = dct_const_round_shift(s0 + s4); + x1 = dct_const_round_shift(s1 + s5); + x2 = dct_const_round_shift(s2 + s6); + x3 = dct_const_round_shift(s3 + s7); + x4 = dct_const_round_shift(s0 - s4); + x5 = dct_const_round_shift(s1 - s5); + x6 = dct_const_round_shift(s2 - s6); + x7 = dct_const_round_shift(s3 - s7); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} + +static const transform_2d IHT_8[] = { + { idct8_1d, idct8_1d }, // DCT_DCT = 0 + { iadst8_1d, idct8_1d }, // ADST_DCT = 1 + { idct8_1d, iadst8_1d }, // DCT_ADST = 2 + { iadst8_1d, iadst8_1d } // ADST_ADST = 3 +}; + +void vp9_short_iht8x8_c(int16_t *input, int16_t *output, + int pitch, int tx_type) { + int i, j; + int16_t out[8 * 8]; + int16_t *outptr = out; + int16_t temp_in[8], temp_out[8]; + const transform_2d ht = IHT_8[tx_type]; + + // inverse transform row vectors + for (i = 0; i < 8; ++i) { + ht.rows(input, outptr); + input += 8; + outptr += 8; + } + + // inverse transform column vectors + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5); + } +} + +void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[8 * 8]; + int16_t *outptr = out; + const int half_pitch = pitch >> 1; + int i, j; + int16_t temp_in[8], temp_out[8]; + + vpx_memset(out, 0, sizeof(out)); + // First transform rows + // only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + idct8_1d(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + idct8_1d(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5); + } +} + +void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) { + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + output[0] = ROUND_POWER_OF_TWO(out, 5); +} + +static void idct16_1d(int16_t *input, int16_t *output) { + int16_t step1[16], step2[16]; + int temp1, temp2; + + // stage 1 + step1[0] = input[0/2]; + step1[1] = input[16/2]; + step1[2] = input[8/2]; + step1[3] = input[24/2]; + step1[4] = input[4/2]; + step1[5] = input[20/2]; + step1[6] = input[12/2]; + step1[7] = input[28/2]; + step1[8] = input[2/2]; + step1[9] = input[18/2]; + step1[10] = input[10/2]; + step1[11] = input[26/2]; + step1[12] = input[6/2]; + step1[13] = input[22/2]; + step1[14] = input[14/2]; + step1[15] = input[30/2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = dct_const_round_shift(temp1); + step2[15] = dct_const_round_shift(temp2); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = dct_const_round_shift(temp1); + step2[14] = dct_const_round_shift(temp2); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = dct_const_round_shift(temp1); + step2[12] = dct_const_round_shift(temp2); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = dct_const_round_shift(temp1); + step1[7] = dct_const_round_shift(temp2); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + + step1[8] = step2[8] + step2[9]; + step1[9] = step2[8] - step2[9]; + step1[10] = -step2[10] + step2[11]; + step1[11] = step2[10] + step2[11]; + step1[12] = step2[12] + step2[13]; + step1[13] = step2[12] - step2[13]; + step1[14] = -step2[14] + step2[15]; + step1[15] = step2[14] + step2[15]; + + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = dct_const_round_shift(temp1); + step2[1] = dct_const_round_shift(temp2); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = dct_const_round_shift(temp1); + step2[3] = dct_const_round_shift(temp2); + step2[4] = step1[4] + step1[5]; + step2[5] = step1[4] - step1[5]; + step2[6] = -step1[6] + step1[7]; + step2[7] = step1[6] + step1[7]; + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = dct_const_round_shift(temp1); + step2[14] = dct_const_round_shift(temp2); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = step2[0] + step2[3]; + step1[1] = step2[1] + step2[2]; + step1[2] = step2[1] - step2[2]; + step1[3] = step2[0] - step2[3]; + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + step1[7] = step2[7]; + + step1[8] = step2[8] + step2[11]; + step1[9] = step2[9] + step2[10]; + step1[10] = step2[9] - step2[10]; + step1[11] = step2[8] - step2[11]; + step1[12] = -step2[12] + step2[15]; + step1[13] = -step2[13] + step2[14]; + step1[14] = step2[13] + step2[14]; + step1[15] = step2[12] + step2[15]; + + // stage 6 + step2[0] = step1[0] + step1[7]; + step2[1] = step1[1] + step1[6]; + step2[2] = step1[2] + step1[5]; + step2[3] = step1[3] + step1[4]; + step2[4] = step1[3] - step1[4]; + step2[5] = step1[2] - step1[5]; + step2[6] = step1[1] - step1[6]; + step2[7] = step1[0] - step1[7]; + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = dct_const_round_shift(temp1); + step2[12] = dct_const_round_shift(temp2); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = step2[0] + step2[15]; + output[1] = step2[1] + step2[14]; + output[2] = step2[2] + step2[13]; + output[3] = step2[3] + step2[12]; + output[4] = step2[4] + step2[11]; + output[5] = step2[5] + step2[10]; + output[6] = step2[6] + step2[9]; + output[7] = step2[7] + step2[8]; + output[8] = step2[7] - step2[8]; + output[9] = step2[6] - step2[9]; + output[10] = step2[5] - step2[10]; + output[11] = step2[4] - step2[11]; + output[12] = step2[3] - step2[12]; + output[13] = step2[2] - step2[13]; + output[14] = step2[1] - step2[14]; + output[15] = step2[0] - step2[15]; +} + +void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[16 * 16]; + int16_t *outptr = out; + const int half_pitch = pitch >> 1; + int i, j; + int16_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + idct16_1d(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + idct16_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); + } +} + +void iadst16_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = output[8] + = output[9] = output[10] = output[11] = output[12] + = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +static const transform_2d IHT_16[] = { + { idct16_1d, idct16_1d }, // DCT_DCT = 0 + { iadst16_1d, idct16_1d }, // ADST_DCT = 1 + { idct16_1d, iadst16_1d }, // DCT_ADST = 2 + { iadst16_1d, iadst16_1d } // ADST_ADST = 3 +}; + +void vp9_short_iht16x16_c(int16_t *input, int16_t *output, + int pitch, int tx_type) { + int i, j; + int16_t out[16 * 16]; + int16_t *outptr = out; + int16_t temp_in[16], temp_out[16]; + const transform_2d ht = IHT_16[tx_type]; + + // Rows + for (i = 0; i < 16; ++i) { + ht.rows(input, outptr); + input += 16; + outptr += 16; + } + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) + output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); + } +} + +void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[16 * 16]; + int16_t *outptr = out; + const int half_pitch = pitch >> 1; + int i, j; + int16_t temp_in[16], temp_out[16]; + + /* First transform rows. Since all non-zero dct coefficients are in + * upper-left 4x4 area, we only need to calculate first 4 rows here. + */ + vpx_memset(out, 0, sizeof(out)); + for (i = 0; i < 4; ++i) { + idct16_1d(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + idct16_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); + } +} + + +void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) { + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + output[0] = ROUND_POWER_OF_TWO(out, 6); +} + +static void idct32_1d(int16_t *input, int16_t *output) { + int16_t step1[32], step2[32]; + int temp1, temp2; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = dct_const_round_shift(temp1); + step1[31] = dct_const_round_shift(temp2); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = dct_const_round_shift(temp1); + step1[30] = dct_const_round_shift(temp2); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = dct_const_round_shift(temp1); + step1[29] = dct_const_round_shift(temp2); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = dct_const_round_shift(temp1); + step1[28] = dct_const_round_shift(temp2); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = dct_const_round_shift(temp1); + step1[27] = dct_const_round_shift(temp2); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = dct_const_round_shift(temp1); + step1[26] = dct_const_round_shift(temp2); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = dct_const_round_shift(temp1); + step1[25] = dct_const_round_shift(temp2); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = dct_const_round_shift(temp1); + step1[24] = dct_const_round_shift(temp2); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = dct_const_round_shift(temp1); + step2[15] = dct_const_round_shift(temp2); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = dct_const_round_shift(temp1); + step2[14] = dct_const_round_shift(temp2); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = dct_const_round_shift(temp1); + step2[12] = dct_const_round_shift(temp2); + + step2[16] = step1[16] + step1[17]; + step2[17] = step1[16] - step1[17]; + step2[18] = -step1[18] + step1[19]; + step2[19] = step1[18] + step1[19]; + step2[20] = step1[20] + step1[21]; + step2[21] = step1[20] - step1[21]; + step2[22] = -step1[22] + step1[23]; + step2[23] = step1[22] + step1[23]; + step2[24] = step1[24] + step1[25]; + step2[25] = step1[24] - step1[25]; + step2[26] = -step1[26] + step1[27]; + step2[27] = step1[26] + step1[27]; + step2[28] = step1[28] + step1[29]; + step2[29] = step1[28] - step1[29]; + step2[30] = -step1[30] + step1[31]; + step2[31] = step1[30] + step1[31]; + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = dct_const_round_shift(temp1); + step1[7] = dct_const_round_shift(temp2); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + + step1[8] = step2[8] + step2[9]; + step1[9] = step2[8] - step2[9]; + step1[10] = -step2[10] + step2[11]; + step1[11] = step2[10] + step2[11]; + step1[12] = step2[12] + step2[13]; + step1[13] = step2[12] - step2[13]; + step1[14] = -step2[14] + step2[15]; + step1[15] = step2[14] + step2[15]; + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = dct_const_round_shift(temp1); + step1[30] = dct_const_round_shift(temp2); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = dct_const_round_shift(temp1); + step1[29] = dct_const_round_shift(temp2); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = dct_const_round_shift(temp1); + step1[26] = dct_const_round_shift(temp2); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = dct_const_round_shift(temp1); + step1[25] = dct_const_round_shift(temp2); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = dct_const_round_shift(temp1); + step2[1] = dct_const_round_shift(temp2); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = dct_const_round_shift(temp1); + step2[3] = dct_const_round_shift(temp2); + step2[4] = step1[4] + step1[5]; + step2[5] = step1[4] - step1[5]; + step2[6] = -step1[6] + step1[7]; + step2[7] = step1[6] + step1[7]; + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = dct_const_round_shift(temp1); + step2[14] = dct_const_round_shift(temp2); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = step1[16] + step1[19]; + step2[17] = step1[17] + step1[18]; + step2[18] = step1[17] - step1[18]; + step2[19] = step1[16] - step1[19]; + step2[20] = -step1[20] + step1[23]; + step2[21] = -step1[21] + step1[22]; + step2[22] = step1[21] + step1[22]; + step2[23] = step1[20] + step1[23]; + + step2[24] = step1[24] + step1[27]; + step2[25] = step1[25] + step1[26]; + step2[26] = step1[25] - step1[26]; + step2[27] = step1[24] - step1[27]; + step2[28] = -step1[28] + step1[31]; + step2[29] = -step1[29] + step1[30]; + step2[30] = step1[29] + step1[30]; + step2[31] = step1[28] + step1[31]; + + // stage 5 + step1[0] = step2[0] + step2[3]; + step1[1] = step2[1] + step2[2]; + step1[2] = step2[1] - step2[2]; + step1[3] = step2[0] - step2[3]; + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + step1[7] = step2[7]; + + step1[8] = step2[8] + step2[11]; + step1[9] = step2[9] + step2[10]; + step1[10] = step2[9] - step2[10]; + step1[11] = step2[8] - step2[11]; + step1[12] = -step2[12] + step2[15]; + step1[13] = -step2[13] + step2[14]; + step1[14] = step2[13] + step2[14]; + step1[15] = step2[12] + step2[15]; + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = dct_const_round_shift(temp1); + step1[29] = dct_const_round_shift(temp2); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = dct_const_round_shift(temp1); + step1[28] = dct_const_round_shift(temp2); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = dct_const_round_shift(temp1); + step1[27] = dct_const_round_shift(temp2); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = dct_const_round_shift(temp1); + step1[26] = dct_const_round_shift(temp2); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = step1[0] + step1[7]; + step2[1] = step1[1] + step1[6]; + step2[2] = step1[2] + step1[5]; + step2[3] = step1[3] + step1[4]; + step2[4] = step1[3] - step1[4]; + step2[5] = step1[2] - step1[5]; + step2[6] = step1[1] - step1[6]; + step2[7] = step1[0] - step1[7]; + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = dct_const_round_shift(temp1); + step2[12] = dct_const_round_shift(temp2); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = step1[16] + step1[23]; + step2[17] = step1[17] + step1[22]; + step2[18] = step1[18] + step1[21]; + step2[19] = step1[19] + step1[20]; + step2[20] = step1[19] - step1[20]; + step2[21] = step1[18] - step1[21]; + step2[22] = step1[17] - step1[22]; + step2[23] = step1[16] - step1[23]; + + step2[24] = -step1[24] + step1[31]; + step2[25] = -step1[25] + step1[30]; + step2[26] = -step1[26] + step1[29]; + step2[27] = -step1[27] + step1[28]; + step2[28] = step1[27] + step1[28]; + step2[29] = step1[26] + step1[29]; + step2[30] = step1[25] + step1[30]; + step2[31] = step1[24] + step1[31]; + + // stage 7 + step1[0] = step2[0] + step2[15]; + step1[1] = step2[1] + step2[14]; + step1[2] = step2[2] + step2[13]; + step1[3] = step2[3] + step2[12]; + step1[4] = step2[4] + step2[11]; + step1[5] = step2[5] + step2[10]; + step1[6] = step2[6] + step2[9]; + step1[7] = step2[7] + step2[8]; + step1[8] = step2[7] - step2[8]; + step1[9] = step2[6] - step2[9]; + step1[10] = step2[5] - step2[10]; + step1[11] = step2[4] - step2[11]; + step1[12] = step2[3] - step2[12]; + step1[13] = step2[2] - step2[13]; + step1[14] = step2[1] - step2[14]; + step1[15] = step2[0] - step2[15]; + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = dct_const_round_shift(temp1); + step1[27] = dct_const_round_shift(temp2); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = dct_const_round_shift(temp1); + step1[26] = dct_const_round_shift(temp2); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = dct_const_round_shift(temp1); + step1[25] = dct_const_round_shift(temp2); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = dct_const_round_shift(temp1); + step1[24] = dct_const_round_shift(temp2); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = step1[0] + step1[31]; + output[1] = step1[1] + step1[30]; + output[2] = step1[2] + step1[29]; + output[3] = step1[3] + step1[28]; + output[4] = step1[4] + step1[27]; + output[5] = step1[5] + step1[26]; + output[6] = step1[6] + step1[25]; + output[7] = step1[7] + step1[24]; + output[8] = step1[8] + step1[23]; + output[9] = step1[9] + step1[22]; + output[10] = step1[10] + step1[21]; + output[11] = step1[11] + step1[20]; + output[12] = step1[12] + step1[19]; + output[13] = step1[13] + step1[18]; + output[14] = step1[14] + step1[17]; + output[15] = step1[15] + step1[16]; + output[16] = step1[15] - step1[16]; + output[17] = step1[14] - step1[17]; + output[18] = step1[13] - step1[18]; + output[19] = step1[12] - step1[19]; + output[20] = step1[11] - step1[20]; + output[21] = step1[10] - step1[21]; + output[22] = step1[9] - step1[22]; + output[23] = step1[8] - step1[23]; + output[24] = step1[7] - step1[24]; + output[25] = step1[6] - step1[25]; + output[26] = step1[5] - step1[26]; + output[27] = step1[4] - step1[27]; + output[28] = step1[3] - step1[28]; + output[29] = step1[2] - step1[29]; + output[30] = step1[1] - step1[30]; + output[31] = step1[0] - step1[31]; +} + +void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[32 * 32]; + int16_t *outptr = out; + const int half_pitch = pitch >> 1; + int i, j; + int16_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + idct32_1d(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); + } +} + +void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + output[0] = ROUND_POWER_OF_TWO(out, 6); +} + +void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[32 * 32]; + int16_t *outptr = out; + const int half_pitch = pitch >> 1; + int i, j; + int16_t temp_in[32], temp_out[32]; + + /* First transform rows. Since all non-zero dct coefficients are in + * upper-left 4x4 area, we only need to calculate first 4 rows here. + */ + vpx_memset(out, 0, sizeof(out)); + for (i = 0; i < 4; ++i) { + idct32_1d(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); + } +} diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h new file mode 100644 index 0000000000000000000000000000000000000000..176bf5da4311ad2fbf30921f852cc1772ff17a5a --- /dev/null +++ b/vp9/common/vp9_idct.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_IDCT_H_ +#define VP9_COMMON_VP9_IDCT_H_ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" + +// Constants and Macros used by all idct/dct functions +#define DCT_CONST_BITS 14 +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) + +#define pair_set_epi16(a, b) \ + _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16)) + +// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31. +// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) +static const int cospi_1_64 = 16364; +static const int cospi_2_64 = 16305; +static const int cospi_3_64 = 16207; +static const int cospi_4_64 = 16069; +static const int cospi_5_64 = 15893; +static const int cospi_6_64 = 15679; +static const int cospi_7_64 = 15426; +static const int cospi_8_64 = 15137; +static const int cospi_9_64 = 14811; +static const int cospi_10_64 = 14449; +static const int cospi_11_64 = 14053; +static const int cospi_12_64 = 13623; +static const int cospi_13_64 = 13160; +static const int cospi_14_64 = 12665; +static const int cospi_15_64 = 12140; +static const int cospi_16_64 = 11585; +static const int cospi_17_64 = 11003; +static const int cospi_18_64 = 10394; +static const int cospi_19_64 = 9760; +static const int cospi_20_64 = 9102; +static const int cospi_21_64 = 8423; +static const int cospi_22_64 = 7723; +static const int cospi_23_64 = 7005; +static const int cospi_24_64 = 6270; +static const int cospi_25_64 = 5520; +static const int cospi_26_64 = 4756; +static const int cospi_27_64 = 3981; +static const int cospi_28_64 = 3196; +static const int cospi_29_64 = 2404; +static const int cospi_30_64 = 1606; +static const int cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const int sinpi_1_9 = 5283; +static const int sinpi_2_9 = 9929; +static const int sinpi_3_9 = 13377; +static const int sinpi_4_9 = 15212; + +static INLINE int dct_const_round_shift(int input) { + int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + assert(INT16_MIN <= rv && rv <= INT16_MAX); + return rv; +} + +static INLINE int dct_32_round(int input) { + int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + assert(-131072 <= rv && rv <= 131071); + return rv; +} + +typedef void (*transform_1d)(int16_t*, int16_t*); + +typedef struct { + transform_1d cols, rows; // vertical and horizontal +} transform_2d; + +#endif // VP9_COMMON_VP9_IDCT_H_ diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c deleted file mode 100644 index 106ef9c1993807799f5a876b9b3e7d62476b03c2..0000000000000000000000000000000000000000 --- a/vp9/common/vp9_idctllm.c +++ /dev/null @@ -1,2670 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/**************************************************************************** - * Notes: - * - * This implementation makes use of 16 bit fixed point verio of two multiply - * constants: - * 1. sqrt(2) * cos (pi/8) - * 2. sqrt(2) * sin (pi/8) - * Becuase the first constant is bigger than 1, to maintain the same 16 bit - * fixed point precision as the second one, we use a trick of - * x * a = x + x*(a-1) - * so - * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). - **************************************************************************/ -#include <assert.h> -#include <math.h> -#include "./vpx_config.h" -#include "vp9/common/vp9_systemdependent.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/common/vp9_common.h" - -static const int cospi8sqrt2minus1 = 20091; -static const int sinpi8sqrt2 = 35468; -static const int rounding = 0; - -static const int16_t idct_i4[16] = { - 8192, 10703, 8192, 4433, - 8192, 4433, -8192, -10703, - 8192, -4433, -8192, 10703, - 8192, -10703, 8192, -4433 -}; - -static const int16_t iadst_i4[16] = { - 3736, 9459, 10757, 7021, - 7021, 9459, -3736, -10757, - 9459, 0, -9459, 9459, - 10757, -9459, 7021, -3736 -}; - -static const int16_t idct_i8[64] = { - 5793, 8035, 7568, 6811, - 5793, 4551, 3135, 1598, - 5793, 6811, 3135, -1598, - -5793, -8035, -7568, -4551, - 5793, 4551, -3135, -8035, - -5793, 1598, 7568, 6811, - 5793, 1598, -7568, -4551, - 5793, 6811, -3135, -8035, - 5793, -1598, -7568, 4551, - 5793, -6811, -3135, 8035, - 5793, -4551, -3135, 8035, - -5793, -1598, 7568, -6811, - 5793, -6811, 3135, 1598, - -5793, 8035, -7568, 4551, - 5793, -8035, 7568, -6811, - 5793, -4551, 3135, -1598 -}; - -static const int16_t iadst_i8[64] = { - 1460, 4184, 6342, 7644, - 7914, 7114, 5354, 2871, - 2871, 7114, 7644, 4184, - -1460, -6342, -7914, -5354, - 4184, 7914, 2871, -5354, - -7644, -1460, 6342, 7114, - 5354, 6342, -4184, -7114, - 2871, 7644, -1460, -7914, - 6342, 2871, -7914, 1460, - 7114, -5354, -4184, 7644, - 7114, -1460, -5354, 7914, - -4184, -2871, 7644, -6342, - 7644, -5354, 1460, 2871, - -6342, 7914, -7114, 4184, - 7914, -7644, 7114, -6342, - 5354, -4184, 2871, -1460 -}; - - - -static const int16_t idct_i16[256] = { - 4096, 5765, 5681, 5543, 5352, 5109, 4816, 4478, - 4096, 3675, 3218, 2731, 2217, 1682, 1130, 568, - 4096, 5543, 4816, 3675, 2217, 568, -1130, -2731, - -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682, - 4096, 5109, 3218, 568, -2217, -4478, -5681, -5543, - -4096, -1682, 1130, 3675, 5352, 5765, 4816, 2731, - 4096, 4478, 1130, -2731, -5352, -5543, -3218, 568, - 4096, 5765, 4816, 1682, -2217, -5109, -5681, -3675, - 4096, 3675, -1130, -5109, -5352, -1682, 3218, 5765, - 4096, -568, -4816, -5543, -2217, 2731, 5681, 4478, - 4096, 2731, -3218, -5765, -2217, 3675, 5681, 1682, - -4096, -5543, -1130, 4478, 5352, 568, -4816, -5109, - 4096, 1682, -4816, -4478, 2217, 5765, 1130, -5109, - -4096, 2731, 5681, 568, -5352, -3675, 3218, 5543, - 4096, 568, -5681, -1682, 5352, 2731, -4816, -3675, - 4096, 4478, -3218, -5109, 2217, 5543, -1130, -5765, - 4096, -568, -5681, 1682, 5352, -2731, -4816, 3675, - 4096, -4478, -3218, 5109, 2217, -5543, -1130, 5765, - 4096, -1682, -4816, 4478, 2217, -5765, 1130, 5109, - -4096, -2731, 5681, -568, -5352, 3675, 3218, -5543, - 4096, -2731, -3218, 5765, -2217, -3675, 5681, -1682, - -4096, 5543, -1130, -4478, 5352, -568, -4816, 5109, - 4096, -3675, -1130, 5109, -5352, 1682, 3218, -5765, - 4096, 568, -4816, 5543, -2217, -2731, 5681, -4478, - 4096, -4478, 1130, 2731, -5352, 5543, -3218, -568, - 4096, -5765, 4816, -1682, -2217, 5109, -5681, 3675, - 4096, -5109, 3218, -568, -2217, 4478, -5681, 5543, - -4096, 1682, 1130, -3675, 5352, -5765, 4816, -2731, - 4096, -5543, 4816, -3675, 2217, -568, -1130, 2731, - -4096, 5109, -5681, 5765, -5352, 4478, -3218, 1682, - 4096, -5765, 5681, -5543, 5352, -5109, 4816, -4478, - 4096, -3675, 3218, -2731, 2217, -1682, 1130, -568 -}; - -static const int16_t iadst_i16[256] = { - 542, 1607, 2614, 3526, 4311, 4940, 5390, 5646, - 5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080, - 1080, 3084, 4646, 5543, 5646, 4940, 3526, 1607, - -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120, - 1607, 4311, 5646, 5189, 3084, 0, -3084, -5189, - -5646, -4311, -1607, 1607, 4311, 5646, 5189, 3084, - 2120, 5189, 5390, 2614, -1607, -4940, -5543, -3084, - 1080, 4646, 5646, 3526, -542, -4311, -5698, -3936, - 2614, 5646, 3936, -1080, -5189, -4940, -542, 4311, - 5543, 2120, -3084, -5698, -3526, 1607, 5390, 4646, - 3084, 5646, 1607, -4311, -5189, 0, 5189, 4311, - -1607, -5646, -3084, 3084, 5646, 1607, -4311, -5189, - 3526, 5189, -1080, -5698, -1607, 4940, 3936, -3084, - -5390, 542, 5646, 2120, -4646, -4311, 2614, 5543, - 3936, 4311, -3526, -4646, 3084, 4940, -2614, -5189, - 2120, 5390, -1607, -5543, 1080, 5646, -542, -5698, - 4311, 3084, -5189, -1607, 5646, 0, -5646, 1607, - 5189, -3084, -4311, 4311, 3084, -5189, -1607, 5646, - 4646, 1607, -5698, 2120, 4311, -4940, -1080, 5646, - -2614, -3936, 5189, 542, -5543, 3084, 3526, -5390, - 4940, 0, -4940, 4940, 0, -4940, 4940, 0, - -4940, 4940, 0, -4940, 4940, 0, -4940, 4940, - 5189, -1607, -3084, 5646, -4311, 0, 4311, -5646, - 3084, 1607, -5189, 5189, -1607, -3084, 5646, -4311, - 5390, -3084, -542, 3936, -5646, 4940, -2120, -1607, - 4646, -5698, 4311, -1080, -2614, 5189, -5543, 3526, - 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189, - -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614, - 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084, - -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607, - 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311, - 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542 -}; - - -/* Converted the transforms to integer form. */ -#define HORIZONTAL_SHIFT 14 // 16 -#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1) -#define VERTICAL_SHIFT 17 // 15 -#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1) -void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch, - TX_TYPE tx_type, int tx_dim, uint16_t eobs) { - int i, j, k; - int nz_dim; - int16_t imbuf[256]; - - const int16_t *ip = input; - int16_t *op = output; - int16_t *im = &imbuf[0]; - - /* pointers to vertical and horizontal transforms. */ - const int16_t *ptv = NULL, *pth = NULL; - int shortpitch = pitch >> 1; - - switch (tx_type) { - case ADST_ADST : - ptv = pth = (tx_dim == 4) ? &iadst_i4[0] - : ((tx_dim == 8) ? &iadst_i8[0] - : &iadst_i16[0]); - break; - case ADST_DCT : - ptv = (tx_dim == 4) ? &iadst_i4[0] - : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]); - pth = (tx_dim == 4) ? &idct_i4[0] - : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]); - break; - case DCT_ADST : - ptv = (tx_dim == 4) ? &idct_i4[0] - : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]); - pth = (tx_dim == 4) ? &iadst_i4[0] - : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]); - break; - case DCT_DCT : - ptv = pth = (tx_dim == 4) ? &idct_i4[0] - : ((tx_dim == 8) ? &idct_i8[0] - : &idct_i16[0]); - break; - default: - assert(0); - break; - } - - nz_dim = tx_dim; - if(tx_dim > 4) { - if(eobs < 36) { - vpx_memset(im, 0, 512); - nz_dim = 8; - if(eobs < 3) { - nz_dim = 2; - } else if(eobs < 10) { - nz_dim = 4; - } - } - } - - /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps - * from right to left: - * 1. horizontal transform: Y= Z*Transposed_M2 - * 2. vertical transform: X = M1*Y - * In SIMD, doing this way could eliminate the transpose needed if it is - * calculated from left to right. - */ - /* Horizontal transformation */ - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < nz_dim; i++) { - int temp = 0; - - for (k = 0; k < nz_dim; k++) { - temp += ip[k] * pth[k]; - } - - /* Calculate im and store it in its transposed position. */ - im[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT); - ip += tx_dim; - } - im += tx_dim; - pth += tx_dim; - ip = input; - } - - /* Vertical transformation */ - im = &imbuf[0]; - - for (i = 0; i < tx_dim; i++) { - for (j = 0; j < tx_dim; j++) { - int temp = 0; - - for (k = 0; k < nz_dim; k++) { - temp += ptv[k] * im[k]; - } - - op[j] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT); - im += tx_dim; - } - im = &imbuf[0]; - ptv += tx_dim; - op += shortpitch; - } -} - -void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) { - int i; - int a1, b1, c1, d1; - - int16_t *ip = input; - int16_t *op = output; - int temp1, temp2; - int shortpitch = pitch >> 1; - - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[8]; - b1 = ip[0] - ip[8]; - - temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; - temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); - c1 = temp1 - temp2; - - temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16); - temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16; - d1 = temp1 + temp2; - - op[shortpitch * 0] = a1 + d1; - op[shortpitch * 3] = a1 - d1; - - op[shortpitch * 1] = b1 + c1; - op[shortpitch * 2] = b1 - c1; - - ip++; - op++; - } - - ip = output; - op = output; - - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[2]; - b1 = ip[0] - ip[2]; - - temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16; - temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16); - c1 = temp1 - temp2; - - temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16); - temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16; - d1 = temp1 + temp2; - - op[0] = (a1 + d1 + 16) >> 5; - op[3] = (a1 - d1 + 16) >> 5; - - op[1] = (b1 + c1 + 16) >> 5; - op[2] = (b1 - c1 + 16) >> 5; - - ip += shortpitch; - op += shortpitch; - } -} - -void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) { - int i; - int a1; - int16_t *op = output; - int shortpitch = pitch >> 1; - a1 = ((input[0] + 16) >> 5); - for (i = 0; i < 4; i++) { - op[0] = a1; - op[1] = a1; - op[2] = a1; - op[3] = a1; - op += shortpitch; - } -} - -void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, - uint8_t *dst_ptr, int pitch, int stride) { - int a1 = ((input_dc + 16) >> 5); - int r, c; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]); - } - - dst_ptr += stride; - pred_ptr += pitch; - } -} - -void vp9_short_inv_walsh4x4_c(int16_t *input, int16_t *output) { - int i; - int a1, b1, c1, d1; - int16_t *ip = input; - int16_t *op = output; - - for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3])); - b1 = ((ip[1] + ip[2])); - c1 = ((ip[1] - ip[2])); - d1 = ((ip[0] - ip[3])); - - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; - - ip += 4; - op += 4; - } - - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[12]; - b1 = ip[4] + ip[8]; - c1 = ip[4] - ip[8]; - d1 = ip[0] - ip[12]; - op[0] = (a1 + b1 + 1) >> 1; - op[4] = (c1 + d1) >> 1; - op[8] = (a1 - b1) >> 1; - op[12] = (d1 - c1) >> 1; - ip++; - op++; - } -} - -void vp9_short_inv_walsh4x4_1_c(int16_t *in, int16_t *out) { - int i; - int16_t tmp[4]; - int16_t *ip = in; - int16_t *op = tmp; - - op[0] = (ip[0] + 1) >> 1; - op[1] = op[2] = op[3] = (ip[0] >> 1); - - ip = tmp; - op = out; - for (i = 0; i < 4; i++) { - op[0] = (ip[0] + 1) >> 1; - op[4] = op[8] = op[12] = (ip[0] >> 1); - ip++; - op++; - } -} - -#if CONFIG_LOSSLESS -void vp9_short_inv_walsh4x4_lossless_c(int16_t *input, int16_t *output) { - int i; - int a1, b1, c1, d1; - int16_t *ip = input; - int16_t *op = output; - - for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR; - b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR; - c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR; - d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR; - - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; - - ip += 4; - op += 4; - } - - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[12]; - b1 = ip[4] + ip[8]; - c1 = ip[4] - ip[8]; - d1 = ip[0] - ip[12]; - - - op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - - ip++; - op++; - } -} - -void vp9_short_inv_walsh4x4_1_lossless_c(int16_t *in, int16_t *out) { - int i; - int16_t tmp[4]; - int16_t *ip = in; - int16_t *op = tmp; - - op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1; - op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1); - - ip = tmp; - op = out; - for (i = 0; i < 4; i++) { - op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR; - ip++; - op++; - } -} - -void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) { - int i; - int a1, b1, c1, d1; - int16_t *ip = input; - int16_t *op = output; - int shortpitch = pitch >> 1; - - for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR; - b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR; - c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR; - d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR; - - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; - - ip += 4; - op += shortpitch; - } - - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[shortpitch * 0] + ip[shortpitch * 3]; - b1 = ip[shortpitch * 1] + ip[shortpitch * 2]; - c1 = ip[shortpitch * 1] - ip[shortpitch * 2]; - d1 = ip[shortpitch * 0] - ip[shortpitch * 3]; - - - op[shortpitch * 0] = (a1 + b1 + 1) >> 1; - op[shortpitch * 1] = (c1 + d1) >> 1; - op[shortpitch * 2] = (a1 - b1) >> 1; - op[shortpitch * 3] = (d1 - c1) >> 1; - - ip++; - op++; - } -} - -void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) { - int i; - int16_t tmp[4]; - int16_t *ip = in; - int16_t *op = tmp; - int shortpitch = pitch >> 1; - - op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; - op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1); - - - ip = tmp; - op = out; - for (i = 0; i < 4; i++) { - op[shortpitch * 0] = (ip[0] + 1) >> 1; - op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1; - ip++; - op++; - } -} - -void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr, - uint8_t *dst_ptr, - int pitch, int stride) { - int r, c; - short tmp[16]; - vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1); - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]); - } - - dst_ptr += stride; - pred_ptr += pitch; - } -} -#endif - -void vp9_dc_only_idct_add_8x8_c(short input_dc, - uint8_t *pred_ptr, - uint8_t *dst_ptr, - int pitch, int stride) { - int a1 = ((input_dc + 16) >> 5); - int r, c, b; - uint8_t *orig_pred = pred_ptr; - uint8_t *orig_dst = dst_ptr; - for (b = 0; b < 4; b++) { - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]); - } - - dst_ptr += stride; - pred_ptr += pitch; - } - dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride; - pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch; - } -} - -#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */ -#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */ -#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */ -#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */ -#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */ -#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */ - -/* row (horizontal) IDCT - * - * 7 pi 1 dst[k] = sum c[l] * src[l] * cos( -- * - * ( k + - ) * l ) l=0 8 2 - * - * where: c[0] = 128 c[1..7] = 128*sqrt(2) */ - -static void idctrow(int *blk) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - /* shortcut */ - if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) | - (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) { - blk[0] = blk[1] = blk[2] = blk[3] = blk[4] - = blk[5] = blk[6] = blk[7] = blk[0] << 3; - return; - } - - x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */ - /* first stage */ - x8 = W7 * (x4 + x5); - x4 = x8 + (W1 - W7) * x4; - x5 = x8 - (W1 + W7) * x5; - x8 = W3 * (x6 + x7); - x6 = x8 - (W3 - W5) * x6; - x7 = x8 - (W3 + W5) * x7; - - /* second stage */ - x8 = x0 + x1; - x0 -= x1; - x1 = W6 * (x3 + x2); - x2 = x1 - (W2 + W6) * x2; - x3 = x1 + (W2 - W6) * x3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x8 + x3; - x8 -= x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - blk[0] = (x7 + x1) >> 8; - blk[1] = (x3 + x2) >> 8; - blk[2] = (x0 + x4) >> 8; - blk[3] = (x8 + x6) >> 8; - blk[4] = (x8 - x6) >> 8; - blk[5] = (x0 - x4) >> 8; - blk[6] = (x3 - x2) >> 8; - blk[7] = (x7 - x1) >> 8; -} - -/* column (vertical) IDCT - * - * 7 pi 1 dst[8*k] = sum c[l] * src[8*l] * - * cos( -- * ( k + - ) * l ) l=0 8 2 - * - * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */ -static void idctcol(int *blk) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* shortcut */ - if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | - (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | - (x7 = blk[8 * 3]))) { - blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] - = blk[8 * 4] = blk[8 * 5] = blk[8 * 6] - = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6); - return; - } - - x0 = (blk[8 * 0] << 8) + 16384; - - /* first stage */ - x8 = W7 * (x4 + x5) + 4; - x4 = (x8 + (W1 - W7) * x4) >> 3; - x5 = (x8 - (W1 + W7) * x5) >> 3; - x8 = W3 * (x6 + x7) + 4; - x6 = (x8 - (W3 - W5) * x6) >> 3; - x7 = (x8 - (W3 + W5) * x7) >> 3; - - /* second stage */ - x8 = x0 + x1; - x0 -= x1; - x1 = W6 * (x3 + x2) + 4; - x2 = (x1 - (W2 + W6) * x2) >> 3; - x3 = (x1 + (W2 - W6) * x3) >> 3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x8 + x3; - x8 -= x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - blk[8 * 0] = (x7 + x1) >> 14; - blk[8 * 1] = (x3 + x2) >> 14; - blk[8 * 2] = (x0 + x4) >> 14; - blk[8 * 3] = (x8 + x6) >> 14; - blk[8 * 4] = (x8 - x6) >> 14; - blk[8 * 5] = (x0 - x4) >> 14; - blk[8 * 6] = (x3 - x2) >> 14; - blk[8 * 7] = (x7 - x1) >> 14; -} - -#define TX_DIM 8 -void vp9_short_idct8x8_c(int16_t *coefs, int16_t *block, int pitch) { - int X[TX_DIM * TX_DIM]; - int i, j; - int shortpitch = pitch >> 1; - - for (i = 0; i < TX_DIM; i++) { - for (j = 0; j < TX_DIM; j++) { - X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 - + (coefs[i * TX_DIM + j] < 0)) >> 2; - } - } - for (i = 0; i < 8; i++) - idctrow(X + 8 * i); - - for (i = 0; i < 8; i++) - idctcol(X + i); - - for (i = 0; i < TX_DIM; i++) { - for (j = 0; j < TX_DIM; j++) { - block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; - } - } -} - -/* Row IDCT when only first 4 coefficients are non-zero. */ -static void idctrow10(int *blk) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* shortcut */ - if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) | - (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) { - blk[0] = blk[1] = blk[2] = blk[3] = blk[4] - = blk[5] = blk[6] = blk[7] = blk[0] << 3; - return; - } - - x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */ - /* first stage */ - x5 = W7 * x4; - x4 = W1 * x4; - x6 = W3 * x7; - x7 = -W5 * x7; - - /* second stage */ - x2 = W6 * x3; - x3 = W2 * x3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x0 + x3; - x8 = x0 - x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - blk[0] = (x7 + x1) >> 8; - blk[1] = (x3 + x2) >> 8; - blk[2] = (x0 + x4) >> 8; - blk[3] = (x8 + x6) >> 8; - blk[4] = (x8 - x6) >> 8; - blk[5] = (x0 - x4) >> 8; - blk[6] = (x3 - x2) >> 8; - blk[7] = (x7 - x1) >> 8; -} - -/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */ -static void idctcol10(int *blk) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* shortcut */ - if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | - (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | - (x7 = blk[8 * 3]))) { - blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] - = blk[8 * 4] = blk[8 * 5] = blk[8 * 6] - = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6); - return; - } - - x0 = (blk[8 * 0] << 8) + 16384; - - /* first stage */ - x5 = (W7 * x4 + 4) >> 3; - x4 = (W1 * x4 + 4) >> 3; - x6 = (W3 * x7 + 4) >> 3; - x7 = (-W5 * x7 + 4) >> 3; - - /* second stage */ - x2 = (W6 * x3 + 4) >> 3; - x3 = (W2 * x3 + 4) >> 3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x0 + x3; - x8 = x0 - x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - blk[8 * 0] = (x7 + x1) >> 14; - blk[8 * 1] = (x3 + x2) >> 14; - blk[8 * 2] = (x0 + x4) >> 14; - blk[8 * 3] = (x8 + x6) >> 14; - blk[8 * 4] = (x8 - x6) >> 14; - blk[8 * 5] = (x0 - x4) >> 14; - blk[8 * 6] = (x3 - x2) >> 14; - blk[8 * 7] = (x7 - x1) >> 14; -} - -void vp9_short_idct10_8x8_c(int16_t *coefs, int16_t *block, int pitch) { - int X[TX_DIM * TX_DIM]; - int i, j; - int shortpitch = pitch >> 1; - - for (i = 0; i < TX_DIM; i++) { - for (j = 0; j < TX_DIM; j++) { - X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 - + (coefs[i * TX_DIM + j] < 0)) >> 2; - } - } - - /* Do first 4 row idct only since non-zero dct coefficients are all in - * upper-left 4x4 area. */ - for (i = 0; i < 4; i++) - idctrow10(X + 8 * i); - - for (i = 0; i < 8; i++) - idctcol10(X + i); - - for (i = 0; i < TX_DIM; i++) { - for (j = 0; j < TX_DIM; j++) { - block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; - } - } -} - -void vp9_short_ihaar2x2_c(int16_t *input, int16_t *output, int pitch) { - int i; - int16_t *ip = input; // 0, 1, 4, 8 - int16_t *op = output; - for (i = 0; i < 16; i++) { - op[i] = 0; - } - - op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1; - op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1; - op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1; - op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1; -} - - -#if 0 -// Keep a really bad float version as reference for now. -void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double x; - const int short_pitch = pitch >> 1; - int i, j, k, l; - for (l = 0; l < 16; ++l) { - for (k = 0; k < 16; ++k) { - double s = 0; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) { - x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32; - if (i != 0) - x *= sqrt(2.0); - if (j != 0) - x *= sqrt(2.0); - s += x; - } - } - output[k*short_pitch+l] = (short)round(s); - } - } - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} -#endif - -#define TEST_INT_16x16_IDCT 1 -#if !TEST_INT_16x16_IDCT - -static void butterfly_16x16_idct_1d(double input[16], double output[16]) { - - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - - // step 1 and 2 - step[ 0] = input[0] + input[8]; - step[ 1] = input[0] - input[8]; - - temp1 = input[4]*C12; - temp2 = input[12]*C4; - - temp1 -= temp2; - temp1 *= C8; - - step[ 2] = 2*(temp1); - - temp1 = input[4]*C4; - temp2 = input[12]*C12; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - step[ 3] = 2*(temp1); - - temp1 = input[2]*C8; - temp1 = 2*(temp1); - temp2 = input[6] + input[10]; - - step[ 4] = temp1 + temp2; - step[ 5] = temp1 - temp2; - - temp1 = input[14]*C8; - temp1 = 2*(temp1); - temp2 = input[6] - input[10]; - - step[ 6] = temp2 - temp1; - step[ 7] = temp2 + temp1; - - // for odd input - temp1 = input[3]*C12; - temp2 = input[13]*C4; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - intermediate[ 8] = 2*(temp1); - - temp1 = input[3]*C4; - temp2 = input[13]*C12; - temp2 -= temp1; - temp2 = (temp2); - temp2 *= C8; - intermediate[ 9] = 2*(temp2); - - intermediate[10] = 2*(input[9]*C8); - intermediate[11] = input[15] - input[1]; - intermediate[12] = input[15] + input[1]; - intermediate[13] = 2*((input[7]*C8)); - - temp1 = input[11]*C12; - temp2 = input[5]*C4; - temp2 -= temp1; - temp2 = (temp2); - temp2 *= C8; - intermediate[14] = 2*(temp2); - - temp1 = input[11]*C4; - temp2 = input[5]*C12; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - intermediate[15] = 2*(temp1); - - step[ 8] = intermediate[ 8] + intermediate[14]; - step[ 9] = intermediate[ 9] + intermediate[15]; - step[10] = intermediate[10] + intermediate[11]; - step[11] = intermediate[10] - intermediate[11]; - step[12] = intermediate[12] + intermediate[13]; - step[13] = intermediate[12] - intermediate[13]; - step[14] = intermediate[ 8] - intermediate[14]; - step[15] = intermediate[ 9] - intermediate[15]; - - // step 3 - output[0] = step[ 0] + step[ 3]; - output[1] = step[ 1] + step[ 2]; - output[2] = step[ 1] - step[ 2]; - output[3] = step[ 0] - step[ 3]; - - temp1 = step[ 4]*C14; - temp2 = step[ 7]*C2; - temp1 -= temp2; - output[4] = (temp1); - - temp1 = step[ 4]*C2; - temp2 = step[ 7]*C14; - temp1 += temp2; - output[7] = (temp1); - - temp1 = step[ 5]*C10; - temp2 = step[ 6]*C6; - temp1 -= temp2; - output[5] = (temp1); - - temp1 = step[ 5]*C6; - temp2 = step[ 6]*C10; - temp1 += temp2; - output[6] = (temp1); - - output[8] = step[ 8] + step[11]; - output[9] = step[ 9] + step[10]; - output[10] = step[ 9] - step[10]; - output[11] = step[ 8] - step[11]; - output[12] = step[12] + step[15]; - output[13] = step[13] + step[14]; - output[14] = step[13] - step[14]; - output[15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[0] + output[7]; - step[ 1] = output[1] + output[6]; - step[ 2] = output[2] + output[5]; - step[ 3] = output[3] + output[4]; - step[ 4] = output[3] - output[4]; - step[ 5] = output[2] - output[5]; - step[ 6] = output[1] - output[6]; - step[ 7] = output[0] - output[7]; - - temp1 = output[8]*C7; - temp2 = output[15]*C9; - temp1 -= temp2; - step[ 8] = (temp1); - - temp1 = output[9]*C11; - temp2 = output[14]*C5; - temp1 += temp2; - step[ 9] = (temp1); - - temp1 = output[10]*C3; - temp2 = output[13]*C13; - temp1 -= temp2; - step[10] = (temp1); - - temp1 = output[11]*C15; - temp2 = output[12]*C1; - temp1 += temp2; - step[11] = (temp1); - - temp1 = output[11]*C1; - temp2 = output[12]*C15; - temp2 -= temp1; - step[12] = (temp2); - - temp1 = output[10]*C13; - temp2 = output[13]*C3; - temp1 += temp2; - step[13] = (temp1); - - temp1 = output[9]*C5; - temp2 = output[14]*C11; - temp2 -= temp1; - step[14] = (temp2); - - temp1 = output[8]*C9; - temp2 = output[15]*C7; - temp1 += temp2; - step[15] = (temp1); - - // step 5 - output[0] = (step[0] + step[15]); - output[1] = (step[1] + step[14]); - output[2] = (step[2] + step[13]); - output[3] = (step[3] + step[12]); - output[4] = (step[4] + step[11]); - output[5] = (step[5] + step[10]); - output[6] = (step[6] + step[ 9]); - output[7] = (step[7] + step[ 8]); - - output[15] = (step[0] - step[15]); - output[14] = (step[1] - step[14]); - output[13] = (step[2] - step[13]); - output[12] = (step[3] - step[12]); - output[11] = (step[4] - step[11]); - output[10] = (step[5] - step[10]); - output[9] = (step[6] - step[ 9]); - output[8] = (step[7] - step[ 8]); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -// Remove once an int version of iDCT is written -#if 0 -void reference_16x16_idct_1d(double input[16], double output[16]) { - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - const double kPi = 3.141592653589793238462643383279502884; - const double kSqrt2 = 1.414213562373095048801688724209698; - for (int k = 0; k < 16; k++) { - output[k] = 0.0; - for (int n = 0; n < 16; n++) { - output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0); - if (n == 0) - output[k] = output[k]/kSqrt2; - } - } - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} -#endif - -void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double out[16*16], out2[16*16]; - const int short_pitch = pitch >> 1; - int i, j; - // First transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = input[j + i*short_pitch]; - butterfly_16x16_idct_1d(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out[j + i*16] = temp_out[j]; - } - // Then transform columns - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; - butterfly_16x16_idct_1d(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out2[j*16 + i] = temp_out[j]; - } - for (i = 0; i < 16*16; ++i) - output[i] = round(out2[i]/128); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -#else - -#define INITIAL_SHIFT 2 -#define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1)) -#define RIGHT_SHIFT 14 -#define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1)) - -static const int16_t C1 = 16305; -static const int16_t C2 = 16069; -static const int16_t C3 = 15679; -static const int16_t C4 = 15137; -static const int16_t C5 = 14449; -static const int16_t C6 = 13623; -static const int16_t C7 = 12665; -static const int16_t C8 = 11585; -static const int16_t C9 = 10394; -static const int16_t C10 = 9102; -static const int16_t C11 = 7723; -static const int16_t C12 = 6270; -static const int16_t C13 = 4756; -static const int16_t C14 = 3196; -static const int16_t C15 = 1606; - -static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16], - int last_shift_bits) { - int16_t step[16]; - int intermediate[16]; - int temp1, temp2; - - int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT; - int step1_rounding = 1 << (step1_shift - 1); - int last_rounding = 0; - - if (last_shift_bits > 0) - last_rounding = 1 << (last_shift_bits - 1); - - // step 1 and 2 - step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - temp1 = input[4] * C12; - temp2 = input[12] * C4; - temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift; - - temp1 = input[4] * C4; - temp2 = input[12] * C12; - temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift; - - temp1 = input[2] * C8; - temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp2 = input[6] + input[10]; - step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - temp1 = input[14] * C8; - temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp2 = input[6] - input[10]; - step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - // for odd input - temp1 = input[3] * C12; - temp2 = input[13] * C4; - temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = input[3] * C4; - temp2 = input[13] * C12; - temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp2 *= C8; - intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - intermediate[11] = input[15] - input[1]; - intermediate[12] = input[15] + input[1]; - intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = input[11] * C12; - temp2 = input[5] * C4; - temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp2 *= C8; - intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = input[11] * C4; - temp2 = input[5] * C12; - temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - - // step 3 - output[0] = step[ 0] + step[ 3]; - output[1] = step[ 1] + step[ 2]; - output[2] = step[ 1] - step[ 2]; - output[3] = step[ 0] - step[ 3]; - - temp1 = step[ 4] * C14; - temp2 = step[ 7] * C2; - output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 4] * C2; - temp2 = step[ 7] * C14; - output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 5] * C10; - temp2 = step[ 6] * C6; - output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 5] * C6; - temp2 = step[ 6] * C10; - output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - output[8] = step[ 8] + step[11]; - output[9] = step[ 9] + step[10]; - output[10] = step[ 9] - step[10]; - output[11] = step[ 8] - step[11]; - output[12] = step[12] + step[15]; - output[13] = step[13] + step[14]; - output[14] = step[13] - step[14]; - output[15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[0] + output[7]; - step[ 1] = output[1] + output[6]; - step[ 2] = output[2] + output[5]; - step[ 3] = output[3] + output[4]; - step[ 4] = output[3] - output[4]; - step[ 5] = output[2] - output[5]; - step[ 6] = output[1] - output[6]; - step[ 7] = output[0] - output[7]; - - temp1 = output[8] * C7; - temp2 = output[15] * C9; - step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[9] * C11; - temp2 = output[14] * C5; - step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[10] * C3; - temp2 = output[13] * C13; - step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[11] * C15; - temp2 = output[12] * C1; - step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[11] * C1; - temp2 = output[12] * C15; - step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[10] * C13; - temp2 = output[13] * C3; - step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[9] * C5; - temp2 = output[14] * C11; - step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[8] * C9; - temp2 = output[15] * C7; - step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - // step 5 - output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; - output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; - output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; - output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; - output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; - output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; - output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; - output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; - - output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; - output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; - output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; - output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; - output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; - output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; - output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; - output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; -} - -void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { - int16_t out[16 * 16]; - int16_t *outptr = &out[0]; - const int short_pitch = pitch >> 1; - int i, j; - int16_t temp_in[16], temp_out[16]; - - // First transform rows - for (i = 0; i < 16; ++i) { - butterfly_16x16_idct_1d(input, outptr, 0); - input += short_pitch; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - butterfly_16x16_idct_1d(temp_in, temp_out, 3); - for (j = 0; j < 16; ++j) - output[j * 16 + i] = temp_out[j]; - } -} - -/* The following function is called when we know the maximum number of non-zero - * dct coefficients is less or equal 10. - */ -static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16], - int last_shift_bits) { - int16_t step[16] = {0}; - int intermediate[16] = {0}; - int temp1, temp2; - int last_rounding = 0; - - if (last_shift_bits > 0) - last_rounding = 1 << (last_shift_bits - 1); - - // step 1 and 2 - step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - // for odd input - temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - // step 3 - output[0] = step[ 0]; - output[1] = step[ 1]; - output[2] = step[ 1]; - output[3] = step[ 0]; - - temp1 = step[ 4] * C14; - output[4] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 4] * C2; - output[7] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 5] * C10; - output[5] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 5] * C6; - output[6] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - output[8] = step[ 8] + step[11]; - output[9] = step[ 9] + step[10]; - output[10] = step[ 9] - step[10]; - output[11] = step[ 8] - step[11]; - output[12] = step[12] + step[15]; - output[13] = step[13] + step[14]; - output[14] = step[13] - step[14]; - output[15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[0] + output[7]; - step[ 1] = output[1] + output[6]; - step[ 2] = output[2] + output[5]; - step[ 3] = output[3] + output[4]; - step[ 4] = output[3] - output[4]; - step[ 5] = output[2] - output[5]; - step[ 6] = output[1] - output[6]; - step[ 7] = output[0] - output[7]; - - temp1 = output[8] * C7; - temp2 = output[15] * C9; - step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[9] * C11; - temp2 = output[14] * C5; - step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[10] * C3; - temp2 = output[13] * C13; - step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[11] * C15; - temp2 = output[12] * C1; - step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[11] * C1; - temp2 = output[12] * C15; - step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[10] * C13; - temp2 = output[13] * C3; - step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[9] * C5; - temp2 = output[14] * C11; - step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[8] * C9; - temp2 = output[15] * C7; - step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - // step 5 - output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; - output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; - output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; - output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; - output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; - output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; - output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; - output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; - - output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; - output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; - output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; - output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; - output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; - output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; - output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; - output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; -} - -void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) { - int16_t out[16 * 16]; - int16_t *outptr = &out[0]; - const int short_pitch = pitch >> 1; - int i, j; - int16_t temp_in[16], temp_out[16]; - - /* First transform rows. Since all non-zero dct coefficients are in - * upper-left 4x4 area, we only need to calculate first 4 rows here. - */ - vpx_memset(out, 0, sizeof(out)); - for (i = 0; i < 4; ++i) { - butterfly_16x16_idct10_1d(input, outptr, 0); - input += short_pitch; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; - butterfly_16x16_idct10_1d(temp_in, temp_out, 3); - for (j = 0; j < 16; ++j) - output[j*16 + i] = temp_out[j]; - } -} -#undef INITIAL_SHIFT -#undef INITIAL_ROUNDING -#undef RIGHT_SHIFT -#undef RIGHT_ROUNDING -#endif - -#if !CONFIG_DWTDCTHYBRID -#define DownshiftMultiplyBy2(x) x * 2 -#define DownshiftMultiply(x) x - -static void idct16(double *input, double *output, int stride) { - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - double step[16]; - double intermediate[16]; - double temp1, temp2; - - // step 1 and 2 - step[ 0] = input[stride*0] + input[stride*8]; - step[ 1] = input[stride*0] - input[stride*8]; - - temp1 = input[stride*4]*C12; - temp2 = input[stride*12]*C4; - - temp1 -= temp2; - temp1 = DownshiftMultiply(temp1); - temp1 *= C8; - - step[ 2] = DownshiftMultiplyBy2(temp1); - - temp1 = input[stride*4]*C4; - temp2 = input[stride*12]*C12; - temp1 += temp2; - temp1 = DownshiftMultiply(temp1); - temp1 *= C8; - step[ 3] = DownshiftMultiplyBy2(temp1); - - temp1 = input[stride*2]*C8; - temp1 = DownshiftMultiplyBy2(temp1); - temp2 = input[stride*6] + input[stride*10]; - - step[ 4] = temp1 + temp2; - step[ 5] = temp1 - temp2; - - temp1 = input[stride*14]*C8; - temp1 = DownshiftMultiplyBy2(temp1); - temp2 = input[stride*6] - input[stride*10]; - - step[ 6] = temp2 - temp1; - step[ 7] = temp2 + temp1; - - // for odd input - temp1 = input[stride*3]*C12; - temp2 = input[stride*13]*C4; - temp1 += temp2; - temp1 = DownshiftMultiply(temp1); - temp1 *= C8; - intermediate[ 8] = DownshiftMultiplyBy2(temp1); - - temp1 = input[stride*3]*C4; - temp2 = input[stride*13]*C12; - temp2 -= temp1; - temp2 = DownshiftMultiply(temp2); - temp2 *= C8; - intermediate[ 9] = DownshiftMultiplyBy2(temp2); - - intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8); - intermediate[11] = input[stride*15] - input[stride*1]; - intermediate[12] = input[stride*15] + input[stride*1]; - intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8)); - - temp1 = input[stride*11]*C12; - temp2 = input[stride*5]*C4; - temp2 -= temp1; - temp2 = DownshiftMultiply(temp2); - temp2 *= C8; - intermediate[14] = DownshiftMultiplyBy2(temp2); - - temp1 = input[stride*11]*C4; - temp2 = input[stride*5]*C12; - temp1 += temp2; - temp1 = DownshiftMultiply(temp1); - temp1 *= C8; - intermediate[15] = DownshiftMultiplyBy2(temp1); - - step[ 8] = intermediate[ 8] + intermediate[14]; - step[ 9] = intermediate[ 9] + intermediate[15]; - step[10] = intermediate[10] + intermediate[11]; - step[11] = intermediate[10] - intermediate[11]; - step[12] = intermediate[12] + intermediate[13]; - step[13] = intermediate[12] - intermediate[13]; - step[14] = intermediate[ 8] - intermediate[14]; - step[15] = intermediate[ 9] - intermediate[15]; - - // step 3 - output[stride*0] = step[ 0] + step[ 3]; - output[stride*1] = step[ 1] + step[ 2]; - output[stride*2] = step[ 1] - step[ 2]; - output[stride*3] = step[ 0] - step[ 3]; - - temp1 = step[ 4]*C14; - temp2 = step[ 7]*C2; - temp1 -= temp2; - output[stride*4] = DownshiftMultiply(temp1); - - temp1 = step[ 4]*C2; - temp2 = step[ 7]*C14; - temp1 += temp2; - output[stride*7] = DownshiftMultiply(temp1); - - temp1 = step[ 5]*C10; - temp2 = step[ 6]*C6; - temp1 -= temp2; - output[stride*5] = DownshiftMultiply(temp1); - - temp1 = step[ 5]*C6; - temp2 = step[ 6]*C10; - temp1 += temp2; - output[stride*6] = DownshiftMultiply(temp1); - - output[stride*8] = step[ 8] + step[11]; - output[stride*9] = step[ 9] + step[10]; - output[stride*10] = step[ 9] - step[10]; - output[stride*11] = step[ 8] - step[11]; - output[stride*12] = step[12] + step[15]; - output[stride*13] = step[13] + step[14]; - output[stride*14] = step[13] - step[14]; - output[stride*15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[stride*0] + output[stride*7]; - step[ 1] = output[stride*1] + output[stride*6]; - step[ 2] = output[stride*2] + output[stride*5]; - step[ 3] = output[stride*3] + output[stride*4]; - step[ 4] = output[stride*3] - output[stride*4]; - step[ 5] = output[stride*2] - output[stride*5]; - step[ 6] = output[stride*1] - output[stride*6]; - step[ 7] = output[stride*0] - output[stride*7]; - - temp1 = output[stride*8]*C7; - temp2 = output[stride*15]*C9; - temp1 -= temp2; - step[ 8] = DownshiftMultiply(temp1); - - temp1 = output[stride*9]*C11; - temp2 = output[stride*14]*C5; - temp1 += temp2; - step[ 9] = DownshiftMultiply(temp1); - - temp1 = output[stride*10]*C3; - temp2 = output[stride*13]*C13; - temp1 -= temp2; - step[10] = DownshiftMultiply(temp1); - - temp1 = output[stride*11]*C15; - temp2 = output[stride*12]*C1; - temp1 += temp2; - step[11] = DownshiftMultiply(temp1); - - temp1 = output[stride*11]*C1; - temp2 = output[stride*12]*C15; - temp2 -= temp1; - step[12] = DownshiftMultiply(temp2); - - temp1 = output[stride*10]*C13; - temp2 = output[stride*13]*C3; - temp1 += temp2; - step[13] = DownshiftMultiply(temp1); - - temp1 = output[stride*9]*C5; - temp2 = output[stride*14]*C11; - temp2 -= temp1; - step[14] = DownshiftMultiply(temp2); - - temp1 = output[stride*8]*C9; - temp2 = output[stride*15]*C7; - temp1 += temp2; - step[15] = DownshiftMultiply(temp1); - - // step 5 - output[stride*0] = step[0] + step[15]; - output[stride*1] = step[1] + step[14]; - output[stride*2] = step[2] + step[13]; - output[stride*3] = step[3] + step[12]; - output[stride*4] = step[4] + step[11]; - output[stride*5] = step[5] + step[10]; - output[stride*6] = step[6] + step[ 9]; - output[stride*7] = step[7] + step[ 8]; - - output[stride*15] = step[0] - step[15]; - output[stride*14] = step[1] - step[14]; - output[stride*13] = step[2] - step[13]; - output[stride*12] = step[3] - step[12]; - output[stride*11] = step[4] - step[11]; - output[stride*10] = step[5] - step[10]; - output[stride*9] = step[6] - step[ 9]; - output[stride*8] = step[7] - step[ 8]; -} - -static void butterfly_32_idct_1d(double *input, double *output, int stride) { - static const double C1 = 0.998795456205; // cos(pi * 1 / 64) - static const double C3 = 0.989176509965; // cos(pi * 3 / 64) - static const double C5 = 0.970031253195; // cos(pi * 5 / 64) - static const double C7 = 0.941544065183; // cos(pi * 7 / 64) - static const double C9 = 0.903989293123; // cos(pi * 9 / 64) - static const double C11 = 0.857728610000; // cos(pi * 11 / 64) - static const double C13 = 0.803207531481; // cos(pi * 13 / 64) - static const double C15 = 0.740951125355; // cos(pi * 15 / 64) - static const double C16 = 0.707106781187; // cos(pi * 16 / 64) - static const double C17 = 0.671558954847; // cos(pi * 17 / 64) - static const double C19 = 0.595699304492; // cos(pi * 19 / 64) - static const double C21 = 0.514102744193; // cos(pi * 21 / 64) - static const double C23 = 0.427555093430; // cos(pi * 23 / 64) - static const double C25 = 0.336889853392; // cos(pi * 25 / 64) - static const double C27 = 0.242980179903; // cos(pi * 27 / 64) - static const double C29 = 0.146730474455; // cos(pi * 29 / 64) - static const double C31 = 0.049067674327; // cos(pi * 31 / 64) - - double step1[32]; - double step2[32]; - - step1[ 0] = input[stride*0]; - step1[ 1] = input[stride*2]; - step1[ 2] = input[stride*4]; - step1[ 3] = input[stride*6]; - step1[ 4] = input[stride*8]; - step1[ 5] = input[stride*10]; - step1[ 6] = input[stride*12]; - step1[ 7] = input[stride*14]; - step1[ 8] = input[stride*16]; - step1[ 9] = input[stride*18]; - step1[10] = input[stride*20]; - step1[11] = input[stride*22]; - step1[12] = input[stride*24]; - step1[13] = input[stride*26]; - step1[14] = input[stride*28]; - step1[15] = input[stride*30]; - - step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16); - step1[17] = (input[stride*3] + input[stride*1]); - step1[18] = (input[stride*5] + input[stride*3]); - step1[19] = (input[stride*7] + input[stride*5]); - step1[20] = (input[stride*9] + input[stride*7]); - step1[21] = (input[stride*11] + input[stride*9]); - step1[22] = (input[stride*13] + input[stride*11]); - step1[23] = (input[stride*15] + input[stride*13]); - step1[24] = (input[stride*17] + input[stride*15]); - step1[25] = (input[stride*19] + input[stride*17]); - step1[26] = (input[stride*21] + input[stride*19]); - step1[27] = (input[stride*23] + input[stride*21]); - step1[28] = (input[stride*25] + input[stride*23]); - step1[29] = (input[stride*27] + input[stride*25]); - step1[30] = (input[stride*29] + input[stride*27]); - step1[31] = (input[stride*31] + input[stride*29]); - - idct16(step1, step2, 1); - idct16(step1 + 16, step2 + 16, 1); - - step2[16] = DownshiftMultiply(step2[16] / (2*C1)); - step2[17] = DownshiftMultiply(step2[17] / (2*C3)); - step2[18] = DownshiftMultiply(step2[18] / (2*C5)); - step2[19] = DownshiftMultiply(step2[19] / (2*C7)); - step2[20] = DownshiftMultiply(step2[20] / (2*C9)); - step2[21] = DownshiftMultiply(step2[21] / (2*C11)); - step2[22] = DownshiftMultiply(step2[22] / (2*C13)); - step2[23] = DownshiftMultiply(step2[23] / (2*C15)); - step2[24] = DownshiftMultiply(step2[24] / (2*C17)); - step2[25] = DownshiftMultiply(step2[25] / (2*C19)); - step2[26] = DownshiftMultiply(step2[26] / (2*C21)); - step2[27] = DownshiftMultiply(step2[27] / (2*C23)); - step2[28] = DownshiftMultiply(step2[28] / (2*C25)); - step2[29] = DownshiftMultiply(step2[29] / (2*C27)); - step2[30] = DownshiftMultiply(step2[30] / (2*C29)); - step2[31] = DownshiftMultiply(step2[31] / (2*C31)); - - output[stride* 0] = step2[ 0] + step2[16]; - output[stride* 1] = step2[ 1] + step2[17]; - output[stride* 2] = step2[ 2] + step2[18]; - output[stride* 3] = step2[ 3] + step2[19]; - output[stride* 4] = step2[ 4] + step2[20]; - output[stride* 5] = step2[ 5] + step2[21]; - output[stride* 6] = step2[ 6] + step2[22]; - output[stride* 7] = step2[ 7] + step2[23]; - output[stride* 8] = step2[ 8] + step2[24]; - output[stride* 9] = step2[ 9] + step2[25]; - output[stride*10] = step2[10] + step2[26]; - output[stride*11] = step2[11] + step2[27]; - output[stride*12] = step2[12] + step2[28]; - output[stride*13] = step2[13] + step2[29]; - output[stride*14] = step2[14] + step2[30]; - output[stride*15] = step2[15] + step2[31]; - output[stride*16] = step2[15] - step2[(31 - 0)]; - output[stride*17] = step2[14] - step2[(31 - 1)]; - output[stride*18] = step2[13] - step2[(31 - 2)]; - output[stride*19] = step2[12] - step2[(31 - 3)]; - output[stride*20] = step2[11] - step2[(31 - 4)]; - output[stride*21] = step2[10] - step2[(31 - 5)]; - output[stride*22] = step2[ 9] - step2[(31 - 6)]; - output[stride*23] = step2[ 8] - step2[(31 - 7)]; - output[stride*24] = step2[ 7] - step2[(31 - 8)]; - output[stride*25] = step2[ 6] - step2[(31 - 9)]; - output[stride*26] = step2[ 5] - step2[(31 - 10)]; - output[stride*27] = step2[ 4] - step2[(31 - 11)]; - output[stride*28] = step2[ 3] - step2[(31 - 12)]; - output[stride*29] = step2[ 2] - step2[(31 - 13)]; - output[stride*30] = step2[ 1] - step2[(31 - 14)]; - output[stride*31] = step2[ 0] - step2[(31 - 15)]; -} - -void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double out[32*32], out2[32*32]; - const int short_pitch = pitch >> 1; - int i, j; - // First transform rows - for (i = 0; i < 32; ++i) { - double temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = input[j + i*short_pitch]; - butterfly_32_idct_1d(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) - out[j + i*32] = temp_out[j]; - } - // Then transform columns - for (i = 0; i < 32; ++i) { - double temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = out[j*32 + i]; - butterfly_32_idct_1d(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) - out2[j*32 + i] = temp_out[j]; - } - for (i = 0; i < 32*32; ++i) - output[i] = round(out2[i]/128); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -#else // !CONFIG_DWTDCTHYBRID - -#if DWT_TYPE == 53 - -// Note: block length must be even for this implementation -static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass, - int16_t *x) { - int16_t r, *a, *b; - int n; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ -= (r + (*b) + 1) >> 1; - r = *b++; - } - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - *x++ = ((r = *a++) + 1) >> 1; - *x++ = *b++ + ((r + (*a) + 2) >> 2); - } - *x++ = ((r = *a) + 1) >> 1; - *x++ = *b + ((r + 1) >> 1); -} - -static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass, - int16_t *x) { - int16_t r, *a, *b; - int n; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ -= (r + (*b) + 1) >> 1; - r = *b++; - } - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - r = *a++; - *x++ = r; - *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1); - } - *x++ = *a; - *x++ = ((*b) << 1) + *a; -} - -static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { - int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; - short buffer[2 * DWT_MAX_LENGTH]; - - th[0] = hh; - tw[0] = hw; - for (i = 1; i <= levels; i++) { - th[i] = (th[i - 1] + 1) >> 1; - tw[i] = (tw[i - 1] + 1) >> 1; - } - for (lv = levels - 1; lv >= 0; lv--) { - nh = th[lv]; - nw = tw[lv]; - hh = th[lv + 1]; - hw = tw[lv + 1]; - if ((nh < 2) || (nw < 2)) continue; - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i] = c[i * pitch_c + j]; - synthesis_53_col(nh, buffer, buffer + hh, buffer + nh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i + nh]; - } - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer)); - synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]); - } - } - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ? - ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) : - -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS); - } - } -} - -#elif DWT_TYPE == 26 - -// Note: block length must be even for this implementation -static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass, - int16_t *x) { - int16_t r, s, *a, *b; - int i, n = length >> 1; - - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ += (r - a[1] + 4) >> 3; - r = *a++; - } - *b += (r - *a + 4) >> 3; - } - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - s = *b++; - r = *a++; - *x++ = (r + s + 1) >> 1; - *x++ = (r - s + 1) >> 1; - } -} - -static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass, - int16_t *x) { - int16_t r, s, *a, *b; - int i, n = length >> 1; - - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ += (r - a[1] + 4) >> 3; - r = *a++; - } - *b += (r - *a + 4) >> 3; - } - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - s = *b++; - r = *a++; - *x++ = r + s; - *x++ = r - s; - } -} - -static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { - int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; - int16_t buffer[2 * DWT_MAX_LENGTH]; - - th[0] = hh; - tw[0] = hw; - for (i = 1; i <= levels; i++) { - th[i] = (th[i - 1] + 1) >> 1; - tw[i] = (tw[i - 1] + 1) >> 1; - } - for (lv = levels - 1; lv >= 0; lv--) { - nh = th[lv]; - nw = tw[lv]; - hh = th[lv + 1]; - hw = tw[lv + 1]; - if ((nh < 2) || (nw < 2)) continue; - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i] = c[i * pitch_c + j]; - synthesis_26_col(nh, buffer, buffer + hh, buffer + nh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i + nh]; - } - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer)); - synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]); - } - } - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ? - ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) : - -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS); - } - } -} - -#elif DWT_TYPE == 97 - -static void synthesis_97(int length, double *lowpass, double *highpass, - double *x) { - static const double a_predict1 = -1.586134342; - static const double a_update1 = -0.05298011854; - static const double a_predict2 = 0.8829110762; - static const double a_update2 = 0.4435068522; - static const double s_low = 1.149604398; - static const double s_high = 1/1.149604398; - static const double inv_s_low = 1 / s_low; - static const double inv_s_high = 1 / s_high; - int i; - double y[DWT_MAX_LENGTH]; - // Undo pack and scale - for (i = 0; i < length / 2; i++) { - y[i * 2] = lowpass[i] * inv_s_low; - y[i * 2 + 1] = highpass[i] * inv_s_high; - } - memcpy(x, y, sizeof(*y) * length); - // Undo update 2 - for (i = 2; i < length; i += 2) { - x[i] -= a_update2 * (x[i-1] + x[i+1]); - } - x[0] -= 2 * a_update2 * x[1]; - // Undo predict 2 - for (i = 1; i < length - 2; i += 2) { - x[i] -= a_predict2 * (x[i - 1] + x[i + 1]); - } - x[length - 1] -= 2 * a_predict2 * x[length - 2]; - // Undo update 1 - for (i = 2; i < length; i += 2) { - x[i] -= a_update1 * (x[i - 1] + x[i + 1]); - } - x[0] -= 2 * a_update1 * x[1]; - // Undo predict 1 - for (i = 1; i < length - 2; i += 2) { - x[i] -= a_predict1 * (x[i - 1] + x[i + 1]); - } - x[length - 1] -= 2 * a_predict1 * x[length - 2]; -} - -static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { - int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; - double buffer[2 * DWT_MAX_LENGTH]; - double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH]; - - th[0] = hh; - tw[0] = hw; - for (i = 1; i <= levels; i++) { - th[i] = (th[i - 1] + 1) >> 1; - tw[i] = (tw[i - 1] + 1) >> 1; - } - for (lv = levels - 1; lv >= 0; lv--) { - nh = th[lv]; - nw = tw[lv]; - hh = th[lv + 1]; - hw = tw[lv + 1]; - if ((nh < 2) || (nw < 2)) continue; - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i] = c[i * pitch_c + j]; - synthesis_97(nh, buffer, buffer + hh, buffer + nh); - for (i = 0; i < nh; i++) - y[i * DWT_MAX_LENGTH + j] = buffer[i + nh]; - } - for (i = 0; i < nh; i++) { - memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer)); - synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]); - } - } - for (i = 0; i < height; i++) - for (j = 0; j < width; j++) - x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] / - (1 << DWT_PRECISION_BITS)); -} - -#endif // DWT_TYPE - -// TODO(debargha): Implement scaling differently so as not to have to use the -// floating point 16x16 dct -static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) { - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - - // step 1 and 2 - step[ 0] = input[0] + input[8]; - step[ 1] = input[0] - input[8]; - - temp1 = input[4]*C12; - temp2 = input[12]*C4; - - temp1 -= temp2; - temp1 *= C8; - - step[ 2] = 2*(temp1); - - temp1 = input[4]*C4; - temp2 = input[12]*C12; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - step[ 3] = 2*(temp1); - - temp1 = input[2]*C8; - temp1 = 2*(temp1); - temp2 = input[6] + input[10]; - - step[ 4] = temp1 + temp2; - step[ 5] = temp1 - temp2; - - temp1 = input[14]*C8; - temp1 = 2*(temp1); - temp2 = input[6] - input[10]; - - step[ 6] = temp2 - temp1; - step[ 7] = temp2 + temp1; - - // for odd input - temp1 = input[3]*C12; - temp2 = input[13]*C4; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - intermediate[ 8] = 2*(temp1); - - temp1 = input[3]*C4; - temp2 = input[13]*C12; - temp2 -= temp1; - temp2 = (temp2); - temp2 *= C8; - intermediate[ 9] = 2*(temp2); - - intermediate[10] = 2*(input[9]*C8); - intermediate[11] = input[15] - input[1]; - intermediate[12] = input[15] + input[1]; - intermediate[13] = 2*((input[7]*C8)); - - temp1 = input[11]*C12; - temp2 = input[5]*C4; - temp2 -= temp1; - temp2 = (temp2); - temp2 *= C8; - intermediate[14] = 2*(temp2); - - temp1 = input[11]*C4; - temp2 = input[5]*C12; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - intermediate[15] = 2*(temp1); - - step[ 8] = intermediate[ 8] + intermediate[14]; - step[ 9] = intermediate[ 9] + intermediate[15]; - step[10] = intermediate[10] + intermediate[11]; - step[11] = intermediate[10] - intermediate[11]; - step[12] = intermediate[12] + intermediate[13]; - step[13] = intermediate[12] - intermediate[13]; - step[14] = intermediate[ 8] - intermediate[14]; - step[15] = intermediate[ 9] - intermediate[15]; - - // step 3 - output[0] = step[ 0] + step[ 3]; - output[1] = step[ 1] + step[ 2]; - output[2] = step[ 1] - step[ 2]; - output[3] = step[ 0] - step[ 3]; - - temp1 = step[ 4]*C14; - temp2 = step[ 7]*C2; - temp1 -= temp2; - output[4] = (temp1); - - temp1 = step[ 4]*C2; - temp2 = step[ 7]*C14; - temp1 += temp2; - output[7] = (temp1); - - temp1 = step[ 5]*C10; - temp2 = step[ 6]*C6; - temp1 -= temp2; - output[5] = (temp1); - - temp1 = step[ 5]*C6; - temp2 = step[ 6]*C10; - temp1 += temp2; - output[6] = (temp1); - - output[8] = step[ 8] + step[11]; - output[9] = step[ 9] + step[10]; - output[10] = step[ 9] - step[10]; - output[11] = step[ 8] - step[11]; - output[12] = step[12] + step[15]; - output[13] = step[13] + step[14]; - output[14] = step[13] - step[14]; - output[15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[0] + output[7]; - step[ 1] = output[1] + output[6]; - step[ 2] = output[2] + output[5]; - step[ 3] = output[3] + output[4]; - step[ 4] = output[3] - output[4]; - step[ 5] = output[2] - output[5]; - step[ 6] = output[1] - output[6]; - step[ 7] = output[0] - output[7]; - - temp1 = output[8]*C7; - temp2 = output[15]*C9; - temp1 -= temp2; - step[ 8] = (temp1); - - temp1 = output[9]*C11; - temp2 = output[14]*C5; - temp1 += temp2; - step[ 9] = (temp1); - - temp1 = output[10]*C3; - temp2 = output[13]*C13; - temp1 -= temp2; - step[10] = (temp1); - - temp1 = output[11]*C15; - temp2 = output[12]*C1; - temp1 += temp2; - step[11] = (temp1); - - temp1 = output[11]*C1; - temp2 = output[12]*C15; - temp2 -= temp1; - step[12] = (temp2); - - temp1 = output[10]*C13; - temp2 = output[13]*C3; - temp1 += temp2; - step[13] = (temp1); - - temp1 = output[9]*C5; - temp2 = output[14]*C11; - temp2 -= temp1; - step[14] = (temp2); - - temp1 = output[8]*C9; - temp2 = output[15]*C7; - temp1 += temp2; - step[15] = (temp1); - - // step 5 - output[0] = (step[0] + step[15]); - output[1] = (step[1] + step[14]); - output[2] = (step[2] + step[13]); - output[3] = (step[3] + step[12]); - output[4] = (step[4] + step[11]); - output[5] = (step[5] + step[10]); - output[6] = (step[6] + step[ 9]); - output[7] = (step[7] + step[ 8]); - - output[15] = (step[0] - step[15]); - output[14] = (step[1] - step[14]); - output[13] = (step[2] - step[13]); - output[12] = (step[3] - step[12]); - output[11] = (step[4] - step[11]); - output[10] = (step[5] - step[10]); - output[9] = (step[6] - step[ 9]); - output[8] = (step[7] - step[ 8]); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch, - int scale) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double out[16*16], out2[16*16]; - const int short_pitch = pitch >> 1; - int i, j; - // First transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = input[j + i*short_pitch]; - butterfly_16x16_idct_1d_f(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out[j + i*16] = temp_out[j]; - } - // Then transform columns - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; - butterfly_16x16_idct_1d_f(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out2[j*16 + i] = temp_out[j]; - } - for (i = 0; i < 16*16; ++i) - output[i] = round(out2[i] / (128 >> scale)); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -static void idct8_1d(double *x) { - int i, j; - double t[8]; - static const double idctmat[64] = { - 0.35355339059327, 0.49039264020162, 0.46193976625564, 0.41573480615127, - 0.35355339059327, 0.2777851165098, 0.19134171618254, 0.097545161008064, - 0.35355339059327, 0.41573480615127, 0.19134171618254, -0.097545161008064, - -0.35355339059327, -0.49039264020161, -0.46193976625564, -0.2777851165098, - 0.35355339059327, 0.2777851165098, -0.19134171618254, -0.49039264020162, - -0.35355339059327, 0.097545161008064, 0.46193976625564, 0.41573480615127, - 0.35355339059327, 0.097545161008063, -0.46193976625564, -0.2777851165098, - 0.35355339059327, 0.41573480615127, -0.19134171618254, -0.49039264020162, - 0.35355339059327, -0.097545161008063, -0.46193976625564, 0.2777851165098, - 0.35355339059327, -0.41573480615127, -0.19134171618255, 0.49039264020162, - 0.35355339059327, -0.2777851165098, -0.19134171618254, 0.49039264020161, - -0.35355339059327, -0.097545161008064, 0.46193976625564, -0.41573480615127, - 0.35355339059327, -0.41573480615127, 0.19134171618254, 0.097545161008065, - -0.35355339059327, 0.49039264020162, -0.46193976625564, 0.2777851165098, - 0.35355339059327, -0.49039264020162, 0.46193976625564, -0.41573480615127, - 0.35355339059327, -0.2777851165098, 0.19134171618255, -0.097545161008064 - }; - for (i = 0; i < 8; ++i) { - t[i] = 0; - for (j = 0; j < 8; ++j) - t[i] += idctmat[i * 8 + j] * x[j]; - } - for (i = 0; i < 8; ++i) { - x[i] = t[i]; - } -} - -static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch, - int scale) { - double X[8 * 8], Y[8]; - int i, j; - int shortpitch = pitch >> 1; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - X[i * 8 + j] = (double)coefs[i * shortpitch + j]; - } - } - for (i = 0; i < 8; i++) - idct8_1d(X + 8 * i); - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; ++j) - Y[j] = X[i + 8 * j]; - idct8_1d(Y); - for (j = 0; j < 8; ++j) - X[i + 8 * j] = Y[j]; - } - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale)); - } - } - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -#define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n)) - -#if DWTDCT_TYPE == DWTDCT16X16_LEAN - -void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { - // assume output is a 32x32 buffer - // Temporary buffer to hold a 16x16 block for 16x16 inverse dct - int16_t buffer[16 * 16]; - // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt - int16_t buffer2[32 * 32]; - // Note: pitch is in bytes, short_pitch is in short units - const int short_pitch = pitch >> 1; - int i, j; - - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the idct16x16 function - vp9_short_idct16x16_c_f(input, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16); - } - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - buffer2[i * 32 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); - } - } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - buffer2[i * 32 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); - } - } -#if DWT_TYPE == 26 - dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 97 - dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 53 - dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32); -#endif -} - -#elif DWTDCT_TYPE == DWTDCT16X16 - -void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { - // assume output is a 32x32 buffer - // Temporary buffer to hold a 16x16 block for 16x16 inverse dct - int16_t buffer[16 * 16]; - // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt - int16_t buffer2[32 * 32]; - // Note: pitch is in bytes, short_pitch is in short units - const int short_pitch = pitch >> 1; - int i, j; - - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the idct16x16 function - vp9_short_idct16x16_c_f(input, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16, - sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16, - sizeof(*buffer2) * 16); - } -#if DWT_TYPE == 26 - dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 97 - dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 53 - dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32); -#endif -} - -#elif DWTDCT_TYPE == DWTDCT8X8 - -void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { - // assume output is a 32x32 buffer - // Temporary buffer to hold a 16x16 block for 16x16 inverse dct - int16_t buffer[8 * 8]; - // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt - int16_t buffer2[32 * 32]; - // Note: pitch is in bytes, short_pitch is in short units - const int short_pitch = pitch >> 1; - int i, j; - - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the idct16x16 function - vp9_short_idct8x8_c_f(input, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) { - vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8); - } - vp9_short_idct8x8_c_f(input + 8, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) { - vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8); - } - vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) { - vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8, - sizeof(*buffer2) * 8); - } - vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) { - vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8, - sizeof(*buffer2) * 8); - } - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - buffer2[i * 32 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); - } - } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - buffer2[i * 32 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); - } - } -#if DWT_TYPE == 26 - dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 97 - dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 53 - dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32); -#endif -} - -#endif - -#if CONFIG_TX64X64 -void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) { - // assume output is a 64x64 buffer - // Temporary buffer to hold a 16x16 block for 16x16 inverse dct - int16_t buffer[16 * 16]; - // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt - int16_t buffer2[64 * 64]; - // Note: pitch is in bytes, short_pitch is in short units - const int short_pitch = pitch >> 1; - int i, j; - - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the idct16x16 function - vp9_short_idct16x16_c_f(input, buffer, pitch, - 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16); - } -#if DWTDCT_TYPE == DWTDCT16X16_LEAN - for (i = 0; i < 16; ++i) { - for (j = 16; j < 64; ++j) { - buffer2[i * 64 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 16; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - buffer2[i * 64 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); - } - } -#elif DWTDCT_TYPE == DWTDCT16X16 - vp9_short_idct16x16_c_f(input + 16, buffer, pitch, - 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch, - 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16, - sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch, - 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16, - sizeof(*buffer2) * 16); - } - - // Copying and scaling highest bands into buffer2 - for (i = 0; i < 32; ++i) { - for (j = 32; j < 64; ++j) { - buffer2[i * 64 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 32; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - buffer2[i * 64 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); - } - } -#endif // DWTDCT_TYPE - -#if DWT_TYPE == 26 - dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64); -#elif DWT_TYPE == 97 - dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64); -#elif DWT_TYPE == 53 - dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64); -#endif -} -#endif // CONFIG_TX64X64 -#endif // !CONFIG_DWTDCTHYBRID diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index b5e6e3cc286f4812cdc35af28b009fe95761ca03..a03a66e338809eab9c250dc918472de9d673cc30 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -11,50 +11,25 @@ #include "vp9/common/vp9_invtrans.h" #include "./vp9_rtcd.h" -static void recon_dcblock(MACROBLOCKD *xd) { - BLOCKD *b = &xd->block[24]; - int i; - - for (i = 0; i < 16; i++) { - xd->block[i].dqcoeff[0] = b->diff[i]; - } -} - -static void recon_dcblock_8x8(MACROBLOCKD *xd) { - BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10 - - xd->block[0].dqcoeff[0] = b->diff[0]; - xd->block[4].dqcoeff[0] = b->diff[1]; - xd->block[8].dqcoeff[0] = b->diff[4]; - xd->block[12].dqcoeff[0] = b->diff[8]; -} - -void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch) { - BLOCKD *b = &xd->block[block]; - if (b->eob <= 1) - xd->inv_xform4x4_1_x8(b->dqcoeff, b->diff, pitch); +void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob, + int16_t *dqcoeff, int16_t *diff, + int pitch) { + if (eob <= 1) + xd->inv_txm4x4_1(dqcoeff, diff, pitch); else - xd->inv_xform4x4_x8(b->dqcoeff, b->diff, pitch); + xd->inv_txm4x4(dqcoeff, diff, pitch); } void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { int i; - BLOCKD *blockd = xd->block; - int has_2nd_order = get_2nd_order_usage(xd); - - if (has_2nd_order) { - /* do 2nd order transform on the dc block */ - vp9_short_inv_walsh4x4(blockd[24].dqcoeff, blockd[24].diff); - recon_dcblock(xd); - } for (i = 0; i < 16; i++) { - TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]); + TX_TYPE tx_type = get_tx_type_4x4(xd, i); if (tx_type != DCT_DCT) { - vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, - tx_type, 4, xd->block[i].eob); + vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type); } else { - vp9_inverse_transform_b_4x4(xd, i, 32); + vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff, + xd->block[i].diff, 32); } } } @@ -63,7 +38,8 @@ void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd) { int i; for (i = 16; i < 24; i++) { - vp9_inverse_transform_b_4x4(xd, i, 16); + vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff, + xd->block[i].diff, 16); } } @@ -80,29 +56,21 @@ void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff, void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { int i; BLOCKD *blockd = xd->block; - int has_2nd_order = get_2nd_order_usage(xd); - - if (has_2nd_order) { - // do 2nd order transform on the dc block - vp9_short_ihaar2x2(blockd[24].dqcoeff, blockd[24].diff, 8); - recon_dcblock_8x8(xd); // need to change for 8x8 - } for (i = 0; i < 9; i += 8) { - TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]); + TX_TYPE tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { - vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8, - xd->block[i].eob); + vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type); } else { vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], &blockd[i].diff[0], 32); } } for (i = 2; i < 11; i += 8) { - TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]); + TX_TYPE tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { - vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8, - xd->block[i + 2].eob); + vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff, + 16, tx_type); } else { vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0], &blockd[i].diff[0], 32); @@ -132,9 +100,9 @@ void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff, void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) { BLOCKD *bd = &xd->block[0]; - TX_TYPE tx_type = get_tx_type_16x16(xd, bd); + TX_TYPE tx_type = get_tx_type_16x16(xd, 0); if (tx_type != DCT_DCT) { - vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16, bd->eob); + vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type); } else { vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0], &xd->block[0].diff[0], 32); @@ -146,13 +114,208 @@ void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) { vp9_inverse_transform_mbuv_8x8(xd); } -void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb) { - vp9_short_idct32x32(xd_sb->dqcoeff, xd_sb->diff, 64); +void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd) { + vp9_short_idct32x32(xd->dqcoeff, xd->diff, 64); } -void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb) { - vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1024, - xd_sb->diff + 1024, 32); - vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1280, - xd_sb->diff + 1280, 32); +void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4); + + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256, + xd->diff + x_idx * 16 + y_idx * 32 * 16, + 64); + } else { + vp9_short_iht16x16(xd->dqcoeff + n * 256, + xd->diff + x_idx * 16 + y_idx * 32 * 16, 32, tx_type); + } + } +} + +void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2); + + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64, + xd->diff + x_idx * 8 + y_idx * 32 * 8, 64); + } else { + vp9_short_iht8x8(xd->dqcoeff + n * 64, + xd->diff + x_idx * 8 + y_idx * 32 * 8, 32, tx_type); + } + } +} + +void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx); + + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16, + xd->diff + x_idx * 4 + y_idx * 4 * 32, 64); + } else { + vp9_short_iht4x4(xd->dqcoeff + n * 16, + xd->diff + x_idx * 4 + y_idx * 4 * 32, 32, tx_type); + } + } +} + +void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd) { + vp9_inverse_transform_b_16x16(xd->dqcoeff + 1024, + xd->diff + 1024, 32); + vp9_inverse_transform_b_16x16(xd->dqcoeff + 1280, + xd->diff + 1280, 32); +} + +void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + vp9_inverse_transform_b_8x8(xd->dqcoeff + 1024 + n * 64, + xd->diff + 1024 + x_idx * 8 + y_idx * 16 * 8, + 32); + vp9_inverse_transform_b_8x8(xd->dqcoeff + 1280 + n * 64, + xd->diff + 1280 + x_idx * 8 + y_idx * 16 * 8, + 32); + } +} + +void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + + vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + n], + xd->dqcoeff + 1024 + n * 16, + xd->diff + 1024 + x_idx * 4 + y_idx * 16 * 4, + 32); + vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + 16 + n], + xd->dqcoeff + 1280 + n * 16, + xd->diff + 1280 + x_idx * 4 + y_idx * 16 * 4, + 32); + } +} + +void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + vp9_short_idct32x32(xd->dqcoeff + n * 1024, + xd->diff + x_idx * 32 + y_idx * 32 * 64, 128); + } +} + +void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4); + + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256, + xd->diff + x_idx * 16 + y_idx * 64 * 16, + 128); + } else { + vp9_short_iht16x16(xd->dqcoeff + n * 256, + xd->diff + x_idx * 16 + y_idx * 64 * 16, 64, tx_type); + } + } +} + +void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2); + + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64, + xd->diff + x_idx * 8 + y_idx * 64 * 8, 128); + } else { + vp9_short_iht8x8(xd->dqcoeff + n * 64, + xd->diff + x_idx * 8 + y_idx * 64 * 8, 64, tx_type); + } + } +} + +void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 256; n++) { + const int x_idx = n & 15, y_idx = n >> 4; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx); + + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16, + xd->diff + x_idx * 4 + y_idx * 4 * 64, 128); + } else { + vp9_short_iht4x4(xd->dqcoeff + n * 16, + xd->diff + x_idx * 4 + y_idx * 4 * 64, 64, tx_type); + } + } +} + +void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd) { + vp9_short_idct32x32(xd->dqcoeff + 4096, + xd->diff + 4096, 64); + vp9_short_idct32x32(xd->dqcoeff + 4096 + 1024, + xd->diff + 4096 + 1024, 64); +} + +void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1, off = x_idx * 16 + y_idx * 32 * 16; + + vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + n * 256, + xd->diff + 4096 + off, 64); + vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + 1024 + n * 256, + xd->diff + 4096 + 1024 + off, 64); + } +} + +void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2, off = x_idx * 8 + y_idx * 32 * 8; + + vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + n * 64, + xd->diff + 4096 + off, 64); + vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + 1024 + n * 64, + xd->diff + 4096 + 1024 + off, 64); + } +} + +void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd) { + int n; + + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3, off = x_idx * 4 + y_idx * 32 * 4; + + vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + n], + xd->dqcoeff + 4096 + n * 16, + xd->diff + 4096 + off, 64); + vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + 64 + n], + xd->dqcoeff + 4096 + 1024 + n * 16, + xd->diff + 4096 + 1024 + off, 64); + } } diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h index fd0eb30203d3743ae6a7ee6fb798cfcf9e34b10a..89916570d0d9dfe0f22e50437a9e89578784671a 100644 --- a/vp9/common/vp9_invtrans.h +++ b/vp9/common/vp9_invtrans.h @@ -15,31 +15,47 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" -extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch); +void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob, + int16_t *dqcoeff, int16_t *diff, + int pitch); -extern void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd); +void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd); -extern void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd); +void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd); -extern void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd); +void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd); -extern void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, +void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff, int pitch); -extern void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd); +void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd); -extern void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd); +void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd); -extern void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd); +void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd); -extern void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff, +void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff, int16_t *output_coeff, int pitch); -extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb); -extern void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb); +void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd); + +void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd); + +void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd); +void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd); +void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd); +void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd); +void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd); +void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd); +void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd); + +void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd); +void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd); +void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd); +void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd); +void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd); +void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd); +void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd); +void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd); #endif // VP9_COMMON_VP9_INVTRANS_H_ diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 7633887a3b8f8fb7e4612eb38f14303e388755be..cbdb273b0401c2ad4a411b292e94ff8062dc2d3d 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -109,6 +109,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, loop_filter_info_n *lfi = &cm->lf_info; /* update limits if sharpness has changed */ + // printf("vp9_loop_filter_frame_init %d\n", default_filt_lvl); + // printf("sharpness level: %d [%d]\n", + // cm->sharpness_level, cm->last_sharpness_level); if (cm->last_sharpness_level != cm->sharpness_level) { vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); cm->last_sharpness_level = cm->sharpness_level; @@ -126,7 +129,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); } else { /* Delta Value */ lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); - lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0; + lvl_seg = clamp(lvl_seg, 0, 63); } } @@ -149,13 +152,12 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, /* Apply delta for Intra modes */ mode = 0; /* B_PRED */ /* Only the split mode BPRED has a further special case */ - lvl_mode = lvl_ref + xd->mode_lf_deltas[mode]; - lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + lvl_mode = clamp(lvl_ref + xd->mode_lf_deltas[mode], 0, 63); lfi->lvl[seg][ref][mode] = lvl_mode; mode = 1; /* all the rest of Intra modes */ - lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */ + lvl_mode = clamp(lvl_ref, 0, 63); lfi->lvl[seg][ref][mode] = lvl_mode; /* LAST, GOLDEN, ALT */ @@ -167,9 +169,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, /* Apply delta for Inter modes */ for (mode = 1; mode < 4; mode++) { - lvl_mode = lvl_ref + xd->mode_lf_deltas[mode]; - lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ - + lvl_mode = clamp(lvl_ref + xd->mode_lf_deltas[mode], 0, 63); lfi->lvl[seg][ref][mode] = lvl_mode; } } @@ -202,10 +202,12 @@ static int sb_mb_lf_skip(const MODE_INFO *const mip0, mbmi1->mv[mbmi1->ref_frame].as_int) && mbmi0->ref_frame != INTRA_FRAME; } + void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, int frame_filter_level, - int y_only) { + int y_only, + int dering) { YV12_BUFFER_CONFIG *post = cm->frame_to_show; loop_filter_info_n *lfi_n = &cm->lf_info; struct loop_filter_info lfi; @@ -271,7 +273,6 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); } - } /* don't apply across umv border */ if (mb_row > 0 && @@ -299,6 +300,62 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, post->uv_stride, &lfi); } } +#if CONFIG_LOOP_DERING + if (dering) { + if (mb_row && mb_row < cm->mb_rows - 1 && + mb_col && mb_col < cm->mb_cols - 1) { + vp9_post_proc_down_and_across(y_ptr, y_ptr, + post->y_stride, post->y_stride, + 16, 16, dering); + if (!y_only) { + vp9_post_proc_down_and_across(u_ptr, u_ptr, + post->uv_stride, post->uv_stride, + 8, 8, dering); + vp9_post_proc_down_and_across(v_ptr, v_ptr, + post->uv_stride, post->uv_stride, + 8, 8, dering); + } + } else { + // Adjust the filter so that no out-of-frame data is used. + uint8_t *dr_y = y_ptr, *dr_u = u_ptr, *dr_v = v_ptr; + int w_adjust = 0; + int h_adjust = 0; + + if (mb_col == 0) { + dr_y += 2; + dr_u += 2; + dr_v += 2; + w_adjust += 2; + } + if (mb_col == cm->mb_cols - 1) + w_adjust += 2; + if (mb_row == 0) { + dr_y += 2 * post->y_stride; + dr_u += 2 * post->uv_stride; + dr_v += 2 * post->uv_stride; + h_adjust += 2; + } + if (mb_row == cm->mb_rows - 1) + h_adjust += 2; + vp9_post_proc_down_and_across_c(dr_y, dr_y, + post->y_stride, post->y_stride, + 16 - w_adjust, 16 - h_adjust, + dering); + if (!y_only) { + vp9_post_proc_down_and_across_c(dr_u, dr_u, + post->uv_stride, + post->uv_stride, + 8 - w_adjust, 8 - h_adjust, + dering); + vp9_post_proc_down_and_across_c(dr_v, dr_v, + post->uv_stride, + post->uv_stride, + 8 - w_adjust, 8 - h_adjust, + dering); + } + } + } +#endif } else { // FIXME: Not 8x8 aware if (mb_col > 0 && @@ -376,16 +433,13 @@ void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd, */ if (alt_flt_enabled) { for (i = 0; i < MAX_MB_SEGMENTS; i++) { - /* Abs value */ if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { + // Abs value lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF); - } - /* Delta Value */ - else { - lvl_seg[i] = default_filt_lvl + - vp9_get_segdata(xd, i, SEG_LVL_ALT_LF); - lvl_seg[i] = (lvl_seg[i] > 0) ? - ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0; + } else { + // Delta Value + lvl_seg[i] = default_filt_lvl + vp9_get_segdata(xd, i, SEG_LVL_ALT_LF); + lvl_seg[i] = clamp(lvl_seg[i], 0, 63); } } } diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 53ec336a454a4307bb95ac97b03de4afb1811ffd..458afc50bc76676b2d00dbf265c97eff81b3813f 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -83,7 +83,8 @@ void vp9_loop_filter_frame_init(struct VP9Common *cm, void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd, int filter_level, - int y_only); + int y_only, + int dering); void vp9_loop_filter_partial_frame(struct VP9Common *cm, struct macroblockd *mbd, diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c index fbce50d05398c081457a5b3814174630c729b6ae..a473cf7421a8152cbec22112b454d4de2f9617e7 100644 --- a/vp9/common/vp9_loopfilter_filters.c +++ b/vp9/common/vp9_loopfilter_filters.c @@ -13,7 +13,7 @@ #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" -static __inline int8_t signed_char_clamp(int t) { +static INLINE int8_t signed_char_clamp(int t) { t = (t < -128 ? -128 : t); t = (t > 127 ? 127 : t); return (int8_t) t; @@ -21,11 +21,11 @@ static __inline int8_t signed_char_clamp(int t) { /* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline int8_t filter_mask(uint8_t limit, uint8_t blimit, - uint8_t p3, uint8_t p2, - uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1, - uint8_t q2, uint8_t q3) { +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, + uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { int8_t mask = 0; mask |= (abs(p3 - p2) > limit) * -1; mask |= (abs(p2 - p1) > limit) * -1; @@ -39,57 +39,46 @@ static __inline int8_t filter_mask(uint8_t limit, uint8_t blimit, } /* is there high variance internal edge ( 11111111 yes, 00000000 no) */ -static __inline int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1) { +static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { int8_t hev = 0; hev |= (abs(p1 - p0) > thresh) * -1; hev |= (abs(q1 - q0) > thresh) * -1; return hev; } -static __inline void filter(int8_t mask, uint8_t hev, uint8_t *op1, - uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { - int8_t ps0, qs0; - int8_t ps1, qs1; - int8_t filter, Filter1, Filter2; - int8_t u; +static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; - ps1 = (int8_t) *op1 ^ 0x80; - ps0 = (int8_t) *op0 ^ 0x80; - qs0 = (int8_t) *oq0 ^ 0x80; - qs1 = (int8_t) *oq1 ^ 0x80; + const int8_t ps1 = (int8_t) *op1 ^ 0x80; + const int8_t ps0 = (int8_t) *op0 ^ 0x80; + const int8_t qs0 = (int8_t) *oq0 ^ 0x80; + const int8_t qs1 = (int8_t) *oq1 ^ 0x80; - /* add outer taps if we have high edge variance */ - filter = signed_char_clamp(ps1 - qs1); - filter &= hev; + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; - /* inner taps */ - filter = signed_char_clamp(filter + 3 * (qs0 - ps0)); - filter &= mask; + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; - /* save bottom 3 bits so that we round one side +4 and the other +3 - * if it equals 4 we'll set to adjust by -1 to account for the fact - * we'd round 3 the other way - */ - Filter1 = signed_char_clamp(filter + 4); - Filter2 = signed_char_clamp(filter + 3); - Filter1 >>= 3; - Filter2 >>= 3; - u = signed_char_clamp(qs0 - Filter1); - *oq0 = u ^ 0x80; - u = signed_char_clamp(ps0 + Filter2); - *op0 = u ^ 0x80; - filter = Filter1; - - /* outer tap adjustments */ + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; + *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; + filter = filter1; + + // outer tap adjustments filter += 1; filter >>= 1; filter &= ~hev; - u = signed_char_clamp(qs1 - filter); - *oq1 = u ^ 0x80; - u = signed_char_clamp(ps1 + filter); - *op1 = u ^ 0x80; + *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; + *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; } void vp9_loop_filter_horizontal_edge_c(uint8_t *s, @@ -143,11 +132,11 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, s += p; } while (++i < count * 8); } -static __inline signed char flatmask(uint8_t thresh, - uint8_t p4, uint8_t p3, uint8_t p2, - uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1, uint8_t q2, - uint8_t q3, uint8_t q4) { +static INLINE signed char flatmask4(uint8_t thresh, + uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { int8_t flat = 0; flat |= (abs(p1 - p0) > thresh) * -1; flat |= (abs(q1 - q0) > thresh) * -1; @@ -155,81 +144,72 @@ static __inline signed char flatmask(uint8_t thresh, flat |= (abs(q0 - q2) > thresh) * -1; flat |= (abs(p3 - p0) > thresh) * -1; flat |= (abs(q3 - q0) > thresh) * -1; + flat = ~flat; + return flat; +} +static INLINE signed char flatmask5(uint8_t thresh, + uint8_t p4, uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, uint8_t q2, + uint8_t q3, uint8_t q4) { + int8_t flat = 0; flat |= (abs(p4 - p0) > thresh) * -1; flat |= (abs(q4 - q0) > thresh) * -1; flat = ~flat; - return flat; + return flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); } -static __inline void mbfilter(int8_t mask, uint8_t hev, uint8_t flat, - uint8_t *op4, uint8_t *op3, uint8_t *op2, - uint8_t *op1, uint8_t *op0, - uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, - uint8_t *oq3, uint8_t *oq4) { - /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + +static INLINE void mbfilter(int8_t mask, uint8_t hev, uint8_t flat, + uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { + // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line if (flat && mask) { - uint8_t p0, q0; - uint8_t p1, q1; - uint8_t p2, q2; - uint8_t p3, q3; - uint8_t p4, q4; - - p4 = *op4; - p3 = *op3; - p2 = *op2; - p1 = *op1; - p0 = *op0; - q0 = *oq0; - q1 = *oq1; - q2 = *oq2; - q3 = *oq3; - q4 = *oq4; - - *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3; - *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3; + const uint8_t p3 = *op3; + const uint8_t p2 = *op2; + const uint8_t p1 = *op1; + const uint8_t p0 = *op0; + const uint8_t q0 = *oq0; + const uint8_t q1 = *oq1; + const uint8_t q2 = *oq2; + const uint8_t q3 = *oq3; + + *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3; + *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3; *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3; *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3; - *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3; - *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3; + *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3; + *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3; } else { - int8_t ps0, qs0; - int8_t ps1, qs1; - int8_t filter, Filter1, Filter2; - int8_t u; - - ps1 = (int8_t) *op1 ^ 0x80; - ps0 = (int8_t) *op0 ^ 0x80; - qs0 = (int8_t) *oq0 ^ 0x80; - qs1 = (int8_t) *oq1 ^ 0x80; - - /* add outer taps if we have high edge variance */ - filter = signed_char_clamp(ps1 - qs1); - filter &= hev; - - /* inner taps */ - filter = signed_char_clamp(filter + 3 * (qs0 - ps0)); - filter &= mask; - - Filter1 = signed_char_clamp(filter + 4); - Filter2 = signed_char_clamp(filter + 3); - Filter1 >>= 3; - Filter2 >>= 3; - - u = signed_char_clamp(qs0 - Filter1); - *oq0 = u ^ 0x80; - u = signed_char_clamp(ps0 + Filter2); - *op0 = u ^ 0x80; - filter = Filter1; - - /* outer tap adjustments */ + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t) *op1 ^ 0x80; + const int8_t ps0 = (int8_t) *op0 ^ 0x80; + const int8_t qs0 = (int8_t) *oq0 ^ 0x80; + const int8_t qs1 = (int8_t) *oq1 ^ 0x80; + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; + *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; + filter = filter1; + + // outer tap adjustments filter += 1; filter >>= 1; filter &= ~hev; - u = signed_char_clamp(qs1 - filter); - *oq1 = u ^ 0x80; - u = signed_char_clamp(ps1 + filter); - *op1 = u ^ 0x80; + *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; + *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; } } @@ -254,12 +234,11 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]); - flat = flatmask(1, - s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], - s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]); + flat = flatmask4(1, s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], + s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]); mbfilter(mask, hev, flat, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p); + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p); ++s; } while (++i < count * 8); @@ -283,53 +262,43 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, s[0], s[1], s[2], s[3]); hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); - flat = flatmask(1, - s[-5], s[-4], s[-3], s[-2], s[-1], - s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]); + flat = flatmask4(1, + s[-4], s[-3], s[-2], s[-1], + s[ 0], s[ 1], s[ 2], s[ 3]); mbfilter(mask, hev, flat, - s - 5, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3, s + 4); + s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3); s += p; } while (++i < count * 8); } /* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline int8_t simple_filter_mask(uint8_t blimit, - uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1) { - /* Why does this cause problems for win32? - * error C2143: syntax error : missing ';' before 'type' - * (void) limit; - */ - int8_t mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; - return mask; +static INLINE int8_t simple_filter_mask(uint8_t blimit, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { + return (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; } -static __inline void simple_filter(int8_t mask, - uint8_t *op1, uint8_t *op0, - uint8_t *oq0, uint8_t *oq1) { - int8_t filter, Filter1, Filter2; - int8_t p1 = (int8_t) *op1 ^ 0x80; - int8_t p0 = (int8_t) *op0 ^ 0x80; - int8_t q0 = (int8_t) *oq0 ^ 0x80; - int8_t q1 = (int8_t) *oq1 ^ 0x80; - int8_t u; - - filter = signed_char_clamp(p1 - q1); +static INLINE void simple_filter(int8_t mask, + uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; + const int8_t p1 = (int8_t) *op1 ^ 0x80; + const int8_t p0 = (int8_t) *op0 ^ 0x80; + const int8_t q0 = (int8_t) *oq0 ^ 0x80; + const int8_t q1 = (int8_t) *oq1 ^ 0x80; + + int8_t filter = signed_char_clamp(p1 - q1); filter = signed_char_clamp(filter + 3 * (q0 - p0)); filter &= mask; - /* save bottom 3 bits so that we round one side +4 and the other +3 */ - Filter1 = signed_char_clamp(filter + 4); - Filter1 >>= 3; - u = signed_char_clamp(q0 - Filter1); - *oq0 = u ^ 0x80; + // save bottom 3 bits so that we round one side +4 and the other +3 + filter1 = signed_char_clamp(filter + 4) >> 3; + *oq0 = signed_char_clamp(q0 - filter1) ^ 0x80; - Filter2 = signed_char_clamp(filter + 3); - Filter2 >>= 3; - u = signed_char_clamp(p0 + Filter2); - *op0 = u ^ 0x80; + filter2 = signed_char_clamp(filter + 3) >> 3; + *op0 = signed_char_clamp(p0 + filter2) ^ 0x80; } void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s, @@ -481,41 +450,32 @@ void vp9_loop_filter_bvs_c(uint8_t *y_ptr, int y_stride, vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); } -static __inline void wide_mbfilter(int8_t mask, uint8_t hev, - uint8_t flat, uint8_t flat2, - uint8_t *op7, uint8_t *op6, uint8_t *op5, - uint8_t *op4, uint8_t *op3, uint8_t *op2, - uint8_t *op1, uint8_t *op0, uint8_t *oq0, - uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, - uint8_t *oq4, uint8_t *oq5, uint8_t *oq6, - uint8_t *oq7) { - /* use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line */ +static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, + uint8_t flat, uint8_t flat2, + uint8_t *op7, uint8_t *op6, uint8_t *op5, + uint8_t *op4, uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, uint8_t *oq0, + uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, + uint8_t *oq4, uint8_t *oq5, uint8_t *oq6, + uint8_t *oq7) { + // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line if (flat2 && flat && mask) { - uint8_t p0, q0; - uint8_t p1, q1; - uint8_t p2, q2; - uint8_t p3, q3; - uint8_t p4, q4; - uint8_t p5, q5; - uint8_t p6, q6; - uint8_t p7, q7; - - p7 = *op7; - p6 = *op6; - p5 = *op5; - p4 = *op4; - p3 = *op3; - p2 = *op2; - p1 = *op1; - p0 = *op0; - q0 = *oq0; - q1 = *oq1; - q2 = *oq2; - q3 = *oq3; - q4 = *oq4; - q5 = *oq5; - q6 = *oq6; - q7 = *oq7; + const uint8_t p7 = *op7; + const uint8_t p6 = *op6; + const uint8_t p5 = *op5; + const uint8_t p4 = *op4; + const uint8_t p3 = *op3; + const uint8_t p2 = *op2; + const uint8_t p1 = *op1; + const uint8_t p0 = *op0; + const uint8_t q0 = *oq0; + const uint8_t q1 = *oq1; + const uint8_t q2 = *oq2; + const uint8_t q3 = *oq3; + const uint8_t q4 = *oq4; + const uint8_t q5 = *oq5; + const uint8_t q6 = *oq6; + const uint8_t q7 = *oq7; *op6 = (p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4; @@ -546,68 +506,48 @@ static __inline void wide_mbfilter(int8_t mask, uint8_t hev, *oq6 = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7 + 8) >> 4; } else if (flat && mask) { - unsigned char p0, q0; - unsigned char p1, q1; - unsigned char p2, q2; - unsigned char p3, q3; - unsigned char p4, q4; - - p4 = *op4; - p3 = *op3; - p2 = *op2; - p1 = *op1; - p0 = *op0; - q0 = *oq0; - q1 = *oq1; - q2 = *oq2; - q3 = *oq3; - q4 = *oq4; - - *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3; - *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3; + const uint8_t p3 = *op3; + const uint8_t p2 = *op2; + const uint8_t p1 = *op1; + const uint8_t p0 = *op0; + const uint8_t q0 = *oq0; + const uint8_t q1 = *oq1; + const uint8_t q2 = *oq2; + const uint8_t q3 = *oq3; + + *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3; + *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3; *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3; *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3; - *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3; - *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3; + *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3; + *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3; } else { - signed char ps0, qs0; - signed char ps1, qs1; - signed char filter, Filter1, Filter2; - signed char u; - - ps1 = (signed char) * op1 ^ 0x80; - ps0 = (signed char) * op0 ^ 0x80; - qs0 = (signed char) * oq0 ^ 0x80; - qs1 = (signed char) * oq1 ^ 0x80; - - /* add outer taps if we have high edge variance */ - filter = signed_char_clamp(ps1 - qs1); - filter &= hev; - - /* inner taps */ - filter = signed_char_clamp(filter + 3 * (qs0 - ps0)); - filter &= mask; - - Filter1 = signed_char_clamp(filter + 4); - Filter2 = signed_char_clamp(filter + 3); - Filter1 >>= 3; - Filter2 >>= 3; - - u = signed_char_clamp(qs0 - Filter1); - *oq0 = u ^ 0x80; - u = signed_char_clamp(ps0 + Filter2); - *op0 = u ^ 0x80; - filter = Filter1; - - /* outer tap adjustments */ + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t) * op1 ^ 0x80; + const int8_t ps0 = (int8_t) * op0 ^ 0x80; + const int8_t qs0 = (int8_t) * oq0 ^ 0x80; + const int8_t qs1 = (int8_t) * oq1 ^ 0x80; + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; + *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; + filter = filter1; + + // outer tap adjustments filter += 1; filter >>= 1; filter &= ~hev; - u = signed_char_clamp(qs1 - filter); - *oq1 = u ^ 0x80; - u = signed_char_clamp(ps1 + filter); - *op1 = u ^ 0x80; + *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; + *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; } } @@ -636,19 +576,19 @@ void vp9_mb_lpf_horizontal_edge_w hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]); - flat = flatmask(1, - s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], - s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]); + flat = flatmask4(1, + s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], + s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]); - flat2 = flatmask(1, - s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p], - s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]); + flat2 = flatmask5(1, + s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p], + s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]); wide_mbfilter(mask, hev, flat, flat2, - s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, - s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p, - s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p); + s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p, + s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p); ++s; } while (++i < count * 8); @@ -674,18 +614,18 @@ void vp9_mb_lpf_vertical_edge_w s[0], s[1], s[2], s[3]); hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); - flat = flatmask(1, - s[-5], s[-4], s[-3], s[-2], s[-1], - s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]); - flat2 = flatmask(1, - s[-8], s[-7], s[-6], s[-5], s[-1], - s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]); + flat = flatmask4(1, + s[-4], s[-3], s[-2], s[-1], + s[ 0], s[ 1], s[ 2], s[ 3]); + flat2 = flatmask5(1, + s[-8], s[-7], s[-6], s[-5], s[-1], + s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]); wide_mbfilter(mask, hev, flat, flat2, - s - 8, s - 7, s - 6, s - 5, - s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3, - s + 4, s + 5, s + 6, s + 7); + s - 8, s - 7, s - 6, s - 5, + s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3, + s + 4, s + 5, s + 6, s + 7); s += p; } while (++i < count * 8); } diff --git a/vp9/common/vp9_maskingmv.c b/vp9/common/vp9_maskingmv.c index f1151e3dc2132f0233005a6dfa33cabaa8934f3b..326201bbeb9e7f163b85882d70d83c59ac4e5124 100644 --- a/vp9/common/vp9_maskingmv.c +++ b/vp9/common/vp9_maskingmv.c @@ -11,25 +11,19 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -extern unsigned int vp9_sad16x16_sse3( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride, - int max_err); -extern void vp9_sad16x16x3_sse3( +unsigned int vp9_sad16x16_sse3( unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride, - int *results); + int max_err); -extern int vp8_growmaskmb_sse3( +int vp8_growmaskmb_sse3( unsigned char *om, unsigned char *nm); -extern void vp8_makemask_sse3( +void vp8_makemask_sse3( unsigned char *y, unsigned char *u, unsigned char *v, @@ -238,6 +232,7 @@ void grow_ymask(unsigned char *ym) { for (i = 0; i < 256; i++) ym[i] = nym[i]; } + void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v, unsigned char *ym, unsigned char *uvm, int yp, int uvp, @@ -283,6 +278,7 @@ int compare_masks(unsigned char *sym, unsigned char *ym) { return sad; } + int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp, unsigned char *ym) { int i, j; @@ -294,6 +290,7 @@ int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp, return sad; } + int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, int yp, int uvp, unsigned char *dy, unsigned char *du, unsigned char *dv, @@ -802,5 +799,5 @@ int mainz(int argc, char *argv[]) { } fclose(f); fclose(g); - return; + return 0; } diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c index e941448134709584240c96007b73f00c79fa5cfd..8d99335d4f39262f373bbc5ee72e67a2879f96f1 100644 --- a/vp9/common/vp9_mbpitch.c +++ b/vp9/common/vp9_mbpitch.c @@ -20,15 +20,15 @@ static void setup_block(BLOCKD *b, int mv_stride, uint8_t **base, uint8_t **base2, - int Stride, + int stride, int offset, BLOCKSET bs) { if (bs == DEST) { - b->dst_stride = Stride; + b->dst_stride = stride; b->dst = offset; b->base_dst = base; } else { - b->pre_stride = Stride; + b->pre_stride = stride; b->pre = offset; b->base_pre = base; b->base_second_pre = base2; @@ -102,9 +102,7 @@ void vp9_setup_block_dptrs(MACROBLOCKD *xd) { } } - blockd[24].diff = &xd->diff[384]; - - for (r = 0; r < 25; r++) { + for (r = 0; r < 24; r++) { blockd[r].qcoeff = xd->qcoeff + r * 16; blockd[r].dqcoeff = xd->dqcoeff + r * 16; } diff --git a/vp9/common/vp9_modecont.c b/vp9/common/vp9_modecont.c index f7f2b901322abb2842b9e83da07764b5eb51948b..73cb5e15e39ec97459588a05741cfacbacc85b53 100644 --- a/vp9/common/vp9_modecont.c +++ b/vp9/common/vp9_modecont.c @@ -12,7 +12,7 @@ #include "vp9/common/vp9_entropy.h" const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4] = { - {223, 1, 1, 237}, // 0,0 best: Only candidate + {1, 223, 1, 237}, // 0,0 best: Only candidate {87, 166, 26, 219}, // 0,0 best: non zero candidates {89, 67, 18, 125}, // 0,0 best: non zero candidates, split {16, 141, 69, 226}, // strong nz candidate(s), no split diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h index 8acd4046b267aabade9f577b50378c7cd9339edc..a1eef4649e05fd6c6eb62852403f25edb74f1c62 100644 --- a/vp9/common/vp9_mv.h +++ b/vp9/common/vp9_mv.h @@ -23,4 +23,14 @@ typedef union int_mv { MV as_mv; } int_mv; /* facilitates faster equality tests and copies */ +struct mv32 { + int32_t row; + int32_t col; +}; + +typedef union int_mv32 { + uint64_t as_int; + struct mv32 as_mv; +} int_mv32; /* facilitates faster equality tests and copies */ + #endif // VP9_COMMON_VP9_MV_H_ diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index 786b02188d7c517c5e4f4f371de2b26c812eff26..8d376adbf70b8fc83a23015ee3293e6f8fb9ea6e 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -11,64 +11,69 @@ #include "vp9/common/vp9_mvref_common.h" #define MVREF_NEIGHBOURS 8 + static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = { {0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2} }; + static int mb_ref_distance_weight[MVREF_NEIGHBOURS] = { 3, 3, 2, 1, 1, 1, 1, 1 }; + static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = { {0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2} }; + static int sb_ref_distance_weight[MVREF_NEIGHBOURS] = { 3, 3, 2, 2, 2, 1, 1, 1 }; -// clamp_mv -#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units -static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) { - if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER)) - mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER; - else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER) - mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER; - if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER)) - mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER; - else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER) - mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER; +static int sb64_mv_ref_search[MVREF_NEIGHBOURS][2] = { + {0, -1}, {-1, 0}, {1, -1}, {-1, 1}, + {2, -1}, {-1, 2}, {3, -1}, {-1,-1} +}; + +static int sb64_ref_distance_weight[MVREF_NEIGHBOURS] = + { 1, 1, 1, 1, 1, 1, 1, 1 }; + + + +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) { + mv->as_mv.col = clamp(mv->as_mv.col, xd->mb_to_left_edge - MV_BORDER, + xd->mb_to_right_edge + MV_BORDER); + mv->as_mv.row = clamp(mv->as_mv.row, xd->mb_to_top_edge - MV_BORDER, + xd->mb_to_bottom_edge + MV_BORDER); } // Gets a candidate refenence motion vector from the given mode info // structure if one exists that matches the given reference frame. -static int get_matching_candidate( - const MODE_INFO *candidate_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *c_mv -) { - int ret_val = TRUE; - +static int get_matching_candidate(const MODE_INFO *candidate_mi, + MV_REFERENCE_FRAME ref_frame, + int_mv *c_mv) { if (ref_frame == candidate_mi->mbmi.ref_frame) { c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) { c_mv->as_int = candidate_mi->mbmi.mv[1].as_int; } else { - ret_val = FALSE; + return 0; } - return ret_val; + return 1; } // Gets candidate refenence motion vector(s) from the given mode info // structure if they exists and do NOT match the given reference frame. -static void get_non_matching_candidates( - const MODE_INFO *candidate_mi, - MV_REFERENCE_FRAME ref_frame, - MV_REFERENCE_FRAME *c_ref_frame, - int_mv *c_mv, - MV_REFERENCE_FRAME *c2_ref_frame, - int_mv *c2_mv -) { +static void get_non_matching_candidates(const MODE_INFO *candidate_mi, + MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME *c_ref_frame, + int_mv *c_mv, + MV_REFERENCE_FRAME *c2_ref_frame, + int_mv *c2_mv) { c_mv->as_int = 0; c2_mv->as_int = 0; @@ -85,73 +90,68 @@ static void get_non_matching_candidates( // Second candidate if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) && - (candidate_mi->mbmi.second_ref_frame != ref_frame)) { // && - // (candidate_mi->mbmi.mv[1].as_int != 0) && - // (candidate_mi->mbmi.mv[1].as_int != - // candidate_mi->mbmi.mv[0].as_int)) { + (candidate_mi->mbmi.second_ref_frame != ref_frame) && + (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) { *c2_ref_frame = candidate_mi->mbmi.second_ref_frame; c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int; } } } -// Performs mv adjustment based on reference frame and clamps the MV -// if it goes off the edge of the buffer. -static void scale_mv( - MACROBLOCKD *xd, - MV_REFERENCE_FRAME this_ref_frame, - MV_REFERENCE_FRAME candidate_ref_frame, - int_mv *candidate_mv, - int *ref_sign_bias -) { - - if (candidate_ref_frame != this_ref_frame) { - //int frame_distances[MAX_REF_FRAMES]; - //int last_distance = 1; - //int gf_distance = xd->frames_since_golden; - //int arf_distance = xd->frames_till_alt_ref_frame; +// Performs mv sign inversion if indicated by the reference frame combination. +static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame, + MV_REFERENCE_FRAME candidate_ref_frame, + int_mv *candidate_mv, int *ref_sign_bias) { + // int frame_distances[MAX_REF_FRAMES]; + // int last_distance = 1; + // int gf_distance = xd->frames_since_golden; + // int arf_distance = xd->frames_till_alt_ref_frame; - // Sign inversion where appropriate. - if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) { - candidate_mv->as_mv.row = -candidate_mv->as_mv.row; - candidate_mv->as_mv.col = -candidate_mv->as_mv.col; - } - - // Scale based on frame distance if the reference frames not the same. - /*frame_distances[INTRA_FRAME] = 1; // should never be used - frame_distances[LAST_FRAME] = 1; - frame_distances[GOLDEN_FRAME] = - (xd->frames_since_golden) ? xd->frames_since_golden : 1; - frame_distances[ALTREF_FRAME] = - (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1; - - if (frame_distances[this_ref_frame] && - frame_distances[candidate_ref_frame]) { - candidate_mv->as_mv.row = - (short)(((int)(candidate_mv->as_mv.row) * - frame_distances[this_ref_frame]) / - frame_distances[candidate_ref_frame]); - - candidate_mv->as_mv.col = - (short)(((int)(candidate_mv->as_mv.col) * - frame_distances[this_ref_frame]) / - frame_distances[candidate_ref_frame]); - } - */ + // Sign inversion where appropriate. + if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) { + candidate_mv->as_mv.row = -candidate_mv->as_mv.row; + candidate_mv->as_mv.col = -candidate_mv->as_mv.col; } - // Clamp the MV so it does not point out of the frame buffer - clamp_mv(xd, candidate_mv); + /* + // Scale based on frame distance if the reference frames not the same. + frame_distances[INTRA_FRAME] = 1; // should never be used + frame_distances[LAST_FRAME] = 1; + frame_distances[GOLDEN_FRAME] = + (xd->frames_since_golden) ? xd->frames_si nce_golden : 1; + frame_distances[ALTREF_FRAME] = + (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1; + + if (frame_distances[this_ref_frame] && + frame_distances[candidate_ref_frame]) { + candidate_mv->as_mv.row = + (short)(((int)(candidate_mv->as_mv.row) * + frame_distances[this_ref_frame]) / + frame_distances[candidate_ref_frame]); + + candidate_mv->as_mv.col = + (short)(((int)(candidate_mv->as_mv.col) * + frame_distances[this_ref_frame]) / + frame_distances[candidate_ref_frame]); + } + */ } -// Adds a new candidate reference vector to the list if indeed it is new. -// If it is not new then the score of the existing candidate that it matches -// is increased and the list is resorted. +/* +// Adds a new candidate reference vector to the sorted list. +// If it is a repeat the weight of the existing entry is increased +// and the order of the list is resorted. +// This method of add plus sort has been deprecated for now as there is a +// further sort of the best candidates in vp9_find_best_ref_mvs() and the +// incremental benefit of both is small. If the decision is made to remove +// the sort in vp9_find_best_ref_mvs() for performance reasons then it may be +// worth re-instating some sort of list reordering by weight here. +// static void addmv_and_shuffle( int_mv *mv_list, int *mv_scores, - int *index, + int *refmv_count, int_mv candidate_mv, int weight ) { @@ -162,11 +162,11 @@ static void addmv_and_shuffle( // Check for duplicates. If there is one increase its score. // We only compare vs the current top candidates. - insert_point = (*index < (MAX_MV_REF_CANDIDATES - 1)) - ? *index : (MAX_MV_REF_CANDIDATES - 1); + insert_point = (*refmv_count < (MAX_MV_REF_CANDIDATES - 1)) + ? *refmv_count : (MAX_MV_REF_CANDIDATES - 1); i = insert_point; - if (*index > i) + if (*refmv_count > i) i++; while (i > 0) { i--; @@ -184,7 +184,7 @@ static void addmv_and_shuffle( mv_scores[insert_point] = weight; i = insert_point; } - (*index)++; + (*refmv_count)++; } // Reshuffle the list so that highest scoring mvs at the top. @@ -202,19 +202,42 @@ static void addmv_and_shuffle( break; } } +*/ + +// Adds a new candidate reference vector to the list. +// The mv is thrown out if it is already in the list. +// Unlike the addmv_and_shuffle() this does not reorder the list +// but assumes that candidates are added in the order most likely to +// match distance and reference frame bias. +static void add_candidate_mv(int_mv *mv_list, int *mv_scores, + int *candidate_count, int_mv candidate_mv, + int weight) { + int i; + + // Make sure we dont insert off the end of the list + const int insert_point = MIN(*candidate_count, MAX_MV_REF_CANDIDATES - 1); + + // Look for duplicates + for (i = 0; i <= insert_point; ++i) { + if (candidate_mv.as_int == mv_list[i].as_int) + break; + } + + // Add the candidate. If the list is already full it is only desirable that + // it should overwrite if it has a higher weight than the last entry. + if (i >= insert_point && weight > mv_scores[insert_point]) { + mv_list[insert_point].as_int = candidate_mv.as_int; + mv_scores[insert_point] = weight; + *candidate_count += (*candidate_count < MAX_MV_REF_CANDIDATES); + } +} // This function searches the neighbourhood of a given MB/SB and populates a // list of candidate reference vectors. // -void vp9_find_mv_refs( - MACROBLOCKD *xd, - MODE_INFO *here, - MODE_INFO *lf_here, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int *ref_sign_bias -) { - +void vp9_find_mv_refs(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, + MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int *ref_sign_bias) { int i; MODE_INFO *candidate_mi; MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; @@ -224,17 +247,22 @@ void vp9_find_mv_refs( MV_REFERENCE_FRAME c_ref_frame; MV_REFERENCE_FRAME c2_ref_frame; int candidate_scores[MAX_MV_REF_CANDIDATES]; - int index = 0; + int refmv_count = 0; int split_count = 0; int (*mv_ref_search)[2]; int *ref_distance_weight; + int zero_seen = FALSE; + const int mb_col = (-xd->mb_to_left_edge) >> 7; // Blank the reference vector lists and other local structures. vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES); vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES); vpx_memset(candidate_scores, 0, sizeof(candidate_scores)); - if (mbmi->sb_type) { + if (mbmi->sb_type == BLOCK_SIZE_SB64X64) { + mv_ref_search = sb64_mv_ref_search; + ref_distance_weight = sb64_ref_distance_weight; + } else if (mbmi->sb_type == BLOCK_SIZE_SB32X32) { mv_ref_search = sb_mv_ref_search; ref_distance_weight = sb_ref_distance_weight; } else { @@ -245,39 +273,44 @@ void vp9_find_mv_refs( // We first scan for candidate vectors that match the current reference frame // Look at nearest neigbours for (i = 0; i < 2; ++i) { - if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && + const int mb_search_col = mb_col + mv_ref_search[i][0]; + + if ((mb_search_col >= cm->cur_tile_mb_col_start) && + (mb_search_col < cm->cur_tile_mb_col_end) && ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { candidate_mi = here + mv_ref_search[i][0] + (mv_ref_search[i][1] * xd->mode_info_stride); if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) { - clamp_mv(xd, &c_refmv); - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, ref_distance_weight[i] + 16); + add_candidate_mv(candidate_mvs, candidate_scores, + &refmv_count, c_refmv, ref_distance_weight[i] + 16); } split_count += (candidate_mi->mbmi.mode == SPLITMV); } } - // Look in the last frame - candidate_mi = lf_here; - if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) { - clamp_mv(xd, &c_refmv); - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, 18); + // Look in the last frame if it exists + if (lf_here) { + candidate_mi = lf_here; + if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) { + add_candidate_mv(candidate_mvs, candidate_scores, + &refmv_count, c_refmv, 18); + } } // More distant neigbours for (i = 2; (i < MVREF_NEIGHBOURS) && - (index < (MAX_MV_REF_CANDIDATES - 1)); ++i) { - if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && + (refmv_count < (MAX_MV_REF_CANDIDATES - 1)); ++i) { + const int mb_search_col = mb_col + mv_ref_search[i][0]; + + if ((mb_search_col >= cm->cur_tile_mb_col_start) && + (mb_search_col < cm->cur_tile_mb_col_end) && ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { candidate_mi = here + mv_ref_search[i][0] + (mv_ref_search[i][1] * xd->mode_info_stride); if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) { - clamp_mv(xd, &c_refmv); - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, ref_distance_weight[i] + 16); + add_candidate_mv(candidate_mvs, candidate_scores, + &refmv_count, c_refmv, ref_distance_weight[i] + 16); } } } @@ -286,9 +319,12 @@ void vp9_find_mv_refs( // reference frame does not match. Break out when we have // MAX_MV_REF_CANDIDATES candidates. // Look first at spatial neighbours - if (index < (MAX_MV_REF_CANDIDATES - 1)) { + if (refmv_count < (MAX_MV_REF_CANDIDATES - 1)) { for (i = 0; i < MVREF_NEIGHBOURS; ++i) { - if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && + const int mb_search_col = mb_col + mv_ref_search[i][0]; + + if ((mb_search_col >= cm->cur_tile_mb_col_start) && + (mb_search_col < cm->cur_tile_mb_col_end) && ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { candidate_mi = here + mv_ref_search[i][0] + @@ -300,24 +336,24 @@ void vp9_find_mv_refs( if (c_ref_frame != INTRA_FRAME) { scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias); - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, ref_distance_weight[i]); + add_candidate_mv(candidate_mvs, candidate_scores, + &refmv_count, c_refmv, ref_distance_weight[i]); } if (c2_ref_frame != INTRA_FRAME) { scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias); - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c2_refmv, ref_distance_weight[i]); + add_candidate_mv(candidate_mvs, candidate_scores, + &refmv_count, c2_refmv, ref_distance_weight[i]); } } - if (index >= (MAX_MV_REF_CANDIDATES - 1)) { + if (refmv_count >= (MAX_MV_REF_CANDIDATES - 1)) { break; } } } - // Look at the last frame - if (index < (MAX_MV_REF_CANDIDATES - 1)) { + // Look at the last frame if it exists + if (refmv_count < (MAX_MV_REF_CANDIDATES - 1) && lf_here) { candidate_mi = lf_here; get_non_matching_candidates(candidate_mi, ref_frame, &c_ref_frame, &c_refmv, @@ -325,14 +361,14 @@ void vp9_find_mv_refs( if (c_ref_frame != INTRA_FRAME) { scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias); - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, 2); + add_candidate_mv(candidate_mvs, candidate_scores, + &refmv_count, c_refmv, 2); } if (c2_ref_frame != INTRA_FRAME) { scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias); - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c2_refmv, 2); + add_candidate_mv(candidate_mvs, candidate_scores, + &refmv_count, c2_refmv, 2); } } @@ -340,7 +376,7 @@ void vp9_find_mv_refs( // 0,0 was best if (candidate_mvs[0].as_int == 0) { // 0,0 is only candidate - if (index <= 1) { + if (refmv_count <= 1) { mbmi->mb_mode_context[ref_frame] = 0; // non zero candidates candidates available } else if (split_count == 0) { @@ -348,30 +384,25 @@ void vp9_find_mv_refs( } else { mbmi->mb_mode_context[ref_frame] = 2; } - // Non zero best, No Split MV cases } else if (split_count == 0) { - if (candidate_scores[0] >= 32) { - mbmi->mb_mode_context[ref_frame] = 3; - } else { - mbmi->mb_mode_context[ref_frame] = 4; - } - // Non zero best, some split mv + // Non zero best, No Split MV cases + mbmi->mb_mode_context[ref_frame] = candidate_scores[0] >= 16 ? 3 : 4; } else { - if (candidate_scores[0] >= 32) { - mbmi->mb_mode_context[ref_frame] = 5; - } else { - mbmi->mb_mode_context[ref_frame] = 6; - } + // Non zero best, some split mv + mbmi->mb_mode_context[ref_frame] = candidate_scores[0] >= 16 ? 5 : 6; } - // 0,0 is always a valid reference. + // Scan for 0,0 case and clamp non zero choices for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { - if (candidate_mvs[i].as_int == 0) - break; + if (candidate_mvs[i].as_int == 0) { + zero_seen = TRUE; + } else { + clamp_mv_ref(xd, &candidate_mvs[i]); + } } - if (i == MAX_MV_REF_CANDIDATES) { + // 0,0 is always a valid reference. Add it if not already seen. + if (!zero_seen) candidate_mvs[MAX_MV_REF_CANDIDATES-1].as_int = 0; - } // Copy over the candidate list. vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs)); diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index ca6d89e9108ad85d7db2cdb097cf7bce80619262..a81366997bdfbf204eabd839509a5b16da0dea7c 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -14,7 +14,8 @@ #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ #define VP9_COMMON_VP9_MVREF_COMMON_H_ -void vp9_find_mv_refs(MACROBLOCKD *xd, +void vp9_find_mv_refs(VP9_COMMON *cm, + MACROBLOCKD *xd, MODE_INFO *here, MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame, diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h index e4ad72f21846a84917710699c22da6800e6d4baa..422f3885f28c68ad2371465da929d9d0f12473c9 100644 --- a/vp9/common/vp9_onyx.h +++ b/vp9/common/vp9_onyx.h @@ -16,6 +16,7 @@ extern "C" { #endif +#include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vp8cx.h" #include "vpx_scale/yv12config.h" @@ -62,7 +63,7 @@ extern "C" #include <assert.h> - static __inline void Scale2Ratio(int mode, int *hr, int *hs) { + static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { switch (mode) { case NORMAL: *hr = 1; @@ -89,11 +90,13 @@ extern "C" } typedef struct { - int Version; // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode - int Width; // width of data passed to the compressor - int Height; // height of data passed to the compressor + int version; // 4 versions of bitstream defined: + // 0 - best quality/slowest decode, + // 3 - lowest quality/fastest decode + int width; // width of data passed to the compressor + int height; // height of data passed to the compressor double frame_rate; // set to passed in framerate - int target_bandwidth; // bandwidth to be used in kilobits per second + int64_t target_bandwidth; // bandwidth to be used in kilobits per second int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 int Sharpness; // parameter used for sharpening output: recommendation 0: @@ -134,9 +137,9 @@ extern "C" int over_shoot_pct; // buffering parameters - int starting_buffer_level; // in seconds - int optimal_buffer_level; - int maximum_buffer_size; + int64_t starting_buffer_level; // in seconds + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; // controlling quality int fixed_q; @@ -159,10 +162,25 @@ extern "C" int encode_breakout; // early breakout encode threshold : for video conf recommend 800 + /* Bitfield defining the error resiliency features to enable. + * Can provide decodable frames after losses in previous + * frames and decodable partitions after losses in the same frame. + */ + unsigned int error_resilient_mode; + + /* Bitfield defining the parallel decoding mode where the + * decoding in successive frames may be conducted in parallel + * just by decoding the frame headers. + */ + unsigned int frame_parallel_decoding_mode; + int arnr_max_frames; int arnr_strength; int arnr_type; + int tile_columns; + int tile_rows; + struct vpx_fixed_buf two_pass_stats_in; struct vpx_codec_pkt_list *output_pkt_list; @@ -195,8 +213,10 @@ extern "C" int vp9_update_reference(VP9_PTR comp, int ref_frame_flags); - int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); + int vp9_copy_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + + int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb); int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index ac66e4902f8560d5db6b5f94d0fd762cdfb5c9c7..fdbabc5376a661bdb61231e491e6bc3706b5f432 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -37,7 +37,16 @@ void vp9_initialize_common(void); #define QINDEX_RANGE (MAXQ + 1) -#define NUM_YV12_BUFFERS 4 +#define NUM_REF_FRAMES 3 +#define NUM_REF_FRAMES_LG2 2 + +// 1 scratch frame for the new frame, 3 for scaled references on the encoder +// TODO(jkoleszar): These 3 extra references could probably come from the +// normal reference pool. +#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4) + +#define NUM_FRAME_CONTEXTS_LG2 2 +#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2) #define COMP_PRED_CONTEXTS 2 @@ -49,13 +58,23 @@ typedef struct frame_contexts { vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1]; vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1]; vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1]; - vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32]; + + vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES]; + vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES]; + vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES]; + vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES]; +#if CONFIG_CODE_NONZEROCOUNT + vp9_prob nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC4X4_NODES]; + vp9_prob nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC8X8_NODES]; + vp9_prob nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC16X16_NODES]; + vp9_prob nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC32X32_NODES]; + vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS] + [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA]; +#endif nmv_context nmvc; nmv_context pre_nmvc; @@ -74,21 +93,42 @@ typedef struct frame_contexts { unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS]; unsigned int mbsplit_counts[VP9_NUMMBSPLITS]; - vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs pre_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs pre_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs pre_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES_32X32]; - - vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32]; + vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES]; + vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES]; + vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES]; + vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES]; +#if CONFIG_CODE_NONZEROCOUNT + vp9_prob pre_nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC4X4_NODES]; + vp9_prob pre_nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC8X8_NODES]; + vp9_prob pre_nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC16X16_NODES]; + vp9_prob pre_nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC32X32_NODES]; + vp9_prob pre_nzc_pcat_probs[MAX_NZC_CONTEXTS] + [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA]; +#endif + + vp9_coeff_count coef_counts_4x4[BLOCK_TYPES]; + vp9_coeff_count coef_counts_8x8[BLOCK_TYPES]; + vp9_coeff_count coef_counts_16x16[BLOCK_TYPES]; + vp9_coeff_count coef_counts_32x32[BLOCK_TYPES]; + unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES] + [COEF_BANDS][PREV_COEF_CONTEXTS]; + +#if CONFIG_CODE_NONZEROCOUNT + unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC4X4_TOKENS]; + unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC8X8_TOKENS]; + unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC16X16_TOKENS]; + unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC32X32_TOKENS]; + unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS] + [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA][2]; +#endif nmv_context_counts NMVcount; vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] @@ -128,13 +168,14 @@ typedef struct VP9Common { struct vpx_internal_error_info error; DECLARE_ALIGNED(16, int16_t, Y1dequant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, int16_t, Y2dequant[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, int16_t, UVdequant[QINDEX_RANGE][16]); - int Width; - int Height; - int horiz_scale; - int vert_scale; + int width; + int height; + int display_width; + int display_height; + int last_width; + int last_height; YUV_TYPE clr_type; CLAMP_TYPE clamp_type; @@ -142,8 +183,15 @@ typedef struct VP9Common { YV12_BUFFER_CONFIG *frame_to_show; YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; - int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; - int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx; + int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */ + int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */ + + /* TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and + * roll new_fb_idx into it. + */ + int active_ref_idx[3]; /* each frame can reference 3 buffers */ + int new_fb_idx; + struct scale_factors active_ref_scale[3]; YV12_BUFFER_CONFIG post_proc_buffer; YV12_BUFFER_CONFIG temp_scale_frame; @@ -173,8 +221,6 @@ typedef struct VP9Common { int last_kf_gf_q; /* Q used on the last GF or KF */ int y1dc_delta_q; - int y2dc_delta_q; - int y2ac_delta_q; int uvdc_delta_q; int uvac_delta_q; @@ -201,19 +247,13 @@ typedef struct VP9Common { int filter_level; int last_sharpness_level; int sharpness_level; - - int refresh_last_frame; /* Two state 0 = NO, 1 = YES */ - int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */ - int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */ - - int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */ - int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */ + int dering_enabled; int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */ int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ - /* Y,U,V,Y2 */ + /* Y,U,V */ ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */ ENTROPY_CONTEXT_PLANES left_context[4]; /* (up to) 4 contexts "" */ @@ -250,9 +290,9 @@ typedef struct VP9Common { vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS]; - FRAME_CONTEXT lfc_a; /* last alt ref entropy */ - FRAME_CONTEXT lfc; /* last frame entropy */ FRAME_CONTEXT fc; /* this frame entropy */ + FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS]; + unsigned int frame_context_idx; /* Context to use/update */ unsigned int current_video_frame; int near_boffset[3]; @@ -272,6 +312,60 @@ typedef struct VP9Common { int use_interintra; #endif + int error_resilient_mode; + int frame_parallel_decoding_mode; + + int tile_columns, log2_tile_columns; + int cur_tile_mb_col_start, cur_tile_mb_col_end, cur_tile_col_idx; + int tile_rows, log2_tile_rows; + int cur_tile_mb_row_start, cur_tile_mb_row_end, cur_tile_row_idx; } VP9_COMMON; +static int get_free_fb(VP9_COMMON *cm) { + int i; + for (i = 0; i < NUM_YV12_BUFFERS; i++) + if (cm->fb_idx_ref_cnt[i] == 0) + break; + + assert(i < NUM_YV12_BUFFERS); + cm->fb_idx_ref_cnt[i] = 1; + return i; +} + +static void ref_cnt_fb(int *buf, int *idx, int new_idx) { + if (buf[*idx] > 0) + buf[*idx]--; + + *idx = new_idx; + + buf[new_idx]++; +} + +// TODO(debargha): merge the two functions +static void set_mb_row(VP9_COMMON *cm, MACROBLOCKD *xd, + int mb_row, int block_size) { + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3; + + // Are edges available for intra prediction? + xd->up_available = (mb_row != 0); +} + +static void set_mb_col(VP9_COMMON *cm, MACROBLOCKD *xd, + int mb_col, int block_size) { + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3; + + // Are edges available for intra prediction? + xd->left_available = (mb_col > cm->cur_tile_mb_col_start); + xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end); +} + +static int get_mb_row(const MACROBLOCKD *xd) { + return ((-xd->mb_to_top_edge) >> 7); +} + +static int get_mb_col(const MACROBLOCKD *xd) { + return ((-xd->mb_to_left_edge) >> 7); +} #endif // VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index 62c381eb9954b29a83fc15e05e8450fc49ab1cc1..06dadfca5b66af4dcead1fd0b50289515448ddd0 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -336,11 +336,8 @@ void vp9_deblock(YV12_BUFFER_CONFIG *source, source->uv_height, source->uv_width, ppl); } -void vp9_de_noise(YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag) { +void vp9_denoise(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *post, + int q, int low_var_thresh, int flag) { double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; int ppl = (int)(level + .5); (void) post; @@ -424,9 +421,9 @@ static void fillrd(struct postproc_state *state, int q, int a) { * * INPUTS : unsigned char *Start starting address of buffer to * add gaussian noise to - * unsigned int Width width of plane - * unsigned int Height height of plane - * int Pitch distance between subsequent lines of frame + * unsigned int width width of plane + * unsigned int height height of plane + * int pitch distance between subsequent lines of frame * int q quantizer used to determine amount of noise * to add * @@ -439,25 +436,25 @@ static void fillrd(struct postproc_state *state, int q, int a) { * SPECIAL NOTES : None. * ****************************************************************************/ -void vp9_plane_add_noise_c(uint8_t *Start, char *noise, +void vp9_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], - unsigned int Width, unsigned int Height, int Pitch) { + unsigned int width, unsigned int height, int pitch) { unsigned int i, j; - for (i = 0; i < Height; i++) { - uint8_t *Pos = Start + i * Pitch; - char *Ref = (char *)(noise + (rand() & 0xff)); + for (i = 0; i < height; i++) { + uint8_t *pos = start + i * pitch; + char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT - for (j = 0; j < Width; j++) { - if (Pos[j] < blackclamp[0]) - Pos[j] = blackclamp[0]; + for (j = 0; j < width; j++) { + if (pos[j] < blackclamp[0]) + pos[j] = blackclamp[0]; - if (Pos[j] > 255 + whiteclamp[0]) - Pos[j] = 255 + whiteclamp[0]; + if (pos[j] > 255 + whiteclamp[0]) + pos[j] = 255 + whiteclamp[0]; - Pos[j] += Ref[j]; + pos[j] += ref[j]; } } } @@ -636,8 +633,8 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest, *dest = *oci->frame_to_show; /* handle problem with extending borders */ - dest->y_width = oci->Width; - dest->y_height = oci->Height; + dest->y_width = oci->width; + dest->y_height = oci->height; dest->uv_height = dest->y_height / 2; return 0; @@ -1004,8 +1001,8 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest, *dest = oci->post_proc_buffer; /* handle problem with extending borders */ - dest->y_width = oci->Width; - dest->y_height = oci->Height; + dest->y_width = oci->width; + dest->y_height = oci->height; dest->uv_height = dest->y_height / 2; return 0; diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h index 11f55ab0a0145e5b325fc1eeda214b10b056a8ae..c2f556e61d86bafb39feac0923cd8bc3b1e8e631 100644 --- a/vp9/common/vp9_postproc.h +++ b/vp9/common/vp9_postproc.h @@ -13,30 +13,26 @@ #define VP9_COMMON_VP9_POSTPROC_H_ #include "vpx_ports/mem.h" + struct postproc_state { - int last_q; - int last_noise; - char noise[3072]; + int last_q; + int last_noise; + char noise[3072]; DECLARE_ALIGNED(16, char, blackclamp[16]); DECLARE_ALIGNED(16, char, whiteclamp[16]); DECLARE_ALIGNED(16, char, bothclamp[16]); }; + #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" + int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags); +void vp9_denoise(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, + int q, int low_var_thresh, int flag); -void vp9_de_noise(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag); - -void vp9_deblock(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag); +void vp9_deblock(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, + int q, int low_var_thresh, int flag); #endif // VP9_COMMON_VP9_POSTPROC_H_ diff --git a/vp9/common/vp9_pragmas.h b/vp9/common/vp9_pragmas.h index cbeaf5370b1af43a8687cc83efc185d7a176bbbd..f079161d6b43a548d03b110a7a2f11431eb055db 100644 --- a/vp9/common/vp9_pragmas.h +++ b/vp9/common/vp9_pragmas.h @@ -14,6 +14,7 @@ #ifdef __INTEL_COMPILER #pragma warning(disable:997 1011 170) #endif + #ifdef _MSC_VER #pragma warning(disable:4799) #endif diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index 76ae0b36bda6c390253772bca8ec547e5f388fc4..9fe66fc5b12741f53db5fee229334db1be5e5f22 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -29,14 +29,15 @@ unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, // The prediction flags in these dummy entries are initialised to 0. switch (pred_id) { case PRED_SEG_ID: - pred_context = (m - 1)->mbmi.seg_id_predicted + - (m - cm->mode_info_stride)->mbmi.seg_id_predicted; + pred_context = (m - cm->mode_info_stride)->mbmi.seg_id_predicted; + if (xd->left_available) + pred_context += (m - 1)->mbmi.seg_id_predicted; break; - case PRED_REF: - pred_context = (m - 1)->mbmi.ref_predicted + - (m - cm->mode_info_stride)->mbmi.ref_predicted; + pred_context = (m - cm->mode_info_stride)->mbmi.ref_predicted; + if (xd->left_available) + pred_context += (m - 1)->mbmi.ref_predicted; break; case PRED_COMP: @@ -61,13 +62,14 @@ unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, break; case PRED_MBSKIP: - pred_context = (m - 1)->mbmi.mb_skip_coeff + - (m - cm->mode_info_stride)->mbmi.mb_skip_coeff; + pred_context = (m - cm->mode_info_stride)->mbmi.mb_skip_coeff; + if (xd->left_available) + pred_context += (m - 1)->mbmi.mb_skip_coeff; break; case PRED_SWITCHABLE_INTERP: { - int left_in_image = (m - 1)->mbmi.mb_in_image; + int left_in_image = xd->left_available && (m - 1)->mbmi.mb_in_image; int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image; int left_mode = (m - 1)->mbmi.mode; int above_mode = (m - cm->mode_info_stride)->mbmi.mode; @@ -98,8 +100,7 @@ unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, break; default: - // TODO *** add error trap code. - pred_context = 0; + pred_context = 0; // *** add error trap code. break; } @@ -111,39 +112,23 @@ unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, const MACROBLOCKD *const xd, PRED_ID pred_id) { - vp9_prob pred_probability; - int pred_context; - - // Get the appropriate prediction context - pred_context = vp9_get_pred_context(cm, xd, pred_id); + const int pred_context = vp9_get_pred_context(cm, xd, pred_id); switch (pred_id) { case PRED_SEG_ID: - pred_probability = cm->segment_pred_probs[pred_context]; - break; - + return cm->segment_pred_probs[pred_context]; case PRED_REF: - pred_probability = cm->ref_pred_probs[pred_context]; - break; - + return cm->ref_pred_probs[pred_context]; case PRED_COMP: // In keeping with convention elsewhre the probability returned is // the probability of a "0" outcome which in this case means the // probability of comp pred off. - pred_probability = cm->prob_comppred[pred_context]; - break; - + return cm->prob_comppred[pred_context]; case PRED_MBSKIP: - pred_probability = cm->mbskip_pred_probs[pred_context]; - break; - + return cm->mbskip_pred_probs[pred_context]; default: - // TODO *** add error trap code. - pred_probability = 128; - break; + return 128; // *** add error trap code. } - - return pred_probability; } // This function returns a context probability ptr for coding a given @@ -151,71 +136,41 @@ vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, const MACROBLOCKD *const xd, PRED_ID pred_id) { - const vp9_prob *pred_probability; - int pred_context; - - // Get the appropriate prediction context - pred_context = vp9_get_pred_context(cm, xd, pred_id); + const int pred_context = vp9_get_pred_context(cm, xd, pred_id); switch (pred_id) { case PRED_SEG_ID: - pred_probability = &cm->segment_pred_probs[pred_context]; - break; - + return &cm->segment_pred_probs[pred_context]; case PRED_REF: - pred_probability = &cm->ref_pred_probs[pred_context]; - break; - + return &cm->ref_pred_probs[pred_context]; case PRED_COMP: // In keeping with convention elsewhre the probability returned is // the probability of a "0" outcome which in this case means the // probability of comp pred off. - pred_probability = &cm->prob_comppred[pred_context]; - break; - + return &cm->prob_comppred[pred_context]; case PRED_MBSKIP: - pred_probability = &cm->mbskip_pred_probs[pred_context]; - break; - + return &cm->mbskip_pred_probs[pred_context]; case PRED_SWITCHABLE_INTERP: - pred_probability = &cm->fc.switchable_interp_prob[pred_context][0]; - break; - + return &cm->fc.switchable_interp_prob[pred_context][0]; default: - // TODO *** add error trap code. - pred_probability = NULL; - break; + return NULL; // *** add error trap code. } - - return pred_probability; } // This function returns the status of the given prediction signal. // I.e. is the predicted value for the given signal correct. unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, PRED_ID pred_id) { - unsigned char pred_flag = 0; - switch (pred_id) { case PRED_SEG_ID: - pred_flag = xd->mode_info_context->mbmi.seg_id_predicted; - break; - + return xd->mode_info_context->mbmi.seg_id_predicted; case PRED_REF: - pred_flag = xd->mode_info_context->mbmi.ref_predicted; - break; - + return xd->mode_info_context->mbmi.ref_predicted; case PRED_MBSKIP: - pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff; - break; - + return xd->mode_info_context->mbmi.mb_skip_coeff; default: - // TODO *** add error trap code. - pred_flag = 0; - break; + return 0; // *** add error trap code. } - - return pred_flag; } // This function sets the status of the given prediction signal. @@ -277,7 +232,7 @@ void vp9_set_pred_flag(MACROBLOCKD *const xd, break; default: - // TODO *** add error trap code. + // *** add error trap code. break; } } @@ -322,7 +277,6 @@ MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, MV_REFERENCE_FRAME pred_ref = LAST_FRAME; int segment_id = xd->mode_info_context->mbmi.segment_id; - int seg_ref_active; int i; unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1}; @@ -333,7 +287,7 @@ MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, unsigned char above_left_in_image; // Is segment coding ennabled - seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME); + int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME); // Special case treatment if segment coding is enabled. // Dont allow prediction of a reference frame that the segment @@ -355,9 +309,10 @@ MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame; // Are neighbours in image - left_in_image = (m - 1)->mbmi.mb_in_image; + left_in_image = (m - 1)->mbmi.mb_in_image && xd->left_available; above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image; - above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image; + above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image && + xd->left_available; // Adjust scores for candidate reference frames based on neigbours if (frame_allowed[left] && left_in_image) { @@ -385,9 +340,7 @@ MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, // Functions to computes a set of modified reference frame probabilities // to use when the prediction of the reference frame value fails void vp9_calc_ref_probs(int *count, vp9_prob *probs) { - int tot_count; - - tot_count = count[0] + count[1] + count[2] + count[3]; + int tot_count = count[0] + count[1] + count[2] + count[3]; probs[0] = get_prob(count[0], tot_count); tot_count -= count[0]; @@ -403,19 +356,12 @@ void vp9_calc_ref_probs(int *count, vp9_prob *probs) { // they are not allowed for a given segment. void vp9_compute_mod_refprobs(VP9_COMMON *const cm) { int norm_cnt[MAX_REF_FRAMES]; - int intra_count; - int inter_count; - int last_count; - int gfarf_count; - int gf_count; - int arf_count; - - intra_count = cm->prob_intra_coded; - inter_count = (255 - intra_count); - last_count = (inter_count * cm->prob_last_coded) / 255; - gfarf_count = inter_count - last_count; - gf_count = (gfarf_count * cm->prob_gf_coded) / 255; - arf_count = gfarf_count - gf_count; + const int intra_count = cm->prob_intra_coded; + const int inter_count = (255 - intra_count); + const int last_count = (inter_count * cm->prob_last_coded) / 255; + const int gfarf_count = inter_count - last_count; + const int gf_count = (gfarf_count * cm->prob_gf_coded) / 255; + const int arf_count = gfarf_count - gf_count; // Work out modified reference frame probabilities to use where prediction // of the reference frame fails diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index 52c4d42ef8fac5695b73bcfa97f2bf2c7e79291d..49dcf0a4cf2bb523a7d4b450dd9f4ef97589becc 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -8,48 +8,48 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/common/vp9_blockd.h" - #ifndef VP9_COMMON_VP9_PRED_COMMON_H_ #define VP9_COMMON_VP9_PRED_COMMON_H_ +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" // Predicted items typedef enum { - PRED_SEG_ID = 0, // Segment identifier + PRED_SEG_ID = 0, // Segment identifier PRED_REF = 1, PRED_COMP = 2, PRED_MBSKIP = 3, PRED_SWITCHABLE_INTERP = 4 } PRED_ID; -extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); +unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); + +vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); -extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); +const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); -extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); +unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, + PRED_ID pred_id); -extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, - PRED_ID pred_id); +void vp9_set_pred_flag(MACROBLOCKD *const xd, + PRED_ID pred_id, + unsigned char pred_flag); -extern void vp9_set_pred_flag(MACROBLOCKD *const xd, - PRED_ID pred_id, - unsigned char pred_flag); +unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + int MbIndex); -extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - int MbIndex); +MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd); -extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd); -extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm); +void vp9_compute_mod_refprobs(VP9_COMMON *const cm); #endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c index 119038121987f92bb8f36fc4e9cc6bee581413fb..a94c772bea1f1a48459fcbdbef3aa9f3c4feb023 100644 --- a/vp9/common/vp9_quant_common.c +++ b/vp9/common/vp9_quant_common.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ - +#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_quant_common.h" static int dc_qlookup[QINDEX_RANGE]; @@ -24,7 +24,7 @@ void vp9_init_quant_tables() { for (i = 0; i < QINDEX_RANGE; i++) { ac_qlookup[i] = current_val; - current_val = (int)((double)current_val * 1.02); + current_val = (int)(current_val * 1.02); if (current_val == last_val) current_val++; last_val = current_val; @@ -38,88 +38,18 @@ void vp9_init_quant_tables() { } } -int vp9_dc_quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = dc_qlookup[ QIndex ]; - return retval; -} - -int vp9_dc2quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = dc_qlookup[ QIndex ]; - - return retval; - -} -int vp9_dc_uv_quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = dc_qlookup[ QIndex ]; - - return retval; +int vp9_dc_quant(int qindex, int delta) { + return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; } -int vp9_ac_yquant(int QIndex) { - int retval; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = ac_qlookup[ QIndex ]; - return retval; +int vp9_dc_uv_quant(int qindex, int delta) { + return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; } -int vp9_ac2quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = (ac_qlookup[ QIndex ] * 775) / 1000; - if (retval < 4) - retval = 4; - - return retval; +int vp9_ac_yquant(int qindex) { + return ac_qlookup[clamp(qindex, 0, MAXQ)]; } -int vp9_ac_uv_quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - retval = ac_qlookup[ QIndex ]; - return retval; +int vp9_ac_uv_quant(int qindex, int delta) { + return ac_qlookup[clamp(qindex + delta, 0, MAXQ)]; } diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h index 871c2b035712b6d114e7d91ae176634e4650213d..1520c37977076f11c7aeaee1de258e39a4faaa91 100644 --- a/vp9/common/vp9_quant_common.h +++ b/vp9/common/vp9_quant_common.h @@ -11,16 +11,15 @@ #ifndef VP9_COMMON_VP9_QUANT_COMMON_H_ #define VP9_COMMON_VP9_QUANT_COMMON_H_ -#include "string.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_onyxc_int.h" -extern void vp9_init_quant_tables(void); -extern int vp9_ac_yquant(int QIndex); -extern int vp9_dc_quant(int QIndex, int Delta); -extern int vp9_dc2quant(int QIndex, int Delta); -extern int vp9_ac2quant(int QIndex, int Delta); -extern int vp9_dc_uv_quant(int QIndex, int Delta); -extern int vp9_ac_uv_quant(int QIndex, int Delta); +void vp9_init_quant_tables(); +int vp9_ac_yquant(int qindex); +int vp9_dc_quant(int qindex, int delta); +int vp9_dc2quant(int qindex, int delta); +int vp9_ac2quant(int qindex, int delta); +int vp9_dc_uv_quant(int qindex, int delta); +int vp9_ac_uv_quant(int qindex, int delta); #endif // VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c index caf7b8d22787eddc2c97dde25aa893bdefa6727a..d67b6d3dfbdd0fe64c0dec179053b0c6a19099e6 100644 --- a/vp9/common/vp9_recon.c +++ b/vp9/common/vp9_recon.c @@ -117,7 +117,7 @@ void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) { void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) { int x, y, stride = xd->block[0].dst_stride; - int16_t *diff = xd->sb_coeff_data.diff; + int16_t *diff = xd->diff; for (y = 0; y < 32; y++) { for (x = 0; x < 32; x++) { @@ -130,8 +130,8 @@ void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) { void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) { int x, y, stride = xd->block[16].dst_stride; - int16_t *udiff = xd->sb_coeff_data.diff + 1024; - int16_t *vdiff = xd->sb_coeff_data.diff + 1280; + int16_t *udiff = xd->diff + 1024; + int16_t *vdiff = xd->diff + 1280; for (y = 0; y < 16; y++) { for (x = 0; x < 16; x++) { @@ -145,6 +145,36 @@ void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) { } } +void vp9_recon_sb64y_s_c(MACROBLOCKD *xd, uint8_t *dst) { + int x, y, stride = xd->block[0].dst_stride; + int16_t *diff = xd->diff; + + for (y = 0; y < 64; y++) { + for (x = 0; x < 64; x++) { + dst[x] = clip_pixel(dst[x] + diff[x]); + } + dst += stride; + diff += 64; + } +} + +void vp9_recon_sb64uv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) { + int x, y, stride = xd->block[16].dst_stride; + int16_t *udiff = xd->diff + 4096; + int16_t *vdiff = xd->diff + 4096 + 1024; + + for (y = 0; y < 32; y++) { + for (x = 0; x < 32; x++) { + udst[x] = clip_pixel(udst[x] + udiff[x]); + vdst[x] = clip_pixel(vdst[x] + vdiff[x]); + } + udst += stride; + vdst += stride; + udiff += 32; + vdiff += 32; + } +} + void vp9_recon_mby_c(MACROBLOCKD *xd) { int i; diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 20de7b7f1d8b5f08c189d70384c5bcc92cb2c02b..a654c7df430d7d69ca819b98137c2cb702b2b189 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -8,66 +8,252 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + YV12_BUFFER_CONFIG *other, + int this_w, int this_h) { + int other_h = other->y_crop_height; + int other_w = other->y_crop_width; + + scale->x_num = other_w; + scale->x_den = this_w; + scale->x_offset_q4 = 0; // calculated per-mb + scale->x_step_q4 = 16 * other_w / this_w; + + scale->y_num = other_h; + scale->y_den = this_h; + scale->y_offset_q4 = 0; // calculated per-mb + scale->y_step_q4 = 16 * other_h / this_h; + + // TODO(agrange): Investigate the best choice of functions to use here + // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what + // to do at full-pel offsets. The current selection, where the filter is + // applied in one direction only, and not at all for 0,0, seems to give the + // best quality, but it may be worth trying an additional mode that does + // do the filtering on full-pel. +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT + if (scale->x_step_q4 == 16) { + if (scale->y_step_q4 == 16) { + // No scaling in either direction. + scale->predict[0][0][0] = vp9_convolve_copy; + scale->predict[0][0][1] = vp9_convolve_1by8; + scale->predict[0][0][2] = vp9_convolve_qtr; + scale->predict[0][0][3] = vp9_convolve_3by8; + scale->predict[0][0][4] = vp9_convolve_avg; + scale->predict[0][0][5] = vp9_convolve_5by8; + scale->predict[0][0][6] = vp9_convolve_3qtr; + scale->predict[0][0][7] = vp9_convolve_7by8; + scale->predict[0][1][0] = vp9_convolve8_vert; + scale->predict[0][1][1] = vp9_convolve8_1by8_vert; + scale->predict[0][1][2] = vp9_convolve8_qtr_vert; + scale->predict[0][1][3] = vp9_convolve8_3by8_vert; + scale->predict[0][1][4] = vp9_convolve8_avg_vert; + scale->predict[0][1][5] = vp9_convolve8_5by8_vert; + scale->predict[0][1][6] = vp9_convolve8_3qtr_vert; + scale->predict[0][1][7] = vp9_convolve8_7by8_vert; + scale->predict[1][0][0] = vp9_convolve8_horiz; + scale->predict[1][0][1] = vp9_convolve8_1by8_horiz; + scale->predict[1][0][2] = vp9_convolve8_qtr_horiz; + scale->predict[1][0][3] = vp9_convolve8_3by8_horiz; + scale->predict[1][0][4] = vp9_convolve8_avg_horiz; + scale->predict[1][0][5] = vp9_convolve8_5by8_horiz; + scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz; + scale->predict[1][0][7] = vp9_convolve8_7by8_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + scale->predict[0][0][0] = vp9_convolve8_vert; + scale->predict[0][0][1] = vp9_convolve8_1by8_vert; + scale->predict[0][0][2] = vp9_convolve8_qtr_vert; + scale->predict[0][0][3] = vp9_convolve8_3by8_vert; + scale->predict[0][0][4] = vp9_convolve8_avg_vert; + scale->predict[0][0][5] = vp9_convolve8_5by8_vert; + scale->predict[0][0][6] = vp9_convolve8_3qtr_vert; + scale->predict[0][0][7] = vp9_convolve8_7by8_vert; + scale->predict[0][1][0] = vp9_convolve8_vert; + scale->predict[0][1][1] = vp9_convolve8_1by8_vert; + scale->predict[0][1][2] = vp9_convolve8_qtr_vert; + scale->predict[0][1][3] = vp9_convolve8_3by8_vert; + scale->predict[0][1][4] = vp9_convolve8_avg_vert; + scale->predict[0][1][5] = vp9_convolve8_5by8_vert; + scale->predict[0][1][6] = vp9_convolve8_3qtr_vert; + scale->predict[0][1][7] = vp9_convolve8_7by8_vert; + scale->predict[1][0][0] = vp9_convolve8; + scale->predict[1][0][1] = vp9_convolve8_1by8; + scale->predict[1][0][2] = vp9_convolve8_qtr; + scale->predict[1][0][3] = vp9_convolve8_3by8; + scale->predict[1][0][4] = vp9_convolve8_avg; + scale->predict[1][0][5] = vp9_convolve8_5by8; + scale->predict[1][0][6] = vp9_convolve8_3qtr; + scale->predict[1][0][7] = vp9_convolve8_7by8; + } + } else { + if (scale->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + scale->predict[0][0][0] = vp9_convolve8_horiz; + scale->predict[0][0][1] = vp9_convolve8_1by8_horiz; + scale->predict[0][0][2] = vp9_convolve8_qtr_horiz; + scale->predict[0][0][3] = vp9_convolve8_3by8_horiz; + scale->predict[0][0][4] = vp9_convolve8_avg_horiz; + scale->predict[0][0][5] = vp9_convolve8_5by8_horiz; + scale->predict[0][0][6] = vp9_convolve8_3qtr_horiz; + scale->predict[0][0][7] = vp9_convolve8_7by8_horiz; + scale->predict[0][1][0] = vp9_convolve8; + scale->predict[0][1][1] = vp9_convolve8_1by8; + scale->predict[0][1][2] = vp9_convolve8_qtr; + scale->predict[0][1][3] = vp9_convolve8_3by8; + scale->predict[0][1][4] = vp9_convolve8_avg; + scale->predict[0][1][5] = vp9_convolve8_5by8; + scale->predict[0][1][6] = vp9_convolve8_3qtr; + scale->predict[0][1][7] = vp9_convolve8_7by8; + scale->predict[1][0][0] = vp9_convolve8_horiz; + scale->predict[1][0][1] = vp9_convolve8_1by8_horiz; + scale->predict[1][0][2] = vp9_convolve8_qtr_horiz; + scale->predict[1][0][3] = vp9_convolve8_3by8_horiz; + scale->predict[1][0][4] = vp9_convolve8_avg_horiz; + scale->predict[1][0][5] = vp9_convolve8_5by8_horiz; + scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz; + scale->predict[1][0][7] = vp9_convolve8_7by8_horiz; + } else { + // Must always scale in both directions. + scale->predict[0][0][0] = vp9_convolve8; + scale->predict[0][0][1] = vp9_convolve8_1by8; + scale->predict[0][0][2] = vp9_convolve8_qtr; + scale->predict[0][0][3] = vp9_convolve8_3by8; + scale->predict[0][0][4] = vp9_convolve8_avg; + scale->predict[0][0][5] = vp9_convolve8_5by8; + scale->predict[0][0][6] = vp9_convolve8_3qtr; + scale->predict[0][0][7] = vp9_convolve8_7by8; + scale->predict[0][1][0] = vp9_convolve8; + scale->predict[0][1][1] = vp9_convolve8_1by8; + scale->predict[0][1][2] = vp9_convolve8_qtr; + scale->predict[0][1][3] = vp9_convolve8_3by8; + scale->predict[0][1][4] = vp9_convolve8_avg; + scale->predict[0][1][5] = vp9_convolve8_5by8; + scale->predict[0][1][6] = vp9_convolve8_3qtr; + scale->predict[0][1][7] = vp9_convolve8_7by8; + scale->predict[1][0][0] = vp9_convolve8; + scale->predict[1][0][1] = vp9_convolve8_1by8; + scale->predict[1][0][2] = vp9_convolve8_qtr; + scale->predict[1][0][3] = vp9_convolve8_3by8; + scale->predict[1][0][4] = vp9_convolve8_avg; + scale->predict[1][0][5] = vp9_convolve8_5by8; + scale->predict[1][0][6] = vp9_convolve8_3qtr; + scale->predict[1][0][7] = vp9_convolve8_7by8; + } + } + // 2D subpel motion always gets filtered in both directions + scale->predict[1][1][0] = vp9_convolve8; + scale->predict[1][1][1] = vp9_convolve8_1by8; + scale->predict[1][1][2] = vp9_convolve8_qtr; + scale->predict[1][1][3] = vp9_convolve8_3by8; + scale->predict[1][1][4] = vp9_convolve8_avg; + scale->predict[1][1][5] = vp9_convolve8_5by8; + scale->predict[1][1][6] = vp9_convolve8_3qtr; + scale->predict[1][1][7] = vp9_convolve8_7by8; +} +#else + if (scale->x_step_q4 == 16) { + if (scale->y_step_q4 == 16) { + // No scaling in either direction. + scale->predict[0][0][0] = vp9_convolve_copy; + scale->predict[0][0][1] = vp9_convolve_avg; + scale->predict[0][1][0] = vp9_convolve8_vert; + scale->predict[0][1][1] = vp9_convolve8_avg_vert; + scale->predict[1][0][0] = vp9_convolve8_horiz; + scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + scale->predict[0][0][0] = vp9_convolve8_vert; + scale->predict[0][0][1] = vp9_convolve8_avg_vert; + scale->predict[0][1][0] = vp9_convolve8_vert; + scale->predict[0][1][1] = vp9_convolve8_avg_vert; + scale->predict[1][0][0] = vp9_convolve8; + scale->predict[1][0][1] = vp9_convolve8_avg; + } + } else { + if (scale->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + scale->predict[0][0][0] = vp9_convolve8_horiz; + scale->predict[0][0][1] = vp9_convolve8_avg_horiz; + scale->predict[0][1][0] = vp9_convolve8; + scale->predict[0][1][1] = vp9_convolve8_avg; + scale->predict[1][0][0] = vp9_convolve8_horiz; + scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + } else { + // Must always scale in both directions. + scale->predict[0][0][0] = vp9_convolve8; + scale->predict[0][0][1] = vp9_convolve8_avg; + scale->predict[0][1][0] = vp9_convolve8; + scale->predict[0][1][1] = vp9_convolve8_avg; + scale->predict[1][0][0] = vp9_convolve8; + scale->predict[1][0][1] = vp9_convolve8_avg; + } + } + // 2D subpel motion always gets filtered in both directions + scale->predict[1][1][0] = vp9_convolve8; + scale->predict[1][1][1] = vp9_convolve8_avg; +} +#endif + void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERPOLATIONFILTERTYPE mcomp_filter_type, VP9_COMMON *cm) { -#if CONFIG_ENABLE_6TAP - if (mcomp_filter_type == SIXTAP) { - xd->subpixel_predict4x4 = vp9_sixtap_predict4x4; - xd->subpixel_predict8x4 = vp9_sixtap_predict8x4; - xd->subpixel_predict8x8 = vp9_sixtap_predict8x8; - xd->subpixel_predict16x16 = vp9_sixtap_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16; - } else { -#endif - if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16; - } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4_smooth; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_smooth; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_smooth; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_smooth; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth; - } else if (mcomp_filter_type == EIGHTTAP_SHARP) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4_sharp; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_sharp; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_sharp; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_sharp; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c; - } else { - xd->subpixel_predict4x4 = vp9_bilinear_predict4x4; - xd->subpixel_predict8x4 = vp9_bilinear_predict8x4; - xd->subpixel_predict8x8 = vp9_bilinear_predict8x8; - xd->subpixel_predict16x16 = vp9_bilinear_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16; + int i; + + /* Calculate scaling factors for each of the 3 available references */ + for (i = 0; i < 3; ++i) { + if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) { + memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i])); + continue; + } + + vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i], + &cm->yv12_fb[cm->active_ref_idx[i]], + cm->width, cm->height); } -#if CONFIG_ENABLE_6TAP + + if (xd->mode_info_context) { + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + + set_scale_factors(xd, + mbmi->ref_frame - 1, + mbmi->second_ref_frame - 1, + cm->active_ref_scale); } + + + switch (mcomp_filter_type) { + case EIGHTTAP: + case SWITCHABLE: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8; + break; + case EIGHTTAP_SMOOTH: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp; + break; + case EIGHTTAP_SHARP: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s; + break; + case BILINEAR: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters; + break; +#if CONFIG_ENABLE_6TAP + case SIXTAP: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6; + break; #endif + } + assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); } -void vp9_copy_mem16x16_c(uint8_t *src, +void vp9_copy_mem16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -93,10 +279,10 @@ void vp9_copy_mem16x16_c(uint8_t *src, dst[15] = src[15]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; - ((uint32_t *)dst)[2] = ((uint32_t *)src)[2]; - ((uint32_t *)dst)[3] = ((uint32_t *)src)[3]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; + ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2]; + ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3]; #endif src += src_stride; @@ -104,25 +290,7 @@ void vp9_copy_mem16x16_c(uint8_t *src, } } -void vp9_avg_mem16x16_c(uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 16; r++) { - int n; - - for (n = 0; n < 16; n++) { - dst[n] = (dst[n] + src[n] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x8_c(uint8_t *src, +void vp9_copy_mem8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -139,33 +307,15 @@ void vp9_copy_mem8x8_c(uint8_t *src, dst[6] = src[6]; dst[7] = src[7]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; #endif src += src_stride; dst += dst_stride; } } -void vp9_avg_mem8x8_c(uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 8; r++) { - int n; - - for (n = 0; n < 8; n++) { - dst[n] = (dst[n] + src[n] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x4_c(uint8_t *src, +void vp9_copy_mem8x4_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -182,243 +332,200 @@ void vp9_copy_mem8x4_c(uint8_t *src, dst[6] = src[6]; dst[7] = src[7]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; #endif src += src_stride; dst += dst_stride; } } -void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { - int r; - uint8_t *ptr_base; - uint8_t *ptr; - uint8_t *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; - - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, - pred_ptr, pitch); - } else { - ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - ptr = ptr_base; +static void set_scaled_offsets(struct scale_factors *scale, + int row, int col) { + const int x_q4 = 16 * col; + const int y_q4 = 16 * row; - for (r = 0; r < 4; r++) { -#if !(CONFIG_FAST_UNALIGNED) - pred_ptr[0] = ptr[0]; - pred_ptr[1] = ptr[1]; - pred_ptr[2] = ptr[2]; - pred_ptr[3] = ptr[3]; -#else - *(uint32_t *)pred_ptr = *(uint32_t *)ptr; -#endif - pred_ptr += pitch; - ptr += d->pre_stride; - } - } + scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf; + scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf; } -/* - * Similar to vp9_build_inter_predictors_b(), but instead of storing the - * results in d->predictor, we average the contents of d->predictor (which - * come from an earlier call to vp9_build_inter_predictors_b()) with the - * predictor of the second reference frame / motion vector. - */ -void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf) { - int r; - uint8_t *ptr_base; - uint8_t *ptr; - uint8_t *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_second_pre); - mv.as_int = d->bmi.as_mv.second.as_int; - - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, - pred_ptr, pitch); - } else { - ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - ptr = ptr_base; - - for (r = 0; r < 4; r++) { - pred_ptr[0] = (pred_ptr[0] + ptr[0] + 1) >> 1; - pred_ptr[1] = (pred_ptr[1] + ptr[1] + 1) >> 1; - pred_ptr[2] = (pred_ptr[2] + ptr[2] + 1) >> 1; - pred_ptr[3] = (pred_ptr[3] + ptr[3] + 1) >> 1; - pred_ptr += pitch; - ptr += d->pre_stride; - } - } +static int32_t scale_motion_vector_component_q3(int mv_q3, + int num, + int den, + int offset_q4) { + // returns the scaled and offset value of the mv component. + const int32_t mv_q4 = mv_q3 << 1; + + /* TODO(jkoleszar): make fixed point, or as a second multiply? */ + return mv_q4 * num / den + offset_q4; } -void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { - uint8_t *ptr_base; - uint8_t *ptr; - uint8_t *pred_ptr = d->predictor; - int_mv mv; +static int32_t scale_motion_vector_component_q4(int mv_q4, + int num, + int den, + int offset_q4) { + // returns the scaled and offset value of the mv component. + + /* TODO(jkoleszar): make fixed point, or as a second multiply? */ + return mv_q4 * num / den + offset_q4; +} - ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); +static int_mv32 scale_motion_vector_q3_to_q4( + const int_mv *src_mv, + const struct scale_factors *scale) { + // returns mv * scale + offset + int_mv32 result; + + result.as_mv.row = scale_motion_vector_component_q3(src_mv->as_mv.row, + scale->y_num, + scale->y_den, + scale->y_offset_q4); + result.as_mv.col = scale_motion_vector_component_q3(src_mv->as_mv.col, + scale->x_num, + scale->x_den, + scale->x_offset_q4); + return result; +} - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); - } +void vp9_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int_mv *mv_q3, + const struct scale_factors *scale, + int w, int h, int weight, + const struct subpix_fn_table *subpix) { + int_mv32 mv = scale_motion_vector_q3_to_q4(mv_q3, scale); + src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4); + scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight]( + src, src_stride, dst, dst_stride, + subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4, + subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4, + w, h); } -/* - * Similar to build_inter_predictors_4b(), but instead of storing the - * results in d->predictor, we average the contents of d->predictor (which - * come from an earlier call to build_inter_predictors_4b()) with the - * predictor of the second reference frame / motion vector. +/* Like vp9_build_inter_predictor, but takes the full-pel part of the + * mv separately, and the fractional part as a q4. */ -void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, - BLOCKD *d, int pitch) { - uint8_t *ptr_base; - uint8_t *ptr; - uint8_t *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_second_pre); - mv.as_int = d->bmi.as_mv.second.as_int; - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); - } +void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int_mv *fullpel_mv_q3, + const int_mv *frac_mv_q4, + const struct scale_factors *scale, + int w, int h, int weight, + const struct subpix_fn_table *subpix) { + const int mv_row_q4 = ((fullpel_mv_q3->as_mv.row >> 3) << 4) + + (frac_mv_q4->as_mv.row & 0xf); + const int mv_col_q4 = ((fullpel_mv_q3->as_mv.col >> 3) << 4) + + (frac_mv_q4->as_mv.col & 0xf); + const int scaled_mv_row_q4 = + scale_motion_vector_component_q4(mv_row_q4, scale->y_num, scale->y_den, + scale->y_offset_q4); + const int scaled_mv_col_q4 = + scale_motion_vector_component_q4(mv_col_q4, scale->x_num, scale->x_den, + scale->x_offset_q4); + const int subpel_x = scaled_mv_col_q4 & 15; + const int subpel_y = scaled_mv_row_q4 & 15; + + src += (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4); + scale->predict[!!subpel_x][!!subpel_y][weight]( + src, src_stride, dst, dst_stride, + subpix->filter_x[subpel_x], scale->x_step_q4, + subpix->filter_y[subpel_y], scale->y_step_q4, + w, h); } -static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { - uint8_t *ptr_base; - uint8_t *ptr; - uint8_t *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); +static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1, + struct scale_factors *scale, + uint8_t *predictor, + int block_size, int stride, + int which_mv, int weight, + int width, int height, + const struct subpix_fn_table *subpix, + int row, int col) { + assert(d1->predictor - d0->predictor == block_size); + assert(d1->pre == d0->pre + block_size); + + set_scaled_offsets(&scale[which_mv], row, col); + + if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) { + uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre; + + vp9_build_inter_predictor(*base_pre + d0->pre, + d0->pre_stride, + predictor, stride, + &d0->bmi.as_mv[which_mv], + &scale[which_mv], + width, height, + weight, subpix); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); } else { - vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch); + uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre; + uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre; + + vp9_build_inter_predictor(*base_pre0 + d0->pre, + d0->pre_stride, + predictor, stride, + &d0->bmi.as_mv[which_mv], + &scale[which_mv], + width > block_size ? block_size : width, height, + weight, subpix); + + if (width <= block_size) return; + + set_scaled_offsets(&scale[which_mv], row, col + block_size); + + vp9_build_inter_predictor(*base_pre1 + d1->pre, + d1->pre_stride, + predictor + block_size, stride, + &d1->bmi.as_mv[which_mv], + &scale[which_mv], + width - block_size, height, + weight, subpix); } } -/*encoder only*/ -void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { - int i, j; - BLOCKD *blockd = xd->block; - - /* build uv mvs */ - for (i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) { - int yoffset = i * 8 + j * 2; - int uoffset = 16 + i * 2 + j; - int voffset = 20 + i * 2 + j; - int temp; - - temp = blockd[yoffset ].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row; +static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1, + struct scale_factors *scale, + int block_size, int stride, + int which_mv, int weight, + const struct subpix_fn_table *subpix, + int row, int col) { + assert(d1->predictor - d0->predictor == block_size); + assert(d1->pre == d0->pre + block_size); - if (temp < 0) temp -= 4; - else temp += 4; + set_scaled_offsets(&scale[which_mv], row, col); - xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) & - xd->fullpixel_mask; - - temp = blockd[yoffset ].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col; - - if (temp < 0) temp -= 4; - else temp += 4; - - blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) & - xd->fullpixel_mask; - - blockd[voffset].bmi.as_mv.first.as_mv.row = - blockd[uoffset].bmi.as_mv.first.as_mv.row; - blockd[voffset].bmi.as_mv.first.as_mv.col = - blockd[uoffset].bmi.as_mv.first.as_mv.col; - - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - temp = blockd[yoffset ].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row; - - if (temp < 0) { - temp -= 4; - } else { - temp += 4; - } + if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) { + uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre; - blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) & - xd->fullpixel_mask; + vp9_build_inter_predictor(*base_pre + d0->pre, + d0->pre_stride, + d0->predictor, stride, + &d0->bmi.as_mv[which_mv], + &scale[which_mv], + 2 * block_size, block_size, + weight, subpix); - temp = blockd[yoffset ].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col; - - if (temp < 0) { - temp -= 4; - } else { - temp += 4; - } - - blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) & - xd->fullpixel_mask; - - blockd[voffset].bmi.as_mv.second.as_mv.row = - blockd[uoffset].bmi.as_mv.second.as_mv.row; - blockd[voffset].bmi.as_mv.second.as_mv.col = - blockd[uoffset].bmi.as_mv.second.as_mv.col; - } - } - } - - for (i = 16; i < 24; i += 2) { - BLOCKD *d0 = &blockd[i]; - BLOCKD *d1 = &blockd[i + 1]; - - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) - build_inter_predictors2b(xd, d0, 8); - else { - vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4); - } - - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4); - } + } else { + uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre; + uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre; + + vp9_build_inter_predictor(*base_pre0 + d0->pre, + d0->pre_stride, + d0->predictor, stride, + &d0->bmi.as_mv[which_mv], + &scale[which_mv], + block_size, block_size, + weight, subpix); + + set_scaled_offsets(&scale[which_mv], row, col + block_size); + + vp9_build_inter_predictor(*base_pre1 + d1->pre, + d1->pre_stride, + d1->predictor, stride, + &d1->bmi.as_mv[which_mv], + &scale[which_mv], + block_size, block_size, + weight, subpix); } } @@ -458,102 +565,538 @@ static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) { (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row; } -/*encoder only*/ -void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, - uint8_t *dst_y, - int dst_ystride, - int clamp_mvs) { - uint8_t *ptr_base = xd->pre.y_buffer; - uint8_t *ptr; - int pre_stride = xd->block[0].pre_stride; - int_mv ymv; +#define AVERAGE_WEIGHT (1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT)) - ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int; +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT - if (clamp_mvs) - clamp_mv_to_umv_border(&ymv.as_mv, xd); +// Whether to use implicit weighting for UV +#define USE_IMPLICIT_WEIGHT_UV - ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3); +// Whether to use implicit weighting for SplitMV +// #define USE_IMPLICIT_WEIGHT_SPLITMV - if ((ymv.as_mv.row | ymv.as_mv.col) & 7) { - xd->subpixel_predict16x16(ptr, pre_stride, - (ymv.as_mv.col & 7) << 1, - (ymv.as_mv.row & 7) << 1, - dst_y, dst_ystride); - } else { - vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); +// #define SEARCH_MIN3 +static int64_t get_consistency_metric(MACROBLOCKD *xd, + uint8_t *tmp_y, int tmp_ystride) { + int block_size = 16 << xd->mode_info_context->mbmi.sb_type; + uint8_t *rec_y = xd->dst.y_buffer; + int rec_ystride = xd->dst.y_stride; + int64_t metric = 0; + int i; + if (xd->up_available) { + for (i = 0; i < block_size; ++i) { + int diff = abs(*(rec_y - rec_ystride + i) - + *(tmp_y + i)); +#ifdef SEARCH_MIN3 + // Searches for the min abs diff among 3 pixel neighbors in the border + int diff1 = xd->left_available ? + abs(*(rec_y - rec_ystride + i - 1) - *(tmp_y + i)) : diff; + int diff2 = i < block_size - 1 ? + abs(*(rec_y - rec_ystride + i + 1) - *(tmp_y + i)) : diff; + diff = diff <= diff1 ? diff : diff1; + diff = diff <= diff2 ? diff : diff2; +#endif + metric += diff; } + } + if (xd->left_available) { + for (i = 0; i < block_size; ++i) { + int diff = abs(*(rec_y - 1 + i * rec_ystride) - + *(tmp_y + i * tmp_ystride)); +#ifdef SEARCH_MIN3 + // Searches for the min abs diff among 3 pixel neighbors in the border + int diff1 = xd->up_available ? + abs(*(rec_y - 1 + (i - 1) * rec_ystride) - + *(tmp_y + i * tmp_ystride)) : diff; + int diff2 = i < block_size - 1 ? + abs(*(rec_y - 1 + (i + 1) * rec_ystride) - + *(tmp_y + i * tmp_ystride)) : diff; + diff = diff <= diff1 ? diff : diff1; + diff = diff <= diff2 ? diff : diff2; +#endif + metric += diff; + } + } + return metric; } -void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_uvstride) { - int offset; - uint8_t *uptr, *vptr; - int pre_stride = xd->block[0].pre_stride; - int_mv _o16x16mv; - int_mv _16x16mv; - - _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int; - - if (xd->mode_info_context->mbmi.need_to_clamp_mvs) - clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); - - _o16x16mv = _16x16mv; - /* calc uv motion vectors */ - if (_16x16mv.as_mv.row < 0) - _16x16mv.as_mv.row -= 1; - else - _16x16mv.as_mv.row += 1; - - if (_16x16mv.as_mv.col < 0) - _16x16mv.as_mv.col -= 1; - else - _16x16mv.as_mv.col += 1; - - _16x16mv.as_mv.row /= 2; - _16x16mv.as_mv.col /= 2; - - _16x16mv.as_mv.row &= xd->fullpixel_mask; - _16x16mv.as_mv.col &= xd->fullpixel_mask; - - pre_stride >>= 1; - offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); - uptr = xd->pre.u_buffer + offset; - vptr = xd->pre.v_buffer + offset; - - if (_o16x16mv.as_int & 0x000f000f) { - xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride); - xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride); - } else { - vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); - vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); +static int get_weight(MACROBLOCKD *xd, int64_t metric_1, int64_t metric_2) { + int weight = AVERAGE_WEIGHT; + if (2 * metric_1 < metric_2) + weight = 6; + else if (4 * metric_1 < 3 * metric_2) + weight = 5; + else if (2 * metric_2 < metric_1) + weight = 2; + else if (4 * metric_2 < 3 * metric_1) + weight = 3; + return weight; +} + +#ifdef USE_IMPLICIT_WEIGHT_SPLITMV +static int get_implicit_compoundinter_weight_splitmv( + MACROBLOCKD *xd, int mb_row, int mb_col) { + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + BLOCKD *blockd = xd->block; + const int use_second_ref = mbmi->second_ref_frame > 0; + int64_t metric_2 = 0, metric_1 = 0; + int i, which_mv, weight; + uint8_t tmp_y[256]; + const int tmp_ystride = 16; + + if (!use_second_ref) return 0; + if (!(xd->up_available || xd->left_available)) + return AVERAGE_WEIGHT; + + assert(xd->mode_info_context->mbmi.mode == SPLITMV); + + which_mv = 1; // second predictor + if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) { + for (i = 0; i < 16; i += 8) { + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 2]; + const int y = i & 8; + + blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0]; + blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2]; + + if (mbmi->need_to_clamp_mvs) { + clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd); + } + if (i == 0) { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16, + which_mv, 0, 16, 1, + &xd->subpix, mb_row * 16 + y, mb_col * 16); + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16, + which_mv, 0, 1, 8, + &xd->subpix, mb_row * 16 + y, mb_col * 16); + } else { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16, + 8, 16, which_mv, 0, 1, 8, + &xd->subpix, mb_row * 16 + y, mb_col * 16); + } } + } else { + for (i = 0; i < 16; i += 2) { + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 1]; + const int x = (i & 3) * 4; + const int y = (i >> 2) * 4; + + blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0]; + blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1]; + + if (i >= 4 && (i & 3) != 0) continue; + + if (i == 0) { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16, + which_mv, 0, 8, 1, &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16, + which_mv, 0, 1, 4, &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); + } else if (i < 4) { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16, + which_mv, 0, 8, 1, &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); + } else { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16, + 4, 16, which_mv, 0, 1, 4, &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); + } + } + } + metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride); + + which_mv = 0; // first predictor + if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) { + for (i = 0; i < 16; i += 8) { + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 2]; + const int y = i & 8; + + blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0]; + blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2]; + + if (mbmi->need_to_clamp_mvs) { + clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd); + } + if (i == 0) { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16, + which_mv, 0, 16, 1, + &xd->subpix, mb_row * 16 + y, mb_col * 16); + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16, + which_mv, 0, 1, 8, + &xd->subpix, mb_row * 16 + y, mb_col * 16); + } else { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16, + 8, 16, which_mv, 0, 1, 8, + &xd->subpix, mb_row * 16 + y, mb_col * 16); + } + } + } else { + for (i = 0; i < 16; i += 2) { + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 1]; + const int x = (i & 3) * 4; + const int y = (i >> 2) * 4; + + blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0]; + blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1]; + + if (i >= 4 && (i & 3) != 0) continue; + + if (i == 0) { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16, + which_mv, 0, 8, 1, &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16, + which_mv, 0, 1, 4, &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); + } else if (i < 4) { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16, + which_mv, 0, 8, 1, &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); + } else { + build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16, + 4, 16, which_mv, 0, 1, 4, &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); + } + } + } + metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride); + + // Choose final weight for averaging + weight = get_weight(xd, metric_1, metric_2); + return weight; } +#endif + +static int get_implicit_compoundinter_weight(MACROBLOCKD *xd, + int mb_row, + int mb_col) { + const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0; + int64_t metric_2 = 0, metric_1 = 0; + int n, clamp_mvs, pre_stride; + uint8_t *base_pre; + int_mv ymv; + uint8_t tmp_y[4096]; + const int tmp_ystride = 64; + int weight; + int edge[4]; + int block_size = 16 << xd->mode_info_context->mbmi.sb_type; + + if (!use_second_ref) return 0; + if (!(xd->up_available || xd->left_available)) + return AVERAGE_WEIGHT; + + edge[0] = xd->mb_to_top_edge; + edge[1] = xd->mb_to_bottom_edge; + edge[2] = xd->mb_to_left_edge; + edge[3] = xd->mb_to_right_edge; + + clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_secondmv; + base_pre = xd->second_pre.y_buffer; + pre_stride = xd->second_pre.y_stride; + ymv.as_int = xd->mode_info_context->mbmi.mv[1].as_int; + // First generate the second predictor + for (n = 0; n < block_size; n += 16) { + xd->mb_to_left_edge = edge[2] - (n << 3); + xd->mb_to_right_edge = edge[3] + ((16 - n) << 3); + if (clamp_mvs) + clamp_mv_to_umv_border(&ymv.as_mv, xd); + set_scaled_offsets(&xd->scale_factor[1], mb_row * 16, mb_col * 16 + n); + // predict a single row of pixels + vp9_build_inter_predictor( + base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[1]), + pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[1], + 16, 1, 0, &xd->subpix); + } + xd->mb_to_left_edge = edge[2]; + xd->mb_to_right_edge = edge[3]; + for (n = 0; n < block_size; n += 16) { + xd->mb_to_top_edge = edge[0] - (n << 3); + xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3); + if (clamp_mvs) + clamp_mv_to_umv_border(&ymv.as_mv, xd); + set_scaled_offsets(&xd->scale_factor[1], mb_row * 16 + n, mb_col * 16); + // predict a single col of pixels + vp9_build_inter_predictor( + base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[1]), + pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv, + &xd->scale_factor[1], 1, 16, 0, &xd->subpix); + } + xd->mb_to_top_edge = edge[0]; + xd->mb_to_bottom_edge = edge[1]; + // Compute consistency metric + metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride); + + clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_mvs; + base_pre = xd->pre.y_buffer; + pre_stride = xd->pre.y_stride; + ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int; + // Now generate the first predictor + for (n = 0; n < block_size; n += 16) { + xd->mb_to_left_edge = edge[2] - (n << 3); + xd->mb_to_right_edge = edge[3] + ((16 - n) << 3); + if (clamp_mvs) + clamp_mv_to_umv_border(&ymv.as_mv, xd); + set_scaled_offsets(&xd->scale_factor[0], mb_row * 16, mb_col * 16 + n); + // predict a single row of pixels + vp9_build_inter_predictor( + base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[0]), + pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[0], + 16, 1, 0, &xd->subpix); + } + xd->mb_to_left_edge = edge[2]; + xd->mb_to_right_edge = edge[3]; + for (n = 0; n < block_size; n += 16) { + xd->mb_to_top_edge = edge[0] - (n << 3); + xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3); + if (clamp_mvs) + clamp_mv_to_umv_border(&ymv.as_mv, xd); + set_scaled_offsets(&xd->scale_factor[0], mb_row * 16 + n, mb_col * 16); + // predict a single col of pixels + vp9_build_inter_predictor( + base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[0]), + pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv, + &xd->scale_factor[0], 1, 16, 0, &xd->subpix); + } + xd->mb_to_top_edge = edge[0]; + xd->mb_to_bottom_edge = edge[1]; + metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride); + // Choose final weight for averaging + weight = get_weight(xd, metric_1, metric_2); + return weight; +} -void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd, - uint8_t *dst_y, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_ystride, int dst_uvstride) { - vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride, - xd->mode_info_context->mbmi.need_to_clamp_mvs); - vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride); +static void build_inter16x16_predictors_mby_w(MACROBLOCKD *xd, + uint8_t *dst_y, + int dst_ystride, + int weight, + int mb_row, + int mb_col) { + const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0; + int which_mv; + + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + const int clamp_mvs = which_mv ? + xd->mode_info_context->mbmi.need_to_clamp_secondmv : + xd->mode_info_context->mbmi.need_to_clamp_mvs; + + uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer; + int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride; + int_mv ymv; + ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int; + + if (clamp_mvs) + clamp_mv_to_umv_border(&ymv.as_mv, xd); + + set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16); + + vp9_build_inter_predictor(base_pre, pre_stride, + dst_y, dst_ystride, + &ymv, &xd->scale_factor[which_mv], + 16, 16, which_mv ? weight : 0, &xd->subpix); + } } -void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, - uint8_t *dst_y, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_ystride, - int dst_uvstride) { - uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer; - uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer, - *v2 = x->second_pre.v_buffer; +void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd, + uint8_t *dst_y, + int dst_ystride, + int mb_row, + int mb_col) { + int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col); + + build_inter16x16_predictors_mby_w(xd, dst_y, dst_ystride, weight, + mb_row, mb_col); +} + +#else + +void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd, + uint8_t *dst_y, + int dst_ystride, + int mb_row, + int mb_col) { + const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0; + int which_mv; + + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + const int clamp_mvs = which_mv ? + xd->mode_info_context->mbmi.need_to_clamp_secondmv : + xd->mode_info_context->mbmi.need_to_clamp_mvs; + + uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer; + int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride; + int_mv ymv; + ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int; + + if (clamp_mvs) + clamp_mv_to_umv_border(&ymv.as_mv, xd); + + set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16); + + vp9_build_inter_predictor(base_pre, pre_stride, + dst_y, dst_ystride, + &ymv, &xd->scale_factor[which_mv], + 16, 16, which_mv, &xd->subpix); + } +} +#endif + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_uvstride, + int weight, + int mb_row, + int mb_col) { + const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0; + int which_mv; + + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + const int clamp_mvs = + which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv + : xd->mode_info_context->mbmi.need_to_clamp_mvs; + uint8_t *uptr, *vptr; + int pre_stride = which_mv ? xd->second_pre.uv_stride + : xd->pre.uv_stride; + int_mv _o16x16mv; + int_mv _16x16mv; + + _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int; + + if (clamp_mvs) + clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); + + _o16x16mv = _16x16mv; + /* calc uv motion vectors */ + if (_16x16mv.as_mv.row < 0) + _16x16mv.as_mv.row -= 1; + else + _16x16mv.as_mv.row += 1; + + if (_16x16mv.as_mv.col < 0) + _16x16mv.as_mv.col -= 1; + else + _16x16mv.as_mv.col += 1; + + _16x16mv.as_mv.row /= 2; + _16x16mv.as_mv.col /= 2; + + _16x16mv.as_mv.row &= xd->fullpixel_mask; + _16x16mv.as_mv.col &= xd->fullpixel_mask; + + uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer); + vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer); + + set_scaled_offsets(&xd->scale_factor_uv[which_mv], + mb_row * 16, mb_col * 16); + + vp9_build_inter_predictor_q4( + uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv, + &xd->scale_factor_uv[which_mv], 8, 8, + which_mv ? weight : 0, &xd->subpix); + + vp9_build_inter_predictor_q4( + vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv, + &xd->scale_factor_uv[which_mv], 8, 8, + which_mv ? weight : 0, &xd->subpix); + } +} + +void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_uvstride, + int mb_row, + int mb_col) { +#ifdef USE_IMPLICIT_WEIGHT_UV + int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col); +#else + int weight = AVERAGE_WEIGHT; +#endif + build_inter16x16_predictors_mbuv_w(xd, dst_u, dst_v, dst_uvstride, + weight, mb_row, mb_col); +} + +#else + +void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_uvstride, + int mb_row, + int mb_col) { + const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0; + int which_mv; + + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + const int clamp_mvs = + which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv + : xd->mode_info_context->mbmi.need_to_clamp_mvs; + uint8_t *uptr, *vptr; + int pre_stride = which_mv ? xd->second_pre.uv_stride + : xd->pre.uv_stride; + int_mv _o16x16mv; + int_mv _16x16mv; + + _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int; + + if (clamp_mvs) + clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); + + _o16x16mv = _16x16mv; + /* calc uv motion vectors */ + if (_16x16mv.as_mv.row < 0) + _16x16mv.as_mv.row -= 1; + else + _16x16mv.as_mv.row += 1; + + if (_16x16mv.as_mv.col < 0) + _16x16mv.as_mv.col -= 1; + else + _16x16mv.as_mv.col += 1; + + _16x16mv.as_mv.row /= 2; + _16x16mv.as_mv.col /= 2; + + _16x16mv.as_mv.row &= xd->fullpixel_mask; + _16x16mv.as_mv.col &= xd->fullpixel_mask; + + uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer); + vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer); + + set_scaled_offsets(&xd->scale_factor_uv[which_mv], + mb_row * 16, mb_col * 16); + + vp9_build_inter_predictor_q4( + uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv, + &xd->scale_factor_uv[which_mv], 8, 8, + which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix); + + vp9_build_inter_predictor_q4( + vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv, + &xd->scale_factor_uv[which_mv], 8, 8, + which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix); + } +} +#endif + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +static void build_inter32x32_predictors_sby_w(MACROBLOCKD *x, + uint8_t *dst_y, + int dst_ystride, + int weight, + int mb_row, + int mb_col) { + uint8_t *y1 = x->pre.y_buffer; + uint8_t *y2 = x->second_pre.y_buffer; int edge[4], n; edge[0] = x->mb_to_top_edge; @@ -569,43 +1112,246 @@ void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, x->mb_to_left_edge = edge[2] - ((x_idx * 16) << 3); x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 16) << 3); - x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride + x_idx * 16; - x->pre.u_buffer = u1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; - x->pre.v_buffer = v1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; - - vp9_build_1st_inter16x16_predictors_mb(x, - dst_y + y_idx * 16 * dst_ystride + x_idx * 16, - dst_u + y_idx * 8 * dst_uvstride + x_idx * 8, - dst_v + y_idx * 8 * dst_uvstride + x_idx * 8, - dst_ystride, dst_uvstride); + x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16, + y_idx * 16, + x->pre.y_stride, + &x->scale_factor[0]); if (x->mode_info_context->mbmi.second_ref_frame > 0) { - x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride + x_idx * 16; - x->second_pre.u_buffer = u2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; - x->second_pre.v_buffer = v2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; + x->second_pre.y_buffer = y2 + + scaled_buffer_offset(x_idx * 16, + y_idx * 16, + x->second_pre.y_stride, + &x->scale_factor[1]); + } + build_inter16x16_predictors_mby_w(x, + dst_y + y_idx * 16 * dst_ystride + x_idx * 16, + dst_ystride, weight, mb_row + y_idx, mb_col + x_idx); + } + x->mb_to_top_edge = edge[0]; + x->mb_to_bottom_edge = edge[1]; + x->mb_to_left_edge = edge[2]; + x->mb_to_right_edge = edge[3]; + + x->pre.y_buffer = y1; + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.y_buffer = y2; + } +} + +void vp9_build_inter32x32_predictors_sby(MACROBLOCKD *x, + uint8_t *dst_y, + int dst_ystride, + int mb_row, + int mb_col) { + int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col); + build_inter32x32_predictors_sby_w(x, dst_y, dst_ystride, weight, + mb_row, mb_col); +} + +#else + +// TODO(all): Can we use 32x32 specific implementations of this rather than +// using 16x16 implementations ? +void vp9_build_inter32x32_predictors_sby(MACROBLOCKD *x, + uint8_t *dst_y, + int dst_ystride, + int mb_row, + int mb_col) { + uint8_t *y1 = x->pre.y_buffer; + uint8_t *y2 = x->second_pre.y_buffer; + int edge[4], n; + + edge[0] = x->mb_to_top_edge; + edge[1] = x->mb_to_bottom_edge; + edge[2] = x->mb_to_left_edge; + edge[3] = x->mb_to_right_edge; - vp9_build_2nd_inter16x16_predictors_mb(x, + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + x->mb_to_top_edge = edge[0] - ((y_idx * 16) << 3); + x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3); + x->mb_to_left_edge = edge[2] - ((x_idx * 16) << 3); + x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 16) << 3); + + x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16, + y_idx * 16, + x->pre.y_stride, + &x->scale_factor[0]); + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.y_buffer = y2 + + scaled_buffer_offset(x_idx * 16, + y_idx * 16, + x->second_pre.y_stride, + &x->scale_factor[1]); + } + vp9_build_inter16x16_predictors_mby(x, dst_y + y_idx * 16 * dst_ystride + x_idx * 16, + dst_ystride, mb_row + y_idx, mb_col + x_idx); + } + x->mb_to_top_edge = edge[0]; + x->mb_to_bottom_edge = edge[1]; + x->mb_to_left_edge = edge[2]; + x->mb_to_right_edge = edge[3]; + + x->pre.y_buffer = y1; + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.y_buffer = y2; + } +} + +#endif + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +static void build_inter32x32_predictors_sbuv_w(MACROBLOCKD *x, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_uvstride, + int weight, + int mb_row, + int mb_col) { + uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer; + uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer; + int edge[4], n; + + edge[0] = x->mb_to_top_edge; + edge[1] = x->mb_to_bottom_edge; + edge[2] = x->mb_to_left_edge; + edge[3] = x->mb_to_right_edge; + + for (n = 0; n < 4; n++) { + int scaled_uv_offset; + const int x_idx = n & 1, y_idx = n >> 1; + + x->mb_to_top_edge = edge[0] - ((y_idx * 16) << 3); + x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3); + x->mb_to_left_edge = edge[2] - ((x_idx * 16) << 3); + x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 16) << 3); + + scaled_uv_offset = scaled_buffer_offset(x_idx * 8, + y_idx * 8, + x->pre.uv_stride, + &x->scale_factor_uv[0]); + x->pre.u_buffer = u1 + scaled_uv_offset; + x->pre.v_buffer = v1 + scaled_uv_offset; + + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + scaled_uv_offset = scaled_buffer_offset(x_idx * 8, + y_idx * 8, + x->second_pre.uv_stride, + &x->scale_factor_uv[1]); + x->second_pre.u_buffer = u2 + scaled_uv_offset; + x->second_pre.v_buffer = v2 + scaled_uv_offset; + } + + build_inter16x16_predictors_mbuv_w(x, dst_u + y_idx * 8 * dst_uvstride + x_idx * 8, dst_v + y_idx * 8 * dst_uvstride + x_idx * 8, - dst_ystride, dst_uvstride); - } + dst_uvstride, weight, mb_row + y_idx, mb_col + x_idx); } + x->mb_to_top_edge = edge[0]; + x->mb_to_bottom_edge = edge[1]; + x->mb_to_left_edge = edge[2]; + x->mb_to_right_edge = edge[3]; + + x->pre.u_buffer = u1; + x->pre.v_buffer = v1; + + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.u_buffer = u2; + x->second_pre.v_buffer = v2; + } +} + +void vp9_build_inter32x32_predictors_sbuv(MACROBLOCKD *xd, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_uvstride, + int mb_row, + int mb_col) { +#ifdef USE_IMPLICIT_WEIGHT_UV + int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col); +#else + int weight = AVERAGE_WEIGHT; +#endif + build_inter32x32_predictors_sbuv_w(xd, dst_u, dst_v, dst_uvstride, + weight, mb_row, mb_col); +} + +#else + +void vp9_build_inter32x32_predictors_sbuv(MACROBLOCKD *x, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_uvstride, + int mb_row, + int mb_col) { + uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer; + uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer; + int edge[4], n; + + edge[0] = x->mb_to_top_edge; + edge[1] = x->mb_to_bottom_edge; + edge[2] = x->mb_to_left_edge; + edge[3] = x->mb_to_right_edge; + + for (n = 0; n < 4; n++) { + int scaled_uv_offset; + const int x_idx = n & 1, y_idx = n >> 1; + + x->mb_to_top_edge = edge[0] - ((y_idx * 16) << 3); + x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3); + x->mb_to_left_edge = edge[2] - ((x_idx * 16) << 3); + x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 16) << 3); + + scaled_uv_offset = scaled_buffer_offset(x_idx * 8, + y_idx * 8, + x->pre.uv_stride, + &x->scale_factor_uv[0]); + x->pre.u_buffer = u1 + scaled_uv_offset; + x->pre.v_buffer = v1 + scaled_uv_offset; + + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + scaled_uv_offset = scaled_buffer_offset(x_idx * 8, + y_idx * 8, + x->second_pre.uv_stride, + &x->scale_factor_uv[1]); + x->second_pre.u_buffer = u2 + scaled_uv_offset; + x->second_pre.v_buffer = v2 + scaled_uv_offset; + } + vp9_build_inter16x16_predictors_mbuv(x, + dst_u + y_idx * 8 * dst_uvstride + x_idx * 8, + dst_v + y_idx * 8 * dst_uvstride + x_idx * 8, + dst_uvstride, mb_row + y_idx, mb_col + x_idx); + } x->mb_to_top_edge = edge[0]; x->mb_to_bottom_edge = edge[1]; x->mb_to_left_edge = edge[2]; x->mb_to_right_edge = edge[3]; - x->pre.y_buffer = y1; x->pre.u_buffer = u1; x->pre.v_buffer = v1; if (x->mode_info_context->mbmi.second_ref_frame > 0) { - x->second_pre.y_buffer = y2; x->second_pre.u_buffer = u2; x->second_pre.v_buffer = v2; } +} +#endif +void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, + uint8_t *dst_y, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_ystride, + int dst_uvstride, + int mb_row, + int mb_col) { + vp9_build_inter32x32_predictors_sby(x, dst_y, dst_ystride, + mb_row, mb_col); + vp9_build_inter32x32_predictors_sbuv(x, dst_u, dst_v, dst_uvstride, + mb_row, mb_col); #if CONFIG_COMP_INTERINTRA_PRED if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { vp9_build_interintra_32x32_predictors_sb( @@ -614,15 +1360,15 @@ void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, #endif } -void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x, - uint8_t *dst_y, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_ystride, - int dst_uvstride) { - uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer; - uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer, - *v2 = x->second_pre.v_buffer; +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +static void build_inter64x64_predictors_sby_w(MACROBLOCKD *x, + uint8_t *dst_y, + int dst_ystride, + int weight, + int mb_row, + int mb_col) { + uint8_t *y1 = x->pre.y_buffer; + uint8_t *y2 = x->second_pre.y_buffer; int edge[4], n; edge[0] = x->mb_to_top_edge; @@ -638,21 +1384,22 @@ void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x, x->mb_to_left_edge = edge[2] - ((x_idx * 32) << 3); x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 32) << 3); - x->pre.y_buffer = y1 + y_idx * 32 * x->pre.y_stride + x_idx * 32; - x->pre.u_buffer = u1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16; - x->pre.v_buffer = v1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16; + x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32, + y_idx * 32, + x->pre.y_stride, + &x->scale_factor[0]); if (x->mode_info_context->mbmi.second_ref_frame > 0) { - x->second_pre.y_buffer = y2 + y_idx * 32 * x->pre.y_stride + x_idx * 32; - x->second_pre.u_buffer = u2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16; - x->second_pre.v_buffer = v2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16; + x->second_pre.y_buffer = y2 + + scaled_buffer_offset(x_idx * 32, + y_idx * 32, + x->second_pre.y_stride, + &x->scale_factor[1]); } - vp9_build_inter32x32_predictors_sb(x, + build_inter32x32_predictors_sby_w(x, dst_y + y_idx * 32 * dst_ystride + x_idx * 32, - dst_u + y_idx * 16 * dst_uvstride + x_idx * 16, - dst_v + y_idx * 16 * dst_uvstride + x_idx * 16, - dst_ystride, dst_uvstride); + dst_ystride, weight, mb_row + y_idx * 2, mb_col + x_idx * 2); } x->mb_to_top_edge = edge[0]; @@ -661,324 +1408,392 @@ void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x, x->mb_to_right_edge = edge[3]; x->pre.y_buffer = y1; - x->pre.u_buffer = u1; - x->pre.v_buffer = v1; if (x->mode_info_context->mbmi.second_ref_frame > 0) { x->second_pre.y_buffer = y2; - x->second_pre.u_buffer = u2; - x->second_pre.v_buffer = v2; } +} -#if CONFIG_COMP_INTERINTRA_PRED - if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { - vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v, - dst_ystride, dst_uvstride); - } -#endif +void vp9_build_inter64x64_predictors_sby(MACROBLOCKD *x, + uint8_t *dst_y, + int dst_ystride, + int mb_row, + int mb_col) { + int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col); + build_inter64x64_predictors_sby_w(x, dst_y, dst_ystride, weight, + mb_row, mb_col); } -/* - * The following functions should be called after an initial - * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv(). - * It will run a second filter on a (different) ref - * frame and average the result with the output of the - * first filter. The second reference frame is stored - * in x->second_pre (the reference frame index is in - * x->mode_info_context->mbmi.second_ref_frame). The second - * motion vector is x->mode_info_context->mbmi.second_mv. - * - * This allows blending prediction from two reference frames - * which sometimes leads to better prediction than from a - * single reference framer. - */ -void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd, - uint8_t *dst_y, - int dst_ystride) { - uint8_t *ptr; +#else - int_mv _16x16mv; - int mv_row; - int mv_col; +void vp9_build_inter64x64_predictors_sby(MACROBLOCKD *x, + uint8_t *dst_y, + int dst_ystride, + int mb_row, + int mb_col) { + uint8_t *y1 = x->pre.y_buffer; + uint8_t *y2 = x->second_pre.y_buffer; + int edge[4], n; - uint8_t *ptr_base = xd->second_pre.y_buffer; - int pre_stride = xd->block[0].pre_stride; + edge[0] = x->mb_to_top_edge; + edge[1] = x->mb_to_bottom_edge; + edge[2] = x->mb_to_left_edge; + edge[3] = x->mb_to_right_edge; - _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int; + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; - if (xd->mode_info_context->mbmi.need_to_clamp_secondmv) - clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); + x->mb_to_top_edge = edge[0] - ((y_idx * 32) << 3); + x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3); + x->mb_to_left_edge = edge[2] - ((x_idx * 32) << 3); + x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 32) << 3); - mv_row = _16x16mv.as_mv.row; - mv_col = _16x16mv.as_mv.col; + x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32, + y_idx * 32, + x->pre.y_stride, + &x->scale_factor[0]); - ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.y_buffer = y2 + + scaled_buffer_offset(x_idx * 32, + y_idx * 32, + x->second_pre.y_stride, + &x->scale_factor[1]); + } - if ((mv_row | mv_col) & 7) { - xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1, - (mv_row & 7) << 1, dst_y, dst_ystride); - } else { - vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride); + vp9_build_inter32x32_predictors_sby(x, + dst_y + y_idx * 32 * dst_ystride + x_idx * 32, + dst_ystride, mb_row + y_idx * 2, mb_col + x_idx * 2); + } + + x->mb_to_top_edge = edge[0]; + x->mb_to_bottom_edge = edge[1]; + x->mb_to_left_edge = edge[2]; + x->mb_to_right_edge = edge[3]; + + x->pre.y_buffer = y1; + + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.y_buffer = y2; } } +#endif -void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_uvstride) { - int offset; - uint8_t *uptr, *vptr; +void vp9_build_inter64x64_predictors_sbuv(MACROBLOCKD *x, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_uvstride, + int mb_row, + int mb_col) { + uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer; + uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer; + int edge[4], n; - int_mv _16x16mv; - int mv_row; - int mv_col; - int omv_row, omv_col; + edge[0] = x->mb_to_top_edge; + edge[1] = x->mb_to_bottom_edge; + edge[2] = x->mb_to_left_edge; + edge[3] = x->mb_to_right_edge; - int pre_stride = xd->block[0].pre_stride; + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + int scaled_uv_offset; - _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int; + x->mb_to_top_edge = edge[0] - ((y_idx * 32) << 3); + x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3); + x->mb_to_left_edge = edge[2] - ((x_idx * 32) << 3); + x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 32) << 3); - if (xd->mode_info_context->mbmi.need_to_clamp_secondmv) - clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); + scaled_uv_offset = scaled_buffer_offset(x_idx * 16, + y_idx * 16, + x->pre.uv_stride, + &x->scale_factor_uv[0]); + x->pre.u_buffer = u1 + scaled_uv_offset; + x->pre.v_buffer = v1 + scaled_uv_offset; - mv_row = _16x16mv.as_mv.row; - mv_col = _16x16mv.as_mv.col; + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + scaled_uv_offset = scaled_buffer_offset(x_idx * 16, + y_idx * 16, + x->second_pre.uv_stride, + &x->scale_factor_uv[1]); + x->second_pre.u_buffer = u2 + scaled_uv_offset; + x->second_pre.v_buffer = v2 + scaled_uv_offset; + } - /* calc uv motion vectors */ - omv_row = mv_row; - omv_col = mv_col; - mv_row = (mv_row + (mv_row > 0)) >> 1; - mv_col = (mv_col + (mv_col > 0)) >> 1; + vp9_build_inter32x32_predictors_sbuv(x, + dst_u + y_idx * 16 * dst_uvstride + x_idx * 16, + dst_v + y_idx * 16 * dst_uvstride + x_idx * 16, + dst_uvstride, mb_row + y_idx * 2, mb_col + x_idx * 2); + } - mv_row &= xd->fullpixel_mask; - mv_col &= xd->fullpixel_mask; + x->mb_to_top_edge = edge[0]; + x->mb_to_bottom_edge = edge[1]; + x->mb_to_left_edge = edge[2]; + x->mb_to_right_edge = edge[3]; - pre_stride >>= 1; - offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); - uptr = xd->second_pre.u_buffer + offset; - vptr = xd->second_pre.v_buffer + offset; + x->pre.u_buffer = u1; + x->pre.v_buffer = v1; - if ((omv_row | omv_col) & 15) { - xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15, - omv_row & 15, dst_u, dst_uvstride); - xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15, - omv_row & 15, dst_v, dst_uvstride); - } else { - vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); - vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); - } + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.u_buffer = u2; + x->second_pre.v_buffer = v2; + } } -void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, - uint8_t *dst_y, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_ystride, - int dst_uvstride) { - vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride); - vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride); +void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x, + uint8_t *dst_y, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_ystride, + int dst_uvstride, + int mb_row, + int mb_col) { + vp9_build_inter64x64_predictors_sby(x, dst_y, dst_ystride, + mb_row, mb_col); + vp9_build_inter64x64_predictors_sbuv(x, dst_u, dst_v, dst_uvstride, + mb_row, mb_col); +#if CONFIG_COMP_INTERINTRA_PRED + if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { + vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v, + dst_ystride, dst_uvstride); + } +#endif } -static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { +static void build_inter4x4_predictors_mb(MACROBLOCKD *xd, + int mb_row, int mb_col) { int i; MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; BLOCKD *blockd = xd->block; + int which_mv = 0; + const int use_second_ref = mbmi->second_ref_frame > 0; +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && defined(USE_IMPLICIT_WEIGHT_SPLITMV) + int weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col); +#else + int weight = AVERAGE_WEIGHT; +#endif if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) { - blockd[ 0].bmi = xd->mode_info_context->bmi[ 0]; - blockd[ 2].bmi = xd->mode_info_context->bmi[ 2]; - blockd[ 8].bmi = xd->mode_info_context->bmi[ 8]; - blockd[10].bmi = xd->mode_info_context->bmi[10]; - - if (mbmi->need_to_clamp_mvs) { - clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd); - if (mbmi->second_ref_frame > 0) { - clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd); - } - } + for (i = 0; i < 16; i += 8) { + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 2]; + const int y = i & 8; + blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0]; + blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2]; - vp9_build_inter_predictors4b(xd, &blockd[ 0], 16); - vp9_build_inter_predictors4b(xd, &blockd[ 2], 16); - vp9_build_inter_predictors4b(xd, &blockd[ 8], 16); - vp9_build_inter_predictors4b(xd, &blockd[10], 16); + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + if (mbmi->need_to_clamp_mvs) { + clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd); + } - if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16); - vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16); - vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16); - vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16); + build_2x1_inter_predictor(d0, d1, xd->scale_factor, 8, 16, which_mv, + which_mv ? weight : 0, + &xd->subpix, mb_row * 16 + y, mb_col * 16); + } } } else { for (i = 0; i < 16; i += 2) { BLOCKD *d0 = &blockd[i]; BLOCKD *d1 = &blockd[i + 1]; + const int x = (i & 3) * 4; + const int y = (i >> 2) * 4; blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0]; blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1]; - if (mbmi->need_to_clamp_mvs) { - clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd); - if (mbmi->second_ref_frame > 0) { - clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd); - } - } - - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) - build_inter_predictors2b(xd, d0, 16); - else { - vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4); - } - - if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4); + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + build_2x1_inter_predictor(d0, d1, xd->scale_factor, 4, 16, which_mv, + which_mv ? weight : 0, + &xd->subpix, + mb_row * 16 + y, mb_col * 16 + x); } } } - +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +#if !defined(USE_IMPLICIT_WEIGHT_UV) + weight = AVERAGE_WEIGHT; +#endif +#endif for (i = 16; i < 24; i += 2) { BLOCKD *d0 = &blockd[i]; BLOCKD *d1 = &blockd[i + 1]; + const int x = 4 * (i & 1); + const int y = ((i - 16) >> 1) * 4; - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) - build_inter_predictors2b(xd, d0, 8); - else { - vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4); - } - - if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4); + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv, + which_mv ? weight : 0, &xd->subpix, + mb_row * 8 + y, mb_col * 8 + x); } } } -static -void build_4x4uvmvs(MACROBLOCKD *xd) { - int i, j; - BLOCKD *blockd = xd->block; +static INLINE int round_mv_comp(int value) { + return (value < 0 ? value - 4 : value + 4) / 8; +} - for (i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) { - int yoffset = i * 8 + j * 2; - int uoffset = 16 + i * 2 + j; - int voffset = 20 + i * 2 + j; +static int mi_mv_pred_row(MACROBLOCKD *mb, int off, int idx) { + const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row + + mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.row + + mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.row + + mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.row; + return round_mv_comp(temp) & mb->fullpixel_mask; +} - int temp; +static int mi_mv_pred_col(MACROBLOCKD *mb, int off, int idx) { + const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.col + + mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.col + + mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.col + + mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.col; + return round_mv_comp(temp) & mb->fullpixel_mask; +} - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row; +static int b_mv_pred_row(MACROBLOCKD *mb, int off, int idx) { + BLOCKD *const blockd = mb->block; + const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.row + + blockd[off + 1].bmi.as_mv[idx].as_mv.row + + blockd[off + 4].bmi.as_mv[idx].as_mv.row + + blockd[off + 5].bmi.as_mv[idx].as_mv.row; + return round_mv_comp(temp) & mb->fullpixel_mask; +} - if (temp < 0) temp -= 4; - else temp += 4; +static int b_mv_pred_col(MACROBLOCKD *mb, int off, int idx) { + BLOCKD *const blockd = mb->block; + const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.col + + blockd[off + 1].bmi.as_mv[idx].as_mv.col + + blockd[off + 4].bmi.as_mv[idx].as_mv.col + + blockd[off + 5].bmi.as_mv[idx].as_mv.col; + return round_mv_comp(temp) & mb->fullpixel_mask; +} - blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) & - xd->fullpixel_mask; - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col; +static void build_4x4uvmvs(MACROBLOCKD *xd) { + int i, j; + BLOCKD *blockd = xd->block; - if (temp < 0) temp -= 4; - else temp += 4; + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + const int yoffset = i * 8 + j * 2; + const int uoffset = 16 + i * 2 + j; + const int voffset = 20 + i * 2 + j; - blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) & - xd->fullpixel_mask; + MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv; + MV *v = &blockd[voffset].bmi.as_mv[0].as_mv; + u->row = mi_mv_pred_row(xd, yoffset, 0); + u->col = mi_mv_pred_col(xd, yoffset, 0); // if (x->mode_info_context->mbmi.need_to_clamp_mvs) - clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd); + clamp_uvmv_to_umv_border(u, xd); // if (x->mode_info_context->mbmi.need_to_clamp_mvs) - clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd); + clamp_uvmv_to_umv_border(u, xd); - blockd[voffset].bmi.as_mv.first.as_mv.row = - blockd[uoffset].bmi.as_mv.first.as_mv.row; - blockd[voffset].bmi.as_mv.first.as_mv.col = - blockd[uoffset].bmi.as_mv.first.as_mv.col; + v->row = u->row; + v->col = u->col; if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row; - - if (temp < 0) { - temp -= 4; - } else { - temp += 4; - } - - blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) & - xd->fullpixel_mask; - - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col; - - if (temp < 0) { - temp -= 4; - } else { - temp += 4; - } - - blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) & - xd->fullpixel_mask; + u = &blockd[uoffset].bmi.as_mv[1].as_mv; + v = &blockd[voffset].bmi.as_mv[1].as_mv; + u->row = mi_mv_pred_row(xd, yoffset, 1); + u->col = mi_mv_pred_col(xd, yoffset, 1); // if (mbmi->need_to_clamp_mvs) - clamp_uvmv_to_umv_border( - &blockd[uoffset].bmi.as_mv.second.as_mv, xd); + clamp_uvmv_to_umv_border(u, xd); // if (mbmi->need_to_clamp_mvs) - clamp_uvmv_to_umv_border( - &blockd[uoffset].bmi.as_mv.second.as_mv, xd); + clamp_uvmv_to_umv_border(u, xd); - blockd[voffset].bmi.as_mv.second.as_mv.row = - blockd[uoffset].bmi.as_mv.second.as_mv.row; - blockd[voffset].bmi.as_mv.second.as_mv.col = - blockd[uoffset].bmi.as_mv.second.as_mv.col; + v->row = u->row; + v->col = u->col; } } } } -void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) { - if (xd->mode_info_context->mbmi.mode != SPLITMV) { - vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor, - &xd->predictor[256], - &xd->predictor[320], 16, 8); - - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - /* 256 = offset of U plane in Y+U+V buffer; - * 320 = offset of V plane in Y+U+V buffer. - * (256=16x16, 320=16x16+8x8). */ - vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor, - &xd->predictor[256], - &xd->predictor[320], 16, 8); - } +void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd, + uint8_t *dst_y, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_ystride, + int dst_uvstride, + int mb_row, + int mb_col) { + vp9_build_inter16x16_predictors_mby(xd, dst_y, dst_ystride, mb_row, mb_col); + vp9_build_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride, + mb_row, mb_col); #if CONFIG_COMP_INTERINTRA_PRED - else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { - vp9_build_interintra_16x16_predictors_mb(xd, xd->predictor, - &xd->predictor[256], - &xd->predictor[320], 16, 8); - } + if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { + vp9_build_interintra_16x16_predictors_mb(xd, dst_y, dst_u, dst_v, + dst_ystride, dst_uvstride); + } #endif +} + +void vp9_build_inter_predictors_mb(MACROBLOCKD *xd, + int mb_row, + int mb_col) { + if (xd->mode_info_context->mbmi.mode != SPLITMV) { + vp9_build_inter16x16_predictors_mb(xd, xd->predictor, + &xd->predictor[256], + &xd->predictor[320], 16, 8, + mb_row, mb_col); + } else { build_4x4uvmvs(xd); - build_inter4x4_predictors_mb(xd); + build_inter4x4_predictors_mb(xd, mb_row, mb_col); + } +} + +/*encoder only*/ +void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd, + int mb_row, int mb_col) { + int i, j, weight; + BLOCKD *const blockd = xd->block; + + /* build uv mvs */ + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + const int yoffset = i * 8 + j * 2; + const int uoffset = 16 + i * 2 + j; + const int voffset = 20 + i * 2 + j; + + MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv; + MV *v = &blockd[voffset].bmi.as_mv[0].as_mv; + + v->row = u->row = b_mv_pred_row(xd, yoffset, 0); + v->col = u->col = b_mv_pred_col(xd, yoffset, 0); + + if (xd->mode_info_context->mbmi.second_ref_frame > 0) { + u = &blockd[uoffset].bmi.as_mv[1].as_mv; + v = &blockd[voffset].bmi.as_mv[1].as_mv; + + v->row = u->row = b_mv_pred_row(xd, yoffset, 1); + v->row = u->col = b_mv_pred_row(xd, yoffset, 1); + } + } + } + +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && \ + defined(USE_IMPLICIT_WEIGHT_SPLITMV) && \ + defined(USE_IMPLICIT_WEIGHT_UV) + weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col); +#else + weight = AVERAGE_WEIGHT; +#endif + for (i = 16; i < 24; i += 2) { + const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0; + const int x = 4 * (i & 1); + const int y = ((i - 16) >> 1) * 4; + + int which_mv; + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 1]; + + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv, + which_mv ? weight : 0, + &xd->subpix, mb_row * 8 + y, mb_col * 8 + x); + } } } diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 89868b95efcadda7411b92d0cc3287e87f3f6950..831ce2a73b8e01531cf7915d7696ecaf54408e15 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -14,71 +14,128 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_onyxc_int.h" -extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, - uint8_t *dst_y, - int dst_ystride, - int clamp_mvs); - -extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_uvstride); - -extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd, - uint8_t *dst_y, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_ystride, - int dst_uvstride); - -extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd, - uint8_t *dst_y, - int dst_ystride); - -extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_uvstride); - -extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, - uint8_t *dst_y, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_ystride, - int dst_uvstride); - -extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, - uint8_t *dst_y, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_ystride, - int dst_uvstride); - -extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x, - uint8_t *dst_y, - uint8_t *dst_u, - uint8_t *dst_v, - int dst_ystride, - int dst_uvstride); - -extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd); - -extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf); - -extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf); - -extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, - int pitch); - -extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, - BLOCKD *d, int pitch); - -extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd); - -extern void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATIONFILTERTYPE filter, - VP9_COMMON *cm); +struct subpix_fn_table; + +void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd, + uint8_t *dst_y, + int dst_ystride, + int mb_row, + int mb_col); + +void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_uvstride, + int mb_row, + int mb_col); + +void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd, + uint8_t *dst_y, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_ystride, + int dst_uvstride, + int mb_row, + int mb_col); + +void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, + uint8_t *dst_y, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_ystride, + int dst_uvstride, + int mb_row, + int mb_col); + +void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x, + uint8_t *dst_y, + uint8_t *dst_u, + uint8_t *dst_v, + int dst_ystride, + int dst_uvstride, + int mb_row, + int mb_col); + +void vp9_build_inter_predictors_mb(MACROBLOCKD *xd, + int mb_row, + int mb_col); + +void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd, + int mb_row, + int mb_col); + +void vp9_setup_interp_filters(MACROBLOCKD *xd, + INTERPOLATIONFILTERTYPE filter, + VP9_COMMON *cm); + +void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + YV12_BUFFER_CONFIG *other, + int this_w, int this_h); + +void vp9_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int_mv *mv_q3, + const struct scale_factors *scale, + int w, int h, int do_avg, + const struct subpix_fn_table *subpix); + +void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int_mv *fullpel_mv_q3, + const int_mv *frac_mv_q4, + const struct scale_factors *scale, + int w, int h, int do_avg, + const struct subpix_fn_table *subpix); + +static int scale_value_x(int val, const struct scale_factors *scale) { + return val * scale->x_num / scale->x_den; +} + +static int scale_value_y(int val, const struct scale_factors *scale) { + return val * scale->y_num / scale->y_den; +} + +static int scaled_buffer_offset(int x_offset, + int y_offset, + int stride, + const struct scale_factors *scale) { + return scale_value_y(y_offset, scale) * stride + + scale_value_x(x_offset, scale); +} + +static void setup_pred_block(YV12_BUFFER_CONFIG *dst, + const YV12_BUFFER_CONFIG *src, + int mb_row, int mb_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv) { + const int recon_y_stride = src->y_stride; + const int recon_uv_stride = src->uv_stride; + int recon_yoffset; + int recon_uvoffset; + + if (scale) { + recon_yoffset = scaled_buffer_offset(16 * mb_col, 16 * mb_row, + recon_y_stride, scale); + recon_uvoffset = scaled_buffer_offset(8 * mb_col, 8 * mb_row, + recon_uv_stride, scale_uv); + } else { + recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col; + recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col; + } + *dst = *src; + dst->y_buffer += recon_yoffset; + dst->u_buffer += recon_uvoffset; + dst->v_buffer += recon_uvoffset; +} + +static void set_scale_factors(MACROBLOCKD *xd, + int ref0, int ref1, + struct scale_factors scale_factor[MAX_REF_FRAMES]) { + + xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0]; + xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0]; + xd->scale_factor_uv[0] = xd->scale_factor[0]; + xd->scale_factor_uv[1] = xd->scale_factor[1]; +} #endif // VP9_COMMON_VP9_RECONINTER_H_ diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index 9b2fad5b1e223eff2a87d06dd6d5b49613c692b5..186532c8b10907711a86444d24e16a4eea92ac38 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -9,106 +9,131 @@ */ #include <stdio.h> + #include "./vpx_config.h" #include "vp9_rtcd.h" #include "vp9/common/vp9_reconintra.h" #include "vpx_mem/vpx_mem.h" -/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) - * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd). - */ +// For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) +// and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd). + +// Using multiplication and shifting instead of division in diagonal prediction. +// iscale table is calculated from ((1 << 16) + (i + 2) / 2) / (i+2) and used as +// ((A + B) * iscale[i] + (1 << 15)) >> 16; +// where A and B are weighted pixel values. +static const unsigned int iscale[64] = { + 32768, 21845, 16384, 13107, 10923, 9362, 8192, 7282, + 6554, 5958, 5461, 5041, 4681, 4369, 4096, 3855, + 3641, 3449, 3277, 3121, 2979, 2849, 2731, 2621, + 2521, 2427, 2341, 2260, 2185, 2114, 2048, 1986, + 1928, 1872, 1820, 1771, 1725, 1680, 1638, 1598, + 1560, 1524, 1489, 1456, 1425, 1394, 1365, 1337, + 1311, 1285, 1260, 1237, 1214, 1192, 1170, 1150, + 1130, 1111, 1092, 1074, 1057, 1040, 1024, 1008, +}; + +static INLINE int iscale_round(int value, int i) { + return ROUND_POWER_OF_TWO(value * iscale[i], 16); +} static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n, uint8_t *yabove_row, uint8_t *yleft_col) { - int r, c, h, w, v; - int a, b; + int r, c; + r = 0; for (c = 0; c < n - 2; c++) { - if (c & 1) - a = yleft_col[r + 1]; - else - a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1; - b = yabove_row[c + 2]; - ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); + int a = c & 1 ? yleft_col[r + 1] + : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1); + int b = yabove_row[c + 2]; + ypred_ptr[c] = iscale_round(2 * a + (c + 1) * b, 1 + c); } + for (r = 1; r < n / 2 - 1; r++) { for (c = 0; c < n - 2 - 2 * r; c++) { - if (c & 1) - a = yleft_col[r + 1]; - else - a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1; - b = ypred_ptr[(r - 1) * y_stride + c + 2]; - ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); + int a = c & 1 ? yleft_col[r + 1] + : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1); + int b = ypred_ptr[(r - 1) * y_stride + c + 2]; + ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c); } } - for (; r < n - 1; ++r) { + + for (; r < n - 1; r++) { for (c = 0; c < n; c++) { - v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1); - h = r - c / 2; + int v = c & 1 ? yleft_col[r + 1] + : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1); + int h = r - c / 2; ypred_ptr[h * y_stride + c] = v; } } + c = 0; r = n - 1; - ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] + - yleft_col[r] + 1) >> 1; + ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride] + + yleft_col[r], 1); for (r = n - 2; r >= n / 2; --r) { - w = c + (n - 1 - r) * 2; - ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] + - ypred_ptr[r * y_stride + w - 1] + 1) >> 1; + int w = c + (n - 1 - r) * 2; + ypred_ptr[r * y_stride + w] = + ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] + + ypred_ptr[r * y_stride + w - 1], 1); } + for (c = 1; c < n; c++) { for (r = n - 1; r >= n / 2 + c / 2; --r) { - w = c + (n - 1 - r) * 2; - ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] + - ypred_ptr[r * y_stride + w - 1] + 1) >> 1; + int w = c + (n - 1 - r) * 2; + ypred_ptr[r * y_stride + w] = + ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] + + ypred_ptr[r * y_stride + w - 1], 1); } } } static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n, uint8_t *yabove_row, uint8_t *yleft_col) { - int r, c, h, w, v; - int a, b; + int r, c; + c = 0; for (r = 0; r < n - 2; r++) { - if (r & 1) - a = yabove_row[c + 1]; - else - a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1; - b = yleft_col[r + 2]; - ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3); + int a = r & 1 ? yabove_row[c + 1] + : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1); + int b = yleft_col[r + 2]; + ypred_ptr[r * y_stride] = iscale_round(2 * a + (r + 1) * b, 1 + r); } + for (c = 1; c < n / 2 - 1; c++) { for (r = 0; r < n - 2 - 2 * c; r++) { - if (r & 1) - a = yabove_row[c + 1]; - else - a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1; - b = ypred_ptr[(r + 2) * y_stride + c - 1]; - ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); + int a = r & 1 ? yabove_row[c + 1] + : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1); + int b = ypred_ptr[(r + 2) * y_stride + c - 1]; + ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c); } } + for (; c < n - 1; ++c) { for (r = 0; r < n; r++) { - v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1); - w = c - r / 2; + int v = r & 1 ? yabove_row[c + 1] + : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1); + int w = c - r / 2; ypred_ptr[r * y_stride + w] = v; } } + r = 0; c = n - 1; - ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1; + ypred_ptr[c] = ROUND_POWER_OF_TWO(ypred_ptr[(c - 1)] + yabove_row[c], 1); for (c = n - 2; c >= n / 2; --c) { - h = r + (n - 1 - c) * 2; - ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] + - ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1; + int h = r + (n - 1 - c) * 2; + ypred_ptr[h * y_stride + c] = + ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] + + ypred_ptr[(h - 1) * y_stride + c], 1); } + for (r = 1; r < n; r++) { for (c = n - 1; c >= n / 2 + r / 2; --c) { - h = r + (n - 1 - c) * 2; - ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] + - ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1; + int h = r + (n - 1 - c) * 2; + ypred_ptr[h * y_stride + c] = + ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] + + ypred_ptr[(h - 1) * y_stride + c], 1); } } } @@ -116,27 +141,28 @@ static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n, static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n, uint8_t *yabove_row, uint8_t *yleft_col) { int r, c; + for (r = 0; r < n - 1; ++r) { for (c = 0; c <= r; ++c) { - ypred_ptr[(r - c) * y_stride + c] = - (yabove_row[r + 1] * (c + 1) + - yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2); + ypred_ptr[(r - c) * y_stride + c] = iscale_round( + yabove_row[r + 1] * (c + 1) + yleft_col[r + 1] * (r - c + 1), r); } } + for (c = 0; c <= r; ++c) { int yabove_ext = yabove_row[r]; // clip_pixel(2 * yabove_row[r] - // yabove_row[r - 1]); int yleft_ext = yleft_col[r]; // clip_pixel(2 * yleft_col[r] - // yleft_col[r-1]); ypred_ptr[(r - c) * y_stride + c] = - (yabove_ext * (c + 1) + - yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2); + iscale_round(yabove_ext * (c + 1) + yleft_ext * (r - c + 1), r); } for (r = 1; r < n; ++r) { for (c = n - r; c < n; ++c) { const int yabove_ext = ypred_ptr[(r - 1) * y_stride + c]; const int yleft_ext = ypred_ptr[r * y_stride + c - 1]; - ypred_ptr[r * y_stride + c] = (yabove_ext + yleft_ext + 1) >> 1; + ypred_ptr[r * y_stride + c] = + ROUND_POWER_OF_TWO(yabove_ext + yleft_ext, 1); } } } @@ -145,7 +171,7 @@ static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n, uint8_t *yabove_row, uint8_t *yleft_col) { int r, c; for (c = 0; c < n; c++) - ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1; + ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1); ypred_ptr += y_stride; for (c = 0; c < n; c++) ypred_ptr[c] = yabove_row[c - 1]; @@ -179,9 +205,10 @@ static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n, static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n, uint8_t *yabove_row, uint8_t *yleft_col) { int r, c; - ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1; + ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1); for (r = 1; r < n; r++) - ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1; + ypred_ptr[r * y_stride] = + ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1); ypred_ptr++; ypred_ptr[0] = yabove_row[-1]; for (r = 1; r < n; r++) @@ -248,19 +275,57 @@ void vp9_recon_intra_mbuv(MACROBLOCKD *xd) { } } +static INLINE int log2_minus_1(int n) { + switch (n) { + case 4: return 1; + case 8: return 2; + case 16: return 3; + case 32: return 4; + case 64: return 5; + default: + assert(0); + return 0; + } +} + + void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride, uint8_t *ypred_ptr, int y_stride, int mode, int bsize, - int up_available, int left_available) { - - uint8_t *yabove_row = src - src_stride; - uint8_t yleft_col[64]; - uint8_t ytop_left = yabove_row[-1]; + int up_available, int left_available, + int right_available) { int r, c, i; + uint8_t yleft_col[64], yabove_data[65], ytop_left; + uint8_t *yabove_row = yabove_data + 1; + /* + * 127 127 127 .. 127 127 127 127 127 127 + * 129 A B .. Y Z + * 129 C D .. W X + * 129 E F .. U V + * 129 G H .. S T T T T T + * .. + */ + + if (left_available) { + for (i = 0; i < bsize; i++) + yleft_col[i] = src[i * src_stride - 1]; + } else { + vpx_memset(yleft_col, 129, bsize); + } - for (i = 0; i < bsize; i++) { - yleft_col[i] = src[i * src_stride - 1]; + if (up_available) { + uint8_t *yabove_ptr = src - src_stride; + vpx_memcpy(yabove_row, yabove_ptr, bsize); + if (left_available) { + ytop_left = yabove_ptr[-1]; + } else { + ytop_left = 127; + } + } else { + vpx_memset(yabove_row, 127, bsize); + ytop_left = 127; } + yabove_row[-1] = ytop_left; /* for Y */ switch (mode) { @@ -269,22 +334,7 @@ void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride, int i; int shift; int average = 0; - int log2_bsize_minus_1; - - assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32 || - bsize == 64); - if (bsize == 4) { - log2_bsize_minus_1 = 1; - } else if (bsize == 8) { - log2_bsize_minus_1 = 2; - } else if (bsize == 16) { - log2_bsize_minus_1 = 3; - } else if (bsize == 32) { - log2_bsize_minus_1 = 4; - } else { - assert(bsize == 64); - log2_bsize_minus_1 = 5; - } + int log2_bsize_minus_1 = log2_minus_1(bsize); if (up_available || left_available) { if (up_available) { @@ -299,7 +349,7 @@ void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride, } } shift = log2_bsize_minus_1 + up_available + left_available; - expected_dc = (average + (1 << (shift - 1))) >> shift; + expected_dc = ROUND_POWER_OF_TWO(average, shift); } else { expected_dc = 128; } @@ -310,21 +360,19 @@ void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride, } } break; - case V_PRED: { + case V_PRED: for (r = 0; r < bsize; r++) { memcpy(ypred_ptr, yabove_row, bsize); ypred_ptr += y_stride; } - } - break; - case H_PRED: { + break; + case H_PRED: for (r = 0; r < bsize; r++) { vpx_memset(ypred_ptr, yleft_col[r], bsize); ypred_ptr += y_stride; } - } - break; - case TM_PRED: { + break; + case TM_PRED: for (r = 0; r < bsize; r++) { for (c = 0; c < bsize; c++) { ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left); @@ -332,32 +380,25 @@ void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride, ypred_ptr += y_stride; } - } - break; - case D45_PRED: { + break; + case D45_PRED: d45_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D135_PRED: { + break; + case D135_PRED: d135_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D117_PRED: { + break; + case D117_PRED: d117_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D153_PRED: { + break; + case D153_PRED: d153_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D27_PRED: { + break; + case D27_PRED: d27_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D63_PRED: { + break; + case D63_PRED: d63_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; + break; case I8X8_PRED: case B_PRED: case NEARESTMV: @@ -383,155 +424,28 @@ static void combine_interintra(MB_PREDICTION_MODE mode, static const int scale_max = 256; // 1 << scale_bits; static const int scale_round = 127; // (1 << (scale_bits - 1)); // This table is a function A + B*exp(-kx), where x is hor. index - static const int weights1d[32] = { - 128, 122, 116, 111, 107, 103, 99, 96, - 93, 90, 88, 85, 83, 81, 80, 78, - 77, 76, 75, 74, 73, 72, 71, 70, - 70, 69, 69, 68, 68, 68, 67, 67, + static const int weights1d[64] = { + 128, 125, 122, 119, 116, 114, 111, 109, + 107, 105, 103, 101, 99, 97, 96, 94, + 93, 91, 90, 89, 88, 86, 85, 84, + 83, 82, 81, 81, 80, 79, 78, 78, + 77, 76, 76, 75, 75, 74, 74, 73, + 73, 72, 72, 71, 71, 71, 70, 70, + 70, 70, 69, 69, 69, 69, 68, 68, + 68, 68, 68, 67, 67, 67, 67, 67, }; - // This table is a function A + B*exp(-k.sqrt(xy)), where x, y are - // hor. and vert. indices - static const int weights2d[1024] = { - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 122, 120, 118, 116, 115, 114, 113, - 112, 111, 111, 110, 109, 109, 108, 107, - 107, 106, 106, 105, 105, 104, 104, 104, - 103, 103, 102, 102, 102, 101, 101, 101, - 128, 120, 116, 114, 112, 111, 109, 108, - 107, 106, 105, 104, 103, 102, 102, 101, - 100, 100, 99, 99, 98, 97, 97, 96, - 96, 96, 95, 95, 94, 94, 93, 93, - 128, 118, 114, 111, 109, 107, 106, 104, - 103, 102, 101, 100, 99, 98, 97, 97, - 96, 95, 95, 94, 93, 93, 92, 92, - 91, 91, 90, 90, 90, 89, 89, 88, - 128, 116, 112, 109, 107, 105, 103, 102, - 100, 99, 98, 97, 96, 95, 94, 93, - 93, 92, 91, 91, 90, 90, 89, 89, - 88, 88, 87, 87, 86, 86, 85, 85, - 128, 115, 111, 107, 105, 103, 101, 99, - 98, 97, 96, 94, 93, 93, 92, 91, - 90, 89, 89, 88, 88, 87, 86, 86, - 85, 85, 84, 84, 84, 83, 83, 82, - 128, 114, 109, 106, 103, 101, 99, 97, - 96, 95, 93, 92, 91, 90, 90, 89, - 88, 87, 87, 86, 85, 85, 84, 84, - 83, 83, 82, 82, 82, 81, 81, 80, - 128, 113, 108, 104, 102, 99, 97, 96, - 94, 93, 92, 91, 90, 89, 88, 87, - 86, 85, 85, 84, 84, 83, 83, 82, - 82, 81, 81, 80, 80, 79, 79, 79, - 128, 112, 107, 103, 100, 98, 96, 94, - 93, 91, 90, 89, 88, 87, 86, 85, - 85, 84, 83, 83, 82, 82, 81, 80, - 80, 80, 79, 79, 78, 78, 78, 77, - 128, 111, 106, 102, 99, 97, 95, 93, - 91, 90, 89, 88, 87, 86, 85, 84, - 83, 83, 82, 81, 81, 80, 80, 79, - 79, 78, 78, 77, 77, 77, 76, 76, - 128, 111, 105, 101, 98, 96, 93, 92, - 90, 89, 88, 86, 85, 84, 84, 83, - 82, 81, 81, 80, 80, 79, 79, 78, - 78, 77, 77, 76, 76, 76, 75, 75, - 128, 110, 104, 100, 97, 94, 92, 91, - 89, 88, 86, 85, 84, 83, 83, 82, - 81, 80, 80, 79, 79, 78, 78, 77, - 77, 76, 76, 75, 75, 75, 74, 74, - 128, 109, 103, 99, 96, 93, 91, 90, - 88, 87, 85, 84, 83, 82, 82, 81, - 80, 79, 79, 78, 78, 77, 77, 76, - 76, 75, 75, 75, 74, 74, 74, 73, - 128, 109, 102, 98, 95, 93, 90, 89, - 87, 86, 84, 83, 82, 81, 81, 80, - 79, 78, 78, 77, 77, 76, 76, 75, - 75, 75, 74, 74, 73, 73, 73, 73, - 128, 108, 102, 97, 94, 92, 90, 88, - 86, 85, 84, 83, 82, 81, 80, 79, - 78, 78, 77, 77, 76, 76, 75, 75, - 74, 74, 73, 73, 73, 73, 72, 72, - 128, 107, 101, 97, 93, 91, 89, 87, - 85, 84, 83, 82, 81, 80, 79, 78, - 78, 77, 76, 76, 75, 75, 74, 74, - 74, 73, 73, 73, 72, 72, 72, 71, - 128, 107, 100, 96, 93, 90, 88, 86, - 85, 83, 82, 81, 80, 79, 78, 78, - 77, 76, 76, 75, 75, 74, 74, 73, - 73, 73, 72, 72, 72, 71, 71, 71, - 128, 106, 100, 95, 92, 89, 87, 85, - 84, 83, 81, 80, 79, 78, 78, 77, - 76, 76, 75, 75, 74, 74, 73, 73, - 72, 72, 72, 72, 71, 71, 71, 70, - 128, 106, 99, 95, 91, 89, 87, 85, - 83, 82, 81, 80, 79, 78, 77, 76, - 76, 75, 75, 74, 74, 73, 73, 72, - 72, 72, 71, 71, 71, 71, 70, 70, - 128, 105, 99, 94, 91, 88, 86, 84, - 83, 81, 80, 79, 78, 77, 77, 76, - 75, 75, 74, 74, 73, 73, 72, 72, - 72, 71, 71, 71, 70, 70, 70, 70, - 128, 105, 98, 93, 90, 88, 85, 84, - 82, 81, 80, 79, 78, 77, 76, 75, - 75, 74, 74, 73, 73, 72, 72, 71, - 71, 71, 71, 70, 70, 70, 70, 69, - 128, 104, 97, 93, 90, 87, 85, 83, - 82, 80, 79, 78, 77, 76, 76, 75, - 74, 74, 73, 73, 72, 72, 71, 71, - 71, 70, 70, 70, 70, 69, 69, 69, - 128, 104, 97, 92, 89, 86, 84, 83, - 81, 80, 79, 78, 77, 76, 75, 74, - 74, 73, 73, 72, 72, 71, 71, 71, - 70, 70, 70, 70, 69, 69, 69, 69, - 128, 104, 96, 92, 89, 86, 84, 82, - 80, 79, 78, 77, 76, 75, 75, 74, - 73, 73, 72, 72, 71, 71, 71, 70, - 70, 70, 70, 69, 69, 69, 69, 68, - 128, 103, 96, 91, 88, 85, 83, 82, - 80, 79, 78, 77, 76, 75, 74, 74, - 73, 72, 72, 72, 71, 71, 70, 70, - 70, 70, 69, 69, 69, 69, 68, 68, - 128, 103, 96, 91, 88, 85, 83, 81, - 80, 78, 77, 76, 75, 75, 74, 73, - 73, 72, 72, 71, 71, 70, 70, 70, - 70, 69, 69, 69, 69, 68, 68, 68, - 128, 102, 95, 90, 87, 84, 82, 81, - 79, 78, 77, 76, 75, 74, 73, 73, - 72, 72, 71, 71, 71, 70, 70, 70, - 69, 69, 69, 69, 68, 68, 68, 68, - 128, 102, 95, 90, 87, 84, 82, 80, - 79, 77, 76, 75, 75, 74, 73, 73, - 72, 72, 71, 71, 70, 70, 70, 69, - 69, 69, 69, 68, 68, 68, 68, 68, - 128, 102, 94, 90, 86, 84, 82, 80, - 78, 77, 76, 75, 74, 73, 73, 72, - 72, 71, 71, 70, 70, 70, 69, 69, - 69, 69, 68, 68, 68, 68, 68, 67, - 128, 101, 94, 89, 86, 83, 81, 79, - 78, 77, 76, 75, 74, 73, 73, 72, - 71, 71, 71, 70, 70, 69, 69, 69, - 69, 68, 68, 68, 68, 68, 67, 67, - 128, 101, 93, 89, 85, 83, 81, 79, - 78, 76, 75, 74, 74, 73, 72, 72, - 71, 71, 70, 70, 70, 69, 69, 69, - 68, 68, 68, 68, 68, 67, 67, 67, - 128, 101, 93, 88, 85, 82, 80, 79, - 77, 76, 75, 74, 73, 73, 72, 71, - 71, 70, 70, 70, 69, 69, 69, 68, - 68, 68, 68, 68, 67, 67, 67, 67, - }; - int size_scale = (size >= 32 ? 1 : - size == 16 ? 2 : - size == 8 ? 4 : 8); - int size_shift = size == 64 ? 1 : 0; + + int size_scale = (size >= 64 ? 1: + size == 32 ? 2 : + size == 16 ? 4 : + size == 8 ? 8 : 16); int i, j; switch (mode) { case V_PRED: for (i = 0; i < size; ++i) { for (j = 0; j < size; ++j) { int k = i * interstride + j; - int scale = weights1d[i * size_scale >> size_shift]; + int scale = weights1d[i * size_scale]; interpred[k] = ((scale_max - scale) * interpred[k] + scale * intrapred[i * intrastride + j] + scale_round) @@ -544,7 +458,7 @@ static void combine_interintra(MB_PREDICTION_MODE mode, for (i = 0; i < size; ++i) { for (j = 0; j < size; ++j) { int k = i * interstride + j; - int scale = weights1d[j * size_scale >> size_shift]; + int scale = weights1d[j * size_scale]; interpred[k] = ((scale_max - scale) * interpred[k] + scale * intrapred[i * intrastride + j] + scale_round) @@ -558,9 +472,8 @@ static void combine_interintra(MB_PREDICTION_MODE mode, for (i = 0; i < size; ++i) { for (j = 0; j < size; ++j) { int k = i * interstride + j; - int scale = (weights2d[(i * size_scale * 32 + - j * size_scale) >> size_shift] + - weights1d[i * size_scale >> size_shift]) >> 1; + int scale = (weights1d[i * size_scale] * 3 + + weights1d[j * size_scale]) >> 2; interpred[k] = ((scale_max - scale) * interpred[k] + scale * intrapred[i * intrastride + j] + scale_round) @@ -574,9 +487,8 @@ static void combine_interintra(MB_PREDICTION_MODE mode, for (i = 0; i < size; ++i) { for (j = 0; j < size; ++j) { int k = i * interstride + j; - int scale = (weights2d[(i * size_scale * 32 + - j * size_scale) >> size_shift] + - weights1d[j * size_scale >> size_shift]) >> 1; + int scale = (weights1d[j * size_scale] * 3 + + weights1d[i * size_scale]) >> 2; interpred[k] = ((scale_max - scale) * interpred[k] + scale * intrapred[i * intrastride + j] + scale_round) @@ -589,8 +501,7 @@ static void combine_interintra(MB_PREDICTION_MODE mode, for (i = 0; i < size; ++i) { for (j = 0; j < size; ++j) { int k = i * interstride + j; - int scale = weights2d[(i * size_scale * 32 + - j * size_scale) >> size_shift]; + int scale = weights1d[(i < j ? i : j) * size_scale]; interpred[k] = ((scale_max - scale) * interpred[k] + scale * intrapred[i * intrastride + j] + scale_round) @@ -600,8 +511,21 @@ static void combine_interintra(MB_PREDICTION_MODE mode, break; case D45_PRED: - case DC_PRED: + for (i = 0; i < size; ++i) { + for (j = 0; j < size; ++j) { + int k = i * interstride + j; + int scale = (weights1d[i * size_scale] + + weights1d[j * size_scale]) >> 1; + interpred[k] = + ((scale_max - scale) * interpred[k] + + scale * intrapred[i * intrastride + j] + scale_round) + >> scale_bits; + } + } + break; + case TM_PRED: + case DC_PRED: default: // simple average for (i = 0; i < size; ++i) { @@ -631,7 +555,7 @@ void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd, xd->dst.y_buffer, xd->dst.y_stride, intrapredictor, 16, xd->mode_info_context->mbmi.interintra_mode, 16, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, xd->right_available); combine_interintra(xd->mode_info_context->mbmi.interintra_mode, ypred, ystride, intrapredictor, 16, 16); } @@ -646,12 +570,12 @@ void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd, xd->dst.u_buffer, xd->dst.uv_stride, uintrapredictor, 8, xd->mode_info_context->mbmi.interintra_uv_mode, 8, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, xd->right_available); vp9_build_intra_predictors_internal( xd->dst.v_buffer, xd->dst.uv_stride, vintrapredictor, 8, xd->mode_info_context->mbmi.interintra_uv_mode, 8, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, xd->right_available); combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, upred, uvstride, uintrapredictor, 8, 8); combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, @@ -666,7 +590,7 @@ void vp9_build_interintra_32x32_predictors_sby(MACROBLOCKD *xd, xd->dst.y_buffer, xd->dst.y_stride, intrapredictor, 32, xd->mode_info_context->mbmi.interintra_mode, 32, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, xd->right_available); combine_interintra(xd->mode_info_context->mbmi.interintra_mode, ypred, ystride, intrapredictor, 32, 32); } @@ -681,12 +605,12 @@ void vp9_build_interintra_32x32_predictors_sbuv(MACROBLOCKD *xd, xd->dst.u_buffer, xd->dst.uv_stride, uintrapredictor, 16, xd->mode_info_context->mbmi.interintra_uv_mode, 16, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, xd->right_available); vp9_build_intra_predictors_internal( xd->dst.v_buffer, xd->dst.uv_stride, vintrapredictor, 16, xd->mode_info_context->mbmi.interintra_uv_mode, 16, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, xd->right_available); combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, upred, uvstride, uintrapredictor, 16, 16); combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, @@ -710,7 +634,8 @@ void vp9_build_interintra_64x64_predictors_sby(MACROBLOCKD *xd, const int mode = xd->mode_info_context->mbmi.interintra_mode; vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, intrapredictor, 64, mode, 64, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); combine_interintra(xd->mode_info_context->mbmi.interintra_mode, ypred, ystride, intrapredictor, 64, 64); } @@ -724,10 +649,12 @@ void vp9_build_interintra_64x64_predictors_sbuv(MACROBLOCKD *xd, const int mode = xd->mode_info_context->mbmi.interintra_uv_mode; vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride, uintrapredictor, 32, mode, 32, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride, vintrapredictor, 32, mode, 32, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, upred, uvstride, uintrapredictor, 32, 32); combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, @@ -749,28 +676,32 @@ void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) { vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, xd->predictor, 16, xd->mode_info_context->mbmi.mode, 16, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); } void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) { vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, xd->dst.y_buffer, xd->dst.y_stride, xd->mode_info_context->mbmi.mode, 16, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); } void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) { vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, xd->dst.y_buffer, xd->dst.y_stride, xd->mode_info_context->mbmi.mode, 32, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); } void vp9_build_intra_predictors_sb64y_s(MACROBLOCKD *xd) { vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, xd->dst.y_buffer, xd->dst.y_stride, xd->mode_info_context->mbmi.mode, 64, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); } void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd, @@ -780,10 +711,12 @@ void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd, int mode, int bsize) { vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride, upred_ptr, uv_stride, mode, bsize, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride, vpred_ptr, uv_stride, mode, bsize, - xd->up_available, xd->left_available); + xd->up_available, xd->left_available, + xd->right_available); } void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) { @@ -815,20 +748,35 @@ void vp9_build_intra_predictors_sb64uv_s(MACROBLOCKD *xd) { 32); } -void vp9_intra8x8_predict(BLOCKD *xd, +void vp9_intra8x8_predict(MACROBLOCKD *xd, + BLOCKD *b, int mode, uint8_t *predictor) { - vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst, - xd->dst_stride, predictor, 16, - mode, 8, 1, 1); + const int block4x4_idx = (b - xd->block); + const int block_idx = (block4x4_idx >> 2) | !!(block4x4_idx & 2); + const int have_top = (block_idx >> 1) || xd->up_available; + const int have_left = (block_idx & 1) || xd->left_available; + const int have_right = !(block_idx & 1) || xd->right_available; + + vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst, + b->dst_stride, predictor, 16, + mode, 8, have_top, have_left, + have_right); } -void vp9_intra_uv4x4_predict(BLOCKD *xd, +void vp9_intra_uv4x4_predict(MACROBLOCKD *xd, + BLOCKD *b, int mode, uint8_t *predictor) { - vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst, - xd->dst_stride, predictor, 8, - mode, 4, 1, 1); + const int block_idx = (b - xd->block) & 3; + const int have_top = (block_idx >> 1) || xd->up_available; + const int have_left = (block_idx & 1) || xd->left_available; + const int have_right = !(block_idx & 1) || xd->right_available; + + vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst, + b->dst_stride, predictor, 8, + mode, 4, have_top, have_left, + have_right); } /* TODO: try different ways of use Y-UV mode correlation diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h index 88584ad3bb16caaa66d6e57711598a4a7efa27eb..b97b6089dd7c0dd975b565f7622f893275c58aeb 100644 --- a/vp9/common/vp9_reconintra.h +++ b/vp9/common/vp9_reconintra.h @@ -14,37 +14,44 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" -extern void vp9_recon_intra_mbuv(MACROBLOCKD *xd); -extern B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n); -extern B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x); +void vp9_recon_intra_mbuv(MACROBLOCKD *xd); + +B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, + int stride, int n, + int tx, int ty); + +B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x); + #if CONFIG_COMP_INTERINTRA_PRED -extern void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd, - uint8_t *ypred, - uint8_t *upred, - uint8_t *vpred, - int ystride, - int uvstride); -extern void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd, - uint8_t *ypred, - int ystride); -extern void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd, - uint8_t *upred, - uint8_t *vpred, - int uvstride); +void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd, + uint8_t *ypred, + uint8_t *upred, + uint8_t *vpred, + int ystride, + int uvstride); + +void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd, + uint8_t *ypred, + int ystride); + +void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd, + uint8_t *upred, + uint8_t *vpred, + int uvstride); #endif // CONFIG_COMP_INTERINTRA_PRED -extern void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd, - uint8_t *ypred, - uint8_t *upred, - uint8_t *vpred, - int ystride, - int uvstride); -extern void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd, - uint8_t *ypred, - uint8_t *upred, - uint8_t *vpred, - int ystride, - int uvstride); +void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd, + uint8_t *ypred, + uint8_t *upred, + uint8_t *vpred, + int ystride, + int uvstride); + +void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd, + uint8_t *ypred, + uint8_t *upred, + uint8_t *vpred, + int ystride, + int uvstride); #endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c index da607e81c2a77a38ced775a9ec76028a5708b082..eab5ab4955e87ffc7cf72869368d5b194ed72f06 100644 --- a/vp9/common/vp9_reconintra4x4.c +++ b/vp9/common/vp9_reconintra4x4.c @@ -15,17 +15,17 @@ #include "vp9_rtcd.h" #if CONFIG_NEWBINTRAMODES -static int find_grad_measure(uint8_t *x, int stride, int n, int t, +static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty, int dx, int dy) { int i, j; int count = 0, gsum = 0, gdiv; /* TODO: Make this code more efficient by breaking up into two loops */ - for (i = -t; i < n; ++i) - for (j = -t; j < n; ++j) { + for (i = -ty; i < n; ++i) + for (j = -tx; j < n; ++j) { int g; if (i >= 0 && j >= 0) continue; if (i + dy >= 0 && j + dx >= 0) continue; - if (i + dy < -t || i + dy >= n || j + dx < -t || j + dx >= n) continue; + if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue; g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]); gsum += g * g; count++; @@ -36,14 +36,15 @@ static int find_grad_measure(uint8_t *x, int stride, int n, int t, #if CONTEXT_PRED_REPLACEMENTS == 6 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n) { + int stride, int n, + int tx, int ty) { int g[8], i, imin, imax; - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1); + g[2] = find_grad_measure(ptr, stride, n, tx, ty, 1, 1); + g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2); + g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2); + g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1); + g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1); imin = 1; for (i = 2; i < 8; i += 1 + (i == 3)) imin = (g[i] < g[imin] ? i : imin); @@ -73,12 +74,13 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, } #elif CONTEXT_PRED_REPLACEMENTS == 4 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n) { + int stride, int n, + int tx, int ty) { int g[8], i, imin, imax; - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1); + g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2); + g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2); + g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1); imin = 1; for (i = 3; i < 8; i+=2) imin = (g[i] < g[imin] ? i : imin); @@ -104,16 +106,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, } #elif CONTEXT_PRED_REPLACEMENTS == 0 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n) { + int stride, int n, + int tx, int ty) { int g[8], i, imin, imax; - g[0] = find_grad_measure(ptr, stride, n, 4, 1, 0); - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[4] = find_grad_measure(ptr, stride, n, 4, 0, 1); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + g[0] = find_grad_measure(ptr, stride, n, tx, ty, 1, 0); + g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1); + g[2] = find_grad_measure(ptr, stride, n, tx, ty, 1, 1); + g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2); + g[4] = find_grad_measure(ptr, stride, n, tx, ty, 0, 1); + g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2); + g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1); + g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1); imax = 0; for (i = 1; i < 8; i++) imax = (g[i] > g[imax] ? i : imax); @@ -144,26 +147,113 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, } #endif -B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) { +B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x) { + const int block_idx = x - xd->block; + const int have_top = (block_idx >> 2) || xd->up_available; + const int have_left = (block_idx & 3) || xd->left_available; uint8_t *ptr = *(x->base_dst) + x->dst; int stride = x->dst_stride; - return vp9_find_dominant_direction(ptr, stride, 4); + int tx = have_left ? 4 : 0; + int ty = have_top ? 4 : 0; + if (!have_left && !have_top) + return B_DC_PRED; + return vp9_find_dominant_direction(ptr, stride, 4, tx, ty); } #endif -void vp9_intra4x4_predict(BLOCKD *x, +void vp9_intra4x4_predict(MACROBLOCKD *xd, + BLOCKD *x, int b_mode, uint8_t *predictor) { int i, r, c; + const int block_idx = x - xd->block; + const int have_top = (block_idx >> 2) || xd->up_available; + const int have_left = (block_idx & 3) || xd->left_available; + const int have_right = (block_idx & 3) != 3 || xd->right_available; + uint8_t left[4], above[8], top_left; + /* + * 127 127 127 .. 127 127 127 127 127 127 + * 129 A B .. Y Z + * 129 C D .. W X + * 129 E F .. U V + * 129 G H .. S T T T T T + * .. + */ + + if (have_left) { + uint8_t *left_ptr = *(x->base_dst) + x->dst - 1; + const int stride = x->dst_stride; + + left[0] = left_ptr[0 * stride]; + left[1] = left_ptr[1 * stride]; + left[2] = left_ptr[2 * stride]; + left[3] = left_ptr[3 * stride]; + } else { + left[0] = left[1] = left[2] = left[3] = 129; + } + + if (have_top) { + uint8_t *above_ptr = *(x->base_dst) + x->dst - x->dst_stride; - uint8_t *above = *(x->base_dst) + x->dst - x->dst_stride; - uint8_t left[4]; - uint8_t top_left = above[-1]; + if (have_left) { + top_left = above_ptr[-1]; + } else { + top_left = 127; + } - left[0] = (*(x->base_dst))[x->dst - 1]; - left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride]; - left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride]; - left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride]; + above[0] = above_ptr[0]; + above[1] = above_ptr[1]; + above[2] = above_ptr[2]; + above[3] = above_ptr[3]; + if (((block_idx & 3) != 3) || + (have_right && block_idx == 3 && + ((xd->mb_index != 3 && xd->sb_index != 3) || + ((xd->mb_index & 1) == 0 && xd->sb_index == 3)))) { + above[4] = above_ptr[4]; + above[5] = above_ptr[5]; + above[6] = above_ptr[6]; + above[7] = above_ptr[7]; + } else if (have_right) { + uint8_t *above_right = above_ptr + 4; + + if (xd->sb_index == 3 && (xd->mb_index & 1)) + above_right -= 32 * x->dst_stride; + if (xd->mb_index == 3) + above_right -= 16 * x->dst_stride; + above_right -= (block_idx & ~3) * x->dst_stride; + + /* use a more distant above-right (from closest available top-right + * corner), but with a "localized DC" (similar'ish to TM-pred): + * + * A B C D E F G H + * I J K L + * M N O P + * Q R S T + * U V W X x1 x2 x3 x4 + * + * Where: + * x1 = clip_pixel(E + X - D) + * x2 = clip_pixel(F + X - D) + * x3 = clip_pixel(G + X - D) + * x4 = clip_pixel(H + X - D) + * + * This is applied anytime when we use a "distant" above-right edge + * that is not immediately top-right to the block that we're going + * to do intra prediction for. + */ + above[4] = clip_pixel(above_right[0] + above_ptr[3] - above_right[-1]); + above[5] = clip_pixel(above_right[1] + above_ptr[3] - above_right[-1]); + above[6] = clip_pixel(above_right[2] + above_ptr[3] - above_right[-1]); + above[7] = clip_pixel(above_right[3] + above_ptr[3] - above_right[-1]); + } else { + // extend edge + above[4] = above[5] = above[6] = above[7] = above[3]; + } + } else { + above[0] = above[1] = above[2] = above[3] = 127; + above[4] = above[5] = above[6] = above[7] = 127; + top_left = 127; + } #if CONFIG_NEWBINTRAMODES if (b_mode == B_CONTEXT_PRED) @@ -411,39 +501,3 @@ void vp9_intra4x4_predict(BLOCKD *x, #endif } } - -/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and - * to the right prediction have filled in pixels to use. - */ -void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) { - int extend_edge = xd->mb_to_right_edge == 0 && xd->mb_index < 2; - uint8_t *above_right = *(xd->block[0].base_dst) + xd->block[0].dst - - xd->block[0].dst_stride + 16; - uint32_t *dst_ptr0 = (uint32_t *)above_right; - uint32_t *dst_ptr1 = - (uint32_t *)(above_right + 4 * xd->block[0].dst_stride); - uint32_t *dst_ptr2 = - (uint32_t *)(above_right + 8 * xd->block[0].dst_stride); - uint32_t *dst_ptr3 = - (uint32_t *)(above_right + 12 * xd->block[0].dst_stride); - - uint32_t *src_ptr = (uint32_t *) above_right; - - if ((xd->sb_index >= 2 && xd->mb_to_right_edge == 0) || - (xd->sb_index == 3 && xd->mb_index & 1)) - src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 32 * - xd->block[0].dst_stride); - if (xd->mb_index == 3 || - (xd->mb_to_right_edge == 0 && xd->mb_index == 2)) - src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 16 * - xd->block[0].dst_stride); - - if (extend_edge) { - *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U; - } - - *dst_ptr0 = *src_ptr; - *dst_ptr1 = *src_ptr; - *dst_ptr2 = *src_ptr; - *dst_ptr3 = *src_ptr; -} diff --git a/vp9/common/vp9_reconintra4x4.h b/vp9/common/vp9_reconintra4x4.h deleted file mode 100644 index 4e58731e8a84e2fcd9db6c3d493b41c098782e82..0000000000000000000000000000000000000000 --- a/vp9/common/vp9_reconintra4x4.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_VP9_RECONINTRA4X4_H_ -#define VP9_COMMON_VP9_RECONINTRA4X4_H_ - -extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd); - -#endif // VP9_COMMON_VP9_RECONINTRA4X4_H_ diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c index 277d5b217ab98fb46834d7b71a3fdfda7b0ccf2d..72613ae079e8b31d1038c686d4d3e7dc0c59626e 100644 --- a/vp9/common/vp9_rtcd.c +++ b/vp9/common/vp9_rtcd.c @@ -12,10 +12,9 @@ #include "vp9_rtcd.h" #include "vpx_ports/vpx_once.h" -extern void vpx_scale_rtcd(void); +void vpx_scale_rtcd(void); -void vp9_rtcd() -{ +void vp9_rtcd() { vpx_scale_rtcd(); once(setup_rtcd_internal); } diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index ddca11931831e032a1ece7558726b9be566546d7..8b6efc384062c163112bc791b8c5ce29504d65cc 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -23,90 +23,50 @@ EOF } forward_decls vp9_common_forward_decls -prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" - -# At the very least, MSVC 2008 has compiler bug exhibited by this code; code -# compiles warning free but a dissassembly of generated code show bugs. To be -# on the safe side, only enabled when compiled with 'gcc'. -if [ "$CONFIG_GCC" = "yes" ]; then - specialize vp9_filter_block2d_4x4_8 sse4_1 sse2 -fi - specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2 - specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2 - specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2 - # # Dequant # -prototype void vp9_dequantize_b "struct blockd *x" -specialize vp9_dequantize_b - -prototype void vp9_dequantize_b_2x2 "struct blockd *x" -specialize vp9_dequantize_b_2x2 - -prototype void vp9_dequant_dc_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dc, struct macroblockd *xd" -specialize vp9_dequant_dc_idct_add_y_block_8x8 - -prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, struct macroblockd *xd" +prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd" specialize vp9_dequant_idct_add_y_block_8x8 -prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs, struct macroblockd *xd" +prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd" specialize vp9_dequant_idct_add_uv_block_8x8 prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob" specialize vp9_dequant_idct_add_16x16 -prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc, int eob" +prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob" specialize vp9_dequant_idct_add_8x8 -prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride" +prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob" specialize vp9_dequant_idct_add -prototype void vp9_dequant_dc_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc" -specialize vp9_dequant_dc_idct_add - -prototype void vp9_dequant_dc_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dcs" -specialize vp9_dequant_dc_idct_add_y_block - -prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs" +prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd" specialize vp9_dequant_idct_add_y_block -prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs" +prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd" specialize vp9_dequant_idct_add_uv_block prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob" specialize vp9_dequant_idct_add_32x32 -prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs" +prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd" specialize vp9_dequant_idct_add_uv_block_16x16 # # RECON # -prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem16x16 mmx sse2 dspr2 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2 -prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem8x8 mmx dspr2 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2 -prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem8x4 mmx -prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_avg_mem16x16 - -prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_avg_mem8x8 - -prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem8x4 mmx dspr2 -vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2 - prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride" specialize vp9_recon_b @@ -137,6 +97,12 @@ specialize vp9_recon_sby_s prototype void vp9_recon_sbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst" specialize void vp9_recon_sbuv_s +prototype void vp9_recon_sb64y_s "struct macroblockd *x, uint8_t *dst" +specialize vp9_recon_sb64y_s + +prototype void vp9_recon_sb64uv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst" +specialize void vp9_recon_sb64uv_s + prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x" specialize vp9_build_intra_predictors_mby_s @@ -164,15 +130,38 @@ specialize vp9_build_intra_predictors_sb64y_s; prototype void vp9_build_intra_predictors_sb64uv_s "struct macroblockd *x" specialize vp9_build_intra_predictors_sb64uv_s; -prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor" +prototype void vp9_intra4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor" specialize vp9_intra4x4_predict; -prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, uint8_t *predictor" +prototype void vp9_intra8x8_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor" specialize vp9_intra8x8_predict; -prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor" +prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor" specialize vp9_intra_uv4x4_predict; +if [ "$CONFIG_VP9_DECODER" = "yes" ]; then +prototype void vp9_add_residual_4x4 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_residual_4x4 sse2 + +prototype void vp9_add_residual_8x8 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_residual_8x8 sse2 + +prototype void vp9_add_residual_16x16 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_residual_16x16 sse2 + +prototype void vp9_add_residual_32x32 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_residual_32x32 sse2 + +prototype void vp9_add_constant_residual_8x8 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_8x8 sse2 + +prototype void vp9_add_constant_residual_16x16 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_16x16 sse2 + +prototype void vp9_add_constant_residual_32x32 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_32x32 sse2 +fi + # # Loopfilter # @@ -263,171 +252,146 @@ specialize vp9_sad16x3 sse2 prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride" specialize vp9_sad3x16 sse2 -prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, const int source_stride, const int xoffset, const int yoffset, const uint8_t *ref_ptr, const int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x2 sse2 # # Sub Pixel Filters # -prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16 - -prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8 - -prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16 - -prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8 - -prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4 - -prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4 - -prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4 +prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8 ssse3 -prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16_sharp +prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_horiz ssse3 -prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8_sharp +prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_vert ssse3 -prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16_sharp +prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg ssse3 -prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8_sharp +prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg_horiz ssse3 -prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4_sharp +prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg_vert ssse3 -prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4_sharp +#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT +prototype void vp9_convolve8_1by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_1by8 -prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4_sharp +prototype void vp9_convolve8_qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_qtr -prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16_smooth +prototype void vp9_convolve8_3by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_3by8 -prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8_smooth +prototype void vp9_convolve8_5by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_5by8 -prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16_smooth +prototype void vp9_convolve8_3qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_3qtr -prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8_smooth +prototype void vp9_convolve8_7by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_7by8 -prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4_smooth +prototype void vp9_convolve8_1by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_1by8_horiz -prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4_smooth +prototype void vp9_convolve8_qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_qtr_horiz -prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4_smooth +prototype void vp9_convolve8_3by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_3by8_horiz -prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict16x16 +prototype void vp9_convolve8_5by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_5by8_horiz -prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict8x8 +prototype void vp9_convolve8_3qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_3qtr_horiz -prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg16x16 +prototype void vp9_convolve8_7by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_7by8_horiz -prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg8x8 +prototype void vp9_convolve8_1by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_1by8_vert -prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict8x4 +prototype void vp9_convolve8_qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_qtr_vert -prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict4x4 +prototype void vp9_convolve8_3by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_3by8_vert -prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg4x4 +prototype void vp9_convolve8_5by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_5by8_vert -prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict16x16 sse2 +prototype void vp9_convolve8_3qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_3qtr_vert -prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict8x8 sse2 - -prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg16x16 - -prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg8x8 - -prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict8x4 - -prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict4x4 - -prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg4x4 +prototype void vp9_convolve8_7by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_7by8_vert +#endif # # dct # -prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct4x4llm_1 +prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_idct4x4_1 -prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct4x4llm +prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_idct4x4 sse2 prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct8x8 +specialize vp9_short_idct8x8 sse2 prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct10_8x8 +specialize vp9_short_idct10_8x8 sse2 -prototype void vp9_short_ihaar2x2 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_ihaar2x2 +prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output" +specialize vp9_short_idct1_8x8 prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct16x16 +specialize vp9_short_idct16x16 sse2 prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct10_16x16 +specialize vp9_short_idct10_16x16 sse2 + +prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output" +specialize vp9_short_idct1_16x16 + prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct32x32 +specialize vp9_short_idct32x32 sse2 -prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs" -specialize vp9_ihtllm +prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" +specialize vp9_short_idct1_32x32 -# -# 2nd order -# -prototype void vp9_short_inv_walsh4x4_1 "int16_t *in, int16_t *out" -specialize vp9_short_inv_walsh4x4_1 +prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_idct10_32x32 + +prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type" +specialize vp9_short_iht8x8 -prototype void vp9_short_inv_walsh4x4 "int16_t *in, int16_t *out" -specialize vp9_short_inv_walsh4x4_ +prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type" +specialize vp9_short_iht4x4 +prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type" +specialize vp9_short_iht16x16 + +prototype void vp9_idct4_1d "int16_t *input, int16_t *output" +specialize vp9_idct4_1d sse2 # dct and add -prototype void vp9_dc_only_idct_add_8x8 "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" -specialize vp9_dc_only_idct_add_8x8 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" -specialize vp9_dc_only_idct_add +specialize vp9_dc_only_idct_add sse2 -if [ "$CONFIG_LOSSLESS" = "yes" ]; then -prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch" -prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch" +prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_iwalsh4x4_1 +prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_iwalsh4x4 prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" -prototype void vp9_short_inv_walsh4x4_1_lossless "int16_t *in, int16_t *out" -prototype void vp9_short_inv_walsh4x4_lossless "int16_t *in, int16_t *out" -fi +specialize vp9_dc_only_inv_walsh_add prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad" specialize vp9_sad32x3 @@ -475,58 +439,52 @@ specialize vp9_variance4x4 mmx sse2 vp9_variance4x4_sse2=vp9_variance4x4_wmt vp9_variance4x4_mmx=vp9_variance4x4_mmx -prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 +prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance64x64 sse2 -prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 +prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance32x32 sse2 -prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3 -vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt -prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance8x16 sse2 mmx vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt -prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3; vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt -prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance8x8 sse2 mmx vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt -prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance4x4 sse2 mmx vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad64x64 +specialize vp9_sad64x64 sse2 prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x32 +specialize vp9_sad32x32 sse2 prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad16x16 mmx sse2 sse3 -vp9_sad16x16_sse2=vp9_sad16x16_wmt +specialize vp9_sad16x16 mmx sse2 prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad16x8 mmx sse2 -vp9_sad16x8_sse2=vp9_sad16x8_wmt prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad8x16 mmx sse2 -vp9_sad8x16_sse2=vp9_sad8x16_wmt prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad8x8 mmx sse2 -vp9_sad8x8_sse2=vp9_sad8x8_wmt prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad4x4 mmx sse2 -vp9_sad4x4_sse2=vp9_sad4x4_wmt +specialize vp9_sad4x4 mmx sse prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_variance_halfpixvar16x16_h mmx sse2 @@ -579,76 +537,64 @@ specialize vp9_sad8x8x3 sse3 prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x3 sse3 -prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad64x64x8 -prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad32x32x8 -prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad16x16x8 sse4 -prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad16x8x8 sse4 -prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad8x16x8 sse4 -prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad8x8x8 sse4 -prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint16_t *sad_array" +prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" specialize vp9_sad4x4x8 sse4 -prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad64x64x4d - -prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad32x32x4d +prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad64x64x4d sse2 -prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad16x16x4d sse3 +prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad32x32x4d sse2 -prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad16x8x4d sse3 +prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad16x16x4d sse2 -prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad8x16x4d sse3 +prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad16x8x4d sse2 -prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad8x8x4d sse3 +prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad8x16x4d sse2 -prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t **ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad4x4x4d sse3 - -# -# Block copy -# -case $arch in - x86*) - prototype void vp9_copy32xn "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, int n" - specialize vp9_copy32xn sse2 sse3 - ;; -esac +prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad8x8x4d sse2 +prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad4x4x4d sse prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" specialize vp9_sub_pixel_mse16x16 sse2 mmx -vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" specialize vp9_mse16x16 mmx sse2 vp9_mse16x16_sse2=vp9_mse16x16_wmt -prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_mse64x64 -prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_mse32x32 prototype unsigned int vp9_get_mb_ss "const int16_t *" specialize vp9_get_mb_ss mmx sse2 # ENCODEMB INVOKE -prototype int vp9_mbblock_error "struct macroblock *mb, int dc" +prototype int vp9_mbblock_error "struct macroblock *mb" specialize vp9_mbblock_error mmx sse2 vp9_mbblock_error_sse2=vp9_mbblock_error_xmm @@ -686,14 +632,17 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then fi # fdct functions -prototype void vp9_fht "const int16_t *input, int pitch, int16_t *output, int tx_type, int tx_dim" -specialize vp9_fht +prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +specialize vp9_short_fht4x4 -prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct8x8 +prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +specialize vp9_short_fht8x8 + +prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +specialize vp9_short_fht16x16 -prototype void vp9_short_fhaar2x2 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fhaar2x2 +prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_fdct8x8 sse2 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct4x4 @@ -701,23 +650,17 @@ specialize vp9_short_fdct4x4 prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct8x4 -prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh4x4 - prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct32x32 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct16x16 +specialize vp9_short_fdct16x16 sse2 -prototype void vp9_short_walsh4x4_lossless "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh4x4_lossless - -prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh4x4_x8 +prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_walsh4x4 -prototype void vp9_short_walsh8x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh8x4_x8 +prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_walsh8x4 # # Motion search diff --git a/vp9/common/vp9_sadmxn.h b/vp9/common/vp9_sadmxn.h index fe3cdc2b3ab85c5c779188a8b9ef73317fedae2b..b2dfd63f9b41e5d5ddb0aebe981d3e1f45e64c45 100644 --- a/vp9/common/vp9_sadmxn.h +++ b/vp9/common/vp9_sadmxn.h @@ -11,14 +11,15 @@ #ifndef VP9_COMMON_VP9_SADMXN_H_ #define VP9_COMMON_VP9_SADMXN_H_ +#include "./vpx_config.h" #include "vpx/vpx_integer.h" -static __inline unsigned int sad_mx_n_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - int m, - int n) { +static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int m, + int n) { int r, c; unsigned int sad = 0; diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c index 89c1e458dd851b3fc4aa80b247dc34f8b3470409..44d3172939801ecef3425094e232905af764c88d 100644 --- a/vp9/common/vp9_seg_common.c +++ b/vp9/common/vp9_seg_common.c @@ -12,9 +12,8 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_seg_common.h" -static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 }; -static const int seg_feature_data_max[SEG_LVL_MAX] = - { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX_SB - 1}; +static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 }; +static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 0xf, 0xf }; // These functions provide access to new segment level features. // Eventually these function may be "optimized out" but for the moment, @@ -52,7 +51,7 @@ int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { } int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { - return (segfeaturedata_signed[feature_id]); + return segfeaturedata_signed[feature_id]; } void vp9_clear_segdata(MACROBLOCKD *xd, @@ -103,10 +102,4 @@ int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) { ~(1 << INTRA_FRAME)) ? 1 : 0; } -int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) { - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM)) - return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM); - else - return TX_4X4; -} // TBD? Functions to read and write segment data with range / validity checking diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h index 681c701ba8869bd553a0539ef051d786929b0cee..2d0018b47fa6282a7678f482a2a382f2d91b9565 100644 --- a/vp9/common/vp9_seg_common.h +++ b/vp9/common/vp9_seg_common.h @@ -57,7 +57,5 @@ int vp9_check_segref(const MACROBLOCKD *xd, int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id); -int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id); - #endif // VP9_COMMON_VP9_SEG_COMMON_H_ diff --git a/vp9/common/vp9_setupintrarecon.h b/vp9/common/vp9_setupintrarecon.h index 4572655280bdb173399ccf9a477b90a2f87df278..e389f3c91627a236236010b09ab4d9e50f5f69d7 100644 --- a/vp9/common/vp9_setupintrarecon.h +++ b/vp9/common/vp9_setupintrarecon.h @@ -13,6 +13,6 @@ #include "vpx_scale/yv12config.h" -extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); +void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); #endif // VP9_COMMON_VP9_SETUPINTRARECON_H_ diff --git a/vp9/common/vp9_subpixel.h b/vp9/common/vp9_subpixel.h deleted file mode 100644 index dc4eadfb190c4650ca1beb965e53f4aeb92b96bd..0000000000000000000000000000000000000000 --- a/vp9/common/vp9_subpixel.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_SUBPIXEL_H_ -#define VP9_COMMON_VP9_SUBPIXEL_H_ - -#define prototype_subpixel_predict(sym) \ - void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \ - uint8_t *dst, int dst_pitch) - -typedef prototype_subpixel_predict((*vp9_subpix_fn_t)); - -#endif // VP9_COMMON_VP9_SUBPIXEL_H_ diff --git a/vp9/common/vp9_textblit.c b/vp9/common/vp9_textblit.c index 52c6b87c628519cb1fd0483d9c8fdc03b1e2504b..60e95e08f58cde9f5d7b285772cfb8c9b831f7d8 100644 --- a/vp9/common/vp9_textblit.c +++ b/vp9/common/vp9_textblit.c @@ -12,22 +12,26 @@ #include "vp9/common/vp9_textblit.h" +static const int font[] = { + 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000, + 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110, + 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA, + 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20, + 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF, + 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F, + 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2, + 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731, + 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820 +}; + +static void plot(int x, int y, unsigned char *image, int pitch) { + image[x + y * pitch] ^= 255; +} + void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) { int letter_bitmap; unsigned char *output_pos = address; - int colpos; - const int font[] = { - 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000, - 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110, - 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA, - 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20, - 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF, - 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F, - 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2, - 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731, - 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820 - }; - colpos = 0; + int colpos = 0; while (msg[colpos] != 0) { char letter = msg[colpos]; @@ -50,12 +54,11 @@ void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) { } } -static void plot(const int x, const int y, unsigned char *image, const int pitch) { - image [x + y * pitch] ^= 255; -} + /* Bresenham line algorithm */ -void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) { +void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, + int pitch) { int steep = abs(y1 - y0) > abs(x1 - x0); int deltax, deltay; int error, ystep, y, x; diff --git a/vp9/common/vp9_textblit.h b/vp9/common/vp9_textblit.h index 8285aa7fd8b19bf160813222bcacfee84becbf53..c968628fe42402631b9646805098d12c03a44646 100644 --- a/vp9/common/vp9_textblit.h +++ b/vp9/common/vp9_textblit.h @@ -11,9 +11,9 @@ #ifndef VP9_COMMON_VP9_TEXTBLIT_H_ #define VP9_COMMON_VP9_TEXTBLIT_H_ -extern void vp9_blit_text(const char *msg, unsigned char *address, - const int pitch); -extern void vp9_blit_line(int x0, int x1, int y0, int y1, - unsigned char *image, const int pitch); +void vp9_blit_text(const char *msg, unsigned char *address, int pitch); + +void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, + int pitch); #endif // VP9_COMMON_VP9_TEXTBLIT_H_ diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c new file mode 100644 index 0000000000000000000000000000000000000000..b6178f27d9646f8b89a1ee744576fbebd2bbe807 --- /dev/null +++ b/vp9/common/vp9_tile_common.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_tile_common.h" + +#define MIN_TILE_WIDTH 256 +#define MAX_TILE_WIDTH 4096 +#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6) +#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6) + +static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off, + int *max_tile_off, int tile_idx, + int log2_n_tiles, int n_mbs) { + const int n_sbs = (n_mbs + 3) >> 2; + const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles; + const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles; + + *min_tile_off = MIN(sb_off1 << 2, n_mbs); + *max_tile_off = MIN(sb_off2 << 2, n_mbs); +} + +void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) { + cm->cur_tile_col_idx = tile_col_idx; + vp9_get_tile_offsets(cm, &cm->cur_tile_mb_col_start, + &cm->cur_tile_mb_col_end, tile_col_idx, + cm->log2_tile_columns, cm->mb_cols); +} + +void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) { + cm->cur_tile_row_idx = tile_row_idx; + vp9_get_tile_offsets(cm, &cm->cur_tile_mb_row_start, + &cm->cur_tile_mb_row_end, tile_row_idx, + cm->log2_tile_rows, cm->mb_rows); +} + + +void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr, + int *delta_log2_n_tiles) { + const int sb_cols = (cm->mb_cols + 3) >> 2; + int min_log2_n_tiles, max_log2_n_tiles; + + for (max_log2_n_tiles = 0; + (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS; + max_log2_n_tiles++) {} + for (min_log2_n_tiles = 0; + (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols; + min_log2_n_tiles++) {} + + *min_log2_n_tiles_ptr = min_log2_n_tiles; + *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles; +} diff --git a/vp9/decoder/x86/vp9_x86_dsystemdependent.c b/vp9/common/vp9_tile_common.h similarity index 50% rename from vp9/decoder/x86/vp9_x86_dsystemdependent.c rename to vp9/common/vp9_tile_common.h index 51ee8ec31c53a7ed8a0b2b94a271b97e447757d9..7ea377297c56d2a92670b6aefd52de60f295ae20 100644 --- a/vp9/decoder/x86/vp9_x86_dsystemdependent.c +++ b/vp9/common/vp9_tile_common.h @@ -8,19 +8,16 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vpx_config.h" -#include "vpx_ports/x86.h" -#include "vp9/decoder/vp9_onyxd_int.h" +#ifndef VP9_COMMON_VP9_TILE_COMMON_H_ +#define VP9_COMMON_VP9_TILE_COMMON_H_ -#if HAVE_MMX -void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q); +#include "vp9/common/vp9_onyxc_int.h" -void vp9_dequantize_b_mmx(BLOCKD *d) { - short *sq = (short *) d->qcoeff; - short *dq = (short *) d->dqcoeff; - short *q = (short *) d->dequant; - vp9_dequantize_b_impl_mmx(sq, dq, q); -} -#endif +void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx); +void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx); +void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles, + int *delta_log2_n_tiles); + +#endif // VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c index fbc8a38cd41df1ed79697059bd6d4dac90149c0b..6e2597954b454e3cdb516e1f5d4df72a4d06ea1d 100644 --- a/vp9/common/vp9_treecoder.c +++ b/vp9/common/vp9_treecoder.c @@ -48,66 +48,37 @@ void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t, tree2tok(p - offset, t, 0, 0, 0); } -static void branch_counts( - int n, /* n = size of alphabet */ - vp9_token tok [ /* n */ ], - vp9_tree tree, - unsigned int branch_ct [ /* n-1 */ ] [2], - const unsigned int num_events[ /* n */ ] -) { - const int tree_len = n - 1; - int t = 0; - -#if CONFIG_DEBUG - assert(tree_len); -#endif - - do { - branch_ct[t][0] = branch_ct[t][1] = 0; - } while (++t < tree_len); - - t = 0; - - do { - int L = tok[t].Len; - const int enc = tok[t].value; - const unsigned int ct = num_events[t]; - - vp9_tree_index i = 0; - - do { - const int b = (enc >> --L) & 1; - const int j = i >> 1; -#if CONFIG_DEBUG - assert(j < tree_len && 0 <= L); -#endif - - branch_ct [j] [b] += ct; - i = tree[ i + b]; - } while (i > 0); - -#if CONFIG_DEBUG - assert(!L); -#endif - } while (++t < n); - +static unsigned int convert_distribution(unsigned int i, + vp9_tree tree, + vp9_prob probs[], + unsigned int branch_ct[][2], + const unsigned int num_events[], + unsigned int tok0_offset) { + unsigned int left, right; + + if (tree[i] <= 0) { + left = num_events[-tree[i] - tok0_offset]; + } else { + left = convert_distribution(tree[i], tree, probs, branch_ct, + num_events, tok0_offset); + } + if (tree[i + 1] <= 0) { + right = num_events[-tree[i + 1] - tok0_offset]; + } else { + right = convert_distribution(tree[i + 1], tree, probs, branch_ct, + num_events, tok0_offset); + } + probs[i>>1] = get_binary_prob(left, right); + branch_ct[i>>1][0] = left; + branch_ct[i>>1][1] = right; + return left + right; } - void vp9_tree_probs_from_distribution( - int n, /* n = size of alphabet */ - vp9_token tok [ /* n */ ], vp9_tree tree, vp9_prob probs [ /* n-1 */ ], unsigned int branch_ct [ /* n-1 */ ] [2], - const unsigned int num_events[ /* n */ ] -) { - const int tree_len = n - 1; - int t = 0; - - branch_counts(n, tok, tree, branch_ct, num_events); - - do { - probs[t] = get_binary_prob(branch_ct[t][0], branch_ct[t][1]); - } while (++t < tree_len); + const unsigned int num_events[ /* n */ ], + unsigned int tok0_offset) { + convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset); } diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h index 0c0c5e96e601c975c7ac63eb5cddf399ca7470a8..9297d5280157691ec8b490508079a44566d511a1 100644 --- a/vp9/common/vp9_treecoder.h +++ b/vp9/common/vp9_treecoder.h @@ -11,6 +11,7 @@ #ifndef VP9_COMMON_VP9_TREECODER_H_ #define VP9_COMMON_VP9_TREECODER_H_ +#include "./vpx_config.h" #include "vpx/vpx_integer.h" typedef uint8_t vp9_prob; @@ -46,27 +47,35 @@ void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree, taken for each node on the tree; this facilitiates decisions as to probability updates. */ -void vp9_tree_probs_from_distribution(int n, /* n = size of alphabet */ - vp9_token tok[ /* n */ ], - vp9_tree tree, +void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[ /* n - 1 */ ], unsigned int branch_ct[ /* n - 1 */ ][2], - const unsigned int num_events[ /* n */ ]); + const unsigned int num_events[ /* n */ ], + unsigned int tok0_offset); -static __inline vp9_prob clip_prob(int p) { +static INLINE vp9_prob clip_prob(int p) { return (p > 255) ? 255u : (p < 1) ? 1u : p; } -static __inline vp9_prob get_prob(int num, int den) { +// int64 is not needed for normal frame level calculations. +// However when outputing entropy stats accumulated over many frames +// or even clips we can overflow int math. +#ifdef ENTROPY_STATS +static INLINE vp9_prob get_prob(int num, int den) { + return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); +} +#else +static INLINE vp9_prob get_prob(int num, int den) { return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den); } +#endif -static __inline vp9_prob get_binary_prob(int n0, int n1) { +static INLINE vp9_prob get_binary_prob(int n0, int n1) { return get_prob(n0, n0 + n1); } /* this function assumes prob1 and prob2 are already within [1,255] range */ -static __inline vp9_prob weighted_prob(int prob1, int prob2, int factor) { +static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { return (prob1 * (256 - factor) + prob2 * factor + 128) >> 8; } diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index f09e2d78be86786cbb07a3aeace47d73f02e1172..6d3bb021a7d31ddad0ac8cdce3067abb01121146 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -8,91 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include "./vpx_config.h" +#include "./vp9_rtcd.h" #include "vpx_ports/mem.h" -#include "vp9/common/vp9_subpixel.h" - -extern const short vp9_six_tap_mmx[8][6 * 8]; - -extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr, - unsigned char *output_ptr, - int output_pitch, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_height, - unsigned int output_width); - -extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_lin, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - /////////////////////////////////////////////////////////////////////////// // the mmx function that does the bilinear filtering and var calculation // // int one pass // @@ -116,389 +36,7 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { { 8, 8, 8, 8, 120, 120, 120, 120 } }; -#if HAVE_MMX -void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16); - const short *hfilter, *vfilter; - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 8, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch, - 8, 4, 4, 4, vfilter); -} - -void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, - fdata2 + 8, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, - fdata2 + 12, src_pixels_per_line, 1, 21, 32, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch, - 32, 16, 16, 16, vfilter); -} - -void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 13, 16, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 13, 16, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 8, 8, vfilter); -} - -void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 9, 16, hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 4, 8, vfilter); -} -#endif - -#if HAVE_SSE2 -void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 21, 32, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 21, 32); - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } -} - -void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 13, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, vfilter); - } -} - -void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, vfilter); - } -} -#endif - #if HAVE_SSSE3 -extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - fdata2, 16, 21, xoffset); - vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch, - 16, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 16, yoffset); - } -} - -void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 13, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset); - } else { - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, yoffset); - } -} - -void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 9, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - -void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 4, 9, xoffset); - vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset); - } else { - vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, const unsigned int src_pitch, unsigned char *output_ptr, @@ -513,30 +51,6 @@ void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, unsigned int output_height, const short *filter); -void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 23, hfilter_aligned16); - vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, - 16, hfilter_aligned16); - } else { - vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 16, vfilter_aligned16); - } - } -} - void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, const unsigned int src_pitch, unsigned char *output_ptr, @@ -551,51 +65,303 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, unsigned int output_height, const short *filter); -void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); +void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 15, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 8, vfilter_aligned16); +void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; } + while (w >= 8) { + vp9_filter_block1d8_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); } } -void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); +void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 11, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 4, vfilter_aligned16); +void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23); + + // check w/h due to fixed size fdata2 array + assert(w <= 16); + assert(h <= 16); + + if (x_step_q4 == 16 && y_step_q4 == 16 && + filter_x[3] != 128 && filter_y[3] != 128) { + if (w == 16) { + vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d16_v8_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 8) { + vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d8_v8_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 4) { + vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d4_v8_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + } + vp9_convolve8_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); +} + +void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23); + + // check w/h due to fixed size fdata2 array + assert(w <= 16); + assert(h <= 16); + + if (x_step_q4 == 16 && y_step_q4 == 16 && + filter_x[3] != 128 && filter_y[3] != 128) { + if (w == 16) { + vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 8) { + vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 4) { + vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; } } + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); } #endif diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c deleted file mode 100644 index 8e02ac1975eb13597aedab4c70b33189c8fd7954..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_filter_sse2.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> // for alignment checks -#include <emmintrin.h> // SSE2 -#include "vp9/common/vp9_filter.h" -#include "vpx_ports/emmintrin_compat.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, src_ptr, offset) \ - { \ - /* Do shifted load to achieve require shuffles through unpacking */ \ - const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \ - const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \ - const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \ - const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \ - const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \ - const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \ - const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \ - const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \ - /* Shit by 4 bytes through suffle to get additional shifted loads */ \ - const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \ - const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \ - const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \ - const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \ - /* multiply accumulate them */ \ - const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB); - const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB); - const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC); - const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC); - // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 - // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx - // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); - const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); - // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 - // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 - // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx - // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx - const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); - const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx - // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(3, 2, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(3, 2, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - DECLARE_ALIGNED(16, unsigned char, temp[32]); - { - _mm_store_si128((__m128i *)temp, transpose3_0); - DO_FOUR_PIXELS(col0, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_1); - DO_FOUR_PIXELS(col1, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_2); - DO_FOUR_PIXELS(col2, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_3); - DO_FOUR_PIXELS(col3, temp, 0); - } - // transpose - { - __m128i T0 = _mm_unpacklo_epi32(col0, col1); - __m128i T1 = _mm_unpacklo_epi32(col2, col3); - __m128i T2 = _mm_unpackhi_epi32(col0, col1); - __m128i T3 = _mm_unpackhi_epi32(col2, col3); - col0 = _mm_unpacklo_epi64(T0, T1); - col1 = _mm_unpackhi_epi64(T0, T1); - col2 = _mm_unpacklo_epi64(T2, T3); - col3 = _mm_unpackhi_epi64(T2, T3); - } - // saturate to 8 bit - { - col0 = _mm_packs_epi32(col0, col0); - col0 = _mm_packus_epi16(col0, col0); - col1 = _mm_packs_epi32(col1, col1); - col1 = _mm_packus_epi16(col1, col1); - col2 = _mm_packs_epi32 (col2, col2); - col2 = _mm_packus_epi16(col2, col2); - col3 = _mm_packs_epi32 (col3, col3); - col3 = _mm_packus_epi16(col3, col3); - } - // store - { - *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c deleted file mode 100644 index 52c35b29689d85f0e33778feaee08139772e2b01..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_filter_sse4.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> // for alignment checks -#include <smmintrin.h> // SSE4.1 -#include "vp9/common/vp9_filter.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Reduce source size by using macros instead of current code -// duplication. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { - 0x00, 0x01, - 0x01, 0x02, - 0x02, 0x03, - 0x03, 0x04, - 0x02, 0x03, - 0x03, 0x04, - 0x04, 0x05, - 0x05, 0x06, -}; -DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = { - 0x04, 0x05, - 0x05, 0x06, - 0x06, 0x07, - 0x07, 0x08, - 0x06, 0x07, - 0x07, 0x08, - 0x08, 0x09, - 0x09, 0x0A, -}; -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; -DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = { - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, offset) \ - { \ - /*load pixels*/ \ - __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \ - /* extract the ones used for first column */ \ - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \ - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \ - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \ - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \ - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \ - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \ - /* multiply accumulate them */ \ - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c); - const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose); - const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose); - const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx - const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(0, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(1, 1, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(2, 2, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(3, 3, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - { - //load pixels - __m128i src = transpose3_0; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col0 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_1; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col1 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_2; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col2 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_3; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col3 = _mm_packus_epi16(mad_all, mad_all); - } - { - __m128i col01 = _mm_unpacklo_epi8(col0, col1); - __m128i col23 = _mm_unpacklo_epi8(col2, col3); - __m128i col0123 = _mm_unpacklo_epi16(col01, col23); - //TODO(cd): look into Ronald's comment: - // Future suggestion: I believe here, too, you can merge the - // packs_epi32() and pacus_epi16() for the 4 cols above, so that - // you get the data in a single register, and then use pshufb - // (shuffle_epi8()) instead of the unpacks here. Should be - // 2+3+2 instructions faster. - *((unsigned int *)&dst_ptr[dst_stride * 0]) = - _mm_extract_epi32(col0123, 0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = - _mm_extract_epi32(col0123, 1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = - _mm_extract_epi32(col0123, 2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = - _mm_extract_epi32(col0123, 3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/vp9_idctllm_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm similarity index 100% rename from vp9/common/x86/vp9_idctllm_sse2.asm rename to vp9/common/x86/vp9_idct_sse2.asm diff --git a/vp9/common/x86/vp9_idct_x86.c b/vp9/common/x86/vp9_idct_x86.c new file mode 100644 index 0000000000000000000000000000000000000000..811ed9899be21d00d56ca72b3b51bebce0c8b947 --- /dev/null +++ b/vp9/common/x86/vp9_idct_x86.c @@ -0,0 +1,1975 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +#if HAVE_SSE2 +// In order to improve performance, clip absolute diff values to [0, 255], +// which allows to keep the additions/subtractions in 8 bits. +void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, + uint8_t *dst_ptr, int pitch, int stride) { + int a1; + int16_t out; + uint8_t abs_diff; + __m128i p0, p1, p2, p3; + unsigned int extended_diff; + __m128i diff; + + out = dct_const_round_shift(input_dc * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + // Read prediction data. + p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch)); + p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch)); + p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch)); + p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch)); + + // Unpack prediction data, and store 4x4 array in 1 XMM register. + p0 = _mm_unpacklo_epi32(p0, p1); + p2 = _mm_unpacklo_epi32(p2, p3); + p0 = _mm_unpacklo_epi64(p0, p2); + + // Clip dc value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (a1 >= 0) { + abs_diff = (a1 > 255) ? 255 : a1; + extended_diff = abs_diff * 0x01010101u; + diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); + + p1 = _mm_adds_epu8(p0, diff); + } else { + abs_diff = (a1 < -255) ? 255 : -a1; + extended_diff = abs_diff * 0x01010101u; + diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); + + p1 = _mm_subs_epu8(p0, diff); + } + + // Store results to dst. + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); +} + +void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const int half_pitch = pitch >> 1; + __m128i input0, input1, input2, input3; + + // Rows + input0 = _mm_loadl_epi64((__m128i *)input); + input1 = _mm_loadl_epi64((__m128i *)(input + 4)); + input2 = _mm_loadl_epi64((__m128i *)(input + 8)); + input3 = _mm_loadl_epi64((__m128i *)(input + 12)); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input0, 0xd8); + input1 = _mm_shufflelo_epi16(input1, 0xd8); + input2 = _mm_shufflelo_epi16(input2, 0xd8); + input3 = _mm_shufflelo_epi16(input3, 0xd8); + + input0 = _mm_unpacklo_epi32(input0, input0); + input1 = _mm_unpacklo_epi32(input1, input1); + input2 = _mm_unpacklo_epi32(input2, input2); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, zero); + input1 = _mm_packs_epi32(input1, zero); + input2 = _mm_packs_epi32(input2, zero); + input3 = _mm_packs_epi32(input3, zero); + + // Transpose + input1 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpacklo_epi16(input2, input3); + input0 = _mm_unpacklo_epi32(input1, input3); + input1 = _mm_unpackhi_epi32(input1, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Columns + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input2, 0xd8); + input1 = _mm_shufflehi_epi16(input2, 0xd8); + input2 = _mm_shufflehi_epi16(input3, 0xd8); + input3 = _mm_shufflelo_epi16(input3, 0xd8); + + input0 = _mm_unpacklo_epi32(input0, input0); + input1 = _mm_unpackhi_epi32(input1, input1); + input2 = _mm_unpackhi_epi32(input2, input2); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, zero); + input1 = _mm_packs_epi32(input1, zero); + input2 = _mm_packs_epi32(input2, zero); + input3 = _mm_packs_epi32(input3, zero); + + // Transpose + input1 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpacklo_epi16(input2, input3); + input0 = _mm_unpacklo_epi32(input1, input3); + input1 = _mm_unpackhi_epi32(input1, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Final round and shift + input2 = _mm_add_epi16(input2, eight); + input3 = _mm_add_epi16(input3, eight); + + input2 = _mm_srai_epi16(input2, 4); + input3 = _mm_srai_epi16(input3, 4); + + // Store results + _mm_storel_epi64((__m128i *)output, input2); + input2 = _mm_srli_si128(input2, 8); + _mm_storel_epi64((__m128i *)(output + half_pitch), input2); + + _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3); + input3 = _mm_srli_si128(input3, 8); + _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3); +} + +void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { + const __m128i zero = _mm_setzero_si128(); + const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1); + + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i in, temp; + + // Load input data. + in = _mm_loadl_epi64((__m128i *)input); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + in = _mm_shufflelo_epi16(in, 0xd8); + in = _mm_unpacklo_epi32(in, in); + + // Stage 1 + in = _mm_madd_epi16(in, c1); + in = _mm_add_epi32(in, rounding); + in = _mm_srai_epi32(in, DCT_CONST_BITS); + in = _mm_packs_epi32(in, zero); + + // Stage 2 + temp = _mm_shufflelo_epi16(in, 0x9c); + in = _mm_shufflelo_epi16(in, 0xc9); + in = _mm_unpacklo_epi64(temp, in); + in = _mm_madd_epi16(in, c2); + in = _mm_packs_epi32(in, zero); + + // Store results + _mm_storel_epi64((__m128i *)output, in); +} + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } + +#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = out5 = out6 = out7 = zero; \ + } + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ + in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ + } + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ + cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define IDCT8x8_1D \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_4, \ + stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_0, \ + stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + in0 = _mm_adds_epi16(stp1_0, stp2_7); \ + in1 = _mm_adds_epi16(stp1_1, stp1_6); \ + in2 = _mm_adds_epi16(stp1_2, stp1_5); \ + in3 = _mm_adds_epi16(stp1_3, stp2_4); \ + in4 = _mm_subs_epi16(stp1_3, stp2_4); \ + in5 = _mm_subs_epi16(stp1_2, stp1_5); \ + in6 = _mm_subs_epi16(stp1_1, stp1_6); \ + in7 = _mm_subs_epi16(stp1_0, stp2_7); + +void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in4 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 7)); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8x8_1D + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); +} + +void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // Rows. Load 4-row input data. + in0 = _mm_load_si128((__m128i *)input); + in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); + + // 8x4 Transpose + TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) + + // Stage1 + { + const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); + + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, zero); + stp1_7 = _mm_packs_epi32(tmp2, zero); + stp1_5 = _mm_packs_epi32(tmp4, zero); + stp1_6 = _mm_packs_epi32(tmp6, zero); + } + + // Stage2 + { + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); + + tmp0 = _mm_madd_epi16(lo_04, stg2_0); + tmp2 = _mm_madd_epi16(lo_04, stg2_1); + tmp4 = _mm_madd_epi16(lo_26, stg2_2); + tmp6 = _mm_madd_epi16(lo_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, zero); + stp2_1 = _mm_packs_epi32(tmp2, zero); + stp2_2 = _mm_packs_epi32(tmp4, zero); + stp2_3 = _mm_packs_epi32(tmp6, zero); + + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); + } + + // Stage3 + { + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_56, stg3_0); + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, zero); + stp1_6 = _mm_packs_epi32(tmp2, zero); + } + + // Stage4 + in0 = _mm_adds_epi16(stp1_0, stp2_7); + in1 = _mm_adds_epi16(stp1_1, stp1_6); + in2 = _mm_adds_epi16(stp1_2, stp1_5); + in3 = _mm_adds_epi16(stp1_3, stp2_4); + in4 = _mm_subs_epi16(stp1_3, stp2_4); + in5 = _mm_subs_epi16(stp1_2, stp1_5); + in6 = _mm_subs_epi16(stp1_1, stp1_6); + in7 = _mm_subs_epi16(stp1_0, stp2_7); + + // Columns. 4x8 Transpose + TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7) + + // 1D idct8x8 + IDCT8x8_1D + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); +} + +#define IDCT16x16_1D \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ + stg2_0, stg2_1, stg2_2, stg2_3, \ + stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ + stg2_4, stg2_5, stg2_6, stg2_7, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ + stg3_0, stg3_1, stg3_2, stg3_3, \ + stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ + stg4_0, stg4_1, stg4_2, stg4_3, \ + stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, + in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, + in10 = zero, in11 = zero, in12 = zero, in13 = zero, + in14 = zero, in15 = zero; + __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, + l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, + l12 = zero, l13 = zero, l14 = zero, l15 = zero; + __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, + r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, + r12 = zero, r13 = zero, r14 = zero, r15 = zero; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. + for (i = 0; i < 4; i++) { + // 1-D idct + if (i < 2) { + if (i == 1) input += 128; + + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in4 = _mm_load_si128((__m128i *)(input + 8 * 8)); + in12 = _mm_load_si128((__m128i *)(input + 8 * 9)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 10)); + in13 = _mm_load_si128((__m128i *)(input + 8 * 11)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 12)); + in14 = _mm_load_si128((__m128i *)(input + 8 * 13)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 14)); + in15 = _mm_load_si128((__m128i *)(input + 8 * 15)); + + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + } + + if (i == 2) { + TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, + in13, in14, in15); + } + + if (i == 3) { + TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, + in12, in13, in14, in15); + } + + IDCT16x16_1D + + // Stage7 + if (i == 0) { + // Left 8x16 + l0 = _mm_add_epi16(stp2_0, stp1_15); + l1 = _mm_add_epi16(stp2_1, stp1_14); + l2 = _mm_add_epi16(stp2_2, stp2_13); + l3 = _mm_add_epi16(stp2_3, stp2_12); + l4 = _mm_add_epi16(stp2_4, stp2_11); + l5 = _mm_add_epi16(stp2_5, stp2_10); + l6 = _mm_add_epi16(stp2_6, stp1_9); + l7 = _mm_add_epi16(stp2_7, stp1_8); + l8 = _mm_sub_epi16(stp2_7, stp1_8); + l9 = _mm_sub_epi16(stp2_6, stp1_9); + l10 = _mm_sub_epi16(stp2_5, stp2_10); + l11 = _mm_sub_epi16(stp2_4, stp2_11); + l12 = _mm_sub_epi16(stp2_3, stp2_12); + l13 = _mm_sub_epi16(stp2_2, stp2_13); + l14 = _mm_sub_epi16(stp2_1, stp1_14); + l15 = _mm_sub_epi16(stp2_0, stp1_15); + } else if (i == 1) { + // Right 8x16 + r0 = _mm_add_epi16(stp2_0, stp1_15); + r1 = _mm_add_epi16(stp2_1, stp1_14); + r2 = _mm_add_epi16(stp2_2, stp2_13); + r3 = _mm_add_epi16(stp2_3, stp2_12); + r4 = _mm_add_epi16(stp2_4, stp2_11); + r5 = _mm_add_epi16(stp2_5, stp2_10); + r6 = _mm_add_epi16(stp2_6, stp1_9); + r7 = _mm_add_epi16(stp2_7, stp1_8); + r8 = _mm_sub_epi16(stp2_7, stp1_8); + r9 = _mm_sub_epi16(stp2_6, stp1_9); + r10 = _mm_sub_epi16(stp2_5, stp2_10); + r11 = _mm_sub_epi16(stp2_4, stp2_11); + r12 = _mm_sub_epi16(stp2_3, stp2_12); + r13 = _mm_sub_epi16(stp2_2, stp2_13); + r14 = _mm_sub_epi16(stp2_1, stp1_14); + r15 = _mm_sub_epi16(stp2_0, stp1_15); + } else { + // 2-D + in0 = _mm_add_epi16(stp2_0, stp1_15); + in1 = _mm_add_epi16(stp2_1, stp1_14); + in2 = _mm_add_epi16(stp2_2, stp2_13); + in3 = _mm_add_epi16(stp2_3, stp2_12); + in4 = _mm_add_epi16(stp2_4, stp2_11); + in5 = _mm_add_epi16(stp2_5, stp2_10); + in6 = _mm_add_epi16(stp2_6, stp1_9); + in7 = _mm_add_epi16(stp2_7, stp1_8); + in8 = _mm_sub_epi16(stp2_7, stp1_8); + in9 = _mm_sub_epi16(stp2_6, stp1_9); + in10 = _mm_sub_epi16(stp2_5, stp2_10); + in11 = _mm_sub_epi16(stp2_4, stp2_11); + in12 = _mm_sub_epi16(stp2_3, stp2_12); + in13 = _mm_sub_epi16(stp2_2, stp2_13); + in14 = _mm_sub_epi16(stp2_1, stp1_14); + in15 = _mm_sub_epi16(stp2_0, stp1_15); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + _mm_store_si128((__m128i *)(output + half_pitch * 8), in8); + _mm_store_si128((__m128i *)(output + half_pitch * 9), in9); + _mm_store_si128((__m128i *)(output + half_pitch * 10), in10); + _mm_store_si128((__m128i *)(output + half_pitch * 11), in11); + _mm_store_si128((__m128i *)(output + half_pitch * 12), in12); + _mm_store_si128((__m128i *)(output + half_pitch * 13), in13); + _mm_store_si128((__m128i *)(output + half_pitch * 14), in14); + _mm_store_si128((__m128i *)(output + half_pitch * 15), in15); + + output += 8; + } + } +} + +void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, + in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, + in10 = zero, in11 = zero, in12 = zero, in13 = zero, + in14 = zero, in15 = zero; + __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, + l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, + l12 = zero, l13 = zero, l14 = zero, l15 = zero; + + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // 1-D idct. Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); + + TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); + const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); + const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); + const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); + + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); + tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); + tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); + tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); + tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_8 = _mm_packs_epi32(tmp0, zero); + stp2_15 = _mm_packs_epi32(tmp2, zero); + stp2_9 = _mm_packs_epi32(tmp4, zero); + stp2_14 = _mm_packs_epi32(tmp6, zero); + + stp2_10 = _mm_packs_epi32(tmp1, zero); + stp2_13 = _mm_packs_epi32(tmp3, zero); + stp2_11 = _mm_packs_epi32(tmp5, zero); + stp2_12 = _mm_packs_epi32(tmp7, zero); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); + const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); + + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); + tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, zero); + stp1_7 = _mm_packs_epi32(tmp2, zero); + stp1_5 = _mm_packs_epi32(tmp4, zero); + stp1_6 = _mm_packs_epi32(tmp6, zero); + + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); + const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); + tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); + tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, zero); + stp2_1 = _mm_packs_epi32(tmp2, zero); + stp2_2 = _mm_packs_epi32(tmp4, zero); + stp2_3 = _mm_packs_epi32(tmp6, zero); + stp2_9 = _mm_packs_epi32(tmp1, zero); + stp2_14 = _mm_packs_epi32(tmp3, zero); + stp2_10 = _mm_packs_epi32(tmp5, zero); + stp2_13 = _mm_packs_epi32(tmp7, zero); + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + } + + // Stage5 and Stage6 + { + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); + + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp1, zero); + stp1_6 = _mm_packs_epi32(tmp3, zero); + stp2_10 = _mm_packs_epi32(tmp0, zero); + stp2_13 = _mm_packs_epi32(tmp2, zero); + stp2_11 = _mm_packs_epi32(tmp4, zero); + stp2_12 = _mm_packs_epi32(tmp6, zero); + + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); + } + + // Stage7. Left 8x16 only. + l0 = _mm_add_epi16(stp2_0, stp1_15); + l1 = _mm_add_epi16(stp2_1, stp1_14); + l2 = _mm_add_epi16(stp2_2, stp2_13); + l3 = _mm_add_epi16(stp2_3, stp2_12); + l4 = _mm_add_epi16(stp2_4, stp2_11); + l5 = _mm_add_epi16(stp2_5, stp2_10); + l6 = _mm_add_epi16(stp2_6, stp1_9); + l7 = _mm_add_epi16(stp2_7, stp1_8); + l8 = _mm_sub_epi16(stp2_7, stp1_8); + l9 = _mm_sub_epi16(stp2_6, stp1_9); + l10 = _mm_sub_epi16(stp2_5, stp2_10); + l11 = _mm_sub_epi16(stp2_4, stp2_11); + l12 = _mm_sub_epi16(stp2_3, stp2_12); + l13 = _mm_sub_epi16(stp2_2, stp2_13); + l14 = _mm_sub_epi16(stp2_1, stp1_14); + l15 = _mm_sub_epi16(stp2_0, stp1_15); + + // 2-D idct. We do 2 8x16 blocks. + for (i = 0; i < 2; i++) { + if (i == 0) + TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, + in5, in6, in7); + + if (i == 1) + TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, + in4, in5, in6, in7); + + in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; + + IDCT16x16_1D + + // Stage7 + in0 = _mm_add_epi16(stp2_0, stp1_15); + in1 = _mm_add_epi16(stp2_1, stp1_14); + in2 = _mm_add_epi16(stp2_2, stp2_13); + in3 = _mm_add_epi16(stp2_3, stp2_12); + in4 = _mm_add_epi16(stp2_4, stp2_11); + in5 = _mm_add_epi16(stp2_5, stp2_10); + in6 = _mm_add_epi16(stp2_6, stp1_9); + in7 = _mm_add_epi16(stp2_7, stp1_8); + in8 = _mm_sub_epi16(stp2_7, stp1_8); + in9 = _mm_sub_epi16(stp2_6, stp1_9); + in10 = _mm_sub_epi16(stp2_5, stp2_10); + in11 = _mm_sub_epi16(stp2_4, stp2_11); + in12 = _mm_sub_epi16(stp2_3, stp2_12); + in13 = _mm_sub_epi16(stp2_2, stp2_13); + in14 = _mm_sub_epi16(stp2_1, stp1_14); + in15 = _mm_sub_epi16(stp2_0, stp1_15); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + _mm_store_si128((__m128i *)(output + half_pitch * 8), in8); + _mm_store_si128((__m128i *)(output + half_pitch * 9), in9); + _mm_store_si128((__m128i *)(output + half_pitch * 10), in10); + _mm_store_si128((__m128i *)(output + half_pitch * 11), in11); + _mm_store_si128((__m128i *)(output + half_pitch * 12), in12); + _mm_store_si128((__m128i *)(output + half_pitch * 13), in13); + _mm_store_si128((__m128i *)(output + half_pitch * 14), in14); + _mm_store_si128((__m128i *)(output + half_pitch * 15), in15); + output += 8; + } +} + +void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) { + const int half_pitch = pitch >> 1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, + in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, + in24, in25, in26, in27, in28, in29, in30, in31; + __m128i col[128]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j; + + // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. + for (i = 0; i < 8; i++) { + if (i < 4) { + // First 1-D idct + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); + in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); + in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); + in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); + in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); + + in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); + in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); + in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); + in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); + in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); + in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); + in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); + in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); + in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); + in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); + in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); + in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); + in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); + + input += 256; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, + in18, in19, in20, in21, in22, in23); + TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, + in26, in27, in28, in29, in30, in31); + } else { + // Second 1-D idct + j = i - 4; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, + in5, in6, in7); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, + in11, in12, in13, in14, in15); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, + in19, in20, in21, in22, in23); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, + in28, in29, in30, in31); + } + + // Stage1 + { + const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); + const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); + const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); + const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); + + const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); + const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); + const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); + const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); + + const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); + const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); + const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); + const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); + + const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); + const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); + const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); + const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); + + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, + stp1_17, stp1_30) + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, + stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, + stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, + stp1_23, stp1_24) + } + + // Stage2 + { + const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); + const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); + const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); + const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); + + const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); + const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); + const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); + const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); + + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, + stp2_14) + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, + stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); + + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); + + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); + } + + // Stage3 + { + const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); + const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); + const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); + const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); + + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, + stp1_6) + + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, + stp1_18, stp1_29) + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, + stp1_22, stp1_25) + + stp1_16 = stp2_16; + stp1_31 = stp2_31; + stp1_19 = stp2_19; + stp1_20 = stp2_20; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_27 = stp2_27; + stp1_28 = stp2_28; + } + + // Stage4 + { + const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); + const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); + const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); + const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); + + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, + stp2_2, stp2_3) + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, + stp2_10, stp2_13) + + stp2_8 = stp1_8; + stp2_15 = stp1_15; + stp2_11 = stp1_11; + stp2_12 = stp1_12; + + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); + } + + // Stage5 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp1); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + stp1_4 = stp2_4; + stp1_7 = stp2_7; + + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, + stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, + stp1_21, stp1_26) + + stp1_22 = stp2_22; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_25 = stp2_25; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + // Stage6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); + + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); + + stp2_8 = stp1_8; + stp2_9 = stp1_9; + stp2_14 = stp1_14; + stp2_15 = stp1_15; + + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, + stp2_13, stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); + + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); + } + + // Stage7 + { + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + stp1_18 = stp2_18; + stp1_19 = stp2_19; + + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, + stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, + stp1_23, stp1_24) + + stp1_28 = stp2_28; + stp1_29 = stp2_29; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + // final stage + if (i < 4) { + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } else { + // 2_D: Calculate the results and store them to destination. + in0 = _mm_add_epi16(stp1_0, stp1_31); + in1 = _mm_add_epi16(stp1_1, stp1_30); + in2 = _mm_add_epi16(stp1_2, stp1_29); + in3 = _mm_add_epi16(stp1_3, stp1_28); + in4 = _mm_add_epi16(stp1_4, stp1_27); + in5 = _mm_add_epi16(stp1_5, stp1_26); + in6 = _mm_add_epi16(stp1_6, stp1_25); + in7 = _mm_add_epi16(stp1_7, stp1_24); + in8 = _mm_add_epi16(stp1_8, stp1_23); + in9 = _mm_add_epi16(stp1_9, stp1_22); + in10 = _mm_add_epi16(stp1_10, stp1_21); + in11 = _mm_add_epi16(stp1_11, stp1_20); + in12 = _mm_add_epi16(stp1_12, stp1_19); + in13 = _mm_add_epi16(stp1_13, stp1_18); + in14 = _mm_add_epi16(stp1_14, stp1_17); + in15 = _mm_add_epi16(stp1_15, stp1_16); + in16 = _mm_sub_epi16(stp1_15, stp1_16); + in17 = _mm_sub_epi16(stp1_14, stp1_17); + in18 = _mm_sub_epi16(stp1_13, stp1_18); + in19 = _mm_sub_epi16(stp1_12, stp1_19); + in20 = _mm_sub_epi16(stp1_11, stp1_20); + in21 = _mm_sub_epi16(stp1_10, stp1_21); + in22 = _mm_sub_epi16(stp1_9, stp1_22); + in23 = _mm_sub_epi16(stp1_8, stp1_23); + in24 = _mm_sub_epi16(stp1_7, stp1_24); + in25 = _mm_sub_epi16(stp1_6, stp1_25); + in26 = _mm_sub_epi16(stp1_5, stp1_26); + in27 = _mm_sub_epi16(stp1_4, stp1_27); + in28 = _mm_sub_epi16(stp1_3, stp1_28); + in29 = _mm_sub_epi16(stp1_2, stp1_29); + in30 = _mm_sub_epi16(stp1_1, stp1_30); + in31 = _mm_sub_epi16(stp1_0, stp1_31); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + in16 = _mm_adds_epi16(in16, final_rounding); + in17 = _mm_adds_epi16(in17, final_rounding); + in18 = _mm_adds_epi16(in18, final_rounding); + in19 = _mm_adds_epi16(in19, final_rounding); + in20 = _mm_adds_epi16(in20, final_rounding); + in21 = _mm_adds_epi16(in21, final_rounding); + in22 = _mm_adds_epi16(in22, final_rounding); + in23 = _mm_adds_epi16(in23, final_rounding); + in24 = _mm_adds_epi16(in24, final_rounding); + in25 = _mm_adds_epi16(in25, final_rounding); + in26 = _mm_adds_epi16(in26, final_rounding); + in27 = _mm_adds_epi16(in27, final_rounding); + in28 = _mm_adds_epi16(in28, final_rounding); + in29 = _mm_adds_epi16(in29, final_rounding); + in30 = _mm_adds_epi16(in30, final_rounding); + in31 = _mm_adds_epi16(in31, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + in16 = _mm_srai_epi16(in16, 6); + in17 = _mm_srai_epi16(in17, 6); + in18 = _mm_srai_epi16(in18, 6); + in19 = _mm_srai_epi16(in19, 6); + in20 = _mm_srai_epi16(in20, 6); + in21 = _mm_srai_epi16(in21, 6); + in22 = _mm_srai_epi16(in22, 6); + in23 = _mm_srai_epi16(in23, 6); + in24 = _mm_srai_epi16(in24, 6); + in25 = _mm_srai_epi16(in25, 6); + in26 = _mm_srai_epi16(in26, 6); + in27 = _mm_srai_epi16(in27, 6); + in28 = _mm_srai_epi16(in28, 6); + in29 = _mm_srai_epi16(in29, 6); + in30 = _mm_srai_epi16(in30, 6); + in31 = _mm_srai_epi16(in31, 6); + + // Store results + _mm_store_si128((__m128i *)output, in0); + _mm_store_si128((__m128i *)(output + half_pitch * 1), in1); + _mm_store_si128((__m128i *)(output + half_pitch * 2), in2); + _mm_store_si128((__m128i *)(output + half_pitch * 3), in3); + _mm_store_si128((__m128i *)(output + half_pitch * 4), in4); + _mm_store_si128((__m128i *)(output + half_pitch * 5), in5); + _mm_store_si128((__m128i *)(output + half_pitch * 6), in6); + _mm_store_si128((__m128i *)(output + half_pitch * 7), in7); + _mm_store_si128((__m128i *)(output + half_pitch * 8), in8); + _mm_store_si128((__m128i *)(output + half_pitch * 9), in9); + _mm_store_si128((__m128i *)(output + half_pitch * 10), in10); + _mm_store_si128((__m128i *)(output + half_pitch * 11), in11); + _mm_store_si128((__m128i *)(output + half_pitch * 12), in12); + _mm_store_si128((__m128i *)(output + half_pitch * 13), in13); + _mm_store_si128((__m128i *)(output + half_pitch * 14), in14); + _mm_store_si128((__m128i *)(output + half_pitch * 15), in15); + _mm_store_si128((__m128i *)(output + half_pitch * 16), in16); + _mm_store_si128((__m128i *)(output + half_pitch * 17), in17); + _mm_store_si128((__m128i *)(output + half_pitch * 18), in18); + _mm_store_si128((__m128i *)(output + half_pitch * 19), in19); + _mm_store_si128((__m128i *)(output + half_pitch * 20), in20); + _mm_store_si128((__m128i *)(output + half_pitch * 21), in21); + _mm_store_si128((__m128i *)(output + half_pitch * 22), in22); + _mm_store_si128((__m128i *)(output + half_pitch * 23), in23); + _mm_store_si128((__m128i *)(output + half_pitch * 24), in24); + _mm_store_si128((__m128i *)(output + half_pitch * 25), in25); + _mm_store_si128((__m128i *)(output + half_pitch * 26), in26); + _mm_store_si128((__m128i *)(output + half_pitch * 27), in27); + _mm_store_si128((__m128i *)(output + half_pitch * 28), in28); + _mm_store_si128((__m128i *)(output + half_pitch * 29), in29); + _mm_store_si128((__m128i *)(output + half_pitch * 30), in30); + _mm_store_si128((__m128i *)(output + half_pitch * 31), in31); + + output += 8; + } + } +} +#endif diff --git a/vp9/common/x86/vp9_idct_x86.h b/vp9/common/x86/vp9_idct_x86.h index 8320cf87de88593ba85e0cf623b3269de39e790e..bd66d8c72407f929beac84ebe2acb672fcee01a3 100644 --- a/vp9/common/x86/vp9_idct_x86.h +++ b/vp9/common/x86/vp9_idct_x86.h @@ -20,23 +20,10 @@ */ #if HAVE_MMX -extern prototype_idct(vp9_short_idct4x4llm_1_mmx); -extern prototype_idct(vp9_short_idct4x4llm_mmx); -extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx); - extern prototype_second_order(vp9_short_inv_walsh4x4_mmx); extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_idct_idct1 -#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx - -#undef vp9_idct_idct16 -#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx - -#undef vp9_idct_idct1_scalar_add -#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx - #undef vp9_idct_iwalsh16 #define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx diff --git a/vp9/common/x86/vp9_idctllm_mmx.asm b/vp9/common/x86/vp9_idctllm_mmx.asm deleted file mode 100644 index 15e81addb3d6628231adccd5f6d2f2ccaa271ecc..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_idctllm_mmx.asm +++ /dev/null @@ -1,241 +0,0 @@ -; -; Copyright (c) 2012 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -align 16 -x_s1sqr2: times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: times 4 dw 0x4E7B -align 16 -pw_16: times 4 dw 16 - -SECTION .text - - -; /**************************************************************************** -; * Notes: -; * -; * This implementation makes use of 16 bit fixed point version of two multiply -; * constants: -; * 1. sqrt(2) * cos (pi/8) -; * 2. sqrt(2) * sin (pi/8) -; * Because the first constant is bigger than 1, to maintain the same 16 bit -; * fixed point precision as the second one, we use a trick of -; * x * a = x + x*(a-1) -; * so -; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). -; * -; * For the second constant, because of the 16bit version is 35468, which -; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative -; * number. -; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x -; * -; **************************************************************************/ - -INIT_MMX - -;void short_idct4x4llm_mmx(short *input, short *output, int pitch) -cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit - mova m0, [inpq +0] - mova m1, [inpq +8] - - mova m2, [inpq+16] - mova m3, [inpq+24] - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2] ; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1] ; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - mova m3, m5 ; 33 23 13 03 - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2] ; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1] ; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - paddw m0, [pw_16] - - paddw m2, [pw_16] - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - psraw m2, 5 - - psraw m0, 5 - psraw m4, 5 - - psraw m6, 5 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - mova [outq], m0 - - mova [outq+r2], m1 - mova [outq+pitq*2], m2 - - add outq, pitq - mova [outq+pitq*2], m5 - RET - -;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) -cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit - movh m0, [inpq] - paddw m0, [pw_16] - psraw m0, 5 - punpcklwd m0, m0 - punpckldq m0, m0 - - mova [outq], m0 - mova [outq+pitq], m0 - - mova [outq+pitq*2], m0 - add r1, r2 - - mova [outq+pitq*2], m0 - RET - - -;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) -cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride -%if ARCH_X86_64 - movsxd strideq, dword stridem -%else - mov strideq, stridem -%endif - pxor m0, m0 - - movh m5, in_dcq ; dc - paddw m5, [pw_16] - - psraw m5, 5 - - punpcklwd m5, m5 - punpckldq m5, m5 - - movh m1, [predq] - punpcklbw m1, m0 - paddsw m1, m5 - packuswb m1, m0 ; pack and unpack to saturate - movh [dstq], m1 - - movh m2, [predq+pitq] - punpcklbw m2, m0 - paddsw m2, m5 - packuswb m2, m0 ; pack and unpack to saturate - movh [dstq+strideq], m2 - - movh m3, [predq+2*pitq] - punpcklbw m3, m0 - paddsw m3, m5 - packuswb m3, m0 ; pack and unpack to saturate - movh [dstq+2*strideq], m3 - - add dstq, strideq - add predq, pitq - movh m4, [predq+2*pitq] - punpcklbw m4, m0 - paddsw m4, m5 - packuswb m4, m0 ; pack and unpack to saturate - movh [dstq+2*strideq], m4 - RET - diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index d319bf2d528233fa63a97d9f41d1b53db193b354..08447a62de101406f18fe615bdde1e430417b400 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -26,14 +26,16 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); + + DECLARE_ALIGNED(16, unsigned char, ap[8][16]); + DECLARE_ALIGNED(16, unsigned char, aq[8][16]); + + __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; @@ -58,12 +60,24 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + + _mm_store_si128((__m128i *)ap[4], p4); + _mm_store_si128((__m128i *)ap[3], p3); + _mm_store_si128((__m128i *)ap[2], p2); + _mm_store_si128((__m128i *)ap[1], p1); + _mm_store_si128((__m128i *)ap[0], p0); + _mm_store_si128((__m128i *)aq[4], q4); + _mm_store_si128((__m128i *)aq[3], q3); + _mm_store_si128((__m128i *)aq[2], q2); + _mm_store_si128((__m128i *)aq[1], q1); + _mm_store_si128((__m128i *)aq[0], q0); + + { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), @@ -95,246 +109,8 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); - - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), - _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), - _mm_subs_epu8(q0, q4))); - flat = _mm_max_epu8(work, flat); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // calculate flat2 - p4 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 5 * p)); -// p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); -// q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 5 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 6 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 7 * p)); - - { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); - const __m128i one = _mm_set1_epi8(1); - __m128i work; - flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), - _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), - _mm_subs_epu8(q0, q4))); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - // calculate flat2 - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - { - const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - i = 0; - do { - __m128i workp_a, workp_b, workp_shft; - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < 2); - } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // wide flat - // TODO(slavarnway): interleave with the flat pixel calculations (see above) - { - const __m128i eight = _mm_set1_epi16(8); - unsigned char *src = s; - int i = 0; - do { - __m128i workp_a, workp_b, workp_shft; - p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero); - p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero); - p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero); - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); - q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero); - q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero); - q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero); - - - workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 - workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a); - workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2)); - workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < 2); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // lp filter { const __m128i t4 = _mm_set1_epi8(4); @@ -345,14 +121,10 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); - __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), - t80); - __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), - t80); - __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), - t80); - __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), - t80); + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -374,6 +146,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); /* Filter2 >> 3 */ work_a = _mm_cmpgt_epi8(zero, filter2); @@ -381,6 +154,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); /* filt >> 1 */ filt = _mm_adds_epi8(filter1, t1); @@ -389,20 +163,265 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - - ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), + _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), + _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), + _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), + _mm_subs_epu8(q0, q5))); + _mm_store_si128((__m128i *)ap[5], p5); + _mm_store_si128((__m128i *)aq[5], q5); + flat2 = _mm_max_epu8(work, flat2); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), + _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), + _mm_subs_epu8(q0, q6))); + _mm_store_si128((__m128i *)ap[6], p6); + _mm_store_si128((__m128i *)aq[6], q6); + flat2 = _mm_max_epu8(work, flat2); + + p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), + _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), + _mm_subs_epu8(q0, q7))); + _mm_store_si128((__m128i *)ap[7], p7); + _mm_store_si128((__m128i *)aq[7], q7); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i temp_flat2 = flat2; + unsigned char *src = s; + int i = 0; + do { + __m128i workp_shft; + __m128i a, b, c; + + unsigned int off = i * 8; + p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); + p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); + p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); + q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); + q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); + q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero); + + c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 + c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); + + b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); + a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); + a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); + + _mm_storel_epi64((__m128i *)&flat_op[2][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q1, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); + _mm_storel_epi64((__m128i *)&flat_op[1][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q2, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); + _mm_storel_epi64((__m128i *)&flat_op[0][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q3, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); + _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + b = _mm_add_epi16(q3, b); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); + _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(q4, c); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + b = _mm_add_epi16(q3, b); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); + _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + a = _mm_add_epi16(q5, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q6, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + temp_flat2 = _mm_srli_si128(temp_flat2, 8); + src += 8; + } while (++i < 2); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + work_a = _mm_load_si128((__m128i *)ap[2]); + p2 = _mm_load_si128((__m128i *)flat_op[2]); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + _mm_store_si128((__m128i *)flat_op[2], p2); + + p1 = _mm_load_si128((__m128i *)flat_op[1]); + work_a = _mm_andnot_si128(flat, ps1); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + _mm_store_si128((__m128i *)flat_op[1], p1); + + p0 = _mm_load_si128((__m128i *)flat_op[0]); + work_a = _mm_andnot_si128(flat, ps0); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + _mm_store_si128((__m128i *)flat_op[0], p0); + + q0 = _mm_load_si128((__m128i *)flat_oq[0]); + work_a = _mm_andnot_si128(flat, qs0); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + _mm_store_si128((__m128i *)flat_oq[0], q0); + + q1 = _mm_load_si128((__m128i *)flat_oq[1]); + work_a = _mm_andnot_si128(flat, qs1); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + _mm_store_si128((__m128i *)flat_oq[1], q1); + + work_a = _mm_load_si128((__m128i *)aq[2]); + q2 = _mm_load_si128((__m128i *)flat_oq[2]); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + _mm_store_si128((__m128i *)flat_oq[2], q2); // write out op6 - op3 { unsigned char *dst = (s - 7 * p); for (i = 6; i > 2; i--) { __m128i flat2_output; - work_a = _mm_loadu_si128((__m128i *)dst); + work_a = _mm_load_si128((__m128i *)ap[i]); flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); @@ -412,62 +431,42 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, } } - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - work_a = _mm_or_si128(work_a, p2); + work_a = _mm_load_si128((__m128i *)flat_op[2]); p2 = _mm_load_si128((__m128i *)flat2_op[2]); work_a = _mm_andnot_si128(flat2, work_a); p2 = _mm_and_si128(flat2, p2); p2 = _mm_or_si128(work_a, p2); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - p1 = _mm_load_si128((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, ps1); - p1 = _mm_and_si128(flat, p1); - work_a = _mm_or_si128(work_a, p1); + work_a = _mm_load_si128((__m128i *)flat_op[1]); p1 = _mm_load_si128((__m128i *)flat2_op[1]); work_a = _mm_andnot_si128(flat2, work_a); p1 = _mm_and_si128(flat2, p1); p1 = _mm_or_si128(work_a, p1); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - p0 = _mm_load_si128((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, ps0); - p0 = _mm_and_si128(flat, p0); - work_a = _mm_or_si128(work_a, p0); + work_a = _mm_load_si128((__m128i *)flat_op[0]); p0 = _mm_load_si128((__m128i *)flat2_op[0]); work_a = _mm_andnot_si128(flat2, work_a); p0 = _mm_and_si128(flat2, p0); p0 = _mm_or_si128(work_a, p0); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - q0 = _mm_load_si128((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, qs0); - q0 = _mm_and_si128(flat, q0); - work_a = _mm_or_si128(work_a, q0); + work_a = _mm_load_si128((__m128i *)flat_oq[0]); q0 = _mm_load_si128((__m128i *)flat2_oq[0]); work_a = _mm_andnot_si128(flat2, work_a); q0 = _mm_and_si128(flat2, q0); q0 = _mm_or_si128(work_a, q0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); - q1 = _mm_load_si128((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, qs1); - q1 = _mm_and_si128(flat, q1); - work_a = _mm_or_si128(work_a, q1); + work_a = _mm_load_si128((__m128i *)flat_oq[1]); q1 = _mm_load_si128((__m128i *)flat2_oq[1]); work_a = _mm_andnot_si128(flat2, work_a); q1 = _mm_and_si128(flat2, q1); q1 = _mm_or_si128(work_a, q1); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - work_a = _mm_or_si128(work_a, q2); + work_a = _mm_load_si128((__m128i *)flat_oq[2]); q2 = _mm_load_si128((__m128i *)flat2_oq[2]); work_a = _mm_andnot_si128(flat2, work_a); q2 = _mm_and_si128(flat2, q2); @@ -479,7 +478,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, unsigned char *dst = (s + 3 * p); for (i = 3; i < 7; i++) { __m128i flat2_output; - work_a = _mm_loadu_si128((__m128i *)dst); + work_a = _mm_load_si128((__m128i *)aq[i]); flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); @@ -504,7 +503,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); __m128i mask, hev, flat; const __m128i zero = _mm_set1_epi16(0); - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; const unsigned int extended_thresh = _thresh[0] * 0x01010101u; const unsigned int extended_limit = _limit[0] * 0x01010101u; const unsigned int extended_blimit = _blimit[0] * 0x01010101u; @@ -515,7 +514,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, const __m128i blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); @@ -524,7 +522,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); @@ -573,11 +570,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), - _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), - _mm_subs_epu8(q0, q4))); - flat = _mm_max_epu8(work, flat); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); @@ -588,7 +580,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, int i = 0; do { __m128i workp_a, workp_b, workp_shft; - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); @@ -597,11 +588,10 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); - workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); _mm_storel_epi64((__m128i *)&flat_op2[i*8], _mm_packus_epi16(workp_shft, workp_shft)); @@ -611,7 +601,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, _mm_storel_epi64((__m128i *)&flat_op1[i*8], _mm_packus_epi16(workp_shft, workp_shft)); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2); + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); _mm_storel_epi64((__m128i *)&flat_op0[i*8], @@ -623,13 +613,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, _mm_storel_epi64((__m128i *)&flat_oq0[i*8], _mm_packus_epi16(workp_shft, workp_shft)); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4); + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); _mm_storel_epi64((__m128i *)&flat_oq1[i*8], _mm_packus_epi16(workp_shft, workp_shft)); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4); + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); _mm_storel_epi64((__m128i *)&flat_oq2[i*8], @@ -813,8 +803,8 @@ void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u, _mm_loadl_epi64((__m128i *)(src + 120))); } -static __inline void transpose8x16(unsigned char *in0, unsigned char *in1, - int in_p, unsigned char *out, int out_p) { +static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i x8, x9, x10, x11, x12, x13, x14, x15; @@ -879,9 +869,9 @@ static __inline void transpose8x16(unsigned char *in0, unsigned char *in1, _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); } -static __inline void transpose(unsigned char *src[], int in_p, - unsigned char *dst[], int out_p, - int num_8x8_to_transpose) { +static INLINE void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { int idx8x8 = 0; __m128i x0, x1, x2, x3, x4, x5, x6, x7; do { diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm index 5f06f0ea03bfe5e89d5e809cc61cd1e266d7f38f..c2118dbb74e7d0e0011b042e613eea8a322f1d66 100644 --- a/vp9/common/x86/vp9_postproc_mmx.asm +++ b/vp9/common/x86/vp9_postproc_mmx.asm @@ -459,11 +459,11 @@ sym(vp9_mbpost_proc_down_mmx): %undef flimit2 -;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, +;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise, ; unsigned char blackclamp[16], ; unsigned char whiteclamp[16], ; unsigned char bothclamp[16], -; unsigned int Width, unsigned int Height, int Pitch) +; unsigned int width, unsigned int height, int pitch) extern sym(rand) global sym(vp9_plane_add_noise_mmx) PRIVATE sym(vp9_plane_add_noise_mmx): diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm index 8bbb3794b8134c07325c5b500504678d41a1ee4d..858fc99b6b4000a5fac3da137aca8d1e2517e4cd 100644 --- a/vp9/common/x86/vp9_postproc_sse2.asm +++ b/vp9/common/x86/vp9_postproc_sse2.asm @@ -624,11 +624,11 @@ sym(vp9_mbpost_proc_across_ip_xmm): %undef flimit4 -;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, +;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise, ; unsigned char blackclamp[16], ; unsigned char whiteclamp[16], ; unsigned char bothclamp[16], -; unsigned int Width, unsigned int Height, int Pitch) +; unsigned int width, unsigned int height, int pitch) extern sym(rand) global sym(vp9_plane_add_noise_wmt) PRIVATE sym(vp9_plane_add_noise_wmt): diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index b644da64c76dbfc9cf5b50597b1a5e5dddbedb4c..32f00e2893dade99ee9085866fee80cbc39b893c 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -21,34 +21,92 @@ ; ;*************************************************************************************/ -;void vp9_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE -sym(vp9_filter_block1d8_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] +%macro VERTx4 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.loop: + movd xmm0, [rsi] ;A + movd xmm1, [rsi + rdx] ;B + movd xmm2, [rsi + rdx * 2] ;C + movd xmm3, [rax + rdx * 2] ;D + movd xmm4, [rsi + rdx * 4] ;E + movd xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movd xmm6, [rsi + rbx] ;G + movd xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm0, krd + paddsw xmm4, xmm6 + paddsw xmm0, xmm4 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .loop +%endm +%macro VERTx8 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -86,7 +144,7 @@ sym(vp9_filter_block1d8_v8_ssse3): lea rbx, [rdx + rdx*4] add rbx, rdx ;pitch * 6 -.vp9_filter_block1d8_v8_ssse3_loop: +.loop: movq xmm0, [rsi] ;A movq xmm1, [rsi + rdx] ;B movq xmm2, [rsi + rdx * 2] ;C @@ -117,7 +175,10 @@ sym(vp9_filter_block1d8_v8_ssse3): add rsi, rdx add rax, rdx - +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif movq [rdi], xmm0 %if ABI_IS_32BIT @@ -126,47 +187,11 @@ sym(vp9_filter_block1d8_v8_ssse3): add rdi, r8 %endif dec rcx - jnz .vp9_filter_block1d8_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE -sym(vp9_filter_block1d16_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog + jnz .loop +%endm - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] +%macro VERTx16 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -204,7 +229,7 @@ sym(vp9_filter_block1d16_v8_ssse3): lea rbx, [rdx + rdx*4] add rbx, rdx ;pitch * 6 -.vp9_filter_block1d16_v8_ssse3_loop: +.loop: movq xmm0, [rsi] ;A movq xmm1, [rsi + rdx] ;B movq xmm2, [rsi + rdx * 2] ;C @@ -232,7 +257,10 @@ sym(vp9_filter_block1d16_v8_ssse3): psraw xmm0, 7 packuswb xmm0, xmm0 - +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif movq [rdi], xmm0 movq xmm0, [rsi + 8] ;A @@ -267,6 +295,10 @@ sym(vp9_filter_block1d16_v8_ssse3): add rsi, rdx add rax, rdx +%if %1 + movq xmm1, [rdi+8] + pavgb xmm0, xmm1 +%endif movq [rdi+8], xmm0 @@ -276,37 +308,27 @@ sym(vp9_filter_block1d16_v8_ssse3): add rdi, r8 %endif dec rcx - jnz .vp9_filter_block1d16_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret + jnz .loop +%endm -;void vp9_filter_block1d8_h8_ssse3 +;void vp9_filter_block1d8_v8_ssse3 ;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE -sym(vp9_filter_block1d8_h8_ssse3): +global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 - GET_GOT rbx push rsi push rdi + push rbx ; end prolog ALIGN_STACK 16, rax @@ -317,103 +339,37 @@ sym(vp9_filter_block1d8_h8_ssse3): %define k6k7 [rsp + 16*3] %define krd [rsp + 16*4] - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movq xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 -; movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - -.filter_block1d8_h8_rowloop_ssse3: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 - punpcklqdq xmm0, xmm3 - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm4 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - movq [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d8_h8_rowloop_ssse3 + VERTx4 0 add rsp, 16*5 pop rsp - + pop rbx ; begin epilog pop rdi pop rsi - RESTORE_GOT RESTORE_XMM UNSHADOW_ARGS pop rbp ret -;void vp9_filter_block1d16_h8_ssse3 +;void vp9_filter_block1d8_v8_ssse3 ;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE -sym(vp9_filter_block1d16_h8_ssse3): +global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 - GET_GOT rbx push rsi push rdi + push rbx ; end prolog ALIGN_STACK 16, rax @@ -424,21 +380,316 @@ sym(vp9_filter_block1d16_h8_ssse3): %define k6k7 [rsp + 16*3] %define krd [rsp + 16*4] - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movq xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 + VERTx8 0 - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx4 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +%macro HORIZx4 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.loop: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + lea rsi, [rsi + rax] + movd [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .loop +%endm + +%macro HORIZx8 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.loop: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + lea rsi, [rsi + rax] + movq [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .loop +%endm + +%macro HORIZx16 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 punpcklqdq xmm2, xmm2 punpcklqdq xmm3, xmm3 @@ -453,13 +704,10 @@ sym(vp9_filter_block1d16_h8_ssse3): movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height -.filter_block1d16_h8_rowloop_ssse3: +.loop: movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 punpcklqdq xmm0, xmm3 movdqa xmm1, xmm0 @@ -486,10 +734,7 @@ sym(vp9_filter_block1d16_h8_ssse3): movq xmm3, [rsi + 5] -; movq xmm7, [rsi + 12] movq xmm7, [rsi + 13] -;note: same as above -; punpcklbw xmm3, xmm7 punpcklqdq xmm3, xmm7 movdqa xmm1, xmm3 @@ -508,19 +753,54 @@ sym(vp9_filter_block1d16_h8_ssse3): pmaddubsw xmm4, k6k7 paddsw xmm3, xmm1 + paddsw xmm3, xmm4 paddsw xmm3, xmm2 paddsw xmm3, krd - paddsw xmm3, xmm4 psraw xmm3, 7 packuswb xmm3, xmm3 punpcklqdq xmm0, xmm3 +%if %1 + movdqa xmm1, [rdi] + pavgb xmm0, xmm1 +%endif lea rsi, [rsi + rax] movdqa [rdi], xmm0 lea rdi, [rdi + rdx] dec rcx - jnz .filter_block1d16_h8_rowloop_ssse3 + jnz .loop +%endm + +;void vp9_filter_block1d4_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx4 0 add rsp, 16*5 pop rsp @@ -534,7 +814,188 @@ sym(vp9_filter_block1d16_h8_ssse3): pop rbp ret +;void vp9_filter_block1d8_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp9_filter_block1d16_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx4 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret SECTION_RODATA align 16 shuf_t0t1: diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm deleted file mode 100644 index dee29b8fbb1d76f89a5ee3701d247c69d282b585..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_mmx.asm +++ /dev/null @@ -1,268 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -%define BLOCK_HEIGHT_WIDTH 4 -%define vp9_filter_weight 128 -%define VP9_FILTER_SHIFT 7 - - -;void vp9_filter_block1d_h6_mmx -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1d_h6_mmx) PRIVATE -sym(vp9_filter_block1d_h6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - - movq mm1, [rdx + 16] ; do both the negative taps first!!! - movq mm2, [rdx + 32] ; - movq mm6, [rdx + 48] ; - movq mm7, [rdx + 64] ; - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - movq mm3, [rsi-2] ; mm3 = p-2..p5 - movq mm4, mm3 ; mm4 = p-2..p5 - psrlq mm3, 8 ; mm3 = p-1..p5 - punpcklbw mm3, mm0 ; mm3 = p-1..p2 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - movq mm5, mm4 ; mm5 = p-2..p5 - punpckhbw mm4, mm0 ; mm5 = p2..p5 - pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - movq mm4, mm5 ; mm4 = p-2..p5; - psrlq mm5, 16 ; mm5 = p0..p5; - punpcklbw mm5, mm0 ; mm5 = p0..p3 - pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - movq mm5, mm4 ; mm5 = p-2..p5 - psrlq mm4, 24 ; mm4 = p1..p5 - punpcklbw mm4, mm0 ; mm4 = p1..p4 - pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - ; do outer positive taps - movd mm4, [rsi+3] - punpcklbw mm4, mm0 ; mm5 = p3..p6 - pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - paddsw mm3, [GLOBAL(rd)] ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and unpack to saturate - punpcklbw mm3, mm0 ; - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line - add rdi, rax; -%else - movsxd r8, dword ptr arg(2) ;src_pixels_per_line - add rdi, rax; - - add rsi, r8 ; next line -%endif - - dec rcx ; decrement count - jnz .nextrow ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1dc_v6_mmx -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int output_pitch, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1dc_v6_mmx) PRIVATE -sym(vp9_filter_block1dc_v6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movq mm5, [GLOBAL(rd)] - push rbx - mov rbx, arg(7) ;vp9_filter - movq mm1, [rbx + 16] ; do both the negative taps first!!! - movq mm2, [rbx + 32] ; - movq mm6, [rbx + 48] ; - movq mm7, [rbx + 64] ; - - movsxd rdx, dword ptr arg(3) ;pixels_per_line - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - sub rsi, rdx - sub rsi, rdx - movsxd rcx, DWORD PTR arg(5) ;output_height - movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - - -.nextrow_cv: - movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 - pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 - pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi] ; mm4 = p0..p3 = row -2 - pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - add rsi, rdx ; move source forward 1 line to avoid 3 * pitch - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 - pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 - pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - paddsw mm3, mm5 ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and saturate - - movd [rdi],mm3 ; store the results in the destination - ; the subsequent iterations repeat 3 out of 4 of these reads. Since the - ; recon block should be in cache this shouldn't cost much. Its obviously - ; avoidable!!!. - lea rdi, [rdi+rax] ; - dec rcx ; decrement count - jnz .nextrow_cv ; next row - - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -rd: - times 4 dw 0x40 - -align 16 -global HIDDEN_DATA(sym(vp9_six_tap_mmx)) -sym(vp9_six_tap_mmx): - times 8 dw 0 - times 8 dw 0 - times 8 dw 128 - times 8 dw 0 - times 8 dw 0 - times 8 dw 0 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 123 - times 8 dw 12 - times 8 dw -1 - times 8 dw 0 - - times 8 dw 2 - times 8 dw -11 - times 8 dw 108 - times 8 dw 36 - times 8 dw -8 - times 8 dw 1 - - times 8 dw 0 - times 8 dw -9 - times 8 dw 93 - times 8 dw 50 - times 8 dw -6 - times 8 dw 0 - - times 8 dw 3 - times 8 dw -16 - times 8 dw 77 - times 8 dw 77 - times 8 dw -16 - times 8 dw 3 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 50 - times 8 dw 93 - times 8 dw -9 - times 8 dw 0 - - times 8 dw 1 - times 8 dw -8 - times 8 dw 36 - times 8 dw 108 - times 8 dw -11 - times 8 dw 2 - - times 8 dw 0 - times 8 dw -1 - times 8 dw 12 - times 8 dw 123 - times 8 dw -6 - times 8 dw 0 - diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm deleted file mode 100644 index b0c4f12825cbd0e179caf55c7911693762b7eebe..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_sse2.asm +++ /dev/null @@ -1,1372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -global sym(vp9_filter_block1d8_h6_sse2) PRIVATE -sym(vp9_filter_block1d8_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_h6_sse2) PRIVATE -sym(vp9_filter_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi+16], xmm4 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_sse2 -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d8_v6_sse2) PRIVATE -sym(vp9_filter_block1d8_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_sse2_loop: - movdqa xmm1, XMMWORD PTR [rsi] - pmullw xmm1, [rax] - - movdqa xmm2, XMMWORD PTR [rsi + rdx] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] - pmullw xmm3, [rax + 32] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] - pmullw xmm5, [rax + 64] - - add rsi, rdx - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] - - pmullw xmm4, [rax + 48] - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] - - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_v6_sse2 -;( -; unsigned short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; const short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_v6_sse2) PRIVATE -sym(vp9_filter_block1d16_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d16_v6_sse2_loop: -; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. - movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 - movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] - pmullw xmm1, [rax + 16] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm3, [rax + 64] - pmullw xmm4, [rax + 64] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm5, [rax + 32] - pmullw xmm6, [rax + 32] - - movdqa xmm7, XMMWORD PTR [rsi] ; line 1 - movdqa xmm0, XMMWORD PTR [rsi + 16] - pmullw xmm7, [rax] - pmullw xmm0, [rax] - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - paddsw xmm1, xmm7 - paddsw xmm2, xmm0 - - add rsi, rdx - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm3, [rax + 48] - pmullw xmm4, [rax + 48] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm5, [rax + 80] - pmullw xmm6, [rax + 80] - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] - pxor xmm0, xmm0 ; clear xmm0 - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - - paddsw xmm1, xmm7 - paddsw xmm2, xmm7 - - psraw xmm1, 7 - psraw xmm2, 7 - - packuswb xmm1, xmm2 ; pack and saturate - movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d16_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE -sym(vp9_filter_block1d8_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_only_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - - movq QWORD PTR [rdi], xmm4 ; store the results in the destination - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_only_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE -sym(vp9_filter_block1d16_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_only_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; lower 8 bytes - - movq QWORD Ptr [rdi], xmm4 ; store the results in the destination - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; higher 8 bytes - - movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_only_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; Second-pass filter only when xoffset==0 -global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE -sym(vp9_filter_block1d8_v6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - mov rax, arg(5) ;vp9_filter - - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_only_sse2_loop: - movq xmm1, MMWORD PTR [rsi] - movq xmm2, MMWORD PTR [rsi + rdx] - movq xmm3, MMWORD PTR [rsi + rdx * 2] - movq xmm5, MMWORD PTR [rsi + rdx * 4] - add rsi, rdx - movq xmm4, MMWORD PTR [rsi + rdx * 2] - movq xmm6, MMWORD PTR [rsi + rdx * 4] - - punpcklbw xmm1, xmm0 - pmullw xmm1, [rax] - - punpcklbw xmm2, xmm0 - pmullw xmm2, [rax + 16] - - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax + 32] - - punpcklbw xmm5, xmm0 - pmullw xmm5, [rax + 64] - - punpcklbw xmm4, xmm0 - pmullw xmm4, [rax + 48] - - punpcklbw xmm6, xmm0 - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_unpack_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int output_height, -; unsigned int output_width -;) -global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE -sym(vp9_unpack_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(3) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source - - pxor xmm0, xmm0 ; clear xmm0 for unpack -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source -%endif - -.unpack_block1d16_h6_sse2_rowloop: - movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 - movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - punpcklbw xmm1, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm1 - movdqa XMMWORD Ptr [rdi + 16], xmm3 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(4) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - jnz .unpack_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict16x16_sse2) PRIVATE -sym(vp9_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict8x8_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict8x8_sse2) PRIVATE -sym(vp9_bilinear_predict8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ;xoffset - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm5, [rax] - movdqa xmm6, [rax+16] - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqa xmm3, XMMWORD PTR [rsp] - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - add rsp, 16 ; next line -.next_row8x8: - movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - pmullw xmm7, xmm5 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm4, xmm3 - - pmullw xmm3, xmm6 - paddw xmm3, xmm7 - - movdqa xmm7, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - packuswb xmm3, xmm0 - movq [rdi], xmm3 ; store the results in the destination - - add rsp, 16 ; next line - add rdi, rdx - - cmp rdi, rcx - jne .next_row8x8 - - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -rd: - times 8 dw 0x40 diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm deleted file mode 100644 index b260480e0364eafc957ea561fcf294a6d22e165c..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_ssse3.asm +++ /dev/null @@ -1,1515 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE -sym(vp9_filter_block1d8_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 - - movdqa xmm7, [GLOBAL(rd)] - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - mov rdi, arg(2) ;output_ptr - - cmp esi, DWORD PTR [rax] - je vp9_filter_block1d8_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx -;xmm3 free -.filter_block1d8_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - pmaddubsw xmm1, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm1 - paddsw xmm2, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - jnz .filter_block1d8_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -vp9_filter_block1d8_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] - movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] - - mov rsi, arg(0) ;src_ptr - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx - -.filter_block1d8_h4_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm2, xmm0 - pshufb xmm0, xmm3 - - pshufb xmm2, xmm4 - pmaddubsw xmm0, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - - jnz .filter_block1d8_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d16_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE -sym(vp9_filter_block1d16_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - mov rdi, arg(2) ;output_ptr - - mov rsi, arg(0) ;src_ptr - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d16_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - movq xmm3, MMWORD PTR [rsi + 6] - - pmaddubsw xmm1, xmm5 - movq xmm7, MMWORD PTR [rsi + 11] - - pmaddubsw xmm2, xmm6 - punpcklbw xmm3, xmm7 - - paddsw xmm0, xmm1 - movdqa xmm1, xmm3 - - pmaddubsw xmm3, xmm4 - paddsw xmm0, xmm2 - - movdqa xmm2, xmm1 - paddsw xmm0, [GLOBAL(rd)] - - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - - psraw xmm0, 7 - pmaddubsw xmm1, xmm5 - - pmaddubsw xmm2, xmm6 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - paddsw xmm3, xmm1 - - paddsw xmm3, xmm2 - - paddsw xmm3, [GLOBAL(rd)] - - psraw xmm3, 7 - - packuswb xmm3, xmm3 - - punpcklqdq xmm0, xmm3 - - movdqa XMMWORD Ptr [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d4_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE -sym(vp9_filter_block1d4_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - movdqa xmm7, [GLOBAL(rd)] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -;xmm3 free -.filter_block1d4_h6_rowloop_ssse3: - movdqu xmm0, XMMWORD PTR [rsi - 2] - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf1b)] - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2b)] - pmaddubsw xmm0, xmm4 - pshufb xmm2, [GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm0, xmm1 - paddsw xmm0, xmm7 - pxor xmm1, xmm1 - paddsw xmm0, xmm2 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movd DWORD PTR [rdi], xmm0 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d4_h4_rowloop_ssse3: - movdqu xmm1, XMMWORD PTR [rsi - 2] - - movdqa xmm2, xmm1 - pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] - pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm1, xmm7 - paddsw xmm1, xmm2 - psraw xmm1, 7 - packuswb xmm1, xmm1 - - movd DWORD PTR [rdi], xmm1 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - -;void vp9_filter_block1d16_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE -sym(vp9_filter_block1d16_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d16_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - -.vp9_filter_block1d16_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 ;store the results - - movq xmm1, MMWORD PTR [rsi + 8] ;A - movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi+8], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d16_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - -.vp9_filter_block1d16_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - paddsw xmm2, [GLOBAL(rd)] - paddsw xmm2, xmm3 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - punpcklbw xmm5, xmm4 ;B D - punpcklbw xmm1, xmm0 ;C E - - pmaddubsw xmm1, xmm6 - pmaddubsw xmm5, xmm7 - - movdqa xmm4, [GLOBAL(rd)] - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm5, xmm1 - paddsw xmm5, xmm4 - psraw xmm5, 7 - packuswb xmm5, xmm5 - - punpcklqdq xmm2, xmm5 - - movdqa XMMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE -sym(vp9_filter_block1d8_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d8_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - movdqa xmm4, [GLOBAL(rd)] - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d8_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm5, [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm5 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d4_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE -sym(vp9_filter_block1d4_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_v4_ssse3 - - movq mm5, MMWORD PTR [rax] ;k0_k5 - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v6_ssse3_loop: - movd mm1, DWORD PTR [rsi] ;A - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - movd mm0, DWORD PTR [rax + rdx * 4] ;F - - movq mm4, [GLOBAL(rd)] - - pmaddubsw mm3, mm6 - punpcklbw mm1, mm0 ;A F - pmaddubsw mm2, mm7 - pmaddubsw mm1, mm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm1 - paddsw mm2, mm4 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_v4_ssse3: - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - movq mm5, MMWORD PTR [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v4_ssse3_loop: - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - pmaddubsw mm3, mm6 - pmaddubsw mm2, mm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm5 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict16x16_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE -sym(vp9_bilinear_predict16x16_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(2) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 4 - lea rax, [rax + rcx] ; HFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ; src_pixels_per_line - - movdqa xmm2, [rax] - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ; dst_pitch -%endif - movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 - - punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 - pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm6, xmm5 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm6, xmm1 - - punpcklbw xmm4, xmm5 - pmaddubsw xmm4, xmm1 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - packuswb xmm6, xmm4 - movdqa xmm5, xmm7 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm2 - - punpckhbw xmm7, xmm6 - pmaddubsw xmm7, xmm2 - - paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value - psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm5, xmm7 - movdqa xmm7, xmm6 - - movdqa [rdi], xmm5 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ; dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - - ; get the first horizontal line done - movq xmm4, [rsi] ; load row 0 - movq xmm2, [rsi + 8] ; load row 0 - - lea rsi, [rsi + rax] ; next line -.next_row_sp: - movq xmm3, [rsi] ; load row + 1 - movq xmm5, [rsi + 8] ; load row + 1 - - punpcklbw xmm4, xmm3 - punpcklbw xmm2, xmm5 - - pmaddubsw xmm4, xmm1 - movq xmm7, [rsi + rax] ; load row + 2 - - pmaddubsw xmm2, xmm1 - movq xmm6, [rsi + rax + 8] ; load row + 2 - - punpcklbw xmm3, xmm7 - punpcklbw xmm5, xmm6 - - pmaddubsw xmm3, xmm1 - paddw xmm4, [GLOBAL(rd)] - - pmaddubsw xmm5, xmm1 - paddw xmm2, [GLOBAL(rd)] - - psraw xmm4, VP9_FILTER_SHIFT - psraw xmm2, VP9_FILTER_SHIFT - - packuswb xmm4, xmm2 - paddw xmm3, [GLOBAL(rd)] - - movdqa [rdi], xmm4 ; store row 0 - paddw xmm5, [GLOBAL(rd)] - - psraw xmm3, VP9_FILTER_SHIFT - psraw xmm5, VP9_FILTER_SHIFT - - packuswb xmm3, xmm5 - movdqa xmm4, xmm7 - - movdqa [rdi + rdx],xmm3 ; store row 1 - lea rsi, [rsi + 2*rax] - - movdqa xmm2, xmm6 - lea rdi, [rdi + 2*rdx] - - cmp rdi, rcx - jne .next_row_sp - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - -.next_row_fp: - movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm2, xmm4 - movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 - - pmaddubsw xmm2, xmm1 - movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rax] ; next line - punpcklbw xmm3, xmm4 - - pmaddubsw xmm3, xmm1 - movq xmm5, [rsi] - - paddw xmm2, [GLOBAL(rd)] - movq xmm7, [rsi+1] - - movq xmm6, [rsi+8] - psraw xmm2, VP9_FILTER_SHIFT - - punpcklbw xmm5, xmm7 - movq xmm7, [rsi+9] - - paddw xmm3, [GLOBAL(rd)] - pmaddubsw xmm5, xmm1 - - psraw xmm3, VP9_FILTER_SHIFT - punpcklbw xmm6, xmm7 - - packuswb xmm2, xmm3 - pmaddubsw xmm6, xmm1 - - movdqa [rdi], xmm2 ; store the results in the destination - paddw xmm5, [GLOBAL(rd)] - - lea rdi, [rdi + rdx] ; dst_pitch - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm6, VP9_FILTER_SHIFT - - packuswb xmm5, xmm6 - lea rsi, [rsi + rax] ; next line - - movdqa [rdi], xmm5 ; store the results in the destination - lea rdi, [rdi + rdx] ; dst_pitch - - cmp rdi, rcx - - jne .next_row_fp - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict8x8_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE -sym(vp9_bilinear_predict8x8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ; xoffset - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b8x8_sp_only - - shl rax, 4 - add rax, rcx ; HFilter - - mov rdi, arg(4) ; dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b8x8_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm1, [rax] - - ; get the first horizontal line done - movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx - - psrldq xmm5, 1 - lea rsp, [rsp + 16] ; next line - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - lea rsp, [rsp + 16] ; next line - - movdqa xmm5, xmm6 - - psrldq xmm5, 1 - - punpcklbw xmm6, xmm5 - pmaddubsw xmm6, xmm0 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - packuswb xmm6, xmm6 - - punpcklbw xmm7, xmm6 - pmaddubsw xmm7, xmm1 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm7, xmm7 - - movq [rdi], xmm7 ; store the results in the destination - lea rdi, [rdi + rdx] - - movdqa xmm7, xmm6 - - cmp rdi, rcx - jne .next_row - - jmp .done8x8 - -.b8x8_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] ; VFilter - - movq xmm1, XMMWORD PTR [rsp] - movq xmm2, XMMWORD PTR [rsp+16] - - movq xmm3, XMMWORD PTR [rsp+32] - punpcklbw xmm1, xmm2 - - movq xmm4, XMMWORD PTR [rsp+48] - punpcklbw xmm2, xmm3 - - movq xmm5, XMMWORD PTR [rsp+64] - punpcklbw xmm3, xmm4 - - movq xmm6, XMMWORD PTR [rsp+80] - punpcklbw xmm4, xmm5 - - movq xmm7, XMMWORD PTR [rsp+96] - punpcklbw xmm5, xmm6 - - pmaddubsw xmm1, xmm0 - pmaddubsw xmm2, xmm0 - - pmaddubsw xmm3, xmm0 - pmaddubsw xmm4, xmm0 - - pmaddubsw xmm5, xmm0 - punpcklbw xmm6, xmm7 - - pmaddubsw xmm6, xmm0 - paddw xmm1, [GLOBAL(rd)] - - paddw xmm2, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm2, VP9_FILTER_SHIFT - - paddw xmm4, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - psraw xmm6, VP9_FILTER_SHIFT - packuswb xmm1, xmm1 - - packuswb xmm2, xmm2 - movq [rdi], xmm1 - - packuswb xmm3, xmm3 - movq [rdi+rdx], xmm2 - - packuswb xmm4, xmm4 - movq xmm1, XMMWORD PTR [rsp+112] - - lea rdi, [rdi + 2*rdx] - movq xmm2, XMMWORD PTR [rsp+128] - - packuswb xmm5, xmm5 - movq [rdi], xmm3 - - packuswb xmm6, xmm6 - movq [rdi+rdx], xmm4 - - lea rdi, [rdi + 2*rdx] - punpcklbw xmm7, xmm1 - - movq [rdi], xmm5 - pmaddubsw xmm7, xmm0 - - movq [rdi+rdx], xmm6 - punpcklbw xmm1, xmm2 - - pmaddubsw xmm1, xmm0 - paddw xmm7, [GLOBAL(rd)] - - psraw xmm7, VP9_FILTER_SHIFT - paddw xmm1, [GLOBAL(rd)] - - psraw xmm1, VP9_FILTER_SHIFT - packuswb xmm7, xmm7 - - packuswb xmm1, xmm1 - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm7 - - movq [rdi+rdx], xmm1 - lea rsp, [rsp + 144] - - jmp .done8x8 - -.b8x8_fp_only: - lea rcx, [rdi+rdx*8] - -.next_row_fp: - movdqa xmm1, XMMWORD PTR [rsp] - movdqa xmm3, XMMWORD PTR [rsp+16] - - movdqa xmm2, xmm1 - movdqa xmm5, XMMWORD PTR [rsp+32] - - psrldq xmm2, 1 - movdqa xmm7, XMMWORD PTR [rsp+48] - - movdqa xmm4, xmm3 - psrldq xmm4, 1 - - movdqa xmm6, xmm5 - psrldq xmm6, 1 - - punpcklbw xmm1, xmm2 - pmaddubsw xmm1, xmm0 - - punpcklbw xmm3, xmm4 - pmaddubsw xmm3, xmm0 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm0 - - movdqa xmm2, xmm7 - psrldq xmm2, 1 - - punpcklbw xmm7, xmm2 - pmaddubsw xmm7, xmm0 - - paddw xmm1, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm7, [GLOBAL(rd)] - psraw xmm7, VP9_FILTER_SHIFT - - packuswb xmm1, xmm1 - packuswb xmm3, xmm3 - - packuswb xmm5, xmm5 - movq [rdi], xmm1 - - packuswb xmm7, xmm7 - movq [rdi+rdx], xmm3 - - lea rdi, [rdi + 2*rdx] - movq [rdi], xmm5 - - lea rsp, [rsp + 4*16] - movq [rdi+rdx], xmm7 - - lea rdi, [rdi + 2*rdx] - cmp rdi, rcx - - jne .next_row_fp - - lea rsp, [rsp + 16] - -.done8x8: - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -shuf1b: - db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 -shuf2b: - db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 -shuf3b: - db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 - -align 16 -shuf2bfrom1: - db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 -align 16 -shuf3bfrom1: - db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 - -align 16 -rd: - times 8 dw 0x40 - -align 16 -k0_k5: - times 8 db 0, 0 ;placeholder - times 8 db 0, 0 - times 8 db 2, 1 - times 8 db 0, 0 - times 8 db 3, 3 - times 8 db 0, 0 - times 8 db 1, 2 - times 8 db 0, 0 -k1_k3: - times 8 db 0, 0 ;placeholder - times 8 db -6, 12 - times 8 db -11, 36 - times 8 db -9, 50 - times 8 db -16, 77 - times 8 db -6, 93 - times 8 db -8, 108 - times 8 db -1, 123 -k2_k4: - times 8 db 128, 0 ;placeholder - times 8 db 123, -1 - times 8 db 108, -8 - times 8 db 93, -6 - times 8 db 77, -16 - times 8 db 50, -9 - times 8 db 36, -11 - times 8 db 12, -6 -align 16 -bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 120, 8 - times 8 db 112, 16 - times 8 db 104, 24 - times 8 db 96, 32 - times 8 db 88, 40 - times 8 db 80, 48 - times 8 db 72, 56 - times 8 db 64, 64 - times 8 db 56, 72 - times 8 db 48, 80 - times 8 db 40, 88 - times 8 db 32, 96 - times 8 db 24, 104 - times 8 db 16, 112 - times 8 db 8, 120 - diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h deleted file mode 100644 index 25bc26d9bddc50a69d8288c2466dc21f8649de24..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_x86.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_ -#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_ - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx - -#endif -#endif - - -#if HAVE_SSE2 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2); - - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2 - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2 - -#endif -#endif - -#if HAVE_SSSE3 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3 - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3 - - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3 - -#endif -#endif - - - -#endif diff --git a/vp9/decoder/vp9_dboolhuff.c b/vp9/decoder/vp9_dboolhuff.c index 5f1ef04083b86bfe458092fe73bfbf5f59f72739..7e3b4646b2060ae33cfb614e2b0a49455a2e2bbd 100644 --- a/vp9/decoder/vp9_dboolhuff.c +++ b/vp9/decoder/vp9_dboolhuff.c @@ -8,19 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#include "vp9/decoder/vp9_dboolhuff.h" #include "vpx_ports/mem.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/decoder/vp9_dboolhuff.h" + int vp9_start_decode(BOOL_DECODER *br, const unsigned char *source, unsigned int source_sz) { br->user_buffer_end = source + source_sz; - br->user_buffer = source; - br->value = 0; - br->count = -8; - br->range = 255; + br->user_buffer = source; + br->value = 0; + br->count = -8; + br->range = 255; if (source_sz && !source) return 1; @@ -33,16 +33,27 @@ int vp9_start_decode(BOOL_DECODER *br, void vp9_bool_decoder_fill(BOOL_DECODER *br) { - const unsigned char *bufptr; - const unsigned char *bufend; - VP9_BD_VALUE value; - int count; - bufend = br->user_buffer_end; - bufptr = br->user_buffer; - value = br->value; - count = br->count; + const unsigned char *bufptr = br->user_buffer; + const unsigned char *bufend = br->user_buffer_end; + VP9_BD_VALUE value = br->value; + int count = br->count; + int shift = VP9_BD_VALUE_SIZE - 8 - (count + 8); + int loop_end = 0; + int bits_left = (int)((bufend - bufptr)*CHAR_BIT); + int x = shift + CHAR_BIT - bits_left; + + if (x >= 0) { + count += VP9_LOTS_OF_BITS; + loop_end = x; + } - VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend); + if (x < 0 || bits_left) { + while (shift >= loop_end) { + count += CHAR_BIT; + value |= (VP9_BD_VALUE)*bufptr++ << shift; + shift -= CHAR_BIT; + } + } br->user_buffer = bufptr; br->value = value; @@ -52,7 +63,9 @@ void vp9_bool_decoder_fill(BOOL_DECODER *br) { static int get_unsigned_bits(unsigned num_values) { int cat = 0; - if ((num_values--) <= 1) return 0; + if (num_values <= 1) + return 0; + num_values--; while (num_values > 0) { cat++; num_values >>= 1; @@ -61,9 +74,12 @@ static int get_unsigned_bits(unsigned num_values) { } int vp9_inv_recenter_nonneg(int v, int m) { - if (v > (m << 1)) return v; - else if ((v & 1) == 0) return (v >> 1) + m; - else return m - ((v + 1) >> 1); + if (v > (m << 1)) + return v; + else if ((v & 1) == 0) + return (v >> 1) + m; + else + return m - ((v + 1) >> 1); } int vp9_decode_uniform(BOOL_DECODER *br, int n) { diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h index 5afdd67c800eb922b5f16e545cfe46ff50d9b165..02ae1d3c8a85f9c91db05aa6a7fa452c99bbd132 100644 --- a/vp9/decoder/vp9_dboolhuff.h +++ b/vp9/decoder/vp9_dboolhuff.h @@ -13,17 +13,18 @@ #include <stddef.h> #include <limits.h> + #include "./vpx_config.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" typedef size_t VP9_BD_VALUE; -# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT) +#define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT) /*This is meant to be a large, positive constant that can still be efficiently loaded as an immediate (on platforms like ARM, for example). Even relatively modest values like 100 would work fine.*/ -# define VP9_LOTS_OF_BITS (0x40000000) +#define VP9_LOTS_OF_BITS (0x40000000) typedef struct { const unsigned char *user_buffer_end; @@ -45,46 +46,13 @@ int vp9_decode_uniform(BOOL_DECODER *br, int n); int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms); int vp9_inv_recenter_nonneg(int v, int m); -/*The refill loop is used in several places, so define it in a macro to make - sure they're all consistent. - An inline function would be cleaner, but has a significant penalty, because - multiple BOOL_DECODER fields must be modified, and the compiler is not smart - enough to eliminate the stores to those fields and the subsequent reloads - from them when inlining the function.*/ -#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \ - do \ - { \ - int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \ - int loop_end, x; \ - int bits_left = (int)(((_bufend)-(_bufptr))*CHAR_BIT); \ - \ - x = shift + CHAR_BIT - bits_left; \ - loop_end = 0; \ - if(x >= 0) \ - { \ - (_count) += VP9_LOTS_OF_BITS; \ - loop_end = x; \ - if(!bits_left) break; \ - } \ - while(shift >= loop_end) \ - { \ - (_count) += CHAR_BIT; \ - (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \ - shift -= CHAR_BIT; \ - } \ - } \ - while(0) \ - - static int decode_bool(BOOL_DECODER *br, int probability) { unsigned int bit = 0; VP9_BD_VALUE value; - unsigned int split; VP9_BD_VALUE bigsplit; int count; unsigned int range; - - split = 1 + (((br->range - 1) * probability) >> 8); + unsigned int split = 1 + (((br->range - 1) * probability) >> 8); if (br->count < 0) vp9_bool_decoder_fill(br); @@ -120,36 +88,30 @@ static int decode_value(BOOL_DECODER *br, int bits) { int bit; for (bit = bits - 1; bit >= 0; bit--) { - z |= (decode_bool(br, 0x80) << bit); + z |= decode_bool(br, 0x80) << bit; } return z; } static int bool_error(BOOL_DECODER *br) { - /* Check if we have reached the end of the buffer. - * - * Variable 'count' stores the number of bits in the 'value' buffer, minus - * 8. The top byte is part of the algorithm, and the remainder is buffered - * to be shifted into it. So if count == 8, the top 16 bits of 'value' are - * occupied, 8 for the algorithm and 8 in the buffer. - * - * When reading a byte from the user's buffer, count is filled with 8 and - * one byte is filled into the value buffer. When we reach the end of the - * data, count is additionally filled with VP9_LOTS_OF_BITS. So when - * count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted. - */ - if ((br->count > VP9_BD_VALUE_SIZE) && (br->count < VP9_LOTS_OF_BITS)) { - /* We have tried to decode bits after the end of - * stream was encountered. - */ - return 1; - } - - /* No error. */ - return 0; + // Check if we have reached the end of the buffer. + // + // Variable 'count' stores the number of bits in the 'value' buffer, minus + // 8. The top byte is part of the algorithm, and the remainder is buffered + // to be shifted into it. So if count == 8, the top 16 bits of 'value' are + // occupied, 8 for the algorithm and 8 in the buffer. + // + // When reading a byte from the user's buffer, count is filled with 8 and + // one byte is filled into the value buffer. When we reach the end of the + // data, count is additionally filled with VP9_LOTS_OF_BITS. So when + // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted. + // + // 1 if we have tried to decode bits after the end of stream was encountered. + // 0 No error. + return br->count > VP9_BD_VALUE_SIZE && br->count < VP9_LOTS_OF_BITS; } -extern int vp9_decode_unsigned_max(BOOL_DECODER *br, int max); +int vp9_decode_unsigned_max(BOOL_DECODER *br, int max); #endif // VP9_DECODER_VP9_DBOOLHUFF_H_ diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index c6c3d1576dc69b955d6dcbcf45c64e8af6ebbd55..353e94fa58d69a24b9a4535b94949a18fab6db55 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -12,6 +12,7 @@ #include "vp9/decoder/vp9_treereader.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_common.h" @@ -28,12 +29,13 @@ #ifdef DEBUG_DEC_MV int dec_mvcount = 0; #endif + // #define DEC_DEBUG #ifdef DEC_DEBUG extern int dec_debug; #endif -static int read_bmode(vp9_reader *bc, const vp9_prob *p) { +static B_PREDICTION_MODE read_bmode(vp9_reader *bc, const vp9_prob *p) { B_PREDICTION_MODE m = treed_read(bc, vp9_bmode_tree, p); #if CONFIG_NEWBINTRAMODES if (m == B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS) @@ -43,53 +45,63 @@ static int read_bmode(vp9_reader *bc, const vp9_prob *p) { return m; } -static int read_kf_bmode(vp9_reader *bc, const vp9_prob *p) { - return treed_read(bc, vp9_kf_bmode_tree, p); +static B_PREDICTION_MODE read_kf_bmode(vp9_reader *bc, const vp9_prob *p) { + return (B_PREDICTION_MODE)treed_read(bc, vp9_kf_bmode_tree, p); } -static int read_ymode(vp9_reader *bc, const vp9_prob *p) { - return treed_read(bc, vp9_ymode_tree, p); +static MB_PREDICTION_MODE read_ymode(vp9_reader *bc, const vp9_prob *p) { + return (MB_PREDICTION_MODE)treed_read(bc, vp9_ymode_tree, p); } -static int read_sb_ymode(vp9_reader *bc, const vp9_prob *p) { - return treed_read(bc, vp9_sb_ymode_tree, p); +static MB_PREDICTION_MODE read_sb_ymode(vp9_reader *bc, const vp9_prob *p) { + return (MB_PREDICTION_MODE)treed_read(bc, vp9_sb_ymode_tree, p); } -static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) { - return treed_read(bc, vp9_uv_mode_tree, p); +static MB_PREDICTION_MODE read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) { + return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p); } -static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) { - return treed_read(bc, vp9_kf_ymode_tree, p); +static MB_PREDICTION_MODE read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) { + return (MB_PREDICTION_MODE)treed_read(bc, vp9_kf_ymode_tree, p); } static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) { return treed_read(bc, vp9_i8x8_mode_tree, p); } -static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) { - return treed_read(bc, vp9_uv_mode_tree, p); +static MB_PREDICTION_MODE read_uv_mode(vp9_reader *bc, const vp9_prob *p) { + return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p); } // This function reads the current macro block's segnent id from the bitstream // It should only be called if a segment map update is indicated. -static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi, - MACROBLOCKD *xd) { - /* Is segmentation enabled */ +static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *xd) { if (xd->segmentation_enabled && xd->update_mb_segmentation_map) { - /* If so then read the segment id. */ - if (vp9_read(r, xd->mb_segment_tree_probs[0])) - mi->segment_id = - (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2])); - else - mi->segment_id = - (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1])); + const vp9_prob *const p = xd->mb_segment_tree_probs; + mi->segment_id = vp9_read(r, p[0]) ? 2 + vp9_read(r, p[2]) + : vp9_read(r, p[1]); + } +} + +// This function reads the current macro block's segnent id from the bitstream +// It should only be called if a segment map update is indicated. +static void read_mb_segid_except(VP9_COMMON *cm, + vp9_reader *r, MB_MODE_INFO *mi, + MACROBLOCKD *xd, int mb_row, int mb_col) { + const int mb_index = mb_row * cm->mb_cols + mb_col; + const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, mb_index); + const vp9_prob *const p = xd->mb_segment_tree_probs; + const vp9_prob prob = xd->mb_segment_mispred_tree_probs[pred_seg_id]; + + if (xd->segmentation_enabled && xd->update_mb_segmentation_map) { + mi->segment_id = vp9_read(r, prob) + ? 2 + (pred_seg_id < 2 ? vp9_read(r, p[2]) : (pred_seg_id == 2)) + : (pred_seg_id >= 2 ? vp9_read(r, p[1]) : (pred_seg_id == 0)); } } #if CONFIG_NEW_MVREF -int vp9_read_mv_ref_id(vp9_reader *r, - vp9_prob * ref_id_probs) { +int vp9_read_mv_ref_id(vp9_reader *r, vp9_prob *ref_id_probs) { int ref_index = 0; if (vp9_read(r, ref_id_probs[0])) { @@ -111,10 +123,13 @@ static void kfread_modes(VP9D_COMP *pbi, int mb_col, BOOL_DECODER* const bc) { VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; const int mis = pbi->common.mode_info_stride; int map_index = mb_row * pbi->common.mb_cols + mb_col; MB_PREDICTION_MODE y_mode; + m->mbmi.ref_frame = INTRA_FRAME; + // Read the Macroblock segmentation map if it is being updated explicitly // this frame (reset to 0 by default). m->mbmi.segment_id = 0; @@ -139,60 +154,52 @@ static void kfread_modes(VP9D_COMP *pbi, m->mbmi.mb_skip_coeff = 0; if (pbi->common.mb_no_coeff_skip && - (!vp9_segfeature_active(&pbi->mb, - m->mbmi.segment_id, SEG_LVL_EOB) || - (vp9_get_segdata(&pbi->mb, - m->mbmi.segment_id, SEG_LVL_EOB) != 0))) { - MACROBLOCKD *const xd = &pbi->mb; - m->mbmi.mb_skip_coeff = - vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP)); + (!vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id, SEG_LVL_SKIP))) { + m->mbmi.mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, &pbi->mb, + PRED_MBSKIP)); } else { - if (vp9_segfeature_active(&pbi->mb, - m->mbmi.segment_id, SEG_LVL_EOB) && - (vp9_get_segdata(&pbi->mb, - m->mbmi.segment_id, SEG_LVL_EOB) == 0)) { - m->mbmi.mb_skip_coeff = 1; - } else - m->mbmi.mb_skip_coeff = 0; + m->mbmi.mb_skip_coeff = vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id, + SEG_LVL_SKIP); } - if (m->mbmi.sb_type) { - y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc, - pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]); - } else { - y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc, - pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]); - } + y_mode = m->mbmi.sb_type ? + read_kf_sb_ymode(bc, + pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]): + read_kf_mb_ymode(bc, + pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]); m->mbmi.ref_frame = INTRA_FRAME; if ((m->mbmi.mode = y_mode) == B_PRED) { int i = 0; do { - const B_PREDICTION_MODE A = above_block_mode(m, i, mis); - const B_PREDICTION_MODE L = left_block_mode(m, i); + const B_PREDICTION_MODE a = above_block_mode(m, i, mis); + const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ? + left_block_mode(m, i) : B_DC_PRED; - m->bmi[i].as_mode.first = - (B_PREDICTION_MODE) read_kf_bmode( - bc, pbi->common.kf_bmode_prob [A] [L]); + m->bmi[i].as_mode.first = read_kf_bmode(bc, + pbi->common.kf_bmode_prob[a][l]); } while (++i < 16); } + if ((m->mbmi.mode = y_mode) == I8X8_PRED) { int i; - int mode8x8; for (i = 0; i < 4; i++) { - int ib = vp9_i8x8_block[i]; - mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob); + const int ib = vp9_i8x8_block[i]; + const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob); + m->bmi[ib + 0].as_mode.first = mode8x8; m->bmi[ib + 1].as_mode.first = mode8x8; m->bmi[ib + 4].as_mode.first = mode8x8; m->bmi[ib + 5].as_mode.first = mode8x8; } - } else - m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc, - pbi->common.kf_uv_mode_prob[m->mbmi.mode]); + } else { + m->mbmi.uv_mode = read_uv_mode(bc, + pbi->common.kf_uv_mode_prob[m->mbmi.mode]); + } - if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 && + if (cm->txfm_mode == TX_MODE_SELECT && + m->mbmi.mb_skip_coeff == 0 && m->mbmi.mode <= I8X8_PRED) { // FIXME(rbultje) code ternary symbol once all experiments are merged m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]); @@ -215,23 +222,23 @@ static void kfread_modes(VP9D_COMP *pbi, static int read_nmv_component(vp9_reader *r, int rv, const nmv_component *mvcomp) { - int v, s, z, c, o, d; - s = vp9_read(r, mvcomp->sign); - c = treed_read(r, vp9_mv_class_tree, mvcomp->classes); - if (c == MV_CLASS_0) { + int mag, d; + const int sign = vp9_read(r, mvcomp->sign); + const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes); + + if (mv_class == MV_CLASS_0) { d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0); } else { - int i, b; + int i; + int n = mv_class + CLASS0_BITS - 1; // number of bits + d = 0; - b = c + CLASS0_BITS - 1; /* number of bits */ - for (i = 0; i < b; ++i) - d |= (vp9_read(r, mvcomp->bits[i]) << i); + for (i = 0; i < n; ++i) + d |= vp9_read(r, mvcomp->bits[i]) << i; } - o = d << 3; - z = vp9_get_mv_mag(c, o); - v = (s ? -(z + 8) : (z + 8)); - return v; + mag = vp9_get_mv_mag(mv_class, d << 3); + return sign ? -(mag + 8) : (mag + 8); } static int read_nmv_component_fp(vp9_reader *r, @@ -239,43 +246,34 @@ static int read_nmv_component_fp(vp9_reader *r, int rv, const nmv_component *mvcomp, int usehp) { - int s, z, c, o, d, e, f; - s = v < 0; - z = (s ? -v : v) - 1; /* magnitude - 1 */ - z &= ~7; + const int sign = v < 0; + int mag = ((sign ? -v : v) - 1) & ~7; // magnitude - 1 + int offset; + const int mv_class = vp9_get_mv_class(mag, &offset); + const int f = mv_class == MV_CLASS_0 ? + treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[offset >> 3]): + treed_read(r, vp9_mv_fp_tree, mvcomp->fp); - c = vp9_get_mv_class(z, &o); - d = o >> 3; - - if (c == MV_CLASS_0) { - f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]); - } else { - f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp); - } - o += (f << 1); + offset += f << 1; if (usehp) { - if (c == MV_CLASS_0) { - e = vp9_read(r, mvcomp->class0_hp); - } else { - e = vp9_read(r, mvcomp->hp); - } - o += e; + const vp9_prob p = mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp; + offset += vp9_read(r, p); } else { - ++o; /* Note if hp is not used, the default value of the hp bit is 1 */ + offset += 1; // If hp is not used, the default value of the hp bit is 1 } - z = vp9_get_mv_mag(c, o); - v = (s ? -(z + 1) : (z + 1)); - return v; + mag = vp9_get_mv_mag(mv_class, offset); + return sign ? -(mag + 1) : (mag + 1); } static void read_nmv(vp9_reader *r, MV *mv, const MV *ref, const nmv_context *mvctx) { - MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints); + const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints); mv->row = mv-> col = 0; if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) { mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]); } + if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) { mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]); } @@ -283,7 +281,7 @@ static void read_nmv(vp9_reader *r, MV *mv, const MV *ref, static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref, const nmv_context *mvctx, int usehp) { - MV_JOINT_TYPE j = vp9_get_mv_joint(*mv); + const MV_JOINT_TYPE j = vp9_get_mv_joint(*mv); usehp = usehp && vp9_use_nmv_hp(ref); if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) { mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0], @@ -293,7 +291,10 @@ static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref, mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1], usehp); } - //printf(" %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col); + /* + printf("MV: %d %d REF: %d %d\n", mv->row + ref->row, mv->col + ref->col, + ref->row, ref->col); + */ } static void update_nmv(vp9_reader *bc, vp9_prob *const p, @@ -310,48 +311,40 @@ static void update_nmv(vp9_reader *bc, vp9_prob *const p, static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx, int usehp) { int i, j, k; + #ifdef MV_GROUP_UPDATE - if (!vp9_read_bit(bc)) return; + if (!vp9_read_bit(bc)) + return; #endif - for (j = 0; j < MV_JOINTS - 1; ++j) { - update_nmv(bc, &mvctx->joints[j], - VP9_NMV_UPDATE_PROB); - } + for (j = 0; j < MV_JOINTS - 1; ++j) + update_nmv(bc, &mvctx->joints[j], VP9_NMV_UPDATE_PROB); + for (i = 0; i < 2; ++i) { - update_nmv(bc, &mvctx->comps[i].sign, - VP9_NMV_UPDATE_PROB); - for (j = 0; j < MV_CLASSES - 1; ++j) { - update_nmv(bc, &mvctx->comps[i].classes[j], - VP9_NMV_UPDATE_PROB); - } - for (j = 0; j < CLASS0_SIZE - 1; ++j) { - update_nmv(bc, &mvctx->comps[i].class0[j], - VP9_NMV_UPDATE_PROB); - } - for (j = 0; j < MV_OFFSET_BITS; ++j) { - update_nmv(bc, &mvctx->comps[i].bits[j], - VP9_NMV_UPDATE_PROB); - } + update_nmv(bc, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB); + for (j = 0; j < MV_CLASSES - 1; ++j) + update_nmv(bc, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB); + + for (j = 0; j < CLASS0_SIZE - 1; ++j) + update_nmv(bc, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB); + + for (j = 0; j < MV_OFFSET_BITS; ++j) + update_nmv(bc, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB); } for (i = 0; i < 2; ++i) { for (j = 0; j < CLASS0_SIZE; ++j) { for (k = 0; k < 3; ++k) - update_nmv(bc, &mvctx->comps[i].class0_fp[j][k], - VP9_NMV_UPDATE_PROB); - } - for (j = 0; j < 3; ++j) { - update_nmv(bc, &mvctx->comps[i].fp[j], - VP9_NMV_UPDATE_PROB); + update_nmv(bc, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB); } + + for (j = 0; j < 3; ++j) + update_nmv(bc, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB); } if (usehp) { for (i = 0; i < 2; ++i) { - update_nmv(bc, &mvctx->comps[i].class0_hp, - VP9_NMV_UPDATE_PROB); - update_nmv(bc, &mvctx->comps[i].hp, - VP9_NMV_UPDATE_PROB); + update_nmv(bc, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB); + update_nmv(bc, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB); } } } @@ -361,15 +354,11 @@ static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi, vp9_reader *const bc, unsigned char segment_id) { MV_REFERENCE_FRAME ref_frame; - int seg_ref_active; - int seg_ref_count = 0; - VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - seg_ref_active = vp9_segfeature_active(xd, - segment_id, - SEG_LVL_REF_FRAME); + int seg_ref_count = 0; + int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME); // If segment coding enabled does the segment allow for more than one // possible reference frame @@ -384,15 +373,13 @@ static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi, // multiple reference frame options if (!seg_ref_active || (seg_ref_count > 1)) { // Values used in prediction model coding - unsigned char prediction_flag; - vp9_prob pred_prob; MV_REFERENCE_FRAME pred_ref; // Get the context probability the prediction flag - pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF); + vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF); // Read the prediction status flag - prediction_flag = (unsigned char)vp9_read(bc, pred_prob); + unsigned char prediction_flag = vp9_read(bc, pred_prob); // Store the prediction flag. vp9_set_pred_flag(xd, PRED_REF, prediction_flag); @@ -403,9 +390,8 @@ static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi, // If correctly predicted then use the predicted value if (prediction_flag) { ref_frame = pred_ref; - } - // else decode the explicitly coded value - else { + } else { + // decode the explicitly coded value vp9_prob mod_refprobs[PREDICTION_PROBS]; vpx_memcpy(mod_refprobs, cm->mod_refprobs[pred_ref], sizeof(mod_refprobs)); @@ -456,10 +442,8 @@ static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi, } } } - } - - // Segment reference frame features are enabled - else { + } else { + // Segment reference frame features are enabled // The reference frame for the mb is considered as correclty predicted // if it is signaled at the segment level for the purposes of the // common prediction model @@ -492,12 +476,12 @@ unsigned int vp9_mv_cont_count[5][4] = { }; #endif -static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1}; +static const unsigned char mbsplit_fill_count[4] = { 8, 8, 4, 1 }; static const unsigned char mbsplit_fill_offset[4][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}, - { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }, + { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } }; static void read_switchable_interp_probs(VP9D_COMP* const pbi, @@ -506,7 +490,7 @@ static void read_switchable_interp_probs(VP9D_COMP* const pbi, int i, j; for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) { for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) { - cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8); + cm->fc.switchable_interp_prob[j][i] = vp9_read_prob(bc); } } //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0], @@ -527,13 +511,13 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) { #if CONFIG_COMP_INTERINTRA_PRED if (cm->use_interintra) { if (vp9_read(bc, VP9_UPD_INTERINTRA_PROB)) - cm->fc.interintra_prob = (vp9_prob)vp9_read_literal(bc, 8); + cm->fc.interintra_prob = vp9_read_prob(bc); } #endif // Decode the baseline probabilities for decoding reference frame - cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8); - cm->prob_last_coded = (vp9_prob)vp9_read_literal(bc, 8); - cm->prob_gf_coded = (vp9_prob)vp9_read_literal(bc, 8); + cm->prob_intra_coded = vp9_read_prob(bc); + cm->prob_last_coded = vp9_read_prob(bc); + cm->prob_gf_coded = vp9_read_prob(bc); // Computes a modified set of probabilities for use when reference // frame prediction fails. @@ -545,14 +529,14 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) { if (cm->comp_pred_mode == HYBRID_PREDICTION) { int i; for (i = 0; i < COMP_PRED_CONTEXTS; i++) - cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8); + cm->prob_comppred[i] = vp9_read_prob(bc); } if (vp9_read_bit(bc)) { int i = 0; do { - cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8); + cm->fc.ymode_prob[i] = vp9_read_prob(bc); } while (++i < VP9_YMODES - 1); } @@ -560,7 +544,7 @@ static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) { int i = 0; do { - cm->fc.sb_ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8); + cm->fc.sb_ymode_prob[i] = vp9_read_prob(bc); } while (++i < VP9_I32X32_MODES - 1); } @@ -575,10 +559,10 @@ static void read_mb_segment_id(VP9D_COMP *pbi, int mb_row, int mb_col, BOOL_DECODER* const bc) { VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; + MACROBLOCKD *const xd = &pbi->mb; MODE_INFO *mi = xd->mode_info_context; MB_MODE_INFO *mbmi = &mi->mbmi; - int index = mb_row * pbi->common.mb_cols + mb_col; + int mb_index = mb_row * pbi->common.mb_cols + mb_col; if (xd->segmentation_enabled) { if (xd->update_mb_segmentation_map) { @@ -586,12 +570,10 @@ static void read_mb_segment_id(VP9D_COMP *pbi, if (cm->temporal_update) { // Get the context based probability for reading the // prediction status flag - vp9_prob pred_prob = - vp9_get_pred_prob(cm, xd, PRED_SEG_ID); + vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID); // Read the prediction status flag - unsigned char seg_pred_flag = - (unsigned char)vp9_read(bc, pred_prob); + unsigned char seg_pred_flag = vp9_read(bc, pred_prob); // Store the prediction flag. vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag); @@ -599,17 +581,16 @@ static void read_mb_segment_id(VP9D_COMP *pbi, // If the value is flagged as correctly predicted // then use the predicted value if (seg_pred_flag) { - mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, index); + mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, mb_index); + } else { + // Decode it explicitly + read_mb_segid_except(cm, bc, mbmi, xd, mb_row, mb_col); } - // Else .... decode it explicitly - else { - read_mb_segid(bc, mbmi, xd); - } - } - // Normal unpredicted coding mode - else { + } else { + // Normal unpredicted coding mode read_mb_segid(bc, mbmi, xd); } + if (mbmi->sb_type) { const int nmbs = 1 << mbmi->sb_type; const int ymbs = MIN(cm->mb_rows - mb_row, nmbs); @@ -618,12 +599,12 @@ static void read_mb_segment_id(VP9D_COMP *pbi, for (y = 0; y < ymbs; y++) { for (x = 0; x < xmbs; x++) { - cm->last_frame_seg_map[index + x + y * cm->mb_cols] = + cm->last_frame_seg_map[mb_index + x + y * cm->mb_cols] = mbmi->segment_id; } } } else { - cm->last_frame_seg_map[index] = mbmi->segment_id; + cm->last_frame_seg_map[mb_index] = mbmi->segment_id; } } else { if (mbmi->sb_type) { @@ -636,13 +617,12 @@ static void read_mb_segment_id(VP9D_COMP *pbi, for (y = 0; y < ymbs; y++) { for (x = 0; x < xmbs; x++) { segment_id = MIN(segment_id, - cm->last_frame_seg_map[index + x + - y * cm->mb_cols]); + cm->last_frame_seg_map[mb_index + x + y * cm->mb_cols]); } } mbmi->segment_id = segment_id; } else { - mbmi->segment_id = cm->last_frame_seg_map[index]; + mbmi->segment_id = cm->last_frame_seg_map[mb_index]; } } } else { @@ -652,6 +632,27 @@ static void read_mb_segment_id(VP9D_COMP *pbi, } } + +static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src, + int mb_to_left_edge, + int mb_to_right_edge, + int mb_to_top_edge, + int mb_to_bottom_edge) { + dst->as_int = src->as_int; + clamp_mv(dst, mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, + mb_to_bottom_edge); +} + +static INLINE void process_mv(BOOL_DECODER* bc, MV *mv, MV *ref, + nmv_context *nmvc, nmv_context_counts *mvctx, + int usehp) { + read_nmv(bc, mv, ref, nmvc); + read_nmv_fp(bc, mv, ref, nmvc, usehp); + vp9_increment_nmv(mv, ref, mvctx, usehp); + mv->row += ref->row; + mv->col += ref->col; +} + static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, MODE_INFO *prev_mi, int mb_row, int mb_col, @@ -659,124 +660,109 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, VP9_COMMON *const cm = &pbi->common; nmv_context *const nmvc = &pbi->common.fc.nmvc; const int mis = pbi->common.mode_info_stride; - MACROBLOCKD *const xd = &pbi->mb; + MACROBLOCKD *const xd = &pbi->mb; int_mv *const mv = &mbmi->mv[0]; - int mb_to_left_edge; - int mb_to_right_edge; - int mb_to_top_edge; - int mb_to_bottom_edge; const int mb_size = 1 << mi->mbmi.sb_type; - mb_to_top_edge = xd->mb_to_top_edge; - mb_to_bottom_edge = xd->mb_to_bottom_edge; - mb_to_top_edge -= LEFT_TOP_MARGIN; - mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN; + const int use_prev_in_find_mv_refs = cm->width == cm->last_width && + cm->height == cm->last_height && + !cm->error_resilient_mode; + + int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge; + mbmi->need_to_clamp_mvs = 0; mbmi->need_to_clamp_secondmv = 0; mbmi->second_ref_frame = NONE; - /* Distance of Mb to the various image edges. - * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units - */ - xd->mb_to_left_edge = - mb_to_left_edge = -((mb_col * 16) << 3); - mb_to_left_edge -= LEFT_TOP_MARGIN; - xd->mb_to_right_edge = - mb_to_right_edge = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3; - mb_to_right_edge += RIGHT_BOTTOM_MARGIN; // Make sure the MACROBLOCKD mode info pointer is pointed at the // correct entry for the current macroblock. xd->mode_info_context = mi; xd->prev_mode_info_context = prev_mi; + // Distance of Mb to the various image edges. + // These specified to 8th pel as they are always compared to MV values + // that are in 1/8th pel units + set_mb_row(cm, xd, mb_row, mb_size); + set_mb_col(cm, xd, mb_col, mb_size); + + mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN; + mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; + mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN; + mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; + // Read the macroblock segment id. read_mb_segment_id(pbi, mb_row, mb_col, bc); if (pbi->common.mb_no_coeff_skip && - (!vp9_segfeature_active(xd, - mbmi->segment_id, SEG_LVL_EOB) || - (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) { + (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP))) { // Read the macroblock coeff skip flag if this feature is in use, // else default to 0 mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP)); } else { - if (vp9_segfeature_active(xd, - mbmi->segment_id, SEG_LVL_EOB) && - (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) { - mbmi->mb_skip_coeff = 1; - } else - mbmi->mb_skip_coeff = 0; + mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id, + SEG_LVL_SKIP); } // Read the reference frame - if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE) - && vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE) < NEARESTMV) - mbmi->ref_frame = INTRA_FRAME; - else - mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id); + mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id); + + /* + if (pbi->common.current_video_frame == 1) + printf("ref frame: %d [%d %d]\n", mbmi->ref_frame, mb_row, mb_col); + */ // If reference frame is an Inter frame if (mbmi->ref_frame) { int_mv nearest, nearby, best_mv; int_mv nearest_second, nearby_second, best_mv_second; - vp9_prob mv_ref_p [VP9_MVREFS - 1]; + vp9_prob mv_ref_p[VP9_MVREFS - 1]; - int recon_y_stride, recon_yoffset; - int recon_uv_stride, recon_uvoffset; MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame; + xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1]; { - int ref_fb_idx; + const int use_prev_in_find_best_ref = + xd->scale_factor[0].x_num == xd->scale_factor[0].x_den && + xd->scale_factor[0].y_num == xd->scale_factor[0].y_den && + !cm->error_resilient_mode && + !cm->frame_parallel_decoding_mode; /* Select the appropriate reference frame for this MB */ - if (ref_frame == LAST_FRAME) - ref_fb_idx = cm->lst_fb_idx; - else if (ref_frame == GOLDEN_FRAME) - ref_fb_idx = cm->gld_fb_idx; - else - ref_fb_idx = cm->alt_fb_idx; - - recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride ; - recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; + const int ref_fb_idx = cm->active_ref_idx[ref_frame - 1]; - recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16); - recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8); - - xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; - xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; - xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], + mb_row, mb_col, &xd->scale_factor[0], &xd->scale_factor_uv[0]); #ifdef DEC_DEBUG if (dec_debug) printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row, xd->mode_info_context->mbmi.mv[0].as_mv.col); #endif - vp9_find_mv_refs(xd, mi, prev_mi, + // if (cm->current_video_frame == 1 && mb_row == 4 && mb_col == 5) + // printf("Dello\n"); + vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL, ref_frame, mbmi->ref_mvs[ref_frame], cm->ref_frame_sign_bias); vp9_mv_ref_probs(&pbi->common, mv_ref_p, mbmi->mb_mode_context[ref_frame]); - // Is the segment level mode feature enabled for this segment - if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) { - mbmi->mode = - vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE); + // If the segment level skip mode enabled + if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) { + mbmi->mode = ZEROMV; } else { - if (mbmi->sb_type) - mbmi->mode = read_sb_mv_ref(bc, mv_ref_p); - else - mbmi->mode = read_mv_ref(bc, mv_ref_p); - + mbmi->mode = mbmi->sb_type ? read_sb_mv_ref(bc, mv_ref_p) + : read_mv_ref(bc, mv_ref_p); vp9_accum_mv_refs(&pbi->common, mbmi->mode, mbmi->mb_mode_context[ref_frame]); } if (mbmi->mode != ZEROMV) { vp9_find_best_ref_mvs(xd, - xd->pre.y_buffer, - recon_y_stride, + use_prev_in_find_best_ref ? + xd->pre.y_buffer : NULL, + xd->pre.y_stride, mbmi->ref_mvs[ref_frame], &nearest, &nearby); @@ -791,8 +777,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, #endif } - if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV) - { + if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV) { if (cm->mcomp_filter_type == SWITCHABLE) { mbmi->interp_filter = vp9_switchable_interp[ treed_read(bc, vp9_switchable_interp_tree, @@ -817,31 +802,31 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, mbmi->second_ref_frame = 1; if (mbmi->second_ref_frame > 0) { int second_ref_fb_idx; + int use_prev_in_find_best_ref; + + xd->scale_factor[1] = cm->active_ref_scale[mbmi->second_ref_frame - 1]; + use_prev_in_find_best_ref = + xd->scale_factor[1].x_num == xd->scale_factor[1].x_den && + xd->scale_factor[1].y_num == xd->scale_factor[1].y_den && + !cm->error_resilient_mode && + !cm->frame_parallel_decoding_mode; + /* Select the appropriate reference frame for this MB */ - if (mbmi->second_ref_frame == LAST_FRAME) - second_ref_fb_idx = cm->lst_fb_idx; - else if (mbmi->second_ref_frame == - GOLDEN_FRAME) - second_ref_fb_idx = cm->gld_fb_idx; - else - second_ref_fb_idx = cm->alt_fb_idx; + second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1]; - xd->second_pre.y_buffer = - cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset; - xd->second_pre.u_buffer = - cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset; - xd->second_pre.v_buffer = - cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx], + mb_row, mb_col, &xd->scale_factor[1], &xd->scale_factor_uv[1]); - vp9_find_mv_refs(xd, mi, prev_mi, + vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL, mbmi->second_ref_frame, mbmi->ref_mvs[mbmi->second_ref_frame], cm->ref_frame_sign_bias); if (mbmi->mode != ZEROMV) { vp9_find_best_ref_mvs(xd, - xd->second_pre.y_buffer, - recon_y_stride, + use_prev_in_find_best_ref ? + xd->second_pre.y_buffer : NULL, + xd->second_pre.y_stride, mbmi->ref_mvs[mbmi->second_ref_frame], &nearest_second, &nearby_second); @@ -861,12 +846,11 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, pbi->common.fc.interintra_counts[ mbmi->second_ref_frame == INTRA_FRAME]++; if (mbmi->second_ref_frame == INTRA_FRAME) { - mbmi->interintra_mode = (MB_PREDICTION_MODE)read_ymode( - bc, pbi->common.fc.ymode_prob); + mbmi->interintra_mode = read_ymode(bc, pbi->common.fc.ymode_prob); pbi->common.fc.ymode_counts[mbmi->interintra_mode]++; #if SEPARATE_INTERINTRA_UV - mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)read_uv_mode( - bc, pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]); + mbmi->interintra_uv_mode = read_uv_mode(bc, + pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]); pbi->common.fc.uv_mode_counts[mbmi->interintra_mode] [mbmi->interintra_uv_mode]++; #else @@ -905,28 +889,26 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, mbmi->uv_mode = DC_PRED; switch (mbmi->mode) { case SPLITMV: { - const int s = mbmi->partitioning = - treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob); - const int num_p = vp9_mbsplit_count [s]; + const int s = treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob); + const int num_p = vp9_mbsplit_count[s]; int j = 0; - cm->fc.mbsplit_counts[s]++; + cm->fc.mbsplit_counts[s]++; mbmi->need_to_clamp_mvs = 0; - do { /* for each subset j */ + mbmi->partitioning = s; + do { // for each subset j int_mv leftmv, abovemv, second_leftmv, second_abovemv; int_mv blockmv, secondmv; - int k; /* first block in subset j */ int mv_contz; int blockmode; + int k = vp9_mbsplit_offset[s][j]; // first block in subset j - k = vp9_mbsplit_offset[s][j]; - - leftmv.as_int = left_block_mv(mi, k); + leftmv.as_int = left_block_mv(xd, mi, k); abovemv.as_int = above_block_mv(mi, k, mis); second_leftmv.as_int = 0; second_abovemv.as_int = 0; if (mbmi->second_ref_frame > 0) { - second_leftmv.as_int = left_block_second_mv(mi, k); + second_leftmv.as_int = left_block_second_mv(xd, mi, k); second_abovemv.as_int = above_block_second_mv(mi, k, mis); } mv_contz = vp9_mv_cont(&leftmv, &abovemv); @@ -935,23 +917,13 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, switch (blockmode) { case NEW4X4: - read_nmv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc); - read_nmv_fp(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc, - xd->allow_high_precision_mv); - vp9_increment_nmv(&blockmv.as_mv, &best_mv.as_mv, - &cm->fc.NMVcount, xd->allow_high_precision_mv); - blockmv.as_mv.row += best_mv.as_mv.row; - blockmv.as_mv.col += best_mv.as_mv.col; - - if (mbmi->second_ref_frame > 0) { - read_nmv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc); - read_nmv_fp(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc, - xd->allow_high_precision_mv); - vp9_increment_nmv(&secondmv.as_mv, &best_mv_second.as_mv, - &cm->fc.NMVcount, xd->allow_high_precision_mv); - secondmv.as_mv.row += best_mv_second.as_mv.row; - secondmv.as_mv.col += best_mv_second.as_mv.col; - } + process_mv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc, + &cm->fc.NMVcount, xd->allow_high_precision_mv); + + if (mbmi->second_ref_frame > 0) + process_mv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc, + &cm->fc.NMVcount, xd->allow_high_precision_mv); + #ifdef VPX_MODE_COUNT vp9_mv_cont_count[mv_contz][3]++; #endif @@ -1005,15 +977,14 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, /* Fill (uniform) modes, mvs of jth subset. Must do it here because ensuing subsets can refer back to us via "left" or "above". */ - const unsigned char *fill_offset; unsigned int fill_count = mbsplit_fill_count[s]; - - fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]]; + const unsigned char *fill_offset = + &mbsplit_fill_offset[s][j * fill_count]; do { - mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int; + mi->bmi[*fill_offset].as_mv[0].as_int = blockmv.as_int; if (mbmi->second_ref_frame > 0) - mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int; + mi->bmi[*fill_offset].as_mv[1].as_int = secondmv.as_int; fill_offset++; } while (--fill_count); } @@ -1021,33 +992,35 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, } while (++j < num_p); } - mv->as_int = mi->bmi[15].as_mv.first.as_int; - mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int; + mv->as_int = mi->bmi[15].as_mv[0].as_int; + mbmi->mv[1].as_int = mi->bmi[15].as_mv[1].as_int; break; /* done with SPLITMV */ case NEARMV: - mv->as_int = nearby.as_int; - /* Clip "next_nearest" so that it does not extend to far out of image */ - clamp_mv(mv, mb_to_left_edge, mb_to_right_edge, - mb_to_top_edge, mb_to_bottom_edge); - if (mbmi->second_ref_frame > 0) { - mbmi->mv[1].as_int = nearby_second.as_int; - clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge, - mb_to_top_edge, mb_to_bottom_edge); - } + // Clip "next_nearest" so that it does not extend to far out of image + assign_and_clamp_mv(mv, &nearby, mb_to_left_edge, + mb_to_right_edge, + mb_to_top_edge, + mb_to_bottom_edge); + if (mbmi->second_ref_frame > 0) + assign_and_clamp_mv(&mbmi->mv[1], &nearby_second, mb_to_left_edge, + mb_to_right_edge, + mb_to_top_edge, + mb_to_bottom_edge); break; case NEARESTMV: - mv->as_int = nearest.as_int; - /* Clip "next_nearest" so that it does not extend to far out of image */ - clamp_mv(mv, mb_to_left_edge, mb_to_right_edge, - mb_to_top_edge, mb_to_bottom_edge); - if (mbmi->second_ref_frame > 0) { - mbmi->mv[1].as_int = nearest_second.as_int; - clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge, - mb_to_top_edge, mb_to_bottom_edge); - } + // Clip "next_nearest" so that it does not extend to far out of image + assign_and_clamp_mv(mv, &nearest, mb_to_left_edge, + mb_to_right_edge, + mb_to_top_edge, + mb_to_bottom_edge); + if (mbmi->second_ref_frame > 0) + assign_and_clamp_mv(&mbmi->mv[1], &nearest_second, mb_to_left_edge, + mb_to_right_edge, + mb_to_top_edge, + mb_to_bottom_edge); break; case ZEROMV: @@ -1057,21 +1030,13 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, break; case NEWMV: + process_mv(bc, &mv->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount, + xd->allow_high_precision_mv); - read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc); - read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc, - xd->allow_high_precision_mv); - vp9_increment_nmv(&mv->as_mv, &best_mv.as_mv, &cm->fc.NMVcount, - xd->allow_high_precision_mv); - - mv->as_mv.row += best_mv.as_mv.row; - mv->as_mv.col += best_mv.as_mv.col; - - /* Don't need to check this on NEARMV and NEARESTMV modes - * since those modes clamp the MV. The NEWMV mode does not, - * so signal to the prediction stage whether special - * handling may be required. - */ + // Don't need to check this on NEARMV and NEARESTMV modes + // since those modes clamp the MV. The NEWMV mode does not, + // so signal to the prediction stage whether special + // handling may be required. mbmi->need_to_clamp_mvs = check_mv_bounds(mv, mb_to_left_edge, mb_to_right_edge, @@ -1079,17 +1044,13 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, mb_to_bottom_edge); if (mbmi->second_ref_frame > 0) { - read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc); - read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc, - xd->allow_high_precision_mv); - vp9_increment_nmv(&mbmi->mv[1].as_mv, &best_mv_second.as_mv, - &cm->fc.NMVcount, xd->allow_high_precision_mv); - mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row; - mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col; - mbmi->need_to_clamp_secondmv |= - check_mv_bounds(&mbmi->mv[1], - mb_to_left_edge, mb_to_right_edge, - mb_to_top_edge, mb_to_bottom_edge); + process_mv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc, + &cm->fc.NMVcount, xd->allow_high_precision_mv); + mbmi->need_to_clamp_secondmv |= check_mv_bounds(&mbmi->mv[1], + mb_to_left_edge, + mb_to_right_edge, + mb_to_top_edge, + mb_to_bottom_edge); } break; default: @@ -1102,16 +1063,11 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, /* required for left and above block mv */ mbmi->mv[0].as_int = 0; - if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) { - mbmi->mode = (MB_PREDICTION_MODE) - vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE); - } else if (mbmi->sb_type) { - mbmi->mode = (MB_PREDICTION_MODE) - read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob); + if (mbmi->sb_type) { + mbmi->mode = read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob); pbi->common.fc.sb_ymode_counts[mbmi->mode]++; } else { - mbmi->mode = (MB_PREDICTION_MODE) - read_ymode(bc, pbi->common.fc.ymode_prob); + mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob); pbi->common.fc.ymode_counts[mbmi->mode]++; } @@ -1119,9 +1075,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, if (mbmi->mode == B_PRED) { int j = 0; do { - int m; - m = mi->bmi[j].as_mode.first = (B_PREDICTION_MODE) - read_bmode(bc, pbi->common.fc.bmode_prob); + int m = read_bmode(bc, pbi->common.fc.bmode_prob); + mi->bmi[j].as_mode.first = m; #if CONFIG_NEWBINTRAMODES if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS; #endif @@ -1131,10 +1086,10 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, if (mbmi->mode == I8X8_PRED) { int i; - int mode8x8; for (i = 0; i < 4; i++) { - int ib = vp9_i8x8_block[i]; - mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob); + const int ib = vp9_i8x8_block[i]; + const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob); + mi->bmi[ib + 0].as_mode.first = mode8x8; mi->bmi[ib + 1].as_mode.first = mode8x8; mi->bmi[ib + 4].as_mode.first = mode8x8; @@ -1142,11 +1097,14 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, pbi->common.fc.i8x8_mode_counts[mode8x8]++; } } else { - mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode( - bc, pbi->common.fc.uv_mode_prob[mbmi->mode]); + mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob[mbmi->mode]); pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++; } } + /* + if (pbi->common.current_video_frame == 1) + printf("mode: %d skip: %d\n", mbmi->mode, mbmi->mb_skip_coeff); + */ if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 && ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) || @@ -1182,22 +1140,305 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc) { vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs)); if (pbi->common.mb_no_coeff_skip) { int k; - for (k = 0; k < MBSKIP_CONTEXTS; ++k) - cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8); + for (k = 0; k < MBSKIP_CONTEXTS; ++k) { + cm->mbskip_pred_probs[k] = vp9_read_prob(bc); + } } mb_mode_mv_init(pbi, bc); } + +#if CONFIG_CODE_NONZEROCOUNT +static uint16_t read_nzc(VP9_COMMON *const cm, + int nzc_context, + TX_SIZE tx_size, + int ref, + int type, + BOOL_DECODER* const bc) { + int c, e; + uint16_t nzc; + if (tx_size == TX_32X32) { + c = treed_read(bc, vp9_nzc32x32_tree, + cm->fc.nzc_probs_32x32[nzc_context][ref][type]); + cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++; + } else if (tx_size == TX_16X16) { + c = treed_read(bc, vp9_nzc16x16_tree, + cm->fc.nzc_probs_16x16[nzc_context][ref][type]); + cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++; + } else if (tx_size == TX_8X8) { + c = treed_read(bc, vp9_nzc8x8_tree, + cm->fc.nzc_probs_8x8[nzc_context][ref][type]); + cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++; + } else if (tx_size == TX_4X4) { + c = treed_read(bc, vp9_nzc4x4_tree, + cm->fc.nzc_probs_4x4[nzc_context][ref][type]); + cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++; + } else { + assert(0); + } + nzc = vp9_basenzcvalue[c]; + if ((e = vp9_extranzcbits[c])) { + int x = 0; + while (e--) { + int b = vp9_read( + bc, cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]); + x |= (b << e); + cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++; + } + nzc += x; + } + if (tx_size == TX_32X32) + assert(nzc <= 1024); + else if (tx_size == TX_16X16) + assert(nzc <= 256); + else if (tx_size == TX_8X8) + assert(nzc <= 64); + else if (tx_size == TX_4X4) + assert(nzc <= 16); + return nzc; +} + +static void read_nzcs_sb64(VP9_COMMON *const cm, + MACROBLOCKD* xd, + int mb_row, + int mb_col, + BOOL_DECODER* const bc) { + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0])); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_32X32: + for (j = 0; j < 256; j += 64) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc); + } + for (j = 256; j < 384; j += 64) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 1, bc); + } + break; + + case TX_16X16: + for (j = 0; j < 256; j += 16) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc); + } + for (j = 256; j < 384; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc); + } + break; + + case TX_8X8: + for (j = 0; j < 256; j += 4) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc); + } + for (j = 256; j < 384; j += 4) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc); + } + break; + + case TX_4X4: + for (j = 0; j < 256; ++j) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc); + } + for (j = 256; j < 384; ++j) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc); + } + break; + + default: + break; + } +} + +static void read_nzcs_sb32(VP9_COMMON *const cm, + MACROBLOCKD* xd, + int mb_row, + int mb_col, + BOOL_DECODER* const bc) { + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0])); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_32X32: + for (j = 0; j < 64; j += 64) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc); + } + for (j = 64; j < 96; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc); + } + break; + + case TX_16X16: + for (j = 0; j < 64; j += 16) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc); + } + for (j = 64; j < 96; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc); + } + break; + + case TX_8X8: + for (j = 0; j < 64; j += 4) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc); + } + for (j = 64; j < 96; j += 4) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc); + } + break; + + case TX_4X4: + for (j = 0; j < 64; ++j) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc); + } + for (j = 64; j < 96; ++j) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc); + } + break; + + default: + break; + } +} + +static void read_nzcs_mb16(VP9_COMMON *const cm, + MACROBLOCKD* xd, + int mb_row, + int mb_col, + BOOL_DECODER* const bc) { + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0])); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_16X16: + for (j = 0; j < 16; j += 16) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc); + } + for (j = 16; j < 24; j += 4) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc); + } + break; + + case TX_8X8: + for (j = 0; j < 16; j += 4) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc); + } + if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) { + for (j = 16; j < 24; ++j) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc); + } + } else { + for (j = 16; j < 24; j += 4) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc); + } + } + break; + + case TX_4X4: + for (j = 0; j < 16; ++j) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc); + } + for (j = 16; j < 24; ++j) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc); + } + break; + + default: + break; + } +} +#endif // CONFIG_CODE_NONZEROCOUNT + void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi, MACROBLOCKD* const xd, int mb_row, int mb_col, BOOL_DECODER* const bc) { + VP9_COMMON *const cm = &pbi->common; MODE_INFO *mi = xd->mode_info_context; MODE_INFO *prev_mi = xd->prev_mode_info_context; + MB_MODE_INFO *const mbmi = &mi->mbmi; - if (pbi->common.frame_type == KEY_FRAME) + if (pbi->common.frame_type == KEY_FRAME) { kfread_modes(pbi, mi, mb_row, mb_col, bc); - else + } else { read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc); + set_scale_factors(xd, + mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1, + pbi->common.active_ref_scale); + } +#if CONFIG_CODE_NONZEROCOUNT + if (mbmi->sb_type == BLOCK_SIZE_SB64X64) + read_nzcs_sb64(cm, xd, mb_row, mb_col, bc); + else if (mbmi->sb_type == BLOCK_SIZE_SB32X32) + read_nzcs_sb32(cm, xd, mb_row, mb_col, bc); + else + read_nzcs_mb16(cm, xd, mb_row, mb_col, bc); +#endif // CONFIG_CODE_NONZEROCOUNT + + if (mbmi->sb_type) { + const int n_mbs = 1 << mbmi->sb_type; + const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row); + const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col); + const int mis = cm->mode_info_stride; + int x, y; + + for (y = 0; y < y_mbs; y++) { + for (x = !y; x < x_mbs; x++) { + mi[y * mis + x] = *mi; + } + } + } else { + update_blockd_bmi(xd); + } } diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index c3b9637a62164732cb47f38e8bf22ab759343db5..fea6433b28e28cde2223e77c17d404cf151c5d5d 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -13,7 +13,6 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_header.h" #include "vp9/common/vp9_reconintra.h" -#include "vp9/common/vp9_reconintra4x4.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_entropy.h" #include "vp9/decoder/vp9_decodframe.h" @@ -32,7 +31,7 @@ #include "vp9/decoder/vp9_dboolhuff.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_tile_common.h" #include "vp9_rtcd.h" #include <assert.h> @@ -40,11 +39,25 @@ #define COEFCOUNT_TESTING -//#define DEC_DEBUG +// #define DEC_DEBUG #ifdef DEC_DEBUG int dec_debug = 0; #endif +static int read_le16(const uint8_t *p) { + return (p[1] << 8) | p[0]; +} + +static int read_le32(const uint8_t *p) { + return (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; +} + +// len == 0 is not allowed +static int read_is_valid(const unsigned char *start, size_t len, + const unsigned char *end) { + return start + len > start && start + len <= end; +} + static int merge_index(int v, int n, int modulus) { int max1 = (n - 1 - modulus / 2) / modulus + 1; if (v < max1) v = v * modulus + modulus / 2; @@ -62,14 +75,13 @@ static int merge_index(int v, int n, int modulus) { static int inv_remap_prob(int v, int m) { const int n = 256; const int modulus = MODULUS_PARAM; - int i; + v = merge_index(v, n - 1, modulus); if ((m << 1) <= n) { - i = vp9_inv_recenter_nonneg(v + 1, m); + return vp9_inv_recenter_nonneg(v + 1, m); } else { - i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m); + return n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m); } - return i; } static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) { @@ -79,103 +91,78 @@ static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) { void vp9_init_de_quantizer(VP9D_COMP *pbi) { int i; - int Q; + int q; VP9_COMMON *const pc = &pbi->common; - for (Q = 0; Q < QINDEX_RANGE; Q++) { - pc->Y1dequant[Q][0] = (int16_t)vp9_dc_quant(Q, pc->y1dc_delta_q); - pc->Y2dequant[Q][0] = (int16_t)vp9_dc2quant(Q, pc->y2dc_delta_q); - pc->UVdequant[Q][0] = (int16_t)vp9_dc_uv_quant(Q, pc->uvdc_delta_q); + for (q = 0; q < QINDEX_RANGE; q++) { + pc->Y1dequant[q][0] = (int16_t)vp9_dc_quant(q, pc->y1dc_delta_q); + pc->UVdequant[q][0] = (int16_t)vp9_dc_uv_quant(q, pc->uvdc_delta_q); /* all the ac values =; */ for (i = 1; i < 16; i++) { int rc = vp9_default_zig_zag1d_4x4[i]; - pc->Y1dequant[Q][rc] = (int16_t)vp9_ac_yquant(Q); - pc->Y2dequant[Q][rc] = (int16_t)vp9_ac2quant(Q, pc->y2ac_delta_q); - pc->UVdequant[Q][rc] = (int16_t)vp9_ac_uv_quant(Q, pc->uvac_delta_q); + pc->Y1dequant[q][rc] = (int16_t)vp9_ac_yquant(q); + pc->UVdequant[q][rc] = (int16_t)vp9_ac_uv_quant(q, pc->uvac_delta_q); } } } -static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) { - int i; - int QIndex; - VP9_COMMON *const pc = &pbi->common; - int segment_id = xd->mode_info_context->mbmi.segment_id; - +static int get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex) { // Set the Q baseline allowing for any segment level adjustment - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) { - /* Abs Value */ - if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) - QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q); - - /* Delta Value */ - else { - QIndex = pc->base_qindex + - vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q); - QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; /* Clamp to valid range */ - } - } else - QIndex = pc->base_qindex; - xd->q_index = QIndex; - - /* Set up the block level dequant pointers */ - for (i = 0; i < 16; i++) { - xd->block[i].dequant = pc->Y1dequant[QIndex]; - } - -#if CONFIG_LOSSLESS - if (!QIndex) { - pbi->mb.inv_xform4x4_1_x8 = vp9_short_inv_walsh4x4_1_x8; - pbi->mb.inv_xform4x4_x8 = vp9_short_inv_walsh4x4_x8; - pbi->mb.inv_walsh4x4_1 = vp9_short_inv_walsh4x4_1_lossless; - pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless; - pbi->idct_add = vp9_dequant_idct_add_lossless_c; - pbi->dc_idct_add = vp9_dequant_dc_idct_add_lossless_c; - pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c; - pbi->idct_add_y_block = vp9_dequant_idct_add_y_block_lossless_c; - pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c; + if (vp9_segfeature_active(mb, segment_id, SEG_LVL_ALT_Q)) { + if (mb->mb_segment_abs_delta == SEGMENT_ABSDATA) + return vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q); // Abs Value + else + return clamp(base_qindex + vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q), + 0, MAXQ); // Delta Value } else { - pbi->mb.inv_xform4x4_1_x8 = vp9_short_idct4x4llm_1; - pbi->mb.inv_xform4x4_x8 = vp9_short_idct4x4llm; - pbi->mb.inv_walsh4x4_1 = vp9_short_inv_walsh4x4_1; - pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4; - pbi->idct_add = vp9_dequant_idct_add; - pbi->dc_idct_add = vp9_dequant_dc_idct_add; - pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block; - pbi->idct_add_y_block = vp9_dequant_idct_add_y_block; - pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block; - } -#else - pbi->mb.inv_xform4x4_1_x8 = vp9_short_idct4x4llm_1; - pbi->mb.inv_xform4x4_x8 = vp9_short_idct4x4llm; - pbi->mb.inv_walsh4x4_1 = vp9_short_inv_walsh4x4_1; - pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4; - pbi->idct_add = vp9_dequant_idct_add; - pbi->dc_idct_add = vp9_dequant_dc_idct_add; - pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block; - pbi->idct_add_y_block = vp9_dequant_idct_add_y_block; - pbi->idct_add_uv_block = vp9_dequant_idct_add_uv_block; -#endif - - for (i = 16; i < 24; i++) { - xd->block[i].dequant = pc->UVdequant[QIndex]; + return base_qindex; } +} - xd->block[24].dequant = pc->Y2dequant[QIndex]; +static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) { + int i; + VP9_COMMON *const pc = &pbi->common; + const int segment_id = mb->mode_info_context->mbmi.segment_id; + const int qindex = get_qindex(mb, segment_id, pc->base_qindex); + mb->q_index = qindex; + + for (i = 0; i < 16; i++) + mb->block[i].dequant = pc->Y1dequant[qindex]; + + for (i = 16; i < 24; i++) + mb->block[i].dequant = pc->UVdequant[qindex]; + + if (mb->lossless) { + assert(qindex == 0); + mb->inv_txm4x4_1 = vp9_short_iwalsh4x4_1; + mb->inv_txm4x4 = vp9_short_iwalsh4x4; + mb->itxm_add = vp9_dequant_idct_add_lossless_c; + mb->itxm_add_y_block = vp9_dequant_idct_add_y_block_lossless_c; + mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c; + } else { + mb->inv_txm4x4_1 = vp9_short_idct4x4_1; + mb->inv_txm4x4 = vp9_short_idct4x4; + mb->itxm_add = vp9_dequant_idct_add; + mb->itxm_add_y_block = vp9_dequant_idct_add_y_block; + mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block; + } } /* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it * to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy. */ -static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) { +static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd, + int mb_row, int mb_col) { + BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { - if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) { + if (sb_type == BLOCK_SIZE_SB64X64) { vp9_build_intra_predictors_sb64uv_s(xd); vp9_build_intra_predictors_sb64y_s(xd); - } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) { + } else if (sb_type == BLOCK_SIZE_SB32X32) { vp9_build_intra_predictors_sbuv_s(xd); vp9_build_intra_predictors_sby_s(xd); } else { @@ -183,56 +170,38 @@ static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) { vp9_build_intra_predictors_mby_s(xd); } } else { - if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) { + if (sb_type == BLOCK_SIZE_SB64X64) { vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, xd->dst.y_stride, - xd->dst.uv_stride); - } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) { + xd->dst.uv_stride, + mb_row, mb_col); + } else if (sb_type == BLOCK_SIZE_SB32X32) { vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, xd->dst.y_stride, - xd->dst.uv_stride); + xd->dst.uv_stride, + mb_row, mb_col); } else { - vp9_build_1st_inter16x16_predictors_mb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); - - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - vp9_build_2nd_inter16x16_predictors_mb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); - } -#if CONFIG_COMP_INTERINTRA_PRED - else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { - vp9_build_interintra_16x16_predictors_mb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); - } -#endif + vp9_build_inter16x16_predictors_mb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); } } } static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd, BOOL_DECODER* const bc) { - BLOCKD *bd = &xd->block[0]; - TX_TYPE tx_type = get_tx_type_16x16(xd, bd); - assert(get_2nd_order_usage(xd) == 0); -#ifdef DEC_DEBUG + TX_TYPE tx_type = get_tx_type_16x16(xd, 0); +#if 0 // def DEC_DEBUG if (dec_debug) { int i; printf("\n"); @@ -262,20 +231,20 @@ static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd, vp9_dequant_idct_add_uv_block_8x8( xd->qcoeff + 16 * 16, xd->block[16].dequant, xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.uv_stride, xd->eobs + 16, xd); + xd->dst.uv_stride, xd); } static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd, BOOL_DECODER* const bc) { // First do Y // if the first one is DCT_DCT assume all the rest are as well - TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]); -#ifdef DEC_DEBUG + TX_TYPE tx_type = get_tx_type_8x8(xd, 0); +#if 0 // def DEC_DEBUG if (dec_debug) { int i; printf("\n"); printf("qcoeff 8x8\n"); - for (i = 0; i < 400; i++) { + for (i = 0; i < 384; i++) { printf("%3d ", xd->qcoeff[i]); if (i % 16 == 15) printf("\n"); } @@ -283,7 +252,6 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd, #endif if (tx_type != DCT_DCT || xd->mode_info_context->mbmi.mode == I8X8_PRED) { int i; - assert(get_2nd_order_usage(xd) == 0); for (i = 0; i < 4; i++) { int ib = vp9_i8x8_block[i]; int idx = (ib & 0x02) ? (ib + 2) : ib; @@ -295,46 +263,24 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd, BLOCKD *b = &xd->block[ib]; if (xd->mode_info_context->mbmi.mode == I8X8_PRED) { int i8x8mode = b->bmi.as_mode.first; - vp9_intra8x8_predict(b, i8x8mode, b->predictor); + vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor); } - tx_type = get_tx_type_8x8(xd, &xd->block[ib]); + tx_type = get_tx_type_8x8(xd, ib); if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride, xd->eobs[idx]); } else { vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, - 0, xd->eobs[idx]); + xd->eobs[idx]); } } - } else if (xd->mode_info_context->mbmi.mode == SPLITMV) { - assert(get_2nd_order_usage(xd) == 0); + } else { vp9_dequant_idct_add_y_block_8x8(xd->qcoeff, xd->block[0].dequant, xd->predictor, xd->dst.y_buffer, xd->dst.y_stride, - xd->eobs, xd); - } else { - BLOCKD *b = &xd->block[24]; - assert(get_2nd_order_usage(xd) == 1); - vp9_dequantize_b_2x2(b); - vp9_short_ihaar2x2(&b->dqcoeff[0], b->diff, 8); - ((int *)b->qcoeff)[0] = 0; // 2nd order block are set to 0 after idct - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; - vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff, - xd->block[0].dequant, - xd->predictor, - xd->dst.y_buffer, - xd->dst.y_stride, - xd->eobs, - xd->block[24].diff, - xd); + xd); } // Now do UV @@ -344,26 +290,28 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd, int ib = vp9_i8x8_block[i]; BLOCKD *b = &xd->block[ib]; int i8x8mode = b->bmi.as_mode.first; + b = &xd->block[16 + i]; - vp9_intra_uv4x4_predict(&xd->block[16 + i], i8x8mode, b->predictor); - pbi->idct_add(b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 8, b->dst_stride); + vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor); + xd->itxm_add(b->qcoeff, b->dequant, b->predictor, + *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]); + b = &xd->block[20 + i]; - vp9_intra_uv4x4_predict(&xd->block[20 + i], i8x8mode, b->predictor); - pbi->idct_add(b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 8, b->dst_stride); + vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor); + xd->itxm_add(b->qcoeff, b->dequant, b->predictor, + *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]); } } else if (xd->mode_info_context->mbmi.mode == SPLITMV) { - pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant, + xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant, xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.uv_stride, xd->eobs + 16); + xd->dst.uv_stride, xd); } else { vp9_dequant_idct_add_uv_block_8x8 (xd->qcoeff + 16 * 16, xd->block[16].dequant, xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.uv_stride, xd->eobs + 16, xd); + xd->dst.uv_stride, xd); } -#ifdef DEC_DEBUG +#if 0 // def DEC_DEBUG if (dec_debug) { int i; printf("\n"); @@ -381,94 +329,98 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, TX_TYPE tx_type; int i, eobtotal = 0; MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode; +#if 0 // def DEC_DEBUG + if (dec_debug) { + int i; + printf("\n"); + printf("predictor\n"); + for (i = 0; i < 384; i++) { + printf("%3d ", xd->predictor[i]); + if (i % 16 == 15) printf("\n"); + } + } +#endif if (mode == I8X8_PRED) { - assert(get_2nd_order_usage(xd) == 0); for (i = 0; i < 4; i++) { int ib = vp9_i8x8_block[i]; const int iblock[4] = {0, 1, 4, 5}; int j; - int i8x8mode; - BLOCKD *b; - b = &xd->block[ib]; - i8x8mode = b->bmi.as_mode.first; - vp9_intra8x8_predict(b, i8x8mode, b->predictor); + BLOCKD *b = &xd->block[ib]; + int i8x8mode = b->bmi.as_mode.first; + vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor); for (j = 0; j < 4; j++) { b = &xd->block[ib + iblock[j]]; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, ib + iblock[j]); if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 16, - b->dst_stride, b->eob); + b->dst_stride, xd->eobs[ib + iblock[j]]); } else { - vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride); + xd->itxm_add(b->qcoeff, b->dequant, b->predictor, + *(b->base_dst) + b->dst, 16, b->dst_stride, + xd->eobs[ib + iblock[j]]); } } b = &xd->block[16 + i]; - vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor); - pbi->idct_add(b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 8, b->dst_stride); + vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor); + xd->itxm_add(b->qcoeff, b->dequant, b->predictor, + *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]); b = &xd->block[20 + i]; - vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor); - pbi->idct_add(b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 8, b->dst_stride); + vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor); + xd->itxm_add(b->qcoeff, b->dequant, b->predictor, + *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]); } } else if (mode == B_PRED) { - assert(get_2nd_order_usage(xd) == 0); for (i = 0; i < 16; i++) { - int b_mode; BLOCKD *b = &xd->block[i]; - b_mode = xd->mode_info_context->bmi[i].as_mode.first; + int b_mode = xd->mode_info_context->bmi[i].as_mode.first; #if CONFIG_NEWBINTRAMODES xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context = - vp9_find_bpred_context(b); + vp9_find_bpred_context(xd, b); #endif if (!xd->mode_info_context->mbmi.mb_skip_coeff) eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i); - vp9_intra4x4_predict(b, b_mode, b->predictor); - tx_type = get_tx_type_4x4(xd, b); + vp9_intra4x4_predict(xd, b, b_mode, b->predictor); + tx_type = get_tx_type_4x4(xd, i); if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride, - b->eob); + xd->eobs[i]); } else { - vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride); + xd->itxm_add(b->qcoeff, b->dequant, b->predictor, + *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]); } } if (!xd->mode_info_context->mbmi.mb_skip_coeff) { vp9_decode_mb_tokens_4x4_uv(pbi, xd, bc); } - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; vp9_build_intra_predictors_mbuv(xd); - pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, + xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant, xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer, xd->dst.uv_stride, - xd->eobs + 16); - } else if (mode == SPLITMV) { - assert(get_2nd_order_usage(xd) == 0); - pbi->idct_add_y_block(xd->qcoeff, + xd); + } else if (mode == SPLITMV || get_tx_type_4x4(xd, 0) == DCT_DCT) { + xd->itxm_add_y_block(xd->qcoeff, xd->block[0].dequant, xd->predictor, xd->dst.y_buffer, xd->dst.y_stride, - xd->eobs); - pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, + xd); + xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant, xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer, xd->dst.uv_stride, - xd->eobs + 16); + xd); } else { -#ifdef DEC_DEBUG +#if 0 // def DEC_DEBUG if (dec_debug) { int i; printf("\n"); @@ -485,211 +437,35 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, } } #endif - tx_type = get_tx_type_4x4(xd, &xd->block[0]); - if (tx_type != DCT_DCT) { - assert(get_2nd_order_usage(xd) == 0); - for (i = 0; i < 16; i++) { - BLOCKD *b = &xd->block[i]; - tx_type = get_tx_type_4x4(xd, b); - if (tx_type != DCT_DCT) { - vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, - b->dequant, b->predictor, - *(b->base_dst) + b->dst, 16, - b->dst_stride, b->eob); - } else { - vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride); - } - } - } else { - BLOCKD *b = &xd->block[24]; - assert(get_2nd_order_usage(xd) == 1); - vp9_dequantize_b(b); - if (xd->eobs[24] > 1) { - vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; + for (i = 0; i < 16; i++) { + BLOCKD *b = &xd->block[i]; + tx_type = get_tx_type_4x4(xd, i); + if (tx_type != DCT_DCT) { + vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, + b->dequant, b->predictor, + *(b->base_dst) + b->dst, 16, + b->dst_stride, xd->eobs[i]); } else { - xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; + xd->itxm_add(b->qcoeff, b->dequant, b->predictor, + *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]); } - vp9_dequantize_b(b); - pbi->dc_idct_add_y_block(xd->qcoeff, - xd->block[0].dequant, - xd->predictor, - xd->dst.y_buffer, - xd->dst.y_stride, - xd->eobs, - xd->block[24].diff); } - pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, + xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant, xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer, xd->dst.uv_stride, - xd->eobs + 16); + xd); } } -static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, - BOOL_DECODER* const bc, int n, - int maska, int shiftb) { - int x_idx = n & maska, y_idx = n >> shiftb; - TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]); - if (tx_type != DCT_DCT) { - vp9_ht_dequant_idct_add_16x16_c( - tx_type, xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_stride, xd->dst.y_stride, xd->block[0].eob); - } else { - vp9_dequant_idct_add_16x16( - xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_stride, xd->dst.y_stride, xd->eobs[0]); - } - vp9_dequant_idct_add_uv_block_8x8_inplace_c( - xd->qcoeff + 16 * 16, - xd->block[16].dequant, - xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8, - xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8, - xd->dst.uv_stride, xd->eobs + 16, xd); -}; - -static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, - BOOL_DECODER* const bc, int n, - int maska, int shiftb) { - int x_idx = n & maska, y_idx = n >> shiftb; - BLOCKD *b = &xd->block[24]; - TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]); - if (tx_type != DCT_DCT) { - int i; - for (i = 0; i < 4; i++) { - int ib = vp9_i8x8_block[i]; - int idx = (ib & 0x02) ? (ib + 2) : ib; - int16_t *q = xd->block[idx].qcoeff; - int16_t *dq = xd->block[0].dequant; - int stride = xd->dst.y_stride; - BLOCKD *b = &xd->block[ib]; - tx_type = get_tx_type_8x8(xd, &xd->block[ib]); - if (tx_type != DCT_DCT) { - vp9_ht_dequant_idct_add_8x8_c( - tx_type, q, dq, - xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride - + x_idx * 16 + (i & 1) * 8, - xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride - + x_idx * 16 + (i & 1) * 8, - stride, stride, b->eob); - } else { - vp9_dequant_idct_add_8x8_c( - q, dq, - xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride - + x_idx * 16 + (i & 1) * 8, - xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride - + x_idx * 16 + (i & 1) * 8, - stride, stride, 0, b->eob); - } - vp9_dequant_idct_add_uv_block_8x8_inplace_c( - xd->qcoeff + 16 * 16, xd->block[16].dequant, - xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8, - xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8, - xd->dst.uv_stride, xd->eobs + 16, xd); - } - } else { - vp9_dequantize_b_2x2(b); - vp9_short_ihaar2x2(&b->dqcoeff[0], b->diff, 8); - ((int *)b->qcoeff)[0] = 0; // 2nd order block are set to 0 after idct - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; - vp9_dequant_dc_idct_add_y_block_8x8_inplace_c( - xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd); - vp9_dequant_idct_add_uv_block_8x8_inplace_c( - xd->qcoeff + 16 * 16, xd->block[16].dequant, - xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8, - xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8, - xd->dst.uv_stride, xd->eobs + 16, xd); - } -}; - -static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, - BOOL_DECODER* const bc, int n, - int maska, int shiftb) { - int x_idx = n & maska, y_idx = n >> shiftb; - BLOCKD *b = &xd->block[24]; - TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[0]); - if (tx_type != DCT_DCT) { - int i; - for (i = 0; i < 16; i++) { - BLOCKD *b = &xd->block[i]; - tx_type = get_tx_type_4x4(xd, b); - if (tx_type != DCT_DCT) { - vp9_ht_dequant_idct_add_c( - tx_type, b->qcoeff, b->dequant, - xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride - + x_idx * 16 + (i & 3) * 4, - xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride - + x_idx * 16 + (i & 3) * 4, - xd->dst.y_stride, xd->dst.y_stride, b->eob); - } else { - vp9_dequant_idct_add_c( - b->qcoeff, b->dequant, - xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride - + x_idx * 16 + (i & 3) * 4, - xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride - + x_idx * 16 + (i & 3) * 4, - xd->dst.y_stride, xd->dst.y_stride); - } - } - } else { - vp9_dequantize_b(b); - if (xd->eobs[24] > 1) { - vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; - } else { - xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - } - vp9_dequant_dc_idct_add_y_block_4x4_inplace_c( - xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd); - } - vp9_dequant_idct_add_uv_block_4x4_inplace_c( - xd->qcoeff + 16 * 16, xd->block[16].dequant, - xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8, - xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8, - xd->dst.uv_stride, xd->eobs + 16, xd); -}; - static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col, BOOL_DECODER* const bc) { - int i, n, eobtotal; - TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; + int n, eobtotal; VP9_COMMON *const pc = &pbi->common; - MODE_INFO *orig_mi = xd->mode_info_context; + MODE_INFO *mi = xd->mode_info_context; const int mis = pc->mode_info_stride; assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64); @@ -702,25 +478,12 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd, mb_init_dequantizer(pbi, xd); if (xd->mode_info_context->mbmi.mb_skip_coeff) { - int n; - - vp9_reset_mb_tokens_context(xd); - for (n = 1; n <= 3; n++) { - if (mb_col < pc->mb_cols - n) - xd->above_context += n; - if (mb_row < pc->mb_rows - n) - xd->left_context += n; - vp9_reset_mb_tokens_context(xd); - if (mb_col < pc->mb_cols - n) - xd->above_context -= n; - if (mb_row < pc->mb_rows - n) - xd->left_context -= n; - } + vp9_reset_sb64_tokens_context(xd); /* Special case: Force the loopfilter to skip when eobtotal and * mb_skip_coeff are zero. */ - skip_recon_mb(pbi, xd); + skip_recon_mb(pbi, xd, mb_row, mb_col); return; } @@ -731,91 +494,151 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd, } else { vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.y_stride, xd->dst.uv_stride); + xd->dst.y_stride, xd->dst.uv_stride, + mb_row, mb_col); } /* dequantization and idct */ - if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) { - for (n = 0; n < 4; n++) { - const int x_idx = n & 1, y_idx = n >> 1; - - if (mb_col + x_idx * 2 >= pc->mb_cols || - mb_row + y_idx * 2 >= pc->mb_rows) - continue; + eobtotal = vp9_decode_sb64_tokens(pbi, xd, bc); + if (eobtotal == 0) { // skip loopfilter + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; - xd->left_context = pc->left_context + (y_idx << 1); - xd->above_context = pc->above_context + mb_col + (x_idx << 1); - xd->mode_info_context = orig_mi + x_idx * 2 + y_idx * 2 * mis; - eobtotal = vp9_decode_sb_tokens(pbi, xd, bc); - if (eobtotal == 0) { // skip loopfilter - xd->mode_info_context->mbmi.mb_skip_coeff = 1; - if (mb_col + 1 < pc->mb_cols) - xd->mode_info_context[1].mbmi.mb_skip_coeff = 1; - if (mb_row + 1 < pc->mb_rows) { - xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1; - if (mb_col + 1 < pc->mb_cols) - xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1; - } - } else { - vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant, - xd->dst.y_buffer + x_idx * 32 + - xd->dst.y_stride * y_idx * 32, - xd->dst.y_buffer + x_idx * 32 + - xd->dst.y_stride * y_idx * 32, - xd->dst.y_stride, xd->dst.y_stride, - xd->eobs[0]); - vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024, - xd->block[16].dequant, - xd->dst.u_buffer + x_idx * 16 + - xd->dst.uv_stride * y_idx * 16, - xd->dst.v_buffer + x_idx * 16 + - xd->dst.uv_stride * y_idx * 16, - xd->dst.uv_stride, xd->eobs + 16); - } + if (mb_col + x_idx < pc->mb_cols && mb_row + y_idx < pc->mb_rows) + mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; } } else { - for (n = 0; n < 16; n++) { - int x_idx = n & 3, y_idx = n >> 2; - - if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows) - continue; - - xd->above_context = pc->above_context + mb_col + x_idx; - xd->left_context = pc->left_context + y_idx; - xd->mode_info_context = orig_mi + x_idx + y_idx * mis; - for (i = 0; i < 25; i++) { - xd->block[i].eob = 0; - xd->eobs[i] = 0; - } - - eobtotal = vp9_decode_mb_tokens(pbi, xd, bc); - if (eobtotal == 0) { // skip loopfilter - xd->mode_info_context->mbmi.mb_skip_coeff = 1; - continue; - } - - if (tx_size == TX_16X16) { - decode_16x16_sb(pbi, xd, bc, n, 3, 2); - } else if (tx_size == TX_8X8) { - decode_8x8_sb(pbi, xd, bc, n, 3, 2); - } else { - decode_4x4_sb(pbi, xd, bc, n, 3, 2); - } + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_32X32: + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + const int y_offset = x_idx * 32 + y_idx * xd->dst.y_stride * 32; + vp9_dequant_idct_add_32x32(xd->qcoeff + n * 1024, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 64]); + } + vp9_dequant_idct_add_32x32(xd->qcoeff + 4096, + xd->block[16].dequant, xd->dst.u_buffer, xd->dst.u_buffer, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256]); + vp9_dequant_idct_add_32x32(xd->qcoeff + 4096 + 1024, + xd->block[20].dequant, xd->dst.v_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320]); + break; + case TX_16X16: + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + const int y_offset = y_idx * 16 * xd->dst.y_stride + x_idx * 16; + const TX_TYPE tx_type = get_tx_type_16x16(xd, + (y_idx * 16 + x_idx) * 4); + + if (tx_type == DCT_DCT) { + vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + } else { + vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + } + } + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + const int uv_offset = y_idx * 16 * xd->dst.uv_stride + x_idx * 16; + vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + n * 256, + xd->block[16].dequant, + xd->dst.u_buffer + uv_offset, + xd->dst.u_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 16]); + vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + 1024 + n * 256, + xd->block[20].dequant, + xd->dst.v_buffer + uv_offset, + xd->dst.v_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 16]); + } + break; + case TX_8X8: + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + const int y_offset = y_idx * 8 * xd->dst.y_stride + x_idx * 8; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2); + if (tx_type == DCT_DCT) { + vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + } else { + vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + } + } + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + const int uv_offset = y_idx * 8 * xd->dst.uv_stride + x_idx * 8; + vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096, + xd->block[16].dequant, + xd->dst.u_buffer + uv_offset, + xd->dst.u_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 4]); + vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096 + 1024, + xd->block[20].dequant, + xd->dst.v_buffer + uv_offset, + xd->dst.v_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 4]); + } + break; + case TX_4X4: + for (n = 0; n < 256; n++) { + const int x_idx = n & 15, y_idx = n >> 4; + const int y_offset = y_idx * 4 * xd->dst.y_stride + x_idx * 4; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx); + if (tx_type == DCT_DCT) { + xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + } else { + vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + } + } + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + const int uv_offset = y_idx * 4 * xd->dst.uv_stride + x_idx * 4; + xd->itxm_add(xd->qcoeff + 4096 + n * 16, + xd->block[16].dequant, + xd->dst.u_buffer + uv_offset, + xd->dst.u_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n]); + xd->itxm_add(xd->qcoeff + 4096 + 1024 + n * 16, + xd->block[20].dequant, + xd->dst.v_buffer + uv_offset, + xd->dst.v_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n]); + } + break; + default: assert(0); } } - - xd->above_context = pc->above_context + mb_col; - xd->left_context = pc->left_context; - xd->mode_info_context = orig_mi; } static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col, BOOL_DECODER* const bc) { - int i, n, eobtotal; - TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; + int n, eobtotal; VP9_COMMON *const pc = &pbi->common; - MODE_INFO *orig_mi = xd->mode_info_context; const int mis = pc->mode_info_stride; assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32); @@ -828,21 +651,12 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd, mb_init_dequantizer(pbi, xd); if (xd->mode_info_context->mbmi.mb_skip_coeff) { - vp9_reset_mb_tokens_context(xd); - if (mb_col < pc->mb_cols - 1) - xd->above_context++; - if (mb_row < pc->mb_rows - 1) - xd->left_context++; - vp9_reset_mb_tokens_context(xd); - if (mb_col < pc->mb_cols - 1) - xd->above_context--; - if (mb_row < pc->mb_rows - 1) - xd->left_context--; + vp9_reset_sb_tokens_context(xd); /* Special case: Force the loopfilter to skip when eobtotal and * mb_skip_coeff are zero. */ - skip_recon_mb(pbi, xd); + skip_recon_mb(pbi, xd, mb_row, mb_col); return; } @@ -853,64 +667,131 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd, } else { vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.y_stride, xd->dst.uv_stride); + xd->dst.y_stride, xd->dst.uv_stride, + mb_row, mb_col); } /* dequantization and idct */ - if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) { - eobtotal = vp9_decode_sb_tokens(pbi, xd, bc); - if (eobtotal == 0) { // skip loopfilter - xd->mode_info_context->mbmi.mb_skip_coeff = 1; + eobtotal = vp9_decode_sb_tokens(pbi, xd, bc); + if (eobtotal == 0) { // skip loopfilter + xd->mode_info_context->mbmi.mb_skip_coeff = 1; + if (mb_col + 1 < pc->mb_cols) + xd->mode_info_context[1].mbmi.mb_skip_coeff = 1; + if (mb_row + 1 < pc->mb_rows) { + xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1; if (mb_col + 1 < pc->mb_cols) - xd->mode_info_context[1].mbmi.mb_skip_coeff = 1; - if (mb_row + 1 < pc->mb_rows) { - xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1; - if (mb_col + 1 < pc->mb_cols) - xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1; - } - } else { - vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant, - xd->dst.y_buffer, xd->dst.y_buffer, - xd->dst.y_stride, xd->dst.y_stride, - xd->eobs[0]); - vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024, - xd->block[16].dequant, - xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.uv_stride, xd->eobs + 16); + xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1; } } else { - for (n = 0; n < 4; n++) { - int x_idx = n & 1, y_idx = n >> 1; - - if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows) - continue; - - xd->above_context = pc->above_context + mb_col + x_idx; - xd->left_context = pc->left_context + y_idx + (mb_row & 2); - xd->mode_info_context = orig_mi + x_idx + y_idx * mis; - for (i = 0; i < 25; i++) { - xd->block[i].eob = 0; - xd->eobs[i] = 0; - } - - eobtotal = vp9_decode_mb_tokens(pbi, xd, bc); - if (eobtotal == 0) { // skip loopfilter - xd->mode_info_context->mbmi.mb_skip_coeff = 1; - continue; - } + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_32X32: + vp9_dequant_idct_add_32x32(xd->qcoeff, xd->block[0].dequant, + xd->dst.y_buffer, xd->dst.y_buffer, + xd->dst.y_stride, xd->dst.y_stride, + xd->eobs[0]); + vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024, + xd->block[16].dequant, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.uv_stride, xd); + break; + case TX_16X16: + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + const int y_offset = y_idx * 16 * xd->dst.y_stride + x_idx * 16; + const TX_TYPE tx_type = get_tx_type_16x16(xd, + (y_idx * 8 + x_idx) * 4); + if (tx_type == DCT_DCT) { + vp9_dequant_idct_add_16x16( + xd->qcoeff + n * 256, xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + } else { + vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + } + } + vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024, + xd->block[16].dequant, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.uv_stride, xd); + break; + case TX_8X8: + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + const int y_offset = y_idx * 8 * xd->dst.y_stride + x_idx * 8; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2); + if (tx_type == DCT_DCT) { + vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + } else { + vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + } + } + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + const int uv_offset = y_idx * 8 * xd->dst.uv_stride + x_idx * 8; + vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1024, + xd->block[16].dequant, + xd->dst.u_buffer + uv_offset, + xd->dst.u_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n * 4]); + vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1280, + xd->block[20].dequant, + xd->dst.v_buffer + uv_offset, + xd->dst.v_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n * 4]); + } + break; + case TX_4X4: + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + const int y_offset = y_idx * 4 * xd->dst.y_stride + x_idx * 4; + + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx); + if (tx_type == DCT_DCT) { + xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + } else { + vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16, + xd->block[0].dequant, + xd->dst.y_buffer + y_offset, + xd->dst.y_buffer + y_offset, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + } + } - if (tx_size == TX_16X16) { - decode_16x16_sb(pbi, xd, bc, n, 1, 1); - } else if (tx_size == TX_8X8) { - decode_8x8_sb(pbi, xd, bc, n, 1, 1); - } else { - decode_4x4_sb(pbi, xd, bc, n, 1, 1); - } + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + const int uv_offset = y_idx * 4 * xd->dst.uv_stride + x_idx * 4; + xd->itxm_add(xd->qcoeff + 1024 + n * 16, + xd->block[16].dequant, + xd->dst.u_buffer + uv_offset, + xd->dst.u_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n]); + xd->itxm_add(xd->qcoeff + 1280 + n * 16, + xd->block[20].dequant, + xd->dst.v_buffer + uv_offset, + xd->dst.v_buffer + uv_offset, + xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n]); + } + break; + default: assert(0); } - - xd->above_context = pc->above_context + mb_col; - xd->left_context = pc->left_context + (mb_row & 2); - xd->mode_info_context = orig_mi; } } @@ -919,7 +800,6 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd, BOOL_DECODER* const bc) { int eobtotal = 0; MB_PREDICTION_MODE mode; - int i; int tx_size; assert(!xd->mode_info_context->mbmi.sb_type); @@ -934,13 +814,8 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd, if (xd->mode_info_context->mbmi.mb_skip_coeff) { vp9_reset_mb_tokens_context(xd); } else if (!bool_error(bc)) { - for (i = 0; i < 25; i++) { - xd->block[i].eob = 0; - xd->eobs[i] = 0; - } - if (mode != B_PRED) { + if (mode != B_PRED) eobtotal = vp9_decode_mb_tokens(pbi, xd, bc); - } } //mode = xd->mode_info_context->mbmi.mode; @@ -948,24 +823,25 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd, vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, &pbi->common); - if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV - && mode != I8X8_PRED - && !bool_error(bc)) { + if (eobtotal == 0 && + mode != B_PRED && + mode != SPLITMV && + mode != I8X8_PRED && + !bool_error(bc)) { /* Special case: Force the loopfilter to skip when eobtotal and - * mb_skip_coeff are zero. - * */ + mb_skip_coeff are zero. */ xd->mode_info_context->mbmi.mb_skip_coeff = 1; - skip_recon_mb(pbi, xd); + skip_recon_mb(pbi, xd, mb_row, mb_col); return; } -#ifdef DEC_DEBUG +#if 0 // def DEC_DEBUG if (dec_debug) printf("Decoding mb: %d %d\n", xd->mode_info_context->mbmi.mode, tx_size); #endif // moved to be performed before detokenization -// if (xd->segmentation_enabled) -// mb_init_dequantizer(pbi, xd); + // if (xd->segmentation_enabled) + // mb_init_dequantizer(pbi, xd); /* do prediction */ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { @@ -976,13 +852,13 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd, } } } else { -#ifdef DEC_DEBUG +#if 0 // def DEC_DEBUG if (dec_debug) printf("Decoding mb: %d %d interp %d\n", xd->mode_info_context->mbmi.mode, tx_size, xd->mode_info_context->mbmi.interp_filter); #endif - vp9_build_inter_predictors_mb(xd); + vp9_build_inter_predictors_mb(xd, mb_row, mb_col); } if (tx_size == TX_16X16) { @@ -996,6 +872,13 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd, if (dec_debug) { int i, j; printf("\n"); + printf("predictor y\n"); + for (i = 0; i < 16; i++) { + for (j = 0; j < 16; j++) + printf("%3d ", xd->predictor[i * 16 + j]); + printf("\n"); + } + printf("\n"); printf("final y\n"); for (i = 0; i < 16; i++) { for (j = 0; j < 16; j++) @@ -1062,87 +945,45 @@ static void set_offsets(VP9D_COMP *pbi, int block_size, xd->above_context = cm->above_context + mb_col; xd->left_context = cm->left_context + (mb_row & 3); - /* Distance of Mb to the various image edges. - * These are specified to 8th pel as they are always compared to - * values that are in 1/8th pel units - */ + // Distance of Mb to the various image edges. + // These are specified to 8th pel as they are always compared to + // values that are in 1/8th pel units block_size >>= 4; // in mb units - xd->mb_to_top_edge = -((mb_row * 16)) << 3; - xd->mb_to_left_edge = -((mb_col * 16) << 3); - xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3; - xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3; - xd->up_available = (mb_row != 0); - xd->left_available = (mb_col != 0); + set_mb_row(cm, xd, mb_row, block_size); + set_mb_col(cm, xd, mb_col, block_size); xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset; xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset; xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset; } -static void set_refs(VP9D_COMP *pbi, int block_size, - int mb_row, int mb_col) { +static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - MODE_INFO *mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; if (mbmi->ref_frame > INTRA_FRAME) { - int ref_fb_idx, ref_yoffset, ref_uvoffset, ref_y_stride, ref_uv_stride; - - /* Select the appropriate reference frame for this MB */ - if (mbmi->ref_frame == LAST_FRAME) - ref_fb_idx = cm->lst_fb_idx; - else if (mbmi->ref_frame == GOLDEN_FRAME) - ref_fb_idx = cm->gld_fb_idx; - else - ref_fb_idx = cm->alt_fb_idx; - - ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; - ref_yoffset = mb_row * 16 * ref_y_stride + 16 * mb_col; - xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + ref_yoffset; - ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; - ref_uvoffset = mb_row * 8 * ref_uv_stride + 8 * mb_col; - xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + ref_uvoffset; - xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + ref_uvoffset; - - /* propagate errors from reference frames */ + // Select the appropriate reference frame for this MB + int ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1]; + xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1]; + xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1]; + setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col, + &xd->scale_factor[0], &xd->scale_factor_uv[0]); + + // propagate errors from reference frames xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted; if (mbmi->second_ref_frame > INTRA_FRAME) { - int second_ref_fb_idx; - - /* Select the appropriate reference frame for this MB */ - if (mbmi->second_ref_frame == LAST_FRAME) - second_ref_fb_idx = cm->lst_fb_idx; - else if (mbmi->second_ref_frame == GOLDEN_FRAME) - second_ref_fb_idx = cm->gld_fb_idx; - else - second_ref_fb_idx = cm->alt_fb_idx; - - xd->second_pre.y_buffer = - cm->yv12_fb[second_ref_fb_idx].y_buffer + ref_yoffset; - xd->second_pre.u_buffer = - cm->yv12_fb[second_ref_fb_idx].u_buffer + ref_uvoffset; - xd->second_pre.v_buffer = - cm->yv12_fb[second_ref_fb_idx].v_buffer + ref_uvoffset; - - /* propagate errors from reference frames */ - xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted; - } - } + // Select the appropriate reference frame for this MB + int second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1]; - if (mbmi->sb_type) { - const int n_mbs = 1 << mbmi->sb_type; - const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row); - const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col); - const int mis = cm->mode_info_stride; - int x, y; + setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[1], &xd->scale_factor_uv[1]); - for (y = 0; y < y_mbs; y++) { - for (x = !y; x < x_mbs; x++) { - mi[y * mis + x] = *mi; - } + // propagate errors from reference frames + xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted; } } } @@ -1156,8 +997,15 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, // For a SB there are 2 left contexts, each pertaining to a MB row within vpx_memset(pc->left_context, 0, sizeof(pc->left_context)); - for (mb_col = 0; mb_col < pc->mb_cols; mb_col += 4) { + for (mb_col = pc->cur_tile_mb_col_start; + mb_col < pc->cur_tile_mb_col_end; mb_col += 4) { if (vp9_read(bc, pc->sb64_coded)) { +#ifdef DEC_DEBUG + dec_debug = (pc->current_video_frame == 11 && pc->show_frame && + mb_row == 8 && mb_col == 0); + if (dec_debug) + printf("Debug Decode SB64\n"); +#endif set_offsets(pbi, 64, mb_row, mb_col); vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc); set_refs(pbi, 64, mb_row, mb_col); @@ -1178,6 +1026,12 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, xd->sb_index = j; if (vp9_read(bc, pc->sb32_coded)) { +#ifdef DEC_DEBUG + dec_debug = (pc->current_video_frame == 11 && pc->show_frame && + mb_row + y_idx_sb == 8 && mb_col + x_idx_sb == 0); + if (dec_debug) + printf("Debug Decode SB32\n"); +#endif set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb); vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc); @@ -1198,14 +1052,18 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, // MB lies outside frame, skip on to next continue; } +#ifdef DEC_DEBUG + dec_debug = (pc->current_video_frame == 11 && pc->show_frame && + mb_row + y_idx == 8 && mb_col + x_idx == 0); + if (dec_debug) + printf("Debug Decode MB\n"); +#endif set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx); xd->mb_index = i; vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc); - update_blockd_bmi(xd); set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx); - vp9_intra_prediction_down_copy(xd); - decode_macroblock(pbi, xd, mb_row, mb_col, bc); + decode_macroblock(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc); /* check if the boolean decoder has suffered an error */ xd->corrupted |= bool_error(bc); @@ -1216,38 +1074,19 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, } } -static unsigned int read_partition_size(const unsigned char *cx_size) { - const unsigned int size = - cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16); - return size; -} - -static int read_is_valid(const unsigned char *start, - size_t len, - const unsigned char *end) { - return (start + len > start && start + len <= end); -} - static void setup_token_decoder(VP9D_COMP *pbi, const unsigned char *cx_data, BOOL_DECODER* const bool_decoder) { - VP9_COMMON *pc = &pbi->common; + VP9_COMMON *pc = &pbi->common; const unsigned char *user_data_end = pbi->Source + pbi->source_sz; - const unsigned char *partition; - - ptrdiff_t partition_size; - ptrdiff_t bytes_left; - - // Set up pointers to token partition - partition = cx_data; - bytes_left = user_data_end - partition; - partition_size = bytes_left; + const unsigned char *partition = cx_data; + ptrdiff_t bytes_left = user_data_end - partition; + ptrdiff_t partition_size = bytes_left; - /* Validate the calculated partition length. If the buffer - * described by the partition can't be fully read, then restrict - * it to the portion that can be (for EC mode) or throw an error. - */ + // Validate the calculated partition length. If the buffer + // described by the partition can't be fully read, then restrict + // it to the portion that can be (for EC mode) or throw an error. if (!read_is_valid(partition, partition_size, user_data_end)) { vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt partition " @@ -1262,64 +1101,20 @@ static void setup_token_decoder(VP9D_COMP *pbi, static void init_frame(VP9D_COMP *pbi) { VP9_COMMON *const pc = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; + MACROBLOCKD *const xd = &pbi->mb; if (pc->frame_type == KEY_FRAME) { + vp9_setup_past_independence(pc, xd); + // All buffers are implicitly updated on key frames. + pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1; + } else if (pc->error_resilient_mode) { + vp9_setup_past_independence(pc, xd); + } - if (pc->last_frame_seg_map) - vpx_memset(pc->last_frame_seg_map, 0, (pc->mb_rows * pc->mb_cols)); - - vp9_init_mv_probs(pc); - - vp9_init_mbmode_probs(pc); - vp9_default_bmode_probs(pc->fc.bmode_prob); - - vp9_default_coef_probs(pc); - vp9_kf_default_bmode_probs(pc->kf_bmode_prob); - - // Reset the segment feature data to the default stats: - // Features disabled, 0, with delta coding (Default state). - vp9_clearall_segfeatures(xd); - - xd->mb_segment_abs_delta = SEGMENT_DELTADATA; - - /* reset the mode ref deltasa for loop filter */ - vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); - vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas)); - - /* All buffers are implicitly updated on key frames. */ - pc->refresh_golden_frame = 1; - pc->refresh_alt_ref_frame = 1; - pc->copy_buffer_to_gf = 0; - pc->copy_buffer_to_arf = 0; - - /* Note that Golden and Altref modes cannot be used on a key frame so - * ref_frame_sign_bias[] is undefined and meaningless - */ - pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0; - pc->ref_frame_sign_bias[ALTREF_FRAME] = 0; - - vp9_init_mode_contexts(&pbi->common); - vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc)); - vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc)); - - vpx_memset(pc->prev_mip, 0, - (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO)); - vpx_memset(pc->mip, 0, - (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO)); - - vp9_update_mode_info_border(pc, pc->mip); - vp9_update_mode_info_in_image(pc, pc->mi); - - - } else { - - if (!pc->use_bilinear_mc_filter) - pc->mcomp_filter_type = EIGHTTAP; - else - pc->mcomp_filter_type = BILINEAR; + if (pc->frame_type != KEY_FRAME) { + pc->mcomp_filter_type = pc->use_bilinear_mc_filter ? BILINEAR : EIGHTTAP; - /* To enable choice of different interpolation filters */ + // To enable choice of different interpolation filters vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc); } @@ -1328,32 +1123,117 @@ static void init_frame(VP9D_COMP *pbi) { xd->frame_type = pc->frame_type; xd->mode_info_context->mbmi.mode = DC_PRED; xd->mode_info_stride = pc->mode_info_stride; - xd->corrupted = 0; /* init without corruption */ + xd->corrupted = 0; + xd->fullpixel_mask = pc->full_pixel ? 0xfffffff8 : 0xffffffff; +} - xd->fullpixel_mask = 0xffffffff; - if (pc->full_pixel) - xd->fullpixel_mask = 0xfffffff8; +#if CONFIG_CODE_NONZEROCOUNT +static void read_nzc_probs_common(VP9_COMMON *cm, + BOOL_DECODER* const bc, + int block_size) { + int c, r, b, t; + int tokens, nodes; + vp9_prob *nzc_probs; + vp9_prob upd; + + if (!vp9_read_bit(bc)) return; + + if (block_size == 32) { + tokens = NZC32X32_TOKENS; + nzc_probs = cm->fc.nzc_probs_32x32[0][0][0]; + upd = NZC_UPDATE_PROB_32X32; + } else if (block_size == 16) { + tokens = NZC16X16_TOKENS; + nzc_probs = cm->fc.nzc_probs_16x16[0][0][0]; + upd = NZC_UPDATE_PROB_16X16; + } else if (block_size == 8) { + tokens = NZC8X8_TOKENS; + nzc_probs = cm->fc.nzc_probs_8x8[0][0][0]; + upd = NZC_UPDATE_PROB_8X8; + } else { + tokens = NZC4X4_TOKENS; + nzc_probs = cm->fc.nzc_probs_4x4[0][0][0]; + upd = NZC_UPDATE_PROB_4X4; + } + nodes = tokens - 1; + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b; + int offset_nodes = offset * nodes; + for (t = 0; t < nodes; ++t) { + vp9_prob *p = &nzc_probs[offset_nodes + t]; + if (vp9_read(bc, upd)) { + *p = read_prob_diff_update(bc, *p); + } + } + } + } + } +} +static void read_nzc_pcat_probs(VP9_COMMON *cm, BOOL_DECODER* const bc) { + int c, t, b; + vp9_prob upd = NZC_UPDATE_PROB_PCAT; + if (!vp9_read_bit(bc)) { + return; + } + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (t = 0; t < NZC_TOKENS_EXTRA; ++t) { + int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA]; + for (b = 0; b < bits; ++b) { + vp9_prob *p = &cm->fc.nzc_pcat_probs[c][t][b]; + if (vp9_read(bc, upd)) { + *p = read_prob_diff_update(bc, *p); + } + } + } + } } +static void read_nzc_probs(VP9_COMMON *cm, + BOOL_DECODER* const bc) { + read_nzc_probs_common(cm, bc, 4); + if (cm->txfm_mode != ONLY_4X4) + read_nzc_probs_common(cm, bc, 8); + if (cm->txfm_mode > ALLOW_8X8) + read_nzc_probs_common(cm, bc, 16); + if (cm->txfm_mode > ALLOW_16X16) + read_nzc_probs_common(cm, bc, 32); +#ifdef NZC_PCAT_UPDATE + read_nzc_pcat_probs(cm, bc); +#endif +} +#endif // CONFIG_CODE_NONZEROCOUNT + static void read_coef_probs_common(BOOL_DECODER* const bc, vp9_coeff_probs *coef_probs, int block_types) { - int i, j, k, l; +#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE + const int entropy_nodes_update = UNCONSTRAINED_UPDATE_NODES; +#else + const int entropy_nodes_update = ENTROPY_NODES; +#endif + + int i, j, k, l, m; if (vp9_read_bit(bc)) { for (i = 0; i < block_types; i++) { - for (j = !i; j < COEF_BANDS; j++) { - /* NB: This j loop starts from 1 on block type i == 0 */ - for (k = 0; k < PREV_COEF_CONTEXTS; k++) { - if (k >= 3 && ((i == 0 && j == 1) || - (i > 0 && j == 0))) - continue; - for (l = 0; l < ENTROPY_NODES; l++) { - vp9_prob *const p = coef_probs[i][j][k] + l; - - if (vp9_read(bc, COEF_UPDATE_PROB)) { - *p = read_prob_diff_update(bc, *p); + for (j = 0; j < REF_TYPES; j++) { + for (k = 0; k < COEF_BANDS; k++) { + for (l = 0; l < PREV_COEF_CONTEXTS; l++) { + if (l >= 3 && k == 0) + continue; + for (m = CONFIG_CODE_NONZEROCOUNT; m < entropy_nodes_update; m++) { + vp9_prob *const p = coef_probs[i][j][k][l] + m; + + if (vp9_read(bc, vp9_coef_update_prob[m])) { + *p = read_prob_diff_update(bc, *p); +#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE + if (m == UNCONSTRAINED_NODES - 1) + vp9_get_model_distribution(*p, coef_probs[i][j][k][l], i, j); +#endif + } } } } @@ -1365,161 +1245,83 @@ static void read_coef_probs_common(BOOL_DECODER* const bc, static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) { VP9_COMMON *const pc = &pbi->common; - read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES_4X4); - read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4); + read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES); - if (pbi->common.txfm_mode != ONLY_4X4) { - read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES_8X8); - read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8); - } - if (pbi->common.txfm_mode > ALLOW_8X8) { - read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES_16X16); - read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16, - BLOCK_TYPES_16X16); - } - if (pbi->common.txfm_mode > ALLOW_16X16) { - read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES_32X32); - } -} - -int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { - BOOL_DECODER header_bc, residual_bc; - VP9_COMMON *const pc = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - const unsigned char *data = (const unsigned char *)pbi->Source; - const unsigned char *data_end = data + pbi->source_sz; - ptrdiff_t first_partition_length_in_bytes = 0; - - int mb_row; - int i, j; - int corrupt_tokens = 0; - - /* start with no corruption of current frame */ - xd->corrupted = 0; - pc->yv12_fb[pc->new_fb_idx].corrupted = 0; - - if (data_end - data < 3) { - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, - "Truncated packet"); - } else { - pc->last_frame_type = pc->frame_type; - pc->frame_type = (FRAME_TYPE)(data[0] & 1); - pc->version = (data[0] >> 1) & 7; - pc->show_frame = (data[0] >> 4) & 1; - first_partition_length_in_bytes = - (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5; - - if ((data + first_partition_length_in_bytes > data_end - || data + first_partition_length_in_bytes < data)) - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, - "Truncated packet or corrupt partition 0 length"); - - data += 3; - - vp9_setup_version(pc); - - if (pc->frame_type == KEY_FRAME) { - const int Width = pc->Width; - const int Height = pc->Height; + if (pbi->common.txfm_mode != ONLY_4X4) + read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES); - /* vet via sync code */ - /* When error concealment is enabled we should only check the sync - * code if we have enough bits available - */ - if (data + 3 < data_end) { - if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a) - vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, - "Invalid frame sync code"); - } + if (pbi->common.txfm_mode > ALLOW_8X8) + read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES); - /* If error concealment is enabled we should only parse the new size - * if we have enough data. Otherwise we will end up with the wrong - * size. - */ - if (data + 6 < data_end) { - pc->Width = (data[3] | (data[4] << 8)) & 0x3fff; - pc->horiz_scale = data[4] >> 6; - pc->Height = (data[5] | (data[6] << 8)) & 0x3fff; - pc->vert_scale = data[6] >> 6; - } - data += 7; + if (pbi->common.txfm_mode > ALLOW_16X16) + read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES); +} - if (Width != pc->Width || Height != pc->Height) { - if (pc->Width <= 0) { - pc->Width = Width; - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, - "Invalid frame width"); - } +static void update_frame_size(VP9D_COMP *pbi) { + VP9_COMMON *cm = &pbi->common; - if (pc->Height <= 0) { - pc->Height = Height; - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, - "Invalid frame height"); - } + /* our internal buffers are always multiples of 16 */ + const int width = (cm->width + 15) & ~15; + const int height = (cm->height + 15) & ~15; - if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height)) - vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate frame buffers"); - } - } - } -#ifdef DEC_DEBUG - printf("Decode frame %d\n", pc->current_video_frame); -#endif + cm->mb_rows = height >> 4; + cm->mb_cols = width >> 4; + cm->MBs = cm->mb_rows * cm->mb_cols; + cm->mode_info_stride = cm->mb_cols + 1; + memset(cm->mip, 0, + (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO)); + vp9_update_mode_info_border(cm, cm->mip); - if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) || - pc->Width == 0 || pc->Height == 0) { - return -1; - } - - init_frame(pbi); - - if (vp9_start_decode(&header_bc, data, - (unsigned int)first_partition_length_in_bytes)) - vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate bool decoder 0"); - if (pc->frame_type == KEY_FRAME) { - pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc); - pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc); - } + cm->mi = cm->mip + cm->mode_info_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; + vp9_update_mode_info_in_image(cm, cm->mi); +} - /* Is segmentation enabled */ - xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc); +static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) { + int i, j; + xd->segmentation_enabled = vp9_read_bit(r); if (xd->segmentation_enabled) { - // Read whether or not the segmentation map is being explicitly - // updated this frame. - xd->update_mb_segmentation_map = (unsigned char)vp9_read_bit(&header_bc); + // Read whether or not the segmentation map is being explicitly updated + // this frame. + xd->update_mb_segmentation_map = vp9_read_bit(r); // If so what method will be used. if (xd->update_mb_segmentation_map) { - // Which macro block level features are enabled - - // Read the probs used to decode the segment id for each macro - // block. + // Which macro block level features are enabled. Read the probs used to + // decode the segment id for each macro block. for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) { - xd->mb_segment_tree_probs[i] = vp9_read_bit(&header_bc) ? - (vp9_prob)vp9_read_literal(&header_bc, 8) : 255; + xd->mb_segment_tree_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r) : 255; } // Read the prediction probs needed to decode the segment id - pc->temporal_update = (unsigned char)vp9_read_bit(&header_bc); + pc->temporal_update = vp9_read_bit(r); for (i = 0; i < PREDICTION_PROBS; i++) { - if (pc->temporal_update) { - pc->segment_pred_probs[i] = vp9_read_bit(&header_bc) ? - (vp9_prob)vp9_read_literal(&header_bc, 8) : 255; - } else { - pc->segment_pred_probs[i] = 255; - } + pc->segment_pred_probs[i] = pc->temporal_update + ? (vp9_read_bit(r) ? vp9_read_prob(r) : 255) + : 255; + } + + if (pc->temporal_update) { + const vp9_prob *p = xd->mb_segment_tree_probs; + vp9_prob *p_mod = xd->mb_segment_mispred_tree_probs; + const int c0 = p[0] * p[1]; + const int c1 = p[0] * (256 - p[1]); + const int c2 = (256 - p[0]) * p[2]; + const int c3 = (256 - p[0]) * (256 - p[2]); + + p_mod[0] = get_binary_prob(c1, c2 + c3); + p_mod[1] = get_binary_prob(c0, c2 + c3); + p_mod[2] = get_binary_prob(c0 + c1, c3); + p_mod[3] = get_binary_prob(c0 + c1, c2); } } - // Is the segment data being updated - xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc); + xd->update_mb_segmentation_data = vp9_read_bit(r); if (xd->update_mb_segmentation_data) { int data; - xd->mb_segment_abs_delta = (unsigned char)vp9_read_bit(&header_bc); + xd->mb_segment_abs_delta = vp9_read_bit(r); vp9_clearall_segfeatures(xd); @@ -1528,109 +1330,405 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { // For each of the segments features... for (j = 0; j < SEG_LVL_MAX; j++) { // Is the feature enabled - if (vp9_read_bit(&header_bc)) { + if (vp9_read_bit(r)) { // Update the feature data and mask vp9_enable_segfeature(xd, i, j); - data = vp9_decode_unsigned_max(&header_bc, - vp9_seg_feature_data_max(j)); + data = vp9_decode_unsigned_max(r, vp9_seg_feature_data_max(j)); // Is the segment data signed.. if (vp9_is_segfeature_signed(j)) { - if (vp9_read_bit(&header_bc)) + if (vp9_read_bit(r)) data = -data; } - } else + } else { data = 0; + } vp9_set_segdata(xd, i, j, data); } } } } +} - // Read common prediction model status flag probability updates for the - // reference frame - if (pc->frame_type == KEY_FRAME) { - // Set the prediction probabilities to defaults - pc->ref_pred_probs[0] = 120; - pc->ref_pred_probs[1] = 80; - pc->ref_pred_probs[2] = 40; - - } else { - for (i = 0; i < PREDICTION_PROBS; i++) { - if (vp9_read_bit(&header_bc)) - pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8); - } - } +static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) { + int i; - pc->sb64_coded = vp9_read_literal(&header_bc, 8); - pc->sb32_coded = vp9_read_literal(&header_bc, 8); - - /* Read the loop filter level and type */ - pc->txfm_mode = vp9_read_literal(&header_bc, 2); - if (pc->txfm_mode == 3) - pc->txfm_mode += vp9_read_bit(&header_bc); - if (pc->txfm_mode == TX_MODE_SELECT) { - pc->prob_tx[0] = vp9_read_literal(&header_bc, 8); - pc->prob_tx[1] = vp9_read_literal(&header_bc, 8); - pc->prob_tx[2] = vp9_read_literal(&header_bc, 8); - } + pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(r); + pc->filter_level = vp9_read_literal(r, 6); + pc->sharpness_level = vp9_read_literal(r, 3); - pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc); - pc->filter_level = vp9_read_literal(&header_bc, 6); - pc->sharpness_level = vp9_read_literal(&header_bc, 3); +#if CONFIG_LOOP_DERING + if (vp9_read_bit(r)) + pc->dering_enabled = 1 + vp9_read_literal(r, 4); + else + pc->dering_enabled = 0; +#endif - /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */ + // Read in loop filter deltas applied at the MB level based on mode or ref + // frame. xd->mode_ref_lf_delta_update = 0; - xd->mode_ref_lf_delta_enabled = (unsigned char)vp9_read_bit(&header_bc); + xd->mode_ref_lf_delta_enabled = vp9_read_bit(r); if (xd->mode_ref_lf_delta_enabled) { - /* Do the deltas need to be updated */ - xd->mode_ref_lf_delta_update = (unsigned char)vp9_read_bit(&header_bc); + // Do the deltas need to be updated + xd->mode_ref_lf_delta_update = vp9_read_bit(r); if (xd->mode_ref_lf_delta_update) { - /* Send update */ + // Send update for (i = 0; i < MAX_REF_LF_DELTAS; i++) { - if (vp9_read_bit(&header_bc)) { - /*sign = vp9_read_bit( &header_bc );*/ - xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6); + if (vp9_read_bit(r)) { + // sign = vp9_read_bit(r); + xd->ref_lf_deltas[i] = vp9_read_literal(r, 6); - if (vp9_read_bit(&header_bc)) /* Apply sign */ - xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1; + if (vp9_read_bit(r)) + xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i]; // Apply sign } } - /* Send update */ + // Send update for (i = 0; i < MAX_MODE_LF_DELTAS; i++) { - if (vp9_read_bit(&header_bc)) { - /*sign = vp9_read_bit( &header_bc );*/ - xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6); + if (vp9_read_bit(r)) { + // sign = vp9_read_bit(r); + xd->mode_lf_deltas[i] = vp9_read_literal(r, 6); - if (vp9_read_bit(&header_bc)) /* Apply sign */ - xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1; + if (vp9_read_bit(r)) + xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i]; // Apply sign } } } } +} + +static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const pc = &pbi->common; + const int width = pc->width; + const int height = pc->height; + + // If error concealment is enabled we should only parse the new size + // if we have enough data. Otherwise we will end up with the wrong size. + if (scaling_active && data + 4 < data_end) { + pc->display_width = read_le16(data + 0); + pc->display_height = read_le16(data + 2); + data += 4; + } + + if (data + 4 < data_end) { + pc->width = read_le16(data + 0); + pc->height = read_le16(data + 2); + data += 4; + } + + if (!scaling_active) { + pc->display_width = pc->width; + pc->display_height = pc->height; + } + + if (width != pc->width || height != pc->height) { + if (pc->width <= 0) { + pc->width = width; + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame width"); + } + + if (pc->height <= 0) { + pc->height = height; + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame height"); + } + + if (!pbi->initial_width || !pbi->initial_height) { + if (vp9_alloc_frame_buffers(pc, pc->width, pc->height)) + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffers"); + pbi->initial_width = pc->width; + pbi->initial_height = pc->height; + } + + if (pc->width > pbi->initial_width) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Frame width too large"); + } + + if (pc->height > pbi->initial_height) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Frame height too large"); + } + + update_frame_size(pbi); + } + + return data; +} + +static void update_frame_context(VP9D_COMP *pbi, vp9_reader *r) { + FRAME_CONTEXT *const fc = &pbi->common.fc; + + vp9_copy(fc->pre_coef_probs_4x4, fc->coef_probs_4x4); + vp9_copy(fc->pre_coef_probs_8x8, fc->coef_probs_8x8); + vp9_copy(fc->pre_coef_probs_16x16, fc->coef_probs_16x16); + vp9_copy(fc->pre_coef_probs_32x32, fc->coef_probs_32x32); + vp9_copy(fc->pre_ymode_prob, fc->ymode_prob); + vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob); + vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob); + vp9_copy(fc->pre_bmode_prob, fc->bmode_prob); + vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob); + vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob); + vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob); + fc->pre_nmvc = fc->nmvc; + + vp9_zero(fc->coef_counts_4x4); + vp9_zero(fc->coef_counts_8x8); + vp9_zero(fc->coef_counts_16x16); + vp9_zero(fc->coef_counts_32x32); + vp9_zero(fc->eob_branch_counts); + vp9_zero(fc->ymode_counts); + vp9_zero(fc->sb_ymode_counts); + vp9_zero(fc->uv_mode_counts); + vp9_zero(fc->bmode_counts); + vp9_zero(fc->i8x8_mode_counts); + vp9_zero(fc->sub_mv_ref_counts); + vp9_zero(fc->mbsplit_counts); + vp9_zero(fc->NMVcount); + vp9_zero(fc->mv_ref_ct); + +#if CONFIG_COMP_INTERINTRA_PRED + fc->pre_interintra_prob = fc->interintra_prob; + vp9_zero(fc->interintra_counts); +#endif + +#if CONFIG_CODE_NONZEROCOUNT + vp9_copy(fc->pre_nzc_probs_4x4, fc->nzc_probs_4x4); + vp9_copy(fc->pre_nzc_probs_8x8, fc->nzc_probs_8x8); + vp9_copy(fc->pre_nzc_probs_16x16, fc->nzc_probs_16x16); + vp9_copy(fc->pre_nzc_probs_32x32, fc->nzc_probs_32x32); + vp9_copy(fc->pre_nzc_pcat_probs, fc->nzc_pcat_probs); + + vp9_zero(fc->nzc_counts_4x4); + vp9_zero(fc->nzc_counts_8x8); + vp9_zero(fc->nzc_counts_16x16); + vp9_zero(fc->nzc_counts_32x32); + vp9_zero(fc->nzc_pcat_counts); +#endif + + read_coef_probs(pbi, r); +#if CONFIG_CODE_NONZEROCOUNT + read_nzc_probs(&pbi->common, r); +#endif +} + +static void decode_tiles(VP9D_COMP *pbi, + const uint8_t *data, int first_partition_size, + BOOL_DECODER *header_bc, BOOL_DECODER *residual_bc) { + VP9_COMMON *const pc = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + + const uint8_t *data_ptr = data + first_partition_size; + int tile_row, tile_col, delta_log2_tiles; + int mb_row; + + vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles); + while (delta_log2_tiles--) { + if (vp9_read_bit(header_bc)) { + pc->log2_tile_columns++; + } else { + break; + } + } + pc->log2_tile_rows = vp9_read_bit(header_bc); + if (pc->log2_tile_rows) + pc->log2_tile_rows += vp9_read_bit(header_bc); + pc->tile_columns = 1 << pc->log2_tile_columns; + pc->tile_rows = 1 << pc->log2_tile_rows; + + vpx_memset(pc->above_context, 0, + sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols); + + if (pbi->oxcf.inv_tile_order) { + const int n_cols = pc->tile_columns; + const uint8_t *data_ptr2[4][1 << 6]; + BOOL_DECODER UNINITIALIZED_IS_SAFE(bc_bak); + + // pre-initialize the offsets, we're going to read in inverse order + data_ptr2[0][0] = data_ptr; + for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { + if (tile_row) { + const int size = read_le32(data_ptr2[tile_row - 1][n_cols - 1]); + data_ptr2[tile_row - 1][n_cols - 1] += 4; + data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size; + } + + for (tile_col = 1; tile_col < n_cols; tile_col++) { + const int size = read_le32(data_ptr2[tile_row][tile_col - 1]); + data_ptr2[tile_row][tile_col - 1] += 4; + data_ptr2[tile_row][tile_col] = + data_ptr2[tile_row][tile_col - 1] + size; + } + } + + for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { + vp9_get_tile_row_offsets(pc, tile_row); + for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) { + vp9_get_tile_col_offsets(pc, tile_col); + setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], residual_bc); + + // Decode a row of superblocks + for (mb_row = pc->cur_tile_mb_row_start; + mb_row < pc->cur_tile_mb_row_end; mb_row += 4) { + decode_sb_row(pbi, pc, mb_row, xd, residual_bc); + } + + if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1) + bc_bak = *residual_bc; + } + } + *residual_bc = bc_bak; + } else { + int has_more; + + for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { + vp9_get_tile_row_offsets(pc, tile_row); + for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) { + vp9_get_tile_col_offsets(pc, tile_col); + + has_more = tile_col < pc->tile_columns - 1 || + tile_row < pc->tile_rows - 1; + + // Setup decoder + setup_token_decoder(pbi, data_ptr + (has_more ? 4 : 0), residual_bc); + + // Decode a row of superblocks + for (mb_row = pc->cur_tile_mb_row_start; + mb_row < pc->cur_tile_mb_row_end; mb_row += 4) { + decode_sb_row(pbi, pc, mb_row, xd, residual_bc); + } + + if (has_more) { + const int size = read_le32(data_ptr); + data_ptr += 4 + size; + } + } + } + } +} + +int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { + BOOL_DECODER header_bc, residual_bc; + VP9_COMMON *const pc = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + const uint8_t *data = (const uint8_t *)pbi->Source; + const uint8_t *data_end = data + pbi->source_sz; + ptrdiff_t first_partition_length_in_bytes = 0; + int i, corrupt_tokens = 0; + + // printf("Decoding frame %d\n", pc->current_video_frame); + + xd->corrupted = 0; // start with no corruption of current frame + pc->yv12_fb[pc->new_fb_idx].corrupted = 0; + + if (data_end - data < 3) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); + } else { + int scaling_active; + pc->last_frame_type = pc->frame_type; + pc->frame_type = (FRAME_TYPE)(data[0] & 1); + pc->version = (data[0] >> 1) & 7; + pc->show_frame = (data[0] >> 4) & 1; + scaling_active = (data[0] >> 5) & 1; + first_partition_length_in_bytes = read_le16(data + 1); + + if (!read_is_valid(data, first_partition_length_in_bytes, data_end)) + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition 0 length"); + + data += 3; + + vp9_setup_version(pc); + + if (pc->frame_type == KEY_FRAME) { + // When error concealment is enabled we should only check the sync + // code if we have enough bits available + if (data + 3 < data_end) { + if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a) + vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + } + data += 3; + } + + data = setup_frame_size(pbi, scaling_active, data, data_end); + } + + if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) || + pc->width == 0 || pc->height == 0) { + return -1; + } + + init_frame(pbi); + + // Reset the frame pointers to the current frame size + vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx], + pc->width, pc->height, + VP9BORDERINPIXELS); + + if (vp9_start_decode(&header_bc, data, + (unsigned int)first_partition_length_in_bytes)) + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder 0"); + + pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc); + pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc); + pc->error_resilient_mode = vp9_read_bit(&header_bc); + + setup_segmentation(pc, xd, &header_bc); + + // Read common prediction model status flag probability updates for the + // reference frame + if (pc->frame_type == KEY_FRAME) { + // Set the prediction probabilities to defaults + pc->ref_pred_probs[0] = 120; + pc->ref_pred_probs[1] = 80; + pc->ref_pred_probs[2] = 40; + } else { + for (i = 0; i < PREDICTION_PROBS; i++) { + if (vp9_read_bit(&header_bc)) + pc->ref_pred_probs[i] = vp9_read_prob(&header_bc); + } + } + + pc->sb64_coded = vp9_read_prob(&header_bc); + pc->sb32_coded = vp9_read_prob(&header_bc); + xd->lossless = vp9_read_bit(&header_bc); + if (xd->lossless) { + pc->txfm_mode = ONLY_4X4; + } else { + // Read the loop filter level and type + pc->txfm_mode = vp9_read_literal(&header_bc, 2); + if (pc->txfm_mode == ALLOW_32X32) + pc->txfm_mode += vp9_read_bit(&header_bc); + + if (pc->txfm_mode == TX_MODE_SELECT) { + pc->prob_tx[0] = vp9_read_prob(&header_bc); + pc->prob_tx[1] = vp9_read_prob(&header_bc); + pc->prob_tx[2] = vp9_read_prob(&header_bc); + } + } + + setup_loopfilter(pc, xd, &header_bc); // Dummy read for now vp9_read_literal(&header_bc, 2); - setup_token_decoder(pbi, data + first_partition_length_in_bytes, - &residual_bc); - /* Read the default quantizers. */ { - int Q, q_update; + int q_update = 0; + pc->base_qindex = vp9_read_literal(&header_bc, QINDEX_BITS); - Q = vp9_read_literal(&header_bc, QINDEX_BITS); - pc->base_qindex = Q; - q_update = 0; /* AC 1st order Q = default */ pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update); - pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update); - pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update); pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update); pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update); @@ -1641,57 +1739,51 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { mb_init_dequantizer(pbi, &pbi->mb); } - /* Determine if the golden frame or ARF buffer should be updated and how. - * For all non key frames the GF and ARF refresh flags and sign bias - * flags must be set explicitly. - */ - if (pc->frame_type != KEY_FRAME) { - /* Should the GF or ARF be updated from the current frame */ - pc->refresh_golden_frame = vp9_read_bit(&header_bc); - pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc); + // Determine if the golden frame or ARF buffer should be updated and how. + // For all non key frames the GF and ARF refresh flags and sign bias + // flags must be set explicitly. + if (pc->frame_type == KEY_FRAME) { + pc->active_ref_idx[0] = pc->new_fb_idx; + pc->active_ref_idx[1] = pc->new_fb_idx; + pc->active_ref_idx[2] = pc->new_fb_idx; + } else { + // Should the GF or ARF be updated from the current frame + pbi->refresh_frame_flags = vp9_read_literal(&header_bc, NUM_REF_FRAMES); - if (pc->refresh_alt_ref_frame) { - vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc)); - } else { - vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc)); + // Select active reference frames + for (i = 0; i < 3; i++) { + int ref_frame_num = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2); + pc->active_ref_idx[i] = pc->ref_frame_map[ref_frame_num]; } - /* Buffer to buffer copy flags. */ - pc->copy_buffer_to_gf = 0; - - if (!pc->refresh_golden_frame) - pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2); - - pc->copy_buffer_to_arf = 0; - - if (!pc->refresh_alt_ref_frame) - pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2); - pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc); pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc); - /* Is high precision mv allowed */ - xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc); + // Is high precision mv allowed + xd->allow_high_precision_mv = vp9_read_bit(&header_bc); + // Read the type of subpel filter to use - if (vp9_read_bit(&header_bc)) { - pc->mcomp_filter_type = SWITCHABLE; - } else { - pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2); - } + pc->mcomp_filter_type = vp9_read_bit(&header_bc) + ? SWITCHABLE + : vp9_read_literal(&header_bc, 2); + #if CONFIG_COMP_INTERINTRA_PRED pc->use_interintra = vp9_read_bit(&header_bc); #endif - /* To enable choice of different interploation filters */ + // To enable choice of different interploation filters vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc); } - pc->refresh_entropy_probs = vp9_read_bit(&header_bc); - if (pc->refresh_entropy_probs == 0) { - vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc)); + if (!pc->error_resilient_mode) { + pc->refresh_entropy_probs = vp9_read_bit(&header_bc); + pc->frame_parallel_decoding_mode = vp9_read_bit(&header_bc); + } else { + pc->refresh_entropy_probs = 0; + pc->frame_parallel_decoding_mode = 1; } - - pc->refresh_last_frame = (pc->frame_type == KEY_FRAME) - || vp9_read_bit(&header_bc); + pc->frame_context_idx = vp9_read_literal(&header_bc, NUM_FRAME_CONTEXTS_LG2); + vpx_memcpy(&pc->fc, &pc->frame_contexts[pc->frame_context_idx], + sizeof(pc->fc)); // Read inter mode probability context updates if (pc->frame_type != KEY_FRAME) { @@ -1699,20 +1791,19 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { for (i = 0; i < INTER_MODE_CONTEXTS; i++) { for (j = 0; j < 4; j++) { if (vp9_read(&header_bc, 252)) { - pc->fc.vp9_mode_contexts[i][j] = - (vp9_prob)vp9_read_literal(&header_bc, 8); + pc->fc.vp9_mode_contexts[i][j] = vp9_read_prob(&header_bc); } } } } +#if CONFIG_MODELCOEFPROB && ADJUST_KF_COEF_PROBS + if (pc->frame_type == KEY_FRAME) + vp9_adjust_default_coef_probs(pc); +#endif #if CONFIG_NEW_MVREF // If Key frame reset mv ref id probabilities to defaults - if (pc->frame_type == KEY_FRAME) { - // Defaults probabilities for encoding the MV ref id signal - vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB, - sizeof(xd->mb_mv_ref_probs)); - } else { + if (pc->frame_type != KEY_FRAME) { // Read any mv_ref index probability updates int i, j; @@ -1725,8 +1816,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { // Read any updates to probabilities for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) { if (vp9_read(&header_bc, VP9_MVREF_UPDATE_PROB)) { - xd->mb_mv_ref_probs[i][j] = - (vp9_prob)vp9_read_literal(&header_bc, 8); + xd->mb_mv_ref_probs[i][j] = vp9_read_prob(&header_bc); } } } @@ -1735,65 +1825,21 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { if (0) { FILE *z = fopen("decodestats.stt", "a"); - fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", + fprintf(z, "%6d F:%d,R:%d,Q:%d\n", pc->current_video_frame, pc->frame_type, - pc->refresh_golden_frame, - pc->refresh_alt_ref_frame, - pc->refresh_last_frame, + pbi->refresh_frame_flags, pc->base_qindex); fclose(z); } - vp9_copy(pbi->common.fc.pre_coef_probs_4x4, - pbi->common.fc.coef_probs_4x4); - vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_4x4, - pbi->common.fc.hybrid_coef_probs_4x4); - vp9_copy(pbi->common.fc.pre_coef_probs_8x8, - pbi->common.fc.coef_probs_8x8); - vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8, - pbi->common.fc.hybrid_coef_probs_8x8); - vp9_copy(pbi->common.fc.pre_coef_probs_16x16, - pbi->common.fc.coef_probs_16x16); - vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16, - pbi->common.fc.hybrid_coef_probs_16x16); - vp9_copy(pbi->common.fc.pre_coef_probs_32x32, - pbi->common.fc.coef_probs_32x32); - vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob); - vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob); - vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob); - vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob); - vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob); - vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob); - vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob); -#if CONFIG_COMP_INTERINTRA_PRED - pbi->common.fc.pre_interintra_prob = pbi->common.fc.interintra_prob; -#endif - pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc; - vp9_zero(pbi->common.fc.coef_counts_4x4); - vp9_zero(pbi->common.fc.hybrid_coef_counts_4x4); - vp9_zero(pbi->common.fc.coef_counts_8x8); - vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8); - vp9_zero(pbi->common.fc.coef_counts_16x16); - vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16); - vp9_zero(pbi->common.fc.coef_counts_32x32); - vp9_zero(pbi->common.fc.ymode_counts); - vp9_zero(pbi->common.fc.sb_ymode_counts); - vp9_zero(pbi->common.fc.uv_mode_counts); - vp9_zero(pbi->common.fc.bmode_counts); - vp9_zero(pbi->common.fc.i8x8_mode_counts); - vp9_zero(pbi->common.fc.sub_mv_ref_counts); - vp9_zero(pbi->common.fc.mbsplit_counts); - vp9_zero(pbi->common.fc.NMVcount); - vp9_zero(pbi->common.fc.mv_ref_ct); -#if CONFIG_COMP_INTERINTRA_PRED - vp9_zero(pbi->common.fc.interintra_counts); -#endif - - read_coef_probs(pbi, &header_bc); + update_frame_context(pbi, &header_bc); - vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG)); - vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG)); + // Initialize xd pointers. Any reference should do for xd->pre, so use 0. + vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]], + sizeof(YV12_BUFFER_CONFIG)); + vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], + sizeof(YV12_BUFFER_CONFIG)); // Create the segmentation map structure and set to 0 if (!pc->last_frame_seg_map) @@ -1815,46 +1861,46 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { vp9_decode_mode_mvs_init(pbi, &header_bc); - vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols); - - /* Decode a row of superblocks */ - for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 4) { - decode_sb_row(pbi, pc, mb_row, xd, &residual_bc); - } + decode_tiles(pbi, data, first_partition_length_in_bytes, + &header_bc, &residual_bc); corrupt_tokens |= xd->corrupted; - /* Collect information about decoder corruption. */ - /* 1. Check first boolean decoder for errors. */ - pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc); - /* 2. Check the macroblock information */ - pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens; + // keep track of the last coded dimensions + pc->last_width = pc->width; + pc->last_height = pc->height; + + // Collect information about decoder corruption. + // 1. Check first boolean decoder for errors. + // 2. Check the macroblock information + pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc) | + corrupt_tokens; if (!pbi->decoded_key_frame) { - if (pc->frame_type == KEY_FRAME && - !pc->yv12_fb[pc->new_fb_idx].corrupted) + if (pc->frame_type == KEY_FRAME && !pc->yv12_fb[pc->new_fb_idx].corrupted) pbi->decoded_key_frame = 1; else vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, "A stream must start with a complete key frame"); } - vp9_adapt_coef_probs(pc); - if (pc->frame_type != KEY_FRAME) { - vp9_adapt_mode_probs(pc); - vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv); - vp9_update_mode_context(&pbi->common); + if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) { + vp9_adapt_coef_probs(pc); +#if CONFIG_CODE_NONZEROCOUNT + vp9_adapt_nzc_probs(pc); +#endif } - /* If this was a kf or Gf note the Q used */ - if ((pc->frame_type == KEY_FRAME) || - pc->refresh_golden_frame || pc->refresh_alt_ref_frame) { - pc->last_kf_gf_q = pc->base_qindex; + if (pc->frame_type != KEY_FRAME) { + if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) { + vp9_adapt_mode_probs(pc); + vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv); + vp9_adapt_mode_context(&pbi->common); + } } + if (pc->refresh_entropy_probs) { - if (pc->refresh_alt_ref_frame) - vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc)); - else - vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc)); + vpx_memcpy(&pc->frame_contexts[pc->frame_context_idx], &pc->fc, + sizeof(pc->fc)); } #ifdef PACKET_TESTING @@ -1866,11 +1912,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { fclose(f); } #endif - // printf("Frame %d Done\n", frame_count++); /* Find the end of the coded buffer */ - while (residual_bc.count > CHAR_BIT - && residual_bc.count < VP9_BD_VALUE_SIZE) { + while (residual_bc.count > CHAR_BIT && + residual_bc.count < VP9_BD_VALUE_SIZE) { residual_bc.count -= CHAR_BIT; residual_bc.user_buffer--; } diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodframe.h index ae25428c4fb655f53f873182bdd13c127fd303f9..391a265191d26620d28cdc85af85e0d8e06b2f30 100644 --- a/vp9/decoder/vp9_decodframe.h +++ b/vp9/decoder/vp9_decodframe.h @@ -14,6 +14,6 @@ struct VP9Decompressor; -extern void vp9_init_de_quantizer(struct VP9Decompressor *pbi); +void vp9_init_de_quantizer(struct VP9Decompressor *pbi); #endif // VP9_DECODER_VP9_DECODFRAME_H_ diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index 354d2bd3657a3aa084c6371f19c9d18f1986e010..9aebcdcfccaf3cd8ee65c2b82cae931639740239 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c @@ -14,14 +14,15 @@ #include "vpx_mem/vpx_mem.h" #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/common/vp9_common.h" + + static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride, int width, int height) { int r, c; for (r = 0; r < height; r++) { - for (c = 0; c < width; c++) { + for (c = 0; c < width; c++) dest[c] = clip_pixel(diff[c] + pred[c]); - } dest += stride; diff += width; @@ -29,132 +30,148 @@ static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch, } } +void vp9_add_residual_4x4_c(const int16_t *diff, const uint8_t *pred, int pitch, + uint8_t *dest, int stride) { + add_residual(diff, pred, pitch, dest, stride, 4, 4); +} + +void vp9_add_residual_8x8_c(const int16_t *diff, const uint8_t *pred, int pitch, + uint8_t *dest, int stride) { + add_residual(diff, pred, pitch, dest, stride, 8, 8); +} + +void vp9_add_residual_16x16_c(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_residual(diff, pred, pitch, dest, stride, 16, 16); +} + +void vp9_add_residual_32x32_c(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_residual(diff, pred, pitch, dest, stride, 32, 32); +} + static void add_constant_residual(const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride, int width, int height) { int r, c; for (r = 0; r < height; r++) { - for (c = 0; c < width; c++) { + for (c = 0; c < width; c++) dest[c] = clip_pixel(diff + pred[c]); - } dest += stride; pred += pitch; } } -void vp9_dequantize_b_c(BLOCKD *d) { - - int i; - int16_t *DQ = d->dqcoeff; - const int16_t *Q = d->qcoeff; - const int16_t *DQC = d->dequant; +void vp9_add_constant_residual_8x8_c(const int16_t diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_constant_residual(diff, pred, pitch, dest, stride, 8, 8); +} - for (i = 0; i < 16; i++) { - DQ[i] = Q[i] * DQC[i]; - } +void vp9_add_constant_residual_16x16_c(const int16_t diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_constant_residual(diff, pred, pitch, dest, stride, 16, 16); } +void vp9_add_constant_residual_32x32_c(const int16_t diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_constant_residual(diff, pred, pitch, dest, stride, 32, 32); +} void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, - int pitch, int stride, uint16_t eobs) { - int16_t output[16]; - int16_t *diff_ptr = output; + int pitch, int stride, int eob) { int i; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - for (i = 0; i < 16; i++) { - input[i] = dq[i] * input[i]; - } - - vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs); + for (i = 0; i < 16; i++) + input[i] *= dq[i]; + vp9_short_iht4x4(input, output, 4, tx_type); vpx_memset(input, 0, 32); - - add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); } void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, - int pitch, int stride, uint16_t eobs) { - int16_t output[64]; - int16_t *diff_ptr = output; - int i; - if (eobs == 0) { - /* All 0 DCT coefficient */ + int pitch, int stride, int eob) { + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64); + + if (eob == 0) { + // All 0 DCT coefficients vp9_copy_mem8x8(pred, pitch, dest, stride); - } else if (eobs > 0) { - input[0] = dq[0] * input[0]; - for (i = 1; i < 64; i++) { - input[i] = dq[1] * input[i]; - } + } else if (eob > 0) { + int i; - vp9_ihtllm(input, output, 16, tx_type, 8, eobs); + input[0] *= dq[0]; + for (i = 1; i < 64; i++) + input[i] *= dq[1]; + vp9_short_iht8x8(input, output, 8, tx_type); vpx_memset(input, 0, 128); - - add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8); + vp9_add_residual_8x8(output, pred, pitch, dest, stride); } } void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, - uint8_t *dest, int pitch, int stride) { - int16_t output[16]; - int16_t *diff_ptr = output; + uint8_t *dest, int pitch, int stride, int eob) { int i; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - for (i = 0; i < 16; i++) { - input[i] = dq[i] * input[i]; - } + if (eob > 1) { + for (i = 0; i < 16; i++) + input[i] *= dq[i]; - /* the idct halves ( >> 1) the pitch */ - vp9_short_idct4x4llm_c(input, output, 4 << 1); + // the idct halves ( >> 1) the pitch + vp9_short_idct4x4(input, output, 4 << 1); - vpx_memset(input, 0, 32); + vpx_memset(input, 0, 32); - add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); + } else { + vp9_dc_only_idct_add(input[0]*dq[0], pred, dest, pitch, stride); + ((int *)input)[0] = 0; + } } void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, - uint8_t *dest, int pitch, int stride, int Dc) { + uint8_t *dest, int pitch, int stride, int dc) { int i; - int16_t output[16]; - int16_t *diff_ptr = output; - - input[0] = (int16_t)Dc; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - for (i = 1; i < 16; i++) { - input[i] = dq[i] * input[i]; - } + input[0] = dc; - /* the idct halves ( >> 1) the pitch */ - vp9_short_idct4x4llm_c(input, output, 4 << 1); + for (i = 1; i < 16; i++) + input[i] *= dq[i]; + // the idct halves ( >> 1) the pitch + vp9_short_idct4x4(input, output, 4 << 1); vpx_memset(input, 0, 32); - - add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); } -#if CONFIG_LOSSLESS void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, - int pitch, int stride) { - int16_t output[16]; - int16_t *diff_ptr = output; + int pitch, int stride, int eob) { int i; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - for (i = 0; i < 16; i++) { - input[i] = dq[i] * input[i]; - } + if (eob > 1) { + for (i = 0; i < 16; i++) + input[i] *= dq[i]; - vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1); + vp9_short_iwalsh4x4_c(input, output, 4 << 1); - vpx_memset(input, 0, 32); + vpx_memset(input, 0, 32); - add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); + } else { + vp9_dc_only_inv_walsh_add(input[0]*dq[0], pred, dest, pitch, stride); + ((int *)input)[0] = 0; + } } void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq, @@ -162,136 +179,114 @@ void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq, uint8_t *dest, int pitch, int stride, int dc) { int i; - int16_t output[16]; - int16_t *diff_ptr = output; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); - input[0] = (int16_t)dc; + input[0] = dc; - for (i = 1; i < 16; i++) { - input[i] = dq[i] * input[i]; - } + for (i = 1; i < 16; i++) + input[i] *= dq[i]; - vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1); + vp9_short_iwalsh4x4_c(input, output, 4 << 1); vpx_memset(input, 0, 32); - - add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4); -} -#endif - -void vp9_dequantize_b_2x2_c(BLOCKD *d) { - int i; - int16_t *DQ = d->dqcoeff; - const int16_t *Q = d->qcoeff; - const int16_t *DQC = d->dequant; - - for (i = 0; i < 16; i++) { - DQ[i] = (int16_t)((Q[i] * DQC[i])); - } + vp9_add_residual_4x4(output, pred, pitch, dest, stride); } void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, - int stride, int dc, int eob) { - int16_t output[64]; - int16_t *diff_ptr = output; - int i; + int stride, int eob) { + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64); - /* If dc is 1, then input[0] is the reconstructed value, do not need - * dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. - */ - if (!dc) - input[0] *= dq[0]; + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + input[0] *= dq[0]; - /* The calculation can be simplified if there are not many non-zero dct - * coefficients. Use eobs to decide what to do. - * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. - * Combine that with code here. - */ + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. + // Combine that with code here. if (eob == 0) { - /* All 0 DCT coefficient */ + // All 0 DCT coefficients vp9_copy_mem8x8(pred, pitch, dest, stride); } else if (eob == 1) { - /* DC only DCT coefficient. */ + // DC only DCT coefficient + int16_t in = input[0]; int16_t out; - /* Note: the idct1 will need to be modified accordingly whenever - * vp9_short_idct8x8_c() is modified. */ - out = (input[0] + 1 + (input[0] < 0)) >> 2; - out = out << 3; - out = (out + 32) >> 7; - + // Note: the idct1 will need to be modified accordingly whenever + // vp9_short_idct8x8_c() is modified. + vp9_short_idct1_8x8_c(&in, &out); input[0] = 0; - add_constant_residual(out, pred, pitch, dest, stride, 8, 8); + vp9_add_constant_residual_8x8(out, pred, pitch, dest, stride); +#if !CONFIG_SCATTERSCAN } else if (eob <= 10) { - input[1] = input[1] * dq[1]; - input[2] = input[2] * dq[1]; - input[3] = input[3] * dq[1]; - input[8] = input[8] * dq[1]; - input[9] = input[9] * dq[1]; - input[10] = input[10] * dq[1]; - input[16] = input[16] * dq[1]; - input[17] = input[17] * dq[1]; - input[24] = input[24] * dq[1]; - - vp9_short_idct10_8x8_c(input, output, 16); + input[1] *= dq[1]; + input[2] *= dq[1]; + input[3] *= dq[1]; + input[8] *= dq[1]; + input[9] *= dq[1]; + input[10] *= dq[1]; + input[16] *= dq[1]; + input[17] *= dq[1]; + input[24] *= dq[1]; + + vp9_short_idct10_8x8(input, output, 16); input[0] = input[1] = input[2] = input[3] = 0; input[8] = input[9] = input[10] = 0; input[16] = input[17] = 0; input[24] = 0; - add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8); + vp9_add_residual_8x8(output, pred, pitch, dest, stride); +#endif } else { + int i; + // recover quantizer for 4 4x4 blocks - for (i = 1; i < 64; i++) { - input[i] = input[i] * dq[1]; - } - // the idct halves ( >> 1) the pitch - vp9_short_idct8x8_c(input, output, 16); + for (i = 1; i < 64; i++) + input[i] *= dq[1]; + // the idct halves ( >> 1) the pitch + vp9_short_idct8x8(input, output, 8 << 1); vpx_memset(input, 0, 128); - - add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8); - + vp9_add_residual_8x8(output, pred, pitch, dest, stride); } } void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, - uint16_t eobs) { - int16_t output[256]; - int16_t *diff_ptr = output; - int i; - if (eobs == 0) { - /* All 0 DCT coefficient */ + int eob) { + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256); + + if (eob == 0) { + // All 0 DCT coefficients vp9_copy_mem16x16(pred, pitch, dest, stride); - } else if (eobs > 0) { - input[0]= input[0] * dq[0]; + } else if (eob > 0) { + int i; + + input[0] *= dq[0]; // recover quantizer for 4 4x4 blocks for (i = 1; i < 256; i++) - input[i] = input[i] * dq[1]; + input[i] *= dq[1]; // inverse hybrid transform - vp9_ihtllm(input, output, 32, tx_type, 16, eobs); + vp9_short_iht16x16(input, output, 16, tx_type); // the idct halves ( >> 1) the pitch - // vp9_short_idct16x16_c(input, output, 32); + // vp9_short_idct16x16(input, output, 32); vpx_memset(input, 0, 512); - add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16); + vp9_add_residual_16x16(output, pred, pitch, dest, stride); } } void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { - int16_t output[256]; - int16_t *diff_ptr = output; - int i; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256); /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ @@ -300,75 +295,107 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, vp9_copy_mem16x16(pred, pitch, dest, stride); } else if (eob == 1) { /* DC only DCT coefficient. */ + int16_t in = input[0] * dq[0]; int16_t out; - /* Note: the idct1 will need to be modified accordingly whenever - * vp9_short_idct16x16_c() is modified. */ - out = (input[0] * dq[0] + 2) >> 2; - out = (out + 2) >> 2; - out = (out + 4) >> 3; - + * vp9_short_idct16x16() is modified. */ + vp9_short_idct1_16x16_c(&in, &out); input[0] = 0; - add_constant_residual(out, pred, pitch, dest, stride, 16, 16); + vp9_add_constant_residual_16x16(out, pred, pitch, dest, stride); +#if !CONFIG_SCATTERSCAN } else if (eob <= 10) { - input[0]= input[0] * dq[0]; - input[1] = input[1] * dq[1]; - input[2] = input[2] * dq[1]; - input[3] = input[3] * dq[1]; - input[16] = input[16] * dq[1]; - input[17] = input[17] * dq[1]; - input[18] = input[18] * dq[1]; - input[32] = input[32] * dq[1]; - input[33] = input[33] * dq[1]; - input[48] = input[48] * dq[1]; + input[0] *= dq[0]; + + input[1] *= dq[1]; + input[2] *= dq[1]; + input[3] *= dq[1]; + input[16] *= dq[1]; + input[17] *= dq[1]; + input[18] *= dq[1]; + input[32] *= dq[1]; + input[33] *= dq[1]; + input[48] *= dq[1]; // the idct halves ( >> 1) the pitch - vp9_short_idct10_16x16_c(input, output, 32); + vp9_short_idct10_16x16(input, output, 32); input[0] = input[1] = input[2] = input[3] = 0; input[16] = input[17] = input[18] = 0; input[32] = input[33] = 0; input[48] = 0; - add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16); + vp9_add_residual_16x16(output, pred, pitch, dest, stride); +#endif } else { - input[0]= input[0] * dq[0]; + int i; + + input[0] *= dq[0]; // recover quantizer for 4 4x4 blocks for (i = 1; i < 256; i++) - input[i] = input[i] * dq[1]; + input[i] *= dq[1]; // the idct halves ( >> 1) the pitch - vp9_short_idct16x16_c(input, output, 32); + vp9_short_idct16x16(input, output, 16 << 1); vpx_memset(input, 0, 512); - add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16); + vp9_add_residual_16x16(output, pred, pitch, dest, stride); } } void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { - int16_t output[1024]; - int i; - - input[0]= input[0] * dq[0] / 2; - for (i = 1; i < 1024; i++) - input[i] = input[i] * dq[1] / 2; - vp9_short_idct32x32_c(input, output, 64); - vpx_memset(input, 0, 2048); - - add_residual(output, pred, pitch, dest, stride, 32, 32); + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024); + + if (eob) { + input[0] = input[0] * dq[0] / 2; + if (eob == 1) { + vp9_short_idct1_32x32(input, output); + vp9_add_constant_residual_32x32(output[0], pred, pitch, dest, stride); + input[0] = 0; +#if !CONFIG_SCATTERSCAN + } else if (eob <= 10) { + input[1] = input[1] * dq[1] / 2; + input[2] = input[2] * dq[1] / 2; + input[3] = input[3] * dq[1] / 2; + input[32] = input[32] * dq[1] / 2; + input[33] = input[33] * dq[1] / 2; + input[34] = input[34] * dq[1] / 2; + input[64] = input[64] * dq[1] / 2; + input[65] = input[65] * dq[1] / 2; + input[96] = input[96] * dq[1] / 2; + + // the idct halves ( >> 1) the pitch + vp9_short_idct10_32x32(input, output, 64); + + input[0] = input[1] = input[2] = input[3] = 0; + input[32] = input[33] = input[34] = 0; + input[64] = input[65] = 0; + input[96] = 0; + + vp9_add_residual_32x32(output, pred, pitch, dest, stride); +#endif + } else { + int i; + for (i = 1; i < 1024; i++) + input[i] = input[i] * dq[1] / 2; + vp9_short_idct32x32(input, output, 64); + vpx_memset(input, 0, 2048); + vp9_add_residual_32x32(output, pred, pitch, dest, stride); + } + } } void vp9_dequant_idct_add_uv_block_16x16_c(int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, - uint16_t *eobs) { - vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, eobs[0]); - vp9_dequant_idct_add_16x16_c(q + 256, dq, - dstv, dstv, stride, stride, eobs[4]); + MACROBLOCKD *xd) { + vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, + xd->eobs[64]); + vp9_dequant_idct_add_16x16_c(q + 256, dq, dstv, dstv, stride, stride, + xd->eobs[80]); } diff --git a/vp9/decoder/vp9_dequantize.h b/vp9/decoder/vp9_dequantize.h index 2a0ae80e88dd95f57f5a0520da06667bae537da8..bde27bb7aca1ebd627a08227190a3553491e55c5 100644 --- a/vp9/decoder/vp9_dequantize.h +++ b/vp9/decoder/vp9_dequantize.h @@ -11,91 +11,86 @@ #ifndef VP9_DECODER_VP9_DEQUANTIZE_H_ #define VP9_DECODER_VP9_DEQUANTIZE_H_ + #include "vp9/common/vp9_blockd.h" -#if CONFIG_LOSSLESS -extern void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq, - unsigned char *pred, - unsigned char *output, - int pitch, int stride); -extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq, - unsigned char *pred, - unsigned char *output, - int pitch, int stride, int dc); -extern void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q, - const int16_t *dq, - unsigned char *pre, - unsigned char *dst, - int stride, - uint16_t *eobs, - const int16_t *dc); -extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq, - unsigned char *pre, - unsigned char *dst, - int stride, - uint16_t *eobs); -extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq, - unsigned char *pre, - unsigned char *dst_u, - unsigned char *dst_v, - int stride, - uint16_t *eobs); -#endif - -typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq, - unsigned char *pred, unsigned char *output, int pitch, int stride); -typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq, - unsigned char *pred, unsigned char *output, int pitch, int stride, int dc); - -typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq, - unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs, - const int16_t *dc); -typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq, - unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs); -typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq, - unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride, - uint16_t *eobs); + +void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq, + unsigned char *pred, + unsigned char *output, + int pitch, int stride, int eob); + +void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq, + unsigned char *pred, + unsigned char *output, + int pitch, int stride, int dc); + +void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q, + const int16_t *dq, + unsigned char *pre, + unsigned char *dst, + int stride, + const int16_t *dc); + +void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq, + unsigned char *pre, + unsigned char *dst, + int stride, + struct macroblockd *xd); + +void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq, + unsigned char *pre, + unsigned char *dst_u, + unsigned char *dst_v, + int stride, + struct macroblockd *xd); void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, unsigned char *pred, unsigned char *dest, - int pitch, int stride, uint16_t eobs); + int pitch, int stride, int eob); void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, - uint16_t eobs); + int eob); void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, unsigned char *pred, unsigned char *dest, - int pitch, int stride, uint16_t eobs); + int pitch, int stride, int eob); void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq, unsigned char *dst, int stride, - uint16_t *eobs, const int16_t *dc, MACROBLOCKD *xd); +void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq, + unsigned char *dst, + int stride, + MACROBLOCKD *xd); + void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq, unsigned char *dst, int stride, - uint16_t *eobs, const int16_t *dc, MACROBLOCKD *xd); +void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq, + unsigned char *dst, + int stride, + MACROBLOCKD *xd); + void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq, unsigned char *dstu, unsigned char *dstv, int stride, - uint16_t *eobs, MACROBLOCKD *xd); void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq, unsigned char *dstu, unsigned char *dstv, int stride, - uint16_t *eobs, MACROBLOCKD *xd); -#endif +#endif // VP9_DECODER_VP9_DEQUANTIZE_H_ diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 335c335ca33604776456c7a35ccbe1d9f2c89577..cb3038e534f181fd2ab1a529f4149e5807f6f61a 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -59,115 +59,215 @@ static const vp9_prob cat6_prob[15] = { DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]); -static int get_signed(BOOL_DECODER *br, int value_to_sign) { +static int16_t get_signed(BOOL_DECODER *br, int16_t value_to_sign) { return decode_bool(br, 128) ? -value_to_sign : value_to_sign; } -#if CONFIG_NEWCOEFCONTEXT -#define PT pn -#define INCREMENT_COUNT(token) \ - do { \ - coef_counts[type][coef_bands[c]][pn][token]++; \ - pn = pt = vp9_prev_token_class[token]; \ - if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(coef_bands[c + 1])) \ - pn = vp9_get_coef_neighbor_context( \ - qcoeff_ptr, nodc, neighbors, scan[c + 1]); \ - } while (0) -#else -#define PT pt + #define INCREMENT_COUNT(token) \ do { \ - coef_counts[type][coef_bands[c]][pt][token]++; \ - pt = vp9_prev_token_class[token]; \ + coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \ + [pt][token]++; \ + token_cache[c] = token; \ + pt = vp9_get_coef_context(scan, nb, pad, token_cache, \ + c + 1, default_eob); \ } while (0) -#endif /* CONFIG_NEWCOEFCONTEXT */ +#if CONFIG_CODE_NONZEROCOUNT #define WRITE_COEF_CONTINUE(val, token) \ { \ - qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val); \ + qcoeff_ptr[scan[c]] = get_signed(br, val); \ INCREMENT_COUNT(token); \ c++; \ + nzc++; \ continue; \ } +#else +#define WRITE_COEF_CONTINUE(val, token) \ + { \ + qcoeff_ptr[scan[c]] = get_signed(br, val); \ + INCREMENT_COUNT(token); \ + c++; \ + continue; \ + } +#endif // CONFIG_CODE_NONZEROCOUNT #define ADJUST_COEF(prob, bits_count) \ do { \ if (vp9_read(br, prob)) \ - val += (uint16_t)(1 << bits_count);\ + val += 1 << bits_count; \ } while (0); static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd, - BOOL_DECODER* const br, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, - PLANE_TYPE type, - TX_TYPE tx_type, - int seg_eob, int16_t *qcoeff_ptr, - const int *const scan, TX_SIZE txfm_size, - const int *coef_bands) { + BOOL_DECODER* const br, int block_idx, + PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr, + TX_SIZE txfm_size) { + ENTROPY_CONTEXT* const A0 = (ENTROPY_CONTEXT *) xd->above_context; + ENTROPY_CONTEXT* const L0 = (ENTROPY_CONTEXT *) xd->left_context; + int aidx, lidx; + ENTROPY_CONTEXT above_ec, left_ec; FRAME_CONTEXT *const fc = &dx->common.fc; -#if CONFIG_NEWCOEFCONTEXT - const int *neighbors; - int pn; -#endif - int nodc = (type == PLANE_TYPE_Y_NO_DC); - int pt, c = nodc; + int pt, c = 0, pad, default_eob; vp9_coeff_probs *coef_probs; vp9_prob *prob; vp9_coeff_count *coef_counts; + const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME; +#if CONFIG_CODE_NONZEROCOUNT + uint16_t nzc = 0; + uint16_t nzc_expected = xd->mode_info_context->mbmi.nzcs[block_idx]; +#endif + const int *scan, *nb; + uint8_t token_cache[1024]; + + if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) { + aidx = vp9_block2above_sb64[txfm_size][block_idx]; + lidx = vp9_block2left_sb64[txfm_size][block_idx]; + } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) { + aidx = vp9_block2above_sb[txfm_size][block_idx]; + lidx = vp9_block2left_sb[txfm_size][block_idx]; + } else { + aidx = vp9_block2above[txfm_size][block_idx]; + lidx = vp9_block2left[txfm_size][block_idx]; + } switch (txfm_size) { default: - case TX_4X4: - if (tx_type == DCT_DCT) { - coef_probs = fc->coef_probs_4x4; - coef_counts = fc->coef_counts_4x4; - } else { - coef_probs = fc->hybrid_coef_probs_4x4; - coef_counts = fc->hybrid_coef_counts_4x4; + case TX_4X4: { + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_4x4(xd, block_idx) : DCT_DCT; + switch (tx_type) { + default: + scan = vp9_default_zig_zag1d_4x4; + break; + case ADST_DCT: + scan = vp9_row_scan_4x4; + break; + case DCT_ADST: + scan = vp9_col_scan_4x4; + break; } + above_ec = A0[aidx] != 0; + left_ec = L0[lidx] != 0; + coef_probs = fc->coef_probs_4x4; + coef_counts = fc->coef_counts_4x4; + default_eob = 16; break; - case TX_8X8: - if (tx_type == DCT_DCT) { - coef_probs = fc->coef_probs_8x8; - coef_counts = fc->coef_counts_8x8; - } else { - coef_probs = fc->hybrid_coef_probs_8x8; - coef_counts = fc->hybrid_coef_counts_8x8; + } + case TX_8X8: { + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + const int sz = 3 + sb_type, x = block_idx & ((1 << sz) - 1); + const int y = block_idx - x; + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT; + switch (tx_type) { + default: + scan = vp9_default_zig_zag1d_8x8; + break; + case ADST_DCT: + scan = vp9_row_scan_8x8; + break; + case DCT_ADST: + scan = vp9_col_scan_8x8; + break; } + coef_probs = fc->coef_probs_8x8; + coef_counts = fc->coef_counts_8x8; + above_ec = (A0[aidx] + A0[aidx + 1]) != 0; + left_ec = (L0[lidx] + L0[lidx + 1]) != 0; + default_eob = 64; break; - case TX_16X16: - if (tx_type == DCT_DCT) { - coef_probs = fc->coef_probs_16x16; - coef_counts = fc->coef_counts_16x16; + } + case TX_16X16: { + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + const int sz = 4 + sb_type, x = block_idx & ((1 << sz) - 1); + const int y = block_idx - x; + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT; + switch (tx_type) { + default: + scan = vp9_default_zig_zag1d_16x16; + break; + case ADST_DCT: + scan = vp9_row_scan_16x16; + break; + case DCT_ADST: + scan = vp9_col_scan_16x16; + break; + } + coef_probs = fc->coef_probs_16x16; + coef_counts = fc->coef_counts_16x16; + if (type == PLANE_TYPE_UV) { + ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1); + ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1); + above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1]) != 0; + left_ec = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1]) != 0; } else { - coef_probs = fc->hybrid_coef_probs_16x16; - coef_counts = fc->hybrid_coef_counts_16x16; + above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3]) != 0; + left_ec = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3]) != 0; } + default_eob = 256; break; + } case TX_32X32: + scan = vp9_default_zig_zag1d_32x32; coef_probs = fc->coef_probs_32x32; coef_counts = fc->coef_counts_32x32; + if (type == PLANE_TYPE_UV) { + ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1); + ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1); + ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2); + ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2); + ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3); + ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3); + above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1] + + A2[aidx] + A2[aidx + 1] + A3[aidx] + A3[aidx + 1]) != 0; + left_ec = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1] + + L2[lidx] + L2[lidx + 1] + L3[lidx] + L3[lidx + 1]) != 0; + } else { + ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1); + ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1); + above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3] + + A1[aidx] + A1[aidx + 1] + A1[aidx + 2] + A1[aidx + 3]) != 0; + left_ec = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3] + + L1[lidx] + L1[lidx + 1] + L1[lidx + 2] + L1[lidx + 3]) != 0; + } + default_eob = 1024; break; } - VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l); -#if CONFIG_NEWCOEFCONTEXT - pn = pt; - neighbors = vp9_get_coef_neighbors_handle(scan); -#endif + VP9_COMBINEENTROPYCONTEXTS(pt, above_ec, left_ec); + nb = vp9_get_coef_neighbors_handle(scan, &pad); + while (1) { int val; const uint8_t *cat6 = cat6_prob; - if (c >= seg_eob) break; - prob = coef_probs[type][coef_bands[c]][PT]; + + if (c >= seg_eob) + break; +#if CONFIG_CODE_NONZEROCOUNT + if (nzc == nzc_expected) + break; +#endif + prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt]; +#if CONFIG_CODE_NONZEROCOUNT == 0 + fc->eob_branch_counts[txfm_size][type][ref] + [get_coef_band(scan, txfm_size, c)][pt]++; if (!vp9_read(br, prob[EOB_CONTEXT_NODE])) break; +#endif SKIP_START: - if (c >= seg_eob) break; + if (c >= seg_eob) + break; +#if CONFIG_CODE_NONZEROCOUNT + if (nzc == nzc_expected) + break; + // decode zero node only if there are zeros left + if (seg_eob - nzc_expected - c + nzc > 0) +#endif if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) { INCREMENT_COUNT(ZERO_TOKEN); ++c; - prob = coef_probs[type][coef_bands[c]][PT]; + prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt]; goto SKIP_START; } // ONE_CONTEXT_NODE_0_ @@ -230,194 +330,162 @@ SKIP_START: WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6); } +#if CONFIG_CODE_NONZEROCOUNT == 0 if (c < seg_eob) - coef_counts[type][coef_bands[c]][PT][DCT_EOB_TOKEN]++; - - a[0] = l[0] = (c > !type); + coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] + [pt][DCT_EOB_TOKEN]++; +#endif + A0[aidx] = L0[lidx] = c > 0; + if (txfm_size >= TX_8X8) { + A0[aidx + 1] = L0[lidx + 1] = A0[aidx]; + if (txfm_size >= TX_16X16) { + if (type == PLANE_TYPE_UV) { + ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1); + ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1); + A1[aidx] = A1[aidx + 1] = L1[lidx] = L1[lidx + 1] = A0[aidx]; + if (txfm_size >= TX_32X32) { + ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2); + ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2); + ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3); + ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3); + A2[aidx] = A2[aidx + 1] = A3[aidx] = A3[aidx + 1] = A0[aidx]; + L2[lidx] = L2[lidx + 1] = L3[lidx] = L3[lidx + 1] = A0[aidx]; + } + } else { + A0[aidx + 2] = A0[aidx + 3] = L0[lidx + 2] = L0[lidx + 3] = A0[aidx]; + if (txfm_size >= TX_32X32) { + ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1); + ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1); + A1[aidx] = A1[aidx + 1] = A1[aidx + 2] = A1[aidx + 3] = A0[aidx]; + L1[lidx] = L1[lidx + 1] = L1[lidx + 2] = L1[lidx + 3] = A0[aidx]; + } + } + } + } return c; } static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) { - int active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB); - int eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - - if (!active || eob > eob_max) - eob = eob_max; - return eob; + return vp9_get_segdata(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; } -int vp9_decode_sb_tokens(VP9D_COMP* const pbi, - MACROBLOCKD* const xd, - BOOL_DECODER* const bc) { - ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context; - ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context; - ENTROPY_CONTEXT* const A1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]); - ENTROPY_CONTEXT* const L1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]); - uint16_t *const eobs = xd->eobs; +static INLINE int decode_sb(VP9D_COMP* const pbi, + MACROBLOCKD* const xd, + BOOL_DECODER* const bc, + int offset, int count, int inc, + int eob_max, TX_SIZE tx_size) { const int segment_id = xd->mode_info_context->mbmi.segment_id; - int c, i, eobtotal = 0, seg_eob; + const int seg_eob = get_eob(xd, segment_id, eob_max); + int i, eobtotal = 0; - // Luma block -#if CONFIG_CNVCONTEXT - ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3] + - A1[0] + A1[1] + A1[2] + A1[3]) != 0; - ENTROPY_CONTEXT left_ec = (L[0] + L[1] + L[2] + L[3] + - L1[0] + L1[1] + L1[2] + L1[3]) != 0; -#else - ENTROPY_CONTEXT above_ec = A[0]; - ENTROPY_CONTEXT left_ec = L[0]; -#endif - eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec, - PLANE_TYPE_Y_WITH_DC, - DCT_DCT, get_eob(xd, segment_id, 1024), - xd->sb_coeff_data.qcoeff, - vp9_default_zig_zag1d_32x32, - TX_32X32, vp9_coef_bands_32x32); - A[1] = A[2] = A[3] = A[0] = above_ec; - L[1] = L[2] = L[3] = L[0] = left_ec; - A1[1] = A1[2] = A1[3] = A1[0] = above_ec; - L1[1] = L1[2] = L1[3] = L1[0] = left_ec; + // luma blocks + for (i = 0; i < offset; i += inc) { + const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, seg_eob, + xd->qcoeff + i * 16, tx_size); + xd->eobs[i] = c; + eobtotal += c; + } - eobtotal += c; + // chroma blocks + for (i = offset; i < count; i += inc) { + const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, tx_size); + xd->eobs[i] = c; + eobtotal += c; + } - // 16x16 chroma blocks - seg_eob = get_eob(xd, segment_id, 256); + return eobtotal; +} - for (i = 16; i < 24; i += 4) { - ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_16X16][i]; - ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_16X16][i]; - ENTROPY_CONTEXT* const a1 = A1 + vp9_block2above[TX_16X16][i]; - ENTROPY_CONTEXT* const l1 = L1 + vp9_block2left[TX_16X16][i]; -#if CONFIG_CNVCONTEXT - above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; - left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; -#else - above_ec = a[0]; - left_ec = l[0]; -#endif +int vp9_decode_sb_tokens(VP9D_COMP* const pbi, + MACROBLOCKD* const xd, + BOOL_DECODER* const bc) { + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_32X32: { + // 32x32 luma block + const int segment_id = xd->mode_info_context->mbmi.segment_id; + int i, eobtotal = 0, seg_eob; + int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC, + get_eob(xd, segment_id, 1024), xd->qcoeff, TX_32X32); + xd->eobs[0] = c; + eobtotal += c; - eobs[i] = c = decode_coefs(pbi, xd, bc, - &above_ec, &left_ec, - PLANE_TYPE_UV, - DCT_DCT, seg_eob, - xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64, - vp9_default_zig_zag1d_16x16, - TX_16X16, vp9_coef_bands_16x16); + // 16x16 chroma blocks + seg_eob = get_eob(xd, segment_id, 256); + for (i = 64; i < 96; i += 16) { + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_16X16); + xd->eobs[i] = c; + eobtotal += c; + } + return eobtotal; + } + case TX_16X16: + return decode_sb(pbi, xd, bc, 64, 96, 16, 16 * 16, TX_16X16); + case TX_8X8: + return decode_sb(pbi, xd, bc, 64, 96, 4, 8 * 8, TX_8X8); + case TX_4X4: + return decode_sb(pbi, xd, bc, 64, 96, 1, 4 * 4, TX_4X4); + default: + assert(0); + return 0; + } +} - a1[1] = a1[0] = a[1] = a[0] = above_ec; - l1[1] = l1[0] = l[1] = l[0] = left_ec; - eobtotal += c; +int vp9_decode_sb64_tokens(VP9D_COMP* const pbi, + MACROBLOCKD* const xd, + BOOL_DECODER* const bc) { + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_32X32: + return decode_sb(pbi, xd, bc, 256, 384, 64, 32 * 32, TX_32X32); + case TX_16X16: + return decode_sb(pbi, xd, bc, 256, 384, 16, 16 * 16, TX_16X16); + case TX_8X8: + return decode_sb(pbi, xd, bc, 256, 384, 4, 8 * 8, TX_8X8); + case TX_4X4: + return decode_sb(pbi, xd, bc, 256, 384, 1, 4 * 4, TX_4X4); + default: + assert(0); + return 0; } - // no Y2 block - A[8] = L[8] = A1[8] = L1[8] = 0; - return eobtotal; } static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi, MACROBLOCKD* const xd, BOOL_DECODER* const bc) { - ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context; - ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context; - uint16_t *const eobs = xd->eobs; const int segment_id = xd->mode_info_context->mbmi.segment_id; - int c, i, eobtotal = 0, seg_eob; - // Luma block + int i, eobtotal = 0, seg_eob; -#if CONFIG_CNVCONTEXT - ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3]) != 0; - ENTROPY_CONTEXT left_ec = (L[0] + L[1] + L[2] + L[3]) != 0; -#else - ENTROPY_CONTEXT above_ec = A[0]; - ENTROPY_CONTEXT left_ec = L[0]; -#endif - eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec, - PLANE_TYPE_Y_WITH_DC, - get_tx_type(xd, &xd->block[0]), - get_eob(xd, segment_id, 256), - xd->qcoeff, vp9_default_zig_zag1d_16x16, - TX_16X16, vp9_coef_bands_16x16); - A[1] = A[2] = A[3] = A[0] = above_ec; - L[1] = L[2] = L[3] = L[0] = left_ec; + // Luma block + int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC, + get_eob(xd, segment_id, 256), xd->qcoeff, TX_16X16); + xd->eobs[0] = c; eobtotal += c; // 8x8 chroma blocks seg_eob = get_eob(xd, segment_id, 64); for (i = 16; i < 24; i += 4) { - ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_8X8][i]; - ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_8X8][i]; -#if CONFIG_CNVCONTEXT - above_ec = (a[0] + a[1]) != 0; - left_ec = (l[0] + l[1]) != 0; -#else - above_ec = a[0]; - left_ec = l[0]; -#endif - eobs[i] = c = decode_coefs(pbi, xd, bc, - &above_ec, &left_ec, - PLANE_TYPE_UV, - DCT_DCT, seg_eob, xd->block[i].qcoeff, - vp9_default_zig_zag1d_8x8, - TX_8X8, vp9_coef_bands_8x8); - a[1] = a[0] = above_ec; - l[1] = l[0] = left_ec; + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, + seg_eob, xd->block[i].qcoeff, TX_8X8); + xd->eobs[i] = c; eobtotal += c; } - A[8] = 0; - L[8] = 0; return eobtotal; } static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi, MACROBLOCKD* const xd, BOOL_DECODER* const bc) { - ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context; - ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context; - uint16_t *const eobs = xd->eobs; - PLANE_TYPE type; - int c, i, eobtotal = 0, seg_eob; + int i, eobtotal = 0; const int segment_id = xd->mode_info_context->mbmi.segment_id; - int has_2nd_order = get_2nd_order_usage(xd); - // 2nd order DC block - if (has_2nd_order) { - ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][24]; - ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][24]; - - eobs[24] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_Y2, - DCT_DCT, get_eob(xd, segment_id, 4), - xd->block[24].qcoeff, - vp9_default_zig_zag1d_4x4, TX_8X8, - vp9_coef_bands_4x4); - eobtotal += c - 4; - type = PLANE_TYPE_Y_NO_DC; - } else { - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; - eobs[24] = 0; - type = PLANE_TYPE_Y_WITH_DC; - } - // luma blocks - seg_eob = get_eob(xd, segment_id, 64); + int seg_eob = get_eob(xd, segment_id, 64); for (i = 0; i < 16; i += 4) { - ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i]; - ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i]; -#if CONFIG_CNVCONTEXT - ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0; - ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0; -#else - ENTROPY_CONTEXT above_ec = a[0]; - ENTROPY_CONTEXT left_ec = l[0]; -#endif - eobs[i] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec, type, - type == PLANE_TYPE_Y_WITH_DC ? - get_tx_type(xd, xd->block + i) : DCT_DCT, - seg_eob, xd->block[i].qcoeff, - vp9_default_zig_zag1d_8x8, - TX_8X8, vp9_coef_bands_8x8); - a[1] = a[0] = above_ec; - l[1] = l[0] = left_ec; + const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, + seg_eob, xd->block[i].qcoeff, TX_8X8); + xd->eobs[i] = c; eobtotal += c; } @@ -427,34 +495,16 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi, // use 4x4 transform for U, V components in I8X8/splitmv prediction mode seg_eob = get_eob(xd, segment_id, 16); for (i = 16; i < 24; i++) { - ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i]; - ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i]; - - eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV, - DCT_DCT, seg_eob, xd->block[i].qcoeff, - vp9_default_zig_zag1d_4x4, TX_4X4, - vp9_coef_bands_4x4); + const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, + seg_eob, xd->block[i].qcoeff, TX_4X4); + xd->eobs[i] = c; eobtotal += c; } } else { for (i = 16; i < 24; i += 4) { - ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i]; - ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i]; -#if CONFIG_CNVCONTEXT - ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0; - ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0; -#else - ENTROPY_CONTEXT above_ec = a[0]; - ENTROPY_CONTEXT left_ec = l[0]; -#endif - eobs[i] = c = decode_coefs(pbi, xd, bc, - &above_ec, &left_ec, - PLANE_TYPE_UV, - DCT_DCT, seg_eob, xd->block[i].qcoeff, - vp9_default_zig_zag1d_8x8, - TX_8X8, vp9_coef_bands_8x8); - a[1] = a[0] = above_ec; - l[1] = l[0] = left_ec; + const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, + seg_eob, xd->block[i].qcoeff, TX_8X8); + xd->eobs[i] = c; eobtotal += c; } } @@ -464,64 +514,31 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi, static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd, BOOL_DECODER* const bc, - PLANE_TYPE type, int i, int seg_eob, - TX_TYPE tx_type, const int *scan) { - ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context; - ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context; - ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i]; - ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i]; - uint16_t *const eobs = xd->eobs; - int c; - - c = decode_coefs(dx, xd, bc, a, l, type, tx_type, seg_eob, - xd->block[i].qcoeff, scan, TX_4X4, vp9_coef_bands_4x4); - eobs[i] = c; - + PLANE_TYPE type, int i, int seg_eob) { + const int c = decode_coefs(dx, xd, bc, i, type, seg_eob, + xd->block[i].qcoeff, TX_4X4); + xd->eobs[i] = c; return c; } -static int decode_coefs_4x4_y(VP9D_COMP *dx, MACROBLOCKD *xd, - BOOL_DECODER* const bc, - PLANE_TYPE type, int i, int seg_eob) { - const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type(xd, &xd->block[i]) : DCT_DCT; - const int *scan; - - switch (tx_type) { - case ADST_DCT: - scan = vp9_row_scan_4x4; - break; - case DCT_ADST: - scan = vp9_col_scan_4x4; - break; - default: - scan = vp9_default_zig_zag1d_4x4; - break; - } - - return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob, tx_type, scan); -} - int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd, BOOL_DECODER* const bc, PLANE_TYPE type, int i) { const int segment_id = xd->mode_info_context->mbmi.segment_id; const int seg_eob = get_eob(xd, segment_id, 16); - return decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob); + return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob); } static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd, BOOL_DECODER* const bc, int seg_eob) { - int eobtotal = 0, i; + int i, eobtotal = 0; // chroma blocks - for (i = 16; i < 24; i++) { - eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob, - DCT_DCT, vp9_default_zig_zag1d_4x4); - } + for (i = 16; i < 24; i++) + eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob); return eobtotal; } @@ -539,27 +556,12 @@ static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx, MACROBLOCKD* const xd, BOOL_DECODER* const bc) { int i, eobtotal = 0; - PLANE_TYPE type; const int segment_id = xd->mode_info_context->mbmi.segment_id; const int seg_eob = get_eob(xd, segment_id, 16); - const int has_2nd_order = get_2nd_order_usage(xd); - - // 2nd order DC block - if (has_2nd_order) { - eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24, seg_eob, - DCT_DCT, vp9_default_zig_zag1d_4x4) - 16; - type = PLANE_TYPE_Y_NO_DC; - } else { - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; - xd->eobs[24] = 0; - type = PLANE_TYPE_Y_WITH_DC; - } // luma blocks - for (i = 0; i < 16; ++i) { - eobtotal += decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob); - } + for (i = 0; i < 16; ++i) + eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y_WITH_DC, i, seg_eob); // chroma blocks eobtotal += decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob); @@ -571,16 +573,13 @@ int vp9_decode_mb_tokens(VP9D_COMP* const dx, MACROBLOCKD* const xd, BOOL_DECODER* const bc) { const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; - int eobtotal; - - if (tx_size == TX_16X16) { - eobtotal = vp9_decode_mb_tokens_16x16(dx, xd, bc); - } else if (tx_size == TX_8X8) { - eobtotal = vp9_decode_mb_tokens_8x8(dx, xd, bc); - } else { - assert(tx_size == TX_4X4); - eobtotal = vp9_decode_mb_tokens_4x4(dx, xd, bc); + switch (tx_size) { + case TX_16X16: + return vp9_decode_mb_tokens_16x16(dx, xd, bc); + case TX_8X8: + return vp9_decode_mb_tokens_8x8(dx, xd, bc); + default: + assert(tx_size == TX_4X4); + return vp9_decode_mb_tokens_4x4(dx, xd, bc); } - - return eobtotal; } diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h index 926a0661f145b24a0983b03cd969d19a1dc95d16..33a34aeae004bda273e4e26aed7f591b8228f6d3 100644 --- a/vp9/decoder/vp9_detokenize.h +++ b/vp9/decoder/vp9_detokenize.h @@ -14,8 +14,6 @@ #include "vp9/decoder/vp9_onyxd_int.h" -void vp9_reset_mb_tokens_context(MACROBLOCKD* const); - int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd, BOOL_DECODER* const bc, PLANE_TYPE type, int i); @@ -27,6 +25,10 @@ int vp9_decode_sb_tokens(VP9D_COMP* const pbi, MACROBLOCKD* const xd, BOOL_DECODER* const bc); +int vp9_decode_sb64_tokens(VP9D_COMP* const pbi, + MACROBLOCKD* const xd, + BOOL_DECODER* const bc); + int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd, BOOL_DECODER* const bc); diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 152527cfff513b7667bf3d091af9102b6a20b820..b17955b1caa8d57cb861561693d43accfb3336a2 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -10,54 +10,20 @@ #include "vp9_rtcd.h" #include "vp9/common/vp9_blockd.h" -#if CONFIG_LOSSLESS #include "vp9/decoder/vp9_dequantize.h" -#endif -void vp9_dequant_dc_idct_add_y_block_c(int16_t *q, const int16_t *dq, - uint8_t *pre, - uint8_t *dst, - int stride, uint16_t *eobs, - const int16_t *dc) { - int i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - if (*eobs++ > 1) - vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]); - else - vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride); - - q += 16; - pre += 4; - dst += 4; - dc++; - } - - pre += 64 - 16; - dst += 4 * stride - 16; - } -} - -void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, - const int16_t *dq, - uint8_t *dst, - int stride, - uint16_t *eobs, - const int16_t *dc, - MACROBLOCKD *xd) { +void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q, + const int16_t *dq, + uint8_t *dst, + int stride, + MACROBLOCKD *xd) { int i, j; for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { - if (*eobs++ > 1) - vp9_dequant_dc_idct_add_c(q, dq, dst, dst, stride, stride, dc[0]); - else - vp9_dc_only_idct_add_c(dc[0], dst, dst, stride, stride); - + xd->itxm_add(q, dq, dst, dst, stride, stride, xd->eobs[i * 4 + j]); q += 16; dst += 4; - dc++; } dst += 4 * stride - 16; @@ -67,18 +33,12 @@ void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, - int stride, uint16_t *eobs) { + int stride, MACROBLOCKD *xd) { int i, j; for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { - if (*eobs++ > 1) - vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride); - else { - vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride); - ((int *)q)[0] = 0; - } - + vp9_dequant_idct_add(q, dq, pre, dst, 16, stride, xd->eobs[i * 4 + j]); q += 16; pre += 4; dst += 4; @@ -92,18 +52,13 @@ void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq, void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, - uint16_t *eobs) { + MACROBLOCKD *xd) { int i, j; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { - if (*eobs++ > 1) - vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride); - else { - vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride); - ((int *)q)[0] = 0; - } - + vp9_dequant_idct_add(q, dq, pre, dstu, 8, stride, + xd->eobs[16 + i * 2 + j]); q += 16; pre += 4; dstu += 4; @@ -115,13 +70,8 @@ void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq, for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { - if (*eobs++ > 1) - vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride); - else { - vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride); - ((int *)q)[0] = 0; - } - + vp9_dequant_idct_add(q, dq, pre, dstv, 8, stride, + xd->eobs[20 + i * 2 + j]); q += 16; pre += 4; dstv += 4; @@ -136,19 +86,12 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, - uint16_t *eobs, MACROBLOCKD *xd) { int i, j; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { - if (*eobs++ > 1) { - vp9_dequant_idct_add_c(q, dq, dstu, dstu, stride, stride); - } else { - vp9_dc_only_idct_add_c(q[0]*dq[0], dstu, dstu, stride, stride); - ((int *)q)[0] = 0; - } - + xd->itxm_add(q, dq, dstu, dstu, stride, stride, xd->eobs[16 + i * 2 + j]); q += 16; dstu += 4; } @@ -158,13 +101,7 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq, for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { - if (*eobs++ > 1) { - vp9_dequant_idct_add_c(q, dq, dstv, dstv, stride, stride); - } else { - vp9_dc_only_idct_add_c(q[0]*dq[0], dstv, dstv, stride, stride); - ((int *)q)[0] = 0; - } - + xd->itxm_add(q, dq, dstv, dstv, stride, stride, xd->eobs[20 + i * 2 + j]); q += 16; dstv += 4; } @@ -173,69 +110,40 @@ void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq, } } -void vp9_dequant_dc_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq, - uint8_t *pre, - uint8_t *dst, - int stride, uint16_t *eobs, - const int16_t *dc, - MACROBLOCKD *xd) { - q[0] = dc[0]; - vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]); - - q[64] = dc[1]; - vp9_dequant_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, 1, - xd->eobs[4]); - - q[128] = dc[4]; - vp9_dequant_idct_add_8x8_c(&q[128], dq, pre + 8 * 16, - dst + 8 * stride, 16, stride, 1, xd->eobs[8]); - - q[192] = dc[8]; - vp9_dequant_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8, - dst + 8 * stride + 8, 16, stride, 1, - xd->eobs[12]); -} +void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q, + const int16_t *dq, + uint8_t *dst, + int stride, + MACROBLOCKD *xd) { + vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, xd->eobs[0]); -void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, - const int16_t *dq, - uint8_t *dst, - int stride, - uint16_t *eobs, - const int16_t *dc, - MACROBLOCKD *xd) { - q[0] = dc[0]; - vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]); - - q[64] = dc[1]; vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8, - dst + 8, stride, stride, 1, xd->eobs[4]); + dst + 8, stride, stride, xd->eobs[4]); - q[128] = dc[4]; vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride, - dst + 8 * stride, stride, stride, 1, - xd->eobs[8]); + dst + 8 * stride, stride, stride, + xd->eobs[8]); - q[192] = dc[8]; vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8, - dst + 8 * stride + 8, stride, stride, 1, - xd->eobs[12]); + dst + 8 * stride + 8, stride, stride, + xd->eobs[12]); } void vp9_dequant_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, - int stride, uint16_t *eobs, - MACROBLOCKD *xd) { + int stride, MACROBLOCKD *xd) { uint8_t *origdest = dst; uint8_t *origpred = pre; - vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]); + vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, xd->eobs[0]); vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8, - origdest + 8, 16, stride, 0, xd->eobs[4]); + origdest + 8, 16, stride, xd->eobs[4]); vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16, - origdest + 8 * stride, 16, stride, 0, xd->eobs[8]); + origdest + 8 * stride, 16, stride, + xd->eobs[8]); vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8, - origdest + 8 * stride + 8, 16, stride, 0, + origdest + 8 * stride + 8, 16, stride, xd->eobs[12]); } @@ -243,72 +151,39 @@ void vp9_dequant_idct_add_uv_block_8x8_c(int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, - int stride, uint16_t *eobs, - MACROBLOCKD *xd) { - vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]); + int stride, MACROBLOCKD *xd) { + vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, xd->eobs[16]); q += 64; pre += 64; - vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]); + vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, xd->eobs[20]); } void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, - uint16_t *eobs, MACROBLOCKD *xd) { - vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0, + vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, xd->eobs[16]); q += 64; - vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0, + vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, xd->eobs[20]); } -#if CONFIG_LOSSLESS -void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq, - uint8_t *pre, - uint8_t *dst, - int stride, - uint16_t *eobs, - const int16_t *dc) { - int i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - if (*eobs++ > 1) - vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]); - else - vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride); - - q += 16; - pre += 4; - dst += 4; - dc++; - } - - pre += 64 - 16; - dst += 4 * stride - 16; - } -} void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, - int stride, uint16_t *eobs) { + int stride, MACROBLOCKD *xd) { int i, j; for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { - if (*eobs++ > 1) - vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride); - else { - vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride); - ((int *)q)[0] = 0; - } - + vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride, + xd->eobs[i * 4 + j]); q += 16; pre += 4; dst += 4; @@ -324,18 +199,13 @@ void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, - uint16_t *eobs) { + MACROBLOCKD *xd) { int i, j; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { - if (*eobs++ > 1) - vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride); - else { - vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride); - ((int *)q)[0] = 0; - } - + vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride, + xd->eobs[16 + i * 2 + j]); q += 16; pre += 4; dstu += 4; @@ -347,13 +217,8 @@ void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq, for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { - if (*eobs++ > 1) - vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride); - else { - vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride); - ((int *)q)[0] = 0; - } - + vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride, + xd->eobs[20 + i * 2 + j]); q += 16; pre += 4; dstv += 4; @@ -363,5 +228,4 @@ void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq, dstv += 4 * stride - 8; } } -#endif diff --git a/vp9/decoder/vp9_onyxd.h b/vp9/decoder/vp9_onyxd.h index 93321ef347a48844cffa6bc84fc1e96fc6143cd2..cd71166e45ceb81517a6418f0024950ba24ae09f 100644 --- a/vp9/decoder/vp9_onyxd.h +++ b/vp9/decoder/vp9_onyxd.h @@ -27,6 +27,7 @@ extern "C" { int Version; int postprocess; int max_threads; + int inv_tile_order; int input_partition; } VP9D_CONFIG; typedef enum { @@ -45,14 +46,16 @@ extern "C" { int64_t *time_stamp, int64_t *time_end_stamp, vp9_ppflags_t *flags); - vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp, - VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); + vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd); + int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb); + VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf); void vp9_remove_decompressor(VP9D_PTR comp); diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c index b3b75af70794fac4c0a5c4e25c7ae87f7ecb75c3..2b61f0affbc2bdc72067ab684002b09c369f74a7 100644 --- a/vp9/decoder/vp9_onyxd_if.c +++ b/vp9/decoder/vp9_onyxd_if.c @@ -9,6 +9,9 @@ */ +#include <stdio.h> +#include <assert.h> + #include "vp9/common/vp9_onyxc_int.h" #if CONFIG_POSTPROC #include "vp9/common/vp9_postproc.h" @@ -19,8 +22,6 @@ #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_swapyv12buffer.h" -#include <stdio.h> -#include <assert.h> #include "vp9/common/vp9_quant_common.h" #include "vpx_scale/vpx_scale.h" @@ -30,34 +31,34 @@ #include "vp9/decoder/vp9_detokenize.h" #include "./vpx_scale_rtcd.h" -static int get_free_fb(VP9_COMMON *cm); -static void ref_cnt_fb(int *buf, int *idx, int new_idx); - #define WRITE_RECON_BUFFER 0 #if WRITE_RECON_BUFFER == 1 -static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) { +static void recon_write_yuv_frame(const char *name, + const YV12_BUFFER_CONFIG *s, + int w, int _h) { FILE *yuv_file = fopen((char *)name, "ab"); - uint8_t *src = s->y_buffer; - int h = s->y_height; + const uint8_t *src = s->y_buffer; + int h = _h; do { - fwrite(src, s->y_width, 1, yuv_file); + fwrite(src, w, 1, yuv_file); src += s->y_stride; } while (--h); src = s->u_buffer; - h = s->uv_height; + h = (_h + 1) >> 1; + w = (w + 1) >> 1; do { - fwrite(src, s->uv_width, 1, yuv_file); + fwrite(src, w, 1, yuv_file); src += s->uv_stride; } while (--h); src = s->v_buffer; - h = s->uv_height; + h = (_h + 1) >> 1; do { - fwrite(src, s->uv_width, 1, yuv_file); + fwrite(src, w, 1, yuv_file); src += s->uv_stride; } while (--h); @@ -99,7 +100,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { } #endif -void vp9_initialize_dec(void) { +void vp9_initialize_dec() { static int init_done = 0; if (!init_done) { @@ -127,6 +128,7 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { vp9_initialize_dec(); vp9_create_common(&pbi->common); + pbi->oxcf = *oxcf; pbi->common.current_video_frame = 0; pbi->ready_for_new_data = 1; @@ -152,8 +154,8 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { if (!pbi) return; - // Delete sementation map - if (pbi->common.last_frame_seg_map != 0) + // Delete segmentation map + if (pbi->common.last_frame_seg_map) vpx_free(pbi->common.last_frame_seg_map); vp9_remove_common(&pbi->common); @@ -161,33 +163,37 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { vpx_free(pbi); } +static int equal_dimensions(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width; +} -vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd) { +vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { VP9D_COMP *pbi = (VP9D_COMP *) ptr; VP9_COMMON *cm = &pbi->common; int ref_fb_idx; - if (ref_frame_flag == VP9_LAST_FLAG) - ref_fb_idx = cm->lst_fb_idx; - else if (ref_frame_flag == VP9_GOLD_FLAG) - ref_fb_idx = cm->gld_fb_idx; - else if (ref_frame_flag == VP9_ALT_FLAG) - ref_fb_idx = cm->alt_fb_idx; - else { + /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the + * encoder is using the frame buffers for. This is just a stub to keep the + * vpxenc --test-decode functionality working, and will be replaced in a + * later commit that adds VP9-specific controls for this functionality. + */ + if (ref_frame_flag == VP9_LAST_FLAG) { + ref_fb_idx = pbi->common.ref_frame_map[0]; + } else { vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, "Invalid reference frame"); return pbi->common.error.error_code; } - if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height || - cm->yv12_fb[ref_fb_idx].y_width != sd->y_width || - cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height || - cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) { + if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) { vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, "Incorrect buffer dimensions"); - } else + } else { vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd); + } return pbi->common.error.error_code; } @@ -198,34 +204,35 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag, VP9D_COMP *pbi = (VP9D_COMP *) ptr; VP9_COMMON *cm = &pbi->common; int *ref_fb_ptr = NULL; - int free_fb; + /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the + * encoder is using the frame buffers for. This is just a stub to keep the + * vpxenc --test-decode functionality working, and will be replaced in a + * later commit that adds VP9-specific controls for this functionality. + */ if (ref_frame_flag == VP9_LAST_FLAG) - ref_fb_ptr = &cm->lst_fb_idx; + ref_fb_ptr = &pbi->common.active_ref_idx[0]; else if (ref_frame_flag == VP9_GOLD_FLAG) - ref_fb_ptr = &cm->gld_fb_idx; + ref_fb_ptr = &pbi->common.active_ref_idx[1]; else if (ref_frame_flag == VP9_ALT_FLAG) - ref_fb_ptr = &cm->alt_fb_idx; + ref_fb_ptr = &pbi->common.active_ref_idx[2]; else { vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, "Invalid reference frame"); return pbi->common.error.error_code; } - if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height || - cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width || - cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height || - cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) { + if (!equal_dimensions(&cm->yv12_fb[*ref_fb_ptr], sd)) { vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, "Incorrect buffer dimensions"); } else { - /* Find an empty frame buffer. */ - free_fb = get_free_fb(cm); - /* Decrease fb_idx_ref_cnt since it will be increased again in - * ref_cnt_fb() below. */ + // Find an empty frame buffer. + const int free_fb = get_free_fb(cm); + // Decrease fb_idx_ref_cnt since it will be increased again in + // ref_cnt_fb() below. cm->fb_idx_ref_cnt[free_fb]--; - /* Manage the reference counters and copy image. */ + // Manage the reference counters and copy image. ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb); vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]); } @@ -234,77 +241,36 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag, } -static int get_free_fb(VP9_COMMON *cm) { - int i; - for (i = 0; i < NUM_YV12_BUFFERS; i++) - if (cm->fb_idx_ref_cnt[i] == 0) - break; - - assert(i < NUM_YV12_BUFFERS); - cm->fb_idx_ref_cnt[i] = 1; - return i; -} - -static void ref_cnt_fb(int *buf, int *idx, int new_idx) { - if (buf[*idx] > 0) - buf[*idx]--; +int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) { + VP9D_COMP *pbi = (VP9D_COMP *) ptr; + VP9_COMMON *cm = &pbi->common; - *idx = new_idx; + if (index < 0 || index >= NUM_REF_FRAMES) + return -1; - buf[new_idx]++; + *fb = &cm->yv12_fb[cm->ref_frame_map[index]]; + return 0; } -/* If any buffer copy / swapping is signalled it should be done here. */ -static int swap_frame_buffers(VP9_COMMON *cm) { - int err = 0; - - /* The alternate reference frame or golden frame can be updated - * using the new, last, or golden/alt ref frame. If it - * is updated using the newly decoded frame it is a refresh. - * An update using the last or golden/alt ref frame is a copy. - */ - if (cm->copy_buffer_to_arf) { - int new_fb = 0; - - if (cm->copy_buffer_to_arf == 1) - new_fb = cm->lst_fb_idx; - else if (cm->copy_buffer_to_arf == 2) - new_fb = cm->gld_fb_idx; - else - err = -1; - - ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb); - } +/* If any buffer updating is signalled it should be done here. */ +static void swap_frame_buffers(VP9D_COMP *pbi) { + int ref_index = 0, mask; - if (cm->copy_buffer_to_gf) { - int new_fb = 0; - - if (cm->copy_buffer_to_gf == 1) - new_fb = cm->lst_fb_idx; - else if (cm->copy_buffer_to_gf == 2) - new_fb = cm->alt_fb_idx; - else - err = -1; - - ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb); + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + if (mask & 1) { + ref_cnt_fb(pbi->common.fb_idx_ref_cnt, + &pbi->common.ref_frame_map[ref_index], + pbi->common.new_fb_idx); + } + ++ref_index; } - if (cm->refresh_golden_frame) - ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx); - - if (cm->refresh_alt_ref_frame) - ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx); + pbi->common.frame_to_show = &pbi->common.yv12_fb[pbi->common.new_fb_idx]; + pbi->common.fb_idx_ref_cnt[pbi->common.new_fb_idx]--; - if (cm->refresh_last_frame) { - ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx); - - cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx]; - } else - cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; - - cm->fb_idx_ref_cnt[cm->new_fb_idx]--; - - return err; + /* Invalidate these references until the next frame starts. */ + for (ref_index = 0; ref_index < 3; ref_index++) + pbi->common.active_ref_idx[ref_index] = INT_MAX; } int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, @@ -332,8 +298,13 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, * We do not know if the missing frame(s) was supposed to update * any of the reference buffers, but we act conservative and * mark only the last buffer as corrupted. + * + * TODO(jkoleszar): Error concealment is undefined and non-normative + * at this point, but if it becomes so, [0] may not always be the correct + * thing to do here. */ - cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; + if (cm->active_ref_idx[0] != INT_MAX) + cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1; } cm->new_fb_idx = get_free_fb(cm); @@ -344,8 +315,13 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, /* We do not know if the missing frame(s) was supposed to update * any of the reference buffers, but we act conservative and * mark only the last buffer as corrupted. + * + * TODO(jkoleszar): Error concealment is undefined and non-normative + * at this point, but if it becomes so, [0] may not always be the correct + * thing to do here. */ - cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; + if (cm->active_ref_idx[0] != INT_MAX) + cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1; if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) cm->fb_idx_ref_cnt[cm->new_fb_idx]--; @@ -365,11 +341,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, } { - if (swap_frame_buffers(cm)) { - pbi->common.error.error_code = VPX_CODEC_ERROR; - pbi->common.error.setjmp = 0; - return -1; - } + swap_frame_buffers(pbi); #if WRITE_RECON_BUFFER == 2 if (cm->show_frame) @@ -382,14 +354,16 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, if (cm->filter_level) { /* Apply the loop filter if appropriate. */ - vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0); + vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0, + cm->dering_enabled); } vp8_yv12_extend_frame_borders(cm->frame_to_show); } #if WRITE_RECON_BUFFER == 1 if (cm->show_frame) - recon_write_yuv_frame("recon.yuv", cm->frame_to_show); + recon_write_yuv_frame("recon.yuv", cm->frame_to_show, + cm->width, cm->height); #endif vp9_clear_system_state(); @@ -440,9 +414,9 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd, if (pbi->common.frame_to_show) { *sd = *pbi->common.frame_to_show; - sd->y_width = pbi->common.Width; - sd->y_height = pbi->common.Height; - sd->uv_height = pbi->common.Height / 2; + sd->y_width = pbi->common.width; + sd->y_height = pbi->common.height; + sd->uv_height = pbi->common.height / 2; ret = 0; } else { ret = -1; diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h index 64975468ddcfa056609f42de2bd95d0e58d2db2f..0e6d059af707a1530d732c10529ee8d2248c6bd9 100644 --- a/vp9/decoder/vp9_onyxd_int.h +++ b/vp9/decoder/vp9_onyxd_int.h @@ -18,41 +18,6 @@ // #define DEC_DEBUG -typedef struct { - int ithread; - void *ptr1; - void *ptr2; -} DECODETHREAD_DATA; - -typedef struct { - MACROBLOCKD mbd; - int mb_row; - int current_mb_col; - short *coef_ptr; -} MB_ROW_DEC; - -typedef struct { - int const *scan; - int const *scan_8x8; - uint8_t const *ptr_block2leftabove; - vp9_tree_index const *vp9_coef_tree_ptr; - unsigned char *norm_ptr; - uint8_t *ptr_coef_bands_x; - uint8_t *ptr_coef_bands_x_8x8; - - ENTROPY_CONTEXT_PLANES *A; - ENTROPY_CONTEXT_PLANES *L; - - int16_t *qcoeff_start_ptr; - - vp9_prob const *coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16]; - - uint8_t eob[25]; - -} DETOK; - typedef struct VP9Decompressor { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -68,18 +33,13 @@ typedef struct VP9Decompressor { int64_t last_time_stamp; int ready_for_new_data; - DETOK detoken; - - vp9_dequant_idct_add_fn_t idct_add; - vp9_dequant_dc_idct_add_fn_t dc_idct_add; - vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block; - vp9_dequant_idct_add_y_block_fn_t idct_add_y_block; - vp9_dequant_idct_add_uv_block_fn_t idct_add_uv_block; - + int refresh_frame_flags; vp9_prob prob_skip_false; int decoded_key_frame; + int initial_width; + int initial_height; } VP9D_COMP; int vp9_decode_frame(VP9D_COMP *cpi, const unsigned char **p_data_end); diff --git a/vp9/decoder/vp9_treereader.h b/vp9/decoder/vp9_treereader.h index cca017de420c8e5ab49d16cac0ff8b1247504498..4ec6de99de609c495aa001456fe4089d3652b11e 100644 --- a/vp9/decoder/vp9_treereader.h +++ b/vp9/decoder/vp9_treereader.h @@ -13,17 +13,16 @@ #define VP9_DECODER_VP9_TREEREADER_H_ #include "vp9/common/vp9_treecoder.h" - #include "vp9/decoder/vp9_dboolhuff.h" typedef BOOL_DECODER vp9_reader; #define vp9_read decode_bool #define vp9_read_literal decode_value -#define vp9_read_bit(R) vp9_read(R, vp9_prob_half) - -/* Intent of tree data structure is to make decoding trivial. */ +#define vp9_read_bit(r) vp9_read(r, vp9_prob_half) +#define vp9_read_prob(r) ((vp9_prob)vp9_read_literal(r, 8)) +// Intent of tree data structure is to make decoding trivial. static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */ vp9_tree t, const vp9_prob *const p) { diff --git a/vp9/decoder/x86/vp9_dequantize_mmx.asm b/vp9/decoder/x86/vp9_dequantize_mmx.asm deleted file mode 100644 index 23080bfee88d2d5c70a425e9c4f9f447858281f3..0000000000000000000000000000000000000000 --- a/vp9/decoder/x86/vp9_dequantize_mmx.asm +++ /dev/null @@ -1,406 +0,0 @@ -; -; Copyright (c) 2012 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -align 16 -x_s1sqr2: times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: times 4 dw 0x4E7B -align 16 -pw_16: times 4 dw 16 - -SECTION .text - -INIT_MMX - - -;void dequantize_b_impl_mmx(short *sq, short *dq, short *q) -cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3 - mova m1, [sqq] - pmullw m1, [arg3q+0] ; mm4 *= kernel 0 modifiers. - mova [dqq+ 0], m1 - - mova m1, [sqq+8] - pmullw m1, [arg3q+8] ; mm4 *= kernel 0 modifiers. - mova [dqq+ 8], m1 - - mova m1, [sqq+16] - pmullw m1, [arg3q+16] ; mm4 *= kernel 0 modifiers. - mova [dqq+16], m1 - - mova m1, [sqq+24] - pmullw m1, [arg3q+24] ; mm4 *= kernel 0 modifiers. - mova [dqq+24], m1 - RET - - -;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride) -cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride - -%if ARCH_X86_64 - movsxd strideq, dword stridem - movsxd pitq, dword pitm -%else - mov strideq, stridem - mov pitq, pitm -%endif - - mova m0, [inpq+ 0] - pmullw m0, [dqq] - - mova m1, [inpq+ 8] - pmullw m1, [dqq+ 8] - - mova m2, [inpq+16] - pmullw m2, [dqq+16] - - mova m3, [inpq+24] - pmullw m3, [dqq+24] - - pxor m7, m7 - mova [inpq], m7 - mova [inpq+8], m7 - mova [inpq+16], m7 - mova [inpq+24], m7 - - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2]; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1]; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - mova m3, m5 ; 33 23 13 03 - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2]; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1]; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - paddw m0, [pw_16] - - paddw m2, [pw_16] - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - psraw m2, 5 - - psraw m0, 5 - psraw m4, 5 - - psraw m6, 5 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - pxor m7, m7 - - movh m4, [predq] - punpcklbw m4, m7 - paddsw m0, m4 - packuswb m0, m7 - movh [destq], m0 - - movh m4, [predq+pitq] - punpcklbw m4, m7 - paddsw m1, m4 - packuswb m1, m7 - movh [destq+strideq], m1 - - movh m4, [predq+2*pitq] - punpcklbw m4, m7 - paddsw m2, m4 - packuswb m2, m7 - movh [destq+strideq*2], m2 - - add destq, strideq - add predq, pitq - - movh m4, [predq+2*pitq] - punpcklbw m4, m7 - paddsw m5, m4 - packuswb m5, m7 - movh [destq+strideq*2], m5 - RET - - -;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc) -cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc - -%if ARCH_X86_64 - movsxd strideq, dword stridem - movsxd pitq, dword pitm -%else - mov strideq, stridem - mov pitq, pitm -%endif - - mov Dcq, Dcm - mova m0, [inpq+ 0] - pmullw m0, [dqq+ 0] - - mova m1, [inpq+ 8] - pmullw m1, [dqq+ 8] - - mova m2, [inpq+16] - pmullw m2, [dqq+16] - - mova m3, [inpq+24] - pmullw m3, [dqq+24] - - pxor m7, m7 - mova [inpq+ 0], m7 - mova [inpq+ 8], m7 - mova [inpq+16], m7 - mova [inpq+24], m7 - - ; move lower word of Dc to lower word of m0 - psrlq m0, 16 - psllq m0, 16 - and Dcq, 0xFFFF ; If Dc < 0, we don't want the full dword precision. - movh m7, Dcq - por m0, m7 - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2]; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1]; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - mova m3, m5 ; 33 23 13 03 - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2]; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1]; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - paddw m0, [pw_16] - - paddw m2, [pw_16] - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - psraw m2, 5 - - psraw m0, 5 - psraw m4, 5 - - psraw m6, 5 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - pxor m7, m7 - - movh m4, [predq] - punpcklbw m4, m7 - paddsw m0, m4 - packuswb m0, m7 - movh [destq], m0 - - movh m4, [predq+pitq] - punpcklbw m4, m7 - paddsw m1, m4 - packuswb m1, m7 - movh [destq+strideq], m1 - - movh m4, [predq+2*pitq] - punpcklbw m4, m7 - paddsw m2, m4 - packuswb m2, m7 - movh [destq+strideq*2], m2 - - add destq, strideq - add predq, pitq - - movh m4, [predq+2*pitq] - punpcklbw m4, m7 - paddsw m5, m4 - packuswb m5, m7 - movh [destq+strideq*2], m5 - RET - diff --git a/vp9/decoder/x86/vp9_dequantize_x86.c b/vp9/decoder/x86/vp9_dequantize_x86.c new file mode 100644 index 0000000000000000000000000000000000000000..acfae2a27144d0ac66c2bd11f1a838a332f50655 --- /dev/null +++ b/vp9/decoder/x86/vp9_dequantize_x86.c @@ -0,0 +1,455 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +#if HAVE_SSE2 + +void vp9_add_residual_4x4_sse2(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + const int width = 4; + const __m128i zero = _mm_setzero_si128(); + + // Diff data + const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width)); + const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width)); + const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width)); + const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width)); + + // Prediction data. + __m128i p0 = _mm_cvtsi32_si128(*(const int *)(pred + 0 * pitch)); + __m128i p1 = _mm_cvtsi32_si128(*(const int *)(pred + 1 * pitch)); + __m128i p2 = _mm_cvtsi32_si128(*(const int *)(pred + 2 * pitch)); + __m128i p3 = _mm_cvtsi32_si128(*(const int *)(pred + 3 * pitch)); + + p0 = _mm_unpacklo_epi8(p0, zero); + p1 = _mm_unpacklo_epi8(p1, zero); + p2 = _mm_unpacklo_epi8(p2, zero); + p3 = _mm_unpacklo_epi8(p3, zero); + + p0 = _mm_add_epi16(p0, d0); + p1 = _mm_add_epi16(p1, d1); + p2 = _mm_add_epi16(p2, d2); + p3 = _mm_add_epi16(p3, d3); + + p0 = _mm_packus_epi16(p0, p1); + p2 = _mm_packus_epi16(p2, p3); + + *(int *)dest = _mm_cvtsi128_si32(p0); + dest += stride; + + p0 = _mm_srli_si128(p0, 8); + *(int *)dest = _mm_cvtsi128_si32(p0); + dest += stride; + + *(int *)dest = _mm_cvtsi128_si32(p2); + dest += stride; + + p2 = _mm_srli_si128(p2, 8); + *(int *)dest = _mm_cvtsi128_si32(p2); +} + +void vp9_add_residual_8x8_sse2(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + const int width = 8; + const __m128i zero = _mm_setzero_si128(); + + // Diff data + const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width)); + const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width)); + const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width)); + const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width)); + const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width)); + const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width)); + + // Prediction data. + __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch)); + __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch)); + __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch)); + __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch)); + __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch)); + __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch)); + __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch)); + __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch)); + + p0 = _mm_unpacklo_epi8(p0, zero); + p1 = _mm_unpacklo_epi8(p1, zero); + p2 = _mm_unpacklo_epi8(p2, zero); + p3 = _mm_unpacklo_epi8(p3, zero); + p4 = _mm_unpacklo_epi8(p4, zero); + p5 = _mm_unpacklo_epi8(p5, zero); + p6 = _mm_unpacklo_epi8(p6, zero); + p7 = _mm_unpacklo_epi8(p7, zero); + + p0 = _mm_add_epi16(p0, d0); + p1 = _mm_add_epi16(p1, d1); + p2 = _mm_add_epi16(p2, d2); + p3 = _mm_add_epi16(p3, d3); + p4 = _mm_add_epi16(p4, d4); + p5 = _mm_add_epi16(p5, d5); + p6 = _mm_add_epi16(p6, d6); + p7 = _mm_add_epi16(p7, d7); + + p0 = _mm_packus_epi16(p0, p1); + p2 = _mm_packus_epi16(p2, p3); + p4 = _mm_packus_epi16(p4, p5); + p6 = _mm_packus_epi16(p6, p7); + + _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); + p0 = _mm_srli_si128(p0, 8); + _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); + + _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); + p2 = _mm_srli_si128(p2, 8); + _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); + + _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); + p4 = _mm_srli_si128(p4, 8); + _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); + + _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); + p6 = _mm_srli_si128(p6, 8); + _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); +} + +void vp9_add_residual_16x16_sse2(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + const int width = 16; + int i = 4; + const __m128i zero = _mm_setzero_si128(); + + // Diff data + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p0, p1, p2, p3, p4, p5, p6, p7; + + do { + d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8)); + d2 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8)); + d4 = _mm_load_si128((const __m128i *)(diff + 2 * width)); + d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8)); + d6 = _mm_load_si128((const __m128i *)(diff + 3 * width)); + d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8)); + + // Prediction data. + p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); + p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch)); + p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch)); + p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch)); + + p0 = _mm_unpacklo_epi8(p1, zero); + p1 = _mm_unpackhi_epi8(p1, zero); + p2 = _mm_unpacklo_epi8(p3, zero); + p3 = _mm_unpackhi_epi8(p3, zero); + p4 = _mm_unpacklo_epi8(p5, zero); + p5 = _mm_unpackhi_epi8(p5, zero); + p6 = _mm_unpacklo_epi8(p7, zero); + p7 = _mm_unpackhi_epi8(p7, zero); + + p0 = _mm_add_epi16(p0, d0); + p1 = _mm_add_epi16(p1, d1); + p2 = _mm_add_epi16(p2, d2); + p3 = _mm_add_epi16(p3, d3); + p4 = _mm_add_epi16(p4, d4); + p5 = _mm_add_epi16(p5, d5); + p6 = _mm_add_epi16(p6, d6); + p7 = _mm_add_epi16(p7, d7); + + p0 = _mm_packus_epi16(p0, p1); + p1 = _mm_packus_epi16(p2, p3); + p2 = _mm_packus_epi16(p4, p5); + p3 = _mm_packus_epi16(p6, p7); + + _mm_store_si128((__m128i *)(dest + 0 * stride), p0); + _mm_store_si128((__m128i *)(dest + 1 * stride), p1); + _mm_store_si128((__m128i *)(dest + 2 * stride), p2); + _mm_store_si128((__m128i *)(dest + 3 * stride), p3); + + diff += 4 * width; + pred += 4 * pitch; + dest += 4 * stride; + } while (--i); +} + +void vp9_add_residual_32x32_sse2(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + const int width = 32; + int i = 16; + const __m128i zero = _mm_setzero_si128(); + + // Diff data + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p0, p1, p2, p3, p4, p5, p6, p7; + + do { + d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8)); + d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16)); + d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24)); + d4 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8)); + d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16)); + d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24)); + + // Prediction data. + p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); + p3 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16)); + p5 = _mm_load_si128((const __m128i *)(pred + 1 * pitch)); + p7 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16)); + + p0 = _mm_unpacklo_epi8(p1, zero); + p1 = _mm_unpackhi_epi8(p1, zero); + p2 = _mm_unpacklo_epi8(p3, zero); + p3 = _mm_unpackhi_epi8(p3, zero); + p4 = _mm_unpacklo_epi8(p5, zero); + p5 = _mm_unpackhi_epi8(p5, zero); + p6 = _mm_unpacklo_epi8(p7, zero); + p7 = _mm_unpackhi_epi8(p7, zero); + + p0 = _mm_add_epi16(p0, d0); + p1 = _mm_add_epi16(p1, d1); + p2 = _mm_add_epi16(p2, d2); + p3 = _mm_add_epi16(p3, d3); + p4 = _mm_add_epi16(p4, d4); + p5 = _mm_add_epi16(p5, d5); + p6 = _mm_add_epi16(p6, d6); + p7 = _mm_add_epi16(p7, d7); + + p0 = _mm_packus_epi16(p0, p1); + p1 = _mm_packus_epi16(p2, p3); + p2 = _mm_packus_epi16(p4, p5); + p3 = _mm_packus_epi16(p6, p7); + + _mm_store_si128((__m128i *)(dest + 0 * stride), p0); + _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1); + _mm_store_si128((__m128i *)(dest + 1 * stride), p2); + _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3); + + diff += 2 * width; + pred += 2 * pitch; + dest += 2 * stride; + } while (--i); +} + +void vp9_add_constant_residual_8x8_sse2(const int16_t diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + uint8_t abs_diff; + __m128i d; + + // Prediction data. + __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch)); + __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch)); + __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch)); + __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch)); + __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch)); + __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch)); + __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch)); + __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch)); + + p0 = _mm_unpacklo_epi64(p0, p1); + p2 = _mm_unpacklo_epi64(p2, p3); + p4 = _mm_unpacklo_epi64(p4, p5); + p6 = _mm_unpacklo_epi64(p6, p7); + + // Clip diff value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (diff >= 0) { + abs_diff = (diff > 255) ? 255 : diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + + p0 = _mm_adds_epu8(p0, d); + p2 = _mm_adds_epu8(p2, d); + p4 = _mm_adds_epu8(p4, d); + p6 = _mm_adds_epu8(p6, d); + } else { + abs_diff = (diff < -255) ? 255 : -diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + + p0 = _mm_subs_epu8(p0, d); + p2 = _mm_subs_epu8(p2, d); + p4 = _mm_subs_epu8(p4, d); + p6 = _mm_subs_epu8(p6, d); + } + + _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); + p0 = _mm_srli_si128(p0, 8); + _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); + + _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); + p2 = _mm_srli_si128(p2, 8); + _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); + + _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); + p4 = _mm_srli_si128(p4, 8); + _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); + + _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); + p6 = _mm_srli_si128(p6, 8); + _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); +} + +void vp9_add_constant_residual_16x16_sse2(const int16_t diff, + const uint8_t *pred, int pitch, + uint8_t *dest, int stride) { + uint8_t abs_diff; + __m128i d; + + // Prediction data. + __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); + __m128i p1 = _mm_load_si128((const __m128i *)(pred + 1 * pitch)); + __m128i p2 = _mm_load_si128((const __m128i *)(pred + 2 * pitch)); + __m128i p3 = _mm_load_si128((const __m128i *)(pred + 3 * pitch)); + __m128i p4 = _mm_load_si128((const __m128i *)(pred + 4 * pitch)); + __m128i p5 = _mm_load_si128((const __m128i *)(pred + 5 * pitch)); + __m128i p6 = _mm_load_si128((const __m128i *)(pred + 6 * pitch)); + __m128i p7 = _mm_load_si128((const __m128i *)(pred + 7 * pitch)); + __m128i p8 = _mm_load_si128((const __m128i *)(pred + 8 * pitch)); + __m128i p9 = _mm_load_si128((const __m128i *)(pred + 9 * pitch)); + __m128i p10 = _mm_load_si128((const __m128i *)(pred + 10 * pitch)); + __m128i p11 = _mm_load_si128((const __m128i *)(pred + 11 * pitch)); + __m128i p12 = _mm_load_si128((const __m128i *)(pred + 12 * pitch)); + __m128i p13 = _mm_load_si128((const __m128i *)(pred + 13 * pitch)); + __m128i p14 = _mm_load_si128((const __m128i *)(pred + 14 * pitch)); + __m128i p15 = _mm_load_si128((const __m128i *)(pred + 15 * pitch)); + + // Clip diff value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (diff >= 0) { + abs_diff = (diff > 255) ? 255 : diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + + p0 = _mm_adds_epu8(p0, d); + p1 = _mm_adds_epu8(p1, d); + p2 = _mm_adds_epu8(p2, d); + p3 = _mm_adds_epu8(p3, d); + p4 = _mm_adds_epu8(p4, d); + p5 = _mm_adds_epu8(p5, d); + p6 = _mm_adds_epu8(p6, d); + p7 = _mm_adds_epu8(p7, d); + p8 = _mm_adds_epu8(p8, d); + p9 = _mm_adds_epu8(p9, d); + p10 = _mm_adds_epu8(p10, d); + p11 = _mm_adds_epu8(p11, d); + p12 = _mm_adds_epu8(p12, d); + p13 = _mm_adds_epu8(p13, d); + p14 = _mm_adds_epu8(p14, d); + p15 = _mm_adds_epu8(p15, d); + } else { + abs_diff = (diff < -255) ? 255 : -diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + + p0 = _mm_subs_epu8(p0, d); + p1 = _mm_subs_epu8(p1, d); + p2 = _mm_subs_epu8(p2, d); + p3 = _mm_subs_epu8(p3, d); + p4 = _mm_subs_epu8(p4, d); + p5 = _mm_subs_epu8(p5, d); + p6 = _mm_subs_epu8(p6, d); + p7 = _mm_subs_epu8(p7, d); + p8 = _mm_subs_epu8(p8, d); + p9 = _mm_subs_epu8(p9, d); + p10 = _mm_subs_epu8(p10, d); + p11 = _mm_subs_epu8(p11, d); + p12 = _mm_subs_epu8(p12, d); + p13 = _mm_subs_epu8(p13, d); + p14 = _mm_subs_epu8(p14, d); + p15 = _mm_subs_epu8(p15, d); + } + + // Store results + _mm_store_si128((__m128i *)(dest + 0 * stride), p0); + _mm_store_si128((__m128i *)(dest + 1 * stride), p1); + _mm_store_si128((__m128i *)(dest + 2 * stride), p2); + _mm_store_si128((__m128i *)(dest + 3 * stride), p3); + _mm_store_si128((__m128i *)(dest + 4 * stride), p4); + _mm_store_si128((__m128i *)(dest + 5 * stride), p5); + _mm_store_si128((__m128i *)(dest + 6 * stride), p6); + _mm_store_si128((__m128i *)(dest + 7 * stride), p7); + _mm_store_si128((__m128i *)(dest + 8 * stride), p8); + _mm_store_si128((__m128i *)(dest + 9 * stride), p9); + _mm_store_si128((__m128i *)(dest + 10 * stride), p10); + _mm_store_si128((__m128i *)(dest + 11 * stride), p11); + _mm_store_si128((__m128i *)(dest + 12 * stride), p12); + _mm_store_si128((__m128i *)(dest + 13 * stride), p13); + _mm_store_si128((__m128i *)(dest + 14 * stride), p14); + _mm_store_si128((__m128i *)(dest + 15 * stride), p15); +} + +void vp9_add_constant_residual_32x32_sse2(const int16_t diff, + const uint8_t *pred, int pitch, + uint8_t *dest, int stride) { + uint8_t abs_diff; + __m128i d; + int i = 8; + + if (diff >= 0) { + abs_diff = (diff > 255) ? 255 : diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + } else { + abs_diff = (diff < -255) ? 255 : -diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + } + + do { + // Prediction data. + __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); + __m128i p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16)); + __m128i p2 = _mm_load_si128((const __m128i *)(pred + 1 * pitch)); + __m128i p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16)); + __m128i p4 = _mm_load_si128((const __m128i *)(pred + 2 * pitch)); + __m128i p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch + 16)); + __m128i p6 = _mm_load_si128((const __m128i *)(pred + 3 * pitch)); + __m128i p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch + 16)); + + // Clip diff value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (diff >= 0) { + p0 = _mm_adds_epu8(p0, d); + p1 = _mm_adds_epu8(p1, d); + p2 = _mm_adds_epu8(p2, d); + p3 = _mm_adds_epu8(p3, d); + p4 = _mm_adds_epu8(p4, d); + p5 = _mm_adds_epu8(p5, d); + p6 = _mm_adds_epu8(p6, d); + p7 = _mm_adds_epu8(p7, d); + } else { + p0 = _mm_subs_epu8(p0, d); + p1 = _mm_subs_epu8(p1, d); + p2 = _mm_subs_epu8(p2, d); + p3 = _mm_subs_epu8(p3, d); + p4 = _mm_subs_epu8(p4, d); + p5 = _mm_subs_epu8(p5, d); + p6 = _mm_subs_epu8(p6, d); + p7 = _mm_subs_epu8(p7, d); + } + + // Store results + _mm_store_si128((__m128i *)(dest + 0 * stride), p0); + _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1); + _mm_store_si128((__m128i *)(dest + 1 * stride), p2); + _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3); + _mm_store_si128((__m128i *)(dest + 2 * stride), p4); + _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5); + _mm_store_si128((__m128i *)(dest + 3 * stride), p6); + _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7); + + pred += 4 * pitch; + dest += 4 * stride; + } while (--i); +} +#endif diff --git a/vp9/decoder/x86/vp9_idct_blk_mmx.c b/vp9/decoder/x86/vp9_idct_blk_mmx.c deleted file mode 100644 index 8279eaa4ad193f3b4bb7b3b16be4e32a05dfdcaf..0000000000000000000000000000000000000000 --- a/vp9/decoder/x86/vp9_idct_blk_mmx.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/decoder/vp9_dequantize.h" -#include "vp9/decoder/x86/vp9_idct_mmx.h" - -void vp9_dequant_dc_idct_add_y_block_mmx(short *q, const short *dq, - unsigned char *pre, - unsigned char *dst, - int stride, unsigned short *eobs, - const short *dc) { - int i; - - for (i = 0; i < 4; i++) { - if (eobs[0] > 1) - vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]); - else - vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride); - - if (eobs[1] > 1) - vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4, - dst + 4, 16, stride, dc[1]); - else - vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride); - - if (eobs[2] > 1) - vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8, - dst + 8, 16, stride, dc[2]); - else - vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride); - - if (eobs[3] > 1) - vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12, - dst + 12, 16, stride, dc[3]); - else - vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride); - - q += 64; - dc += 4; - pre += 64; - dst += 4 * stride; - eobs += 4; - } -} - -void vp9_dequant_idct_add_y_block_mmx(short *q, const short *dq, - unsigned char *pre, - unsigned char *dst, - int stride, unsigned short *eobs) { - int i; - - for (i = 0; i < 4; i++) { - if (eobs[0] > 1) - vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride); - else { - vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride); - else { - vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride); - ((int *)(q + 16))[0] = 0; - } - - if (eobs[2] > 1) - vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride); - else { - vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride); - ((int *)(q + 32))[0] = 0; - } - - if (eobs[3] > 1) - vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride); - else { - vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride); - ((int *)(q + 48))[0] = 0; - } - - q += 64; - pre += 64; - dst += 4 * stride; - eobs += 4; - } -} - -void vp9_dequant_idct_add_uv_block_mmx(short *q, const short *dq, - unsigned char *pre, - unsigned char *dstu, - unsigned char *dstv, - int stride, unsigned short *eobs) { - int i; - - for (i = 0; i < 2; i++) { - if (eobs[0] > 1) - vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride); - else { - vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride); - else { - vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride); - ((int *)(q + 16))[0] = 0; - } - - q += 32; - pre += 32; - dstu += 4 * stride; - eobs += 2; - } - - for (i = 0; i < 2; i++) { - if (eobs[0] > 1) - vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride); - else { - vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride); - else { - vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride); - ((int *)(q + 16))[0] = 0; - } - - q += 32; - pre += 32; - dstv += 4 * stride; - eobs += 2; - } -} diff --git a/vp9/decoder/x86/vp9_idct_mmx.h b/vp9/decoder/x86/vp9_idct_mmx.h index c0e9bfd0629600ec153cee9f0c4b0332a27c4e11..7d9829175ddda4441a94df9306e36a0c1d80cbde 100644 --- a/vp9/decoder/x86/vp9_idct_mmx.h +++ b/vp9/decoder/x86/vp9_idct_mmx.h @@ -16,9 +16,6 @@ void vp9_dequant_dc_idct_add_mmx(short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc); -void vp9_dc_only_idct_add_mmx(short input_dc, const unsigned char *pred_ptr, - unsigned char *dst_ptr, int pitch, int stride); - void vp9_dequant_idct_add_mmx(short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride); diff --git a/vp9/encoder/vp9_asm_enc_offsets.c b/vp9/encoder/vp9_asm_enc_offsets.c index 71fad2e0753b07ffd75a33f15708df3e6014680d..e174a894a06c9c3b15eeb55d97e35598959c960f 100644 --- a/vp9/encoder/vp9_asm_enc_offsets.c +++ b/vp9/encoder/vp9_asm_enc_offsets.c @@ -32,7 +32,6 @@ DEFINE(vp9_block_quant_shift, offsetof(BLOCK, quant_shift)); DEFINE(vp9_blockd_qcoeff, offsetof(BLOCKD, qcoeff)); DEFINE(vp9_blockd_dequant, offsetof(BLOCKD, dequant)); DEFINE(vp9_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff)); -DEFINE(vp9_blockd_eob, offsetof(BLOCKD, eob)); END diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 61aac5cd1c37c98308dea667054f403700d14234..7128b70c8612e115aa33dd7e2703b3dc9b3e9ea8 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -14,6 +14,7 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_findnearmv.h" +#include "vp9/common/vp9_tile_common.h" #include "vp9/encoder/vp9_mcomp.h" #include "vp9/common/vp9_systemdependent.h" #include <assert.h> @@ -41,17 +42,32 @@ unsigned __int64 Sectionbits[500]; int intra_mode_stats[VP9_KF_BINTRAMODES] [VP9_KF_BINTRAMODES] [VP9_KF_BINTRAMODES]; -vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4]; -vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4]; -vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8]; -vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8]; -vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16]; -vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16]; -vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32]; +vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES]; +vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES]; +vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES]; +vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES]; extern unsigned int active_section; #endif +#if CONFIG_CODE_NONZEROCOUNT +#ifdef NZC_STATS +unsigned int nzc_stats_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC4X4_TOKENS]; +unsigned int nzc_stats_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC8X8_TOKENS]; +unsigned int nzc_stats_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC16X16_TOKENS]; +unsigned int nzc_stats_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC32X32_TOKENS]; +unsigned int nzc_pcat_stats[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA] + [NZC_BITS_EXTRA][2]; +void init_nzcstats(); +void update_nzcstats(VP9_COMMON *const cm); +void print_nzcstats(); +#endif +#endif + #ifdef MODE_STATS int count_mb_seg[4] = { 0, 0, 0, 0 }; #endif @@ -112,8 +128,8 @@ static void update_mode( unsigned int new_b = 0, old_b = 0; int i = 0; - vp9_tree_probs_from_distribution(n--, tok, tree, - Pnew, bct, num_events); + vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0); + n--; do { new_b += cost_branch(bct[i], Pnew[i]); @@ -169,10 +185,9 @@ static void update_switchable_interp_probs(VP9_COMP *cpi, int i, j; for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) { vp9_tree_probs_from_distribution( - VP9_SWITCHABLE_FILTERS, - vp9_switchable_interp_encodings, vp9_switchable_interp_tree, + vp9_switchable_interp_tree, pc->fc.switchable_interp_prob[j], branch_ct, - cpi->switchable_interp_count[j]); + cpi->switchable_interp_count[j], 0); for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) { if (pc->fc.switchable_interp_prob[j][i] < 1) pc->fc.switchable_interp_prob[j][i] = 1; @@ -189,15 +204,7 @@ static void update_refpred_stats(VP9_COMP *cpi) { int old_cost, new_cost; // Set the prediction probability structures to defaults - if (cm->frame_type == KEY_FRAME) { - // Set the prediction probabilities to defaults - cm->ref_pred_probs[0] = 120; - cm->ref_pred_probs[1] = 80; - cm->ref_pred_probs[2] = 40; - - vpx_memset(cpi->ref_pred_probs_update, 0, - sizeof(cpi->ref_pred_probs_update)); - } else { + if (cm->frame_type != KEY_FRAME) { // From the prediction counts set the probabilities for each context for (i = 0; i < PREDICTION_PROBS; i++) { new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0], @@ -219,7 +226,6 @@ static void update_refpred_stats(VP9_COMP *cpi) { cm->ref_pred_probs[i] = new_pred_probs[i]; } else cpi->ref_pred_probs_update[i] = 0; - } } } @@ -230,8 +236,8 @@ static void update_refpred_stats(VP9_COMP *cpi) { // // The branch counts table is re-populated during the actual pack stage and in // the decoder to facilitate backwards update of the context. -static void update_mode_probs(VP9_COMMON *cm, - int mode_context[INTER_MODE_CONTEXTS][4]) { +static void update_inter_mode_probs(VP9_COMMON *cm, + int mode_context[INTER_MODE_CONTEXTS][4]) { int i, j; unsigned int (*mv_ref_ct)[4][2]; @@ -393,6 +399,43 @@ static int prob_diff_update_savings_search(const unsigned int *ct, return bestsavings; } +#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE +static int prob_diff_update_savings_search_model(const unsigned int *ct, + const vp9_prob *oldp, + vp9_prob *bestp, + const vp9_prob upd, + int b, int r) { + int i, old_b, new_b, update_b, savings, bestsavings, step; + int newp; + vp9_prob bestnewp, newplist[ENTROPY_NODES]; + for (i = UNCONSTRAINED_NODES - 1, old_b = 0; i < ENTROPY_NODES; ++i) + old_b += cost_branch256(ct + 2 * i, oldp[i]); + + bestsavings = 0; + bestnewp = oldp[UNCONSTRAINED_NODES - 1]; + + step = (*bestp > oldp[UNCONSTRAINED_NODES - 1] ? -1 : 1); + newp = *bestp; + // newp = *bestp - step * (abs(*bestp - oldp[UNCONSTRAINED_NODES - 1]) >> 1); + for (; newp != oldp[UNCONSTRAINED_NODES - 1]; newp += step) { + if (newp < 1 || newp > 255) continue; + newplist[UNCONSTRAINED_NODES - 1] = newp; + vp9_get_model_distribution(newp, newplist, b, r); + for (i = UNCONSTRAINED_NODES - 1, new_b = 0; i < ENTROPY_NODES; ++i) + new_b += cost_branch256(ct + 2 * i, newplist[i]); + update_b = prob_diff_update_cost(newp, oldp[UNCONSTRAINED_NODES - 1]) + + vp9_cost_upd256; + savings = old_b - new_b - update_b; + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = newp; + } + } + *bestp = bestnewp; + return bestsavings; +} +#endif + static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd, unsigned int *ct) { vp9_prob newp; @@ -508,7 +551,8 @@ static void write_sub_mv_ref vp9_sub_mv_ref_encoding_array - LEFT4X4 + m); } -static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref, +static void write_nmv(VP9_COMP *cpi, vp9_writer *bc, + const MV *mv, const int_mv *ref, const nmv_context *nmvc, int usehp) { MV e; e.row = mv->row - ref->as_mv.row; @@ -585,6 +629,28 @@ static void write_mb_segid(vp9_writer *bc, } } +static void write_mb_segid_except(VP9_COMMON *cm, + vp9_writer *bc, + const MB_MODE_INFO *mi, + const MACROBLOCKD *xd, + int mb_row, int mb_col) { + // Encode the MB segment id. + int seg_id = mi->segment_id; + int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, + mb_row * cm->mb_cols + mb_col); + const vp9_prob *p = xd->mb_segment_tree_probs; + const vp9_prob p1 = xd->mb_segment_mispred_tree_probs[pred_seg_id]; + + if (xd->segmentation_enabled && xd->update_mb_segmentation_map) { + vp9_write(bc, seg_id >= 2, p1); + if (pred_seg_id >= 2 && seg_id < 2) { + vp9_write(bc, seg_id == 1, p[1]); + } else if (pred_seg_id < 2 && seg_id >= 2) { + vp9_write(bc, seg_id == 3, p[2]); + } + } +} + // This function encodes the reference frame static void encode_ref_frame(vp9_writer *const bc, VP9_COMMON *const cm, @@ -708,10 +774,9 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, // Distance of Mb to the various image edges. // These specified to 8th pel as they are always compared to MV // values that are in 1/8th pel units - xd->mb_to_left_edge = -((mb_col * 16) << 3); - xd->mb_to_top_edge = -((mb_row * 16)) << 3; - xd->mb_to_right_edge = ((pc->mb_cols - mb_size - mb_col) * 16) << 3; - xd->mb_to_bottom_edge = ((pc->mb_rows - mb_size - mb_row) * 16) << 3; + + set_mb_row(pc, xd, mb_row, mb_size); + set_mb_col(pc, xd, mb_col, mb_size); #ifdef ENTROPY_STATS active_section = 9; @@ -728,7 +793,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, // If the mb segment id wasn't predicted code explicitly if (!prediction_flag) - write_mb_segid(bc, mi, &cpi->mb.e_mbd); + write_mb_segid_except(pc, bc, mi, &cpi->mb.e_mbd, mb_row, mb_col); } else { // Normal unpredicted coding write_mb_segid(bc, mi, &cpi->mb.e_mbd); @@ -737,45 +802,27 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, if (!pc->mb_no_coeff_skip) { skip_coeff = 0; - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) { + } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { skip_coeff = 1; } else { - const int nmbs = mb_size; - const int xmbs = MIN(nmbs, mb_cols_left); - const int ymbs = MIN(nmbs, mb_rows_left); - int x, y; - - skip_coeff = 1; - for (y = 0; y < ymbs; y++) { - for (x = 0; x < xmbs; x++) { - skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff; - } - } - + skip_coeff = m->mbmi.mb_skip_coeff; vp9_write(bc, skip_coeff, vp9_get_pred_prob(pc, xd, PRED_MBSKIP)); } // Encode the reference frame. - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) - || vp9_get_segdata(xd, segment_id, SEG_LVL_MODE) >= NEARESTMV) { - encode_ref_frame(bc, pc, xd, segment_id, rf); - } else { - assert(rf == INTRA_FRAME); - } + encode_ref_frame(bc, pc, xd, segment_id, rf); if (rf == INTRA_FRAME) { #ifdef ENTROPY_STATS active_section = 6; #endif - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { - if (m->mbmi.sb_type) - write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob); - else - write_ymode(bc, mode, pc->fc.ymode_prob); - } + if (m->mbmi.sb_type) + write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob); + else + write_ymode(bc, mode, pc->fc.ymode_prob); + if (mode == B_PRED) { int j = 0; do { @@ -801,14 +848,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]); - // #ifdef ENTROPY_STATS #ifdef ENTROPY_STATS - accum_mv_refs(mode, ct); active_section = 3; #endif - // Is the segment coding of mode enabled - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { + // If segment skip is not enabled code the mode. + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { if (mi->sb_type) { write_sb_mv_ref(bc, mode, mv_ref_p); } else { @@ -878,12 +923,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, #ifdef ENTROPY_STATS active_section = 5; #endif - write_nmv(bc, &mi->mv[0].as_mv, &mi->best_mv, + write_nmv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv, (const nmv_context*) nmvc, xd->allow_high_precision_mv); if (mi->second_ref_frame > 0) { - write_nmv(bc, &mi->mv[1].as_mv, &mi->best_second_mv, + write_nmv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv, (const nmv_context*) nmvc, xd->allow_high_precision_mv); } @@ -915,7 +960,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, #else while (j != L[++k]); #endif - leftmv.as_int = left_block_mv(m, k); + leftmv.as_int = left_block_mv(xd, m, k); abovemv.as_int = above_block_mv(m, k, mis); mv_contz = vp9_mv_cont(&leftmv, &abovemv); @@ -926,12 +971,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, #ifdef ENTROPY_STATS active_section = 11; #endif - write_nmv(bc, &blockmv.as_mv, &mi->best_mv, + write_nmv(cpi, bc, &blockmv.as_mv, &mi->best_mv, (const nmv_context*) nmvc, xd->allow_high_precision_mv); if (mi->second_ref_frame > 0) { - write_nmv(bc, + write_nmv(cpi, bc, &cpi->mb.partition_info->bmi[j].second_mv.as_mv, &mi->best_second_mv, (const nmv_context*) nmvc, @@ -951,8 +996,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, mi->partitioning == PARTITIONING_4X4))) && pc->txfm_mode == TX_MODE_SELECT && !((pc->mb_no_coeff_skip && skip_coeff) || - (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { + (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { TX_SIZE sz = mi->txfm_size; // FIXME(rbultje) code ternary symbol once all experiments are merged vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]); @@ -965,7 +1009,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, } static void write_mb_modes_kf(const VP9_COMP *cpi, - const MODE_INFO *m, + MODE_INFO *m, vp9_writer *bc, int mb_rows_left, int mb_cols_left) { const VP9_COMMON *const c = &cpi->common; @@ -981,22 +1025,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, if (!c->mb_no_coeff_skip) { skip_coeff = 0; - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) { + } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { skip_coeff = 1; } else { - const int nmbs = 1 << m->mbmi.sb_type; - const int xmbs = MIN(nmbs, mb_cols_left); - const int ymbs = MIN(nmbs, mb_rows_left); - int x, y; - - skip_coeff = 1; - for (y = 0; y < ymbs; y++) { - for (x = 0; x < xmbs; x++) { - skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff; - } - } - + skip_coeff = m->mbmi.mb_skip_coeff; vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP)); } @@ -1013,7 +1045,8 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, int i = 0; do { const B_PREDICTION_MODE A = above_block_mode(m, i, mis); - const B_PREDICTION_MODE L = left_block_mode(m, i); + const B_PREDICTION_MODE L = (xd->left_available || (i & 3)) ? + left_block_mode(m, i) : B_DC_PRED; const int bm = m->bmi[i].as_mode.first; #ifdef ENTROPY_STATS @@ -1041,8 +1074,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT && !((c->mb_no_coeff_skip && skip_coeff) || - (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { + (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { TX_SIZE sz = m->mbmi.txfm_size; // FIXME(rbultje) code ternary symbol once all experiments are merged vp9_write(bc, sz != TX_4X4, c->prob_tx[0]); @@ -1054,45 +1086,609 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, } } +#if CONFIG_CODE_NONZEROCOUNT +static void write_nzc(VP9_COMMON *const cm, + uint16_t nzc, + int nzc_context, + TX_SIZE tx_size, + int ref, + int type, + vp9_writer* const bc) { + int c, e; + c = codenzc(nzc); + if (tx_size == TX_32X32) { + write_token(bc, vp9_nzc32x32_tree, + cm->fc.nzc_probs_32x32[nzc_context][ref][type], + vp9_nzc32x32_encodings + c); + // cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++; + } else if (tx_size == TX_16X16) { + write_token(bc, vp9_nzc16x16_tree, + cm->fc.nzc_probs_16x16[nzc_context][ref][type], + vp9_nzc16x16_encodings + c); + // cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++; + } else if (tx_size == TX_8X8) { + write_token(bc, vp9_nzc8x8_tree, + cm->fc.nzc_probs_8x8[nzc_context][ref][type], + vp9_nzc8x8_encodings + c); + // cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++; + } else if (tx_size == TX_4X4) { + write_token(bc, vp9_nzc4x4_tree, + cm->fc.nzc_probs_4x4[nzc_context][ref][type], + vp9_nzc4x4_encodings + c); + // cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++; + } else { + assert(0); + } + + if ((e = vp9_extranzcbits[c])) { + int x = nzc - vp9_basenzcvalue[c]; + while (e--) { + int b = (x >> e) & 1; + vp9_write(bc, b, + cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]); + // cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++; + } + } +} + +static void write_nzcs_sb64(VP9_COMP *cpi, + MACROBLOCKD *xd, + int mb_row, + int mb_col, + vp9_writer* const bc) { + VP9_COMMON *const cm = &cpi->common; + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_32X32: + for (j = 0; j < 256; j += 64) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc); + } + for (j = 256; j < 384; j += 64) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1, bc); + } + break; + + case TX_16X16: + for (j = 0; j < 256; j += 16) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc); + } + for (j = 256; j < 384; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc); + } + break; + + case TX_8X8: + for (j = 0; j < 256; j += 4) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc); + } + for (j = 256; j < 384; j += 4) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc); + } + break; + + case TX_4X4: + for (j = 0; j < 256; ++j) { + nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc); + } + for (j = 256; j < 384; ++j) { + nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc); + } + break; + + default: + break; + } +} + +static void write_nzcs_sb32(VP9_COMP *cpi, + MACROBLOCKD *xd, + int mb_row, + int mb_col, + vp9_writer* const bc) { + VP9_COMMON *const cm = &cpi->common; + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_32X32: + for (j = 0; j < 64; j += 64) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc); + } + for (j = 64; j < 96; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc); + } + break; + + case TX_16X16: + for (j = 0; j < 64; j += 16) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc); + } + for (j = 64; j < 96; j += 16) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc); + } + break; + + case TX_8X8: + for (j = 0; j < 64; j += 4) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc); + } + for (j = 64; j < 96; j += 4) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc); + } + break; + + case TX_4X4: + for (j = 0; j < 64; ++j) { + nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc); + } + for (j = 64; j < 96; ++j) { + nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc); + } + break; + + default: + break; + } +} + +static void write_nzcs_mb16(VP9_COMP *cpi, + MACROBLOCKD *xd, + int mb_row, + int mb_col, + vp9_writer* const bc) { + VP9_COMMON *const cm = &cpi->common; + MODE_INFO *m = xd->mode_info_context; + MB_MODE_INFO *const mi = &m->mbmi; + int j, nzc_context; + const int ref = m->mbmi.ref_frame != INTRA_FRAME; + + assert(mb_col == get_mb_col(xd)); + assert(mb_row == get_mb_row(xd)); + + if (mi->mb_skip_coeff) + return; + + switch (mi->txfm_size) { + case TX_16X16: + for (j = 0; j < 16; j += 16) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc); + } + for (j = 16; j < 24; j += 4) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc); + } + break; + + case TX_8X8: + for (j = 0; j < 16; j += 4) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc); + } + if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) { + for (j = 16; j < 24; ++j) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc); + } + } else { + for (j = 16; j < 24; j += 4) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc); + } + } + break; + + case TX_4X4: + for (j = 0; j < 16; ++j) { + nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc); + } + for (j = 16; j < 24; ++j) { + nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j); + write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc); + } + break; + + default: + break; + } +} + +#ifdef NZC_STATS +void init_nzcstats() { + vp9_zero(nzc_stats_4x4); + vp9_zero(nzc_stats_8x8); + vp9_zero(nzc_stats_16x16); + vp9_zero(nzc_stats_32x32); + vp9_zero(nzc_pcat_stats); +} + +void update_nzcstats(VP9_COMMON *const cm) { + int c, r, b, t; + + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + for (t = 0; t < NZC4X4_TOKENS; ++t) { + nzc_stats_4x4[c][r][b][t] += cm->fc.nzc_counts_4x4[c][r][b][t]; + } + } + } + } + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + for (t = 0; t < NZC8X8_TOKENS; ++t) { + nzc_stats_8x8[c][r][b][t] += cm->fc.nzc_counts_8x8[c][r][b][t]; + } + } + } + } + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + for (t = 0; t < NZC16X16_TOKENS; ++t) { + nzc_stats_16x16[c][r][b][t] += cm->fc.nzc_counts_16x16[c][r][b][t]; + } + } + } + } + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + for (t = 0; t < NZC32X32_TOKENS; ++t) { + nzc_stats_32x32[c][r][b][t] += cm->fc.nzc_counts_32x32[c][r][b][t]; + } + } + } + } + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (t = 0; t < NZC_TOKENS_EXTRA; ++t) { + int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA]; + for (b = 0; b < bits; ++b) { + nzc_pcat_stats[c][t][b][0] += cm->fc.nzc_pcat_counts[c][t][b][0]; + nzc_pcat_stats[c][t][b][1] += cm->fc.nzc_pcat_counts[c][t][b][1]; + } + } + } +} + +void print_nzcstats() { + int c, r, b, t; + FILE *f; + + printf( + "static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]\n" + " [REF_TYPES]\n" + " [BLOCK_TYPES]\n" + " [NZC4X4_TOKENS] = {\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (r = 0; r < REF_TYPES; ++r) { + printf(" {\n"); + for (b = 0; b < BLOCK_TYPES; ++b) { + printf(" {"); + for (t = 0; t < NZC4X4_TOKENS; ++t) { + printf(" %-3d,", nzc_stats_4x4[c][r][b][t]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]\n" + " [REF_TYPES]\n" + " [BLOCK_TYPES]\n" + " [NZC8X8_TOKENS] = {\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (r = 0; r < REF_TYPES; ++r) { + printf(" {\n"); + for (b = 0; b < BLOCK_TYPES; ++b) { + printf(" {"); + for (t = 0; t < NZC8X8_TOKENS; ++t) { + printf(" %-3d,", nzc_stats_8x8[c][r][b][t]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]\n" + " [REF_TYPES]\n" + " [BLOCK_TYPES]\n" + " [NZC16X16_TOKENS] = {" + "\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (r = 0; r < REF_TYPES; ++r) { + printf(" {\n"); + for (b = 0; b < BLOCK_TYPES; ++b) { + printf(" {"); + for (t = 0; t < NZC16X16_TOKENS; ++t) { + printf(" %-3d,", nzc_stats_16x16[c][r][b][t]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]\n" + " [REF_TYPES]\n" + " [BLOCK_TYPES]\n" + " [NZC32X32_TOKENS] = {" + "\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (r = 0; r < REF_TYPES; ++r) { + printf(" {\n"); + for (b = 0; b < BLOCK_TYPES; ++b) { + printf(" {"); + for (t = 0; t < NZC32X32_TOKENS; ++t) { + printf(" %-3d,", nzc_stats_32x32[c][r][b][t]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const vp9_prob default_nzc_pcat_counts[MAX_NZC_CONTEXTS]\n" + " [NZC_TOKENS_EXTRA]\n" + " [NZC_BITS_EXTRA] = {\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (t = 0; t < NZC_TOKENS_EXTRA; ++t) { + printf(" {"); + for (b = 0; b < NZC_BITS_EXTRA; ++b) { + printf(" %d/%d,", + nzc_pcat_stats[c][t][b][0], nzc_pcat_stats[c][t][b][1]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]\n" + " [REF_TYPES]\n" + " [BLOCK_TYPES]\n" + " [NZC4X4_TOKENS] = {\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (r = 0; r < REF_TYPES; ++r) { + printf(" {\n"); + for (b = 0; b < BLOCK_TYPES; ++b) { + vp9_prob probs[NZC4X4_NODES]; + unsigned int branch_ct[NZC4X4_NODES][2]; + vp9_tree_probs_from_distribution(vp9_nzc4x4_tree, + probs, branch_ct, + nzc_stats_4x4[c][r][b], 0); + printf(" {"); + for (t = 0; t < NZC4X4_NODES; ++t) { + printf(" %-3d,", probs[t]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]\n" + " [REF_TYPES]\n" + " [BLOCK_TYPES]\n" + " [NZC8X8_TOKENS] = {\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (r = 0; r < REF_TYPES; ++r) { + printf(" {\n"); + for (b = 0; b < BLOCK_TYPES; ++b) { + vp9_prob probs[NZC8X8_NODES]; + unsigned int branch_ct[NZC8X8_NODES][2]; + vp9_tree_probs_from_distribution(vp9_nzc8x8_tree, + probs, branch_ct, + nzc_stats_8x8[c][r][b], 0); + printf(" {"); + for (t = 0; t < NZC8X8_NODES; ++t) { + printf(" %-3d,", probs[t]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]\n" + " [REF_TYPES]\n" + " [BLOCK_TYPES]\n" + " [NZC16X16_TOKENS] = {\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (r = 0; r < REF_TYPES; ++r) { + printf(" {\n"); + for (b = 0; b < BLOCK_TYPES; ++b) { + vp9_prob probs[NZC16X16_NODES]; + unsigned int branch_ct[NZC16X16_NODES][2]; + vp9_tree_probs_from_distribution(vp9_nzc16x16_tree, + probs, branch_ct, + nzc_stats_16x16[c][r][b], 0); + printf(" {"); + for (t = 0; t < NZC16X16_NODES; ++t) { + printf(" %-3d,", probs[t]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]\n" + " [REF_TYPES]\n" + " [BLOCK_TYPES]\n" + " [NZC32X32_TOKENS] = {\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (r = 0; r < REF_TYPES; ++r) { + printf(" {\n"); + for (b = 0; b < BLOCK_TYPES; ++b) { + vp9_prob probs[NZC32X32_NODES]; + unsigned int branch_ct[NZC32X32_NODES][2]; + vp9_tree_probs_from_distribution(vp9_nzc32x32_tree, + probs, branch_ct, + nzc_stats_32x32[c][r][b], 0); + printf(" {"); + for (t = 0; t < NZC32X32_NODES; ++t) { + printf(" %-3d,", probs[t]); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + printf( + "static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]\n" + " [NZC_TOKENS_EXTRA]\n" + " [NZC_BITS_EXTRA] = {\n"); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + printf(" {\n"); + for (t = 0; t < NZC_TOKENS_EXTRA; ++t) { + printf(" {"); + for (b = 0; b < NZC_BITS_EXTRA; ++b) { + vp9_prob prob = get_binary_prob(nzc_pcat_stats[c][t][b][0], + nzc_pcat_stats[c][t][b][1]); + printf(" %-3d,", prob); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + + f = fopen("nzcstats.bin", "wb"); + fwrite(nzc_stats_4x4, sizeof(nzc_stats_4x4), 1, f); + fwrite(nzc_stats_8x8, sizeof(nzc_stats_8x8), 1, f); + fwrite(nzc_stats_16x16, sizeof(nzc_stats_16x16), 1, f); + fwrite(nzc_stats_32x32, sizeof(nzc_stats_32x32), 1, f); + fwrite(nzc_pcat_stats, sizeof(nzc_pcat_stats), 1, f); + fclose(f); +} +#endif + +#endif // CONFIG_CODE_NONZEROCOUNT + static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end, int mb_row, int mb_col) { - VP9_COMMON *const c = &cpi->common; + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; xd->mode_info_context = m; - if (c->frame_type == KEY_FRAME) { + set_mb_row(&cpi->common, xd, mb_row, (1 << m->mbmi.sb_type)); + set_mb_col(&cpi->common, xd, mb_col, (1 << m->mbmi.sb_type)); + if (cm->frame_type == KEY_FRAME) { write_mb_modes_kf(cpi, m, bc, - c->mb_rows - mb_row, c->mb_cols - mb_col); + cm->mb_rows - mb_row, cm->mb_cols - mb_col); #ifdef ENTROPY_STATS active_section = 8; #endif } else { pack_inter_mode_mvs(cpi, m, bc, - c->mb_rows - mb_row, c->mb_cols - mb_col); + cm->mb_rows - mb_row, cm->mb_cols - mb_col); #ifdef ENTROPY_STATS active_section = 1; #endif } +#if CONFIG_CODE_NONZEROCOUNT + if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) + write_nzcs_sb64(cpi, xd, mb_row, mb_col, bc); + else if (m->mbmi.sb_type == BLOCK_SIZE_SB32X32) + write_nzcs_sb32(cpi, xd, mb_row, mb_col, bc); + else + write_nzcs_mb16(cpi, xd, mb_row, mb_col, bc); +#endif assert(*tok < tok_end); pack_mb_tokens(bc, tok, tok_end); } -static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) { +static void write_modes(VP9_COMP *cpi, vp9_writer* const bc, + TOKENEXTRA **tok, TOKENEXTRA *tok_end) { VP9_COMMON *const c = &cpi->common; const int mis = c->mode_info_stride; MODE_INFO *m, *m_ptr = c->mi; int i, mb_row, mb_col; - TOKENEXTRA *tok = cpi->tok; - TOKENEXTRA *tok_end = tok + cpi->tok_count; - for (mb_row = 0; mb_row < c->mb_rows; mb_row += 4, m_ptr += 4 * mis) { + m_ptr += c->cur_tile_mb_col_start + c->cur_tile_mb_row_start * mis; + for (mb_row = c->cur_tile_mb_row_start; + mb_row < c->cur_tile_mb_row_end; mb_row += 4, m_ptr += 4 * mis) { m = m_ptr; - for (mb_col = 0; mb_col < c->mb_cols; mb_col += 4, m += 4) { + for (mb_col = c->cur_tile_mb_col_start; + mb_col < c->cur_tile_mb_col_end; mb_col += 4, m += 4) { vp9_write(bc, m->mbmi.sb_type == BLOCK_SIZE_SB64X64, c->sb64_coded); if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) { - write_modes_b(cpi, m, bc, &tok, tok_end, mb_row, mb_col); + write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col); } else { int j; @@ -1107,7 +1703,7 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) { vp9_write(bc, sb_m->mbmi.sb_type, c->sb32_coded); if (sb_m->mbmi.sb_type) { assert(sb_m->mbmi.sb_type == BLOCK_SIZE_SB32X32); - write_modes_b(cpi, sb_m, bc, &tok, tok_end, + write_modes_b(cpi, sb_m, bc, tok, tok_end, mb_row + y_idx_sb, mb_col + x_idx_sb); } else { // Process the 4 MBs in the order: @@ -1123,7 +1719,7 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) { } assert(mb_m->mbmi.sb_type == BLOCK_SIZE_MB16X16); - write_modes_b(cpi, mb_m, bc, &tok, tok_end, + write_modes_b(cpi, mb_m, bc, tok, tok_end, mb_row + y_idx, mb_col + x_idx); } } @@ -1135,20 +1731,23 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) { /* This function is used for debugging probability trees. */ -static void print_prob_tree(vp9_coeff_probs *coef_probs) { +static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) { /* print coef probability tree */ - int i, j, k, l; + int i, j, k, l, m; FILE *f = fopen("enc_tree_probs.txt", "a"); fprintf(f, "{\n"); - for (i = 0; i < BLOCK_TYPES_4X4; i++) { + for (i = 0; i < block_types; i++) { fprintf(f, " {\n"); - for (j = 0; j < COEF_BANDS; j++) { - fprintf(f, " {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; k++) { - fprintf(f, " {"); - for (l = 0; l < ENTROPY_NODES; l++) { - fprintf(f, "%3u, ", - (unsigned int)(coef_probs [i][j][k][l])); + for (j = 0; j < REF_TYPES; ++j) { + fprintf(f, " {\n"); + for (k = 0; k < COEF_BANDS; k++) { + fprintf(f, " {\n"); + for (l = 0; l < PREV_COEF_CONTEXTS; l++) { + fprintf(f, " {"); + for (m = 0; m < ENTROPY_NODES; m++) { + fprintf(f, "%3u, ", + (unsigned int)(coef_probs[i][j][k][l][m])); + } } fprintf(f, " }\n"); } @@ -1162,32 +1761,44 @@ static void print_prob_tree(vp9_coeff_probs *coef_probs) { static void build_tree_distribution(vp9_coeff_probs *coef_probs, vp9_coeff_count *coef_counts, + unsigned int (*eob_branch_ct)[REF_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS], #ifdef ENTROPY_STATS VP9_COMP *cpi, vp9_coeff_accum *context_counters, #endif vp9_coeff_stats *coef_branch_ct, int block_types) { - int i = 0, j, k; + int i, j, k, l; #ifdef ENTROPY_STATS int t = 0; #endif for (i = 0; i < block_types; ++i) { - for (j = 0; j < COEF_BANDS; ++j) { - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS, - vp9_coef_encodings, vp9_coef_tree, - coef_probs[i][j][k], - coef_branch_ct[i][j][k], - coef_counts[i][j][k]); + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + if (l >= 3 && k == 0) + continue; + vp9_tree_probs_from_distribution(vp9_coef_tree, + coef_probs[i][j][k][l], + coef_branch_ct[i][j][k][l], + coef_counts[i][j][k][l], 0); + coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] - + coef_branch_ct[i][j][k][l][0][0]; + coef_probs[i][j][k][l][0] = + get_binary_prob(coef_branch_ct[i][j][k][l][0][0], + coef_branch_ct[i][j][k][l][0][1]); #ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - context_counters[i][j][k][t] += coef_counts[i][j][k][t]; + if (!cpi->dummy_packing) { + for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) + context_counters[i][j][k][l][t] += coef_counts[i][j][k][l][t]; + context_counters[i][j][k][l][MAX_ENTROPY_TOKENS] += + eob_branch_ct[i][j][k][l]; + } #endif + } } } } @@ -1196,48 +1807,256 @@ static void build_tree_distribution(vp9_coeff_probs *coef_probs, static void build_coeff_contexts(VP9_COMP *cpi) { build_tree_distribution(cpi->frame_coef_probs_4x4, cpi->coef_counts_4x4, + cpi->common.fc.eob_branch_counts[TX_4X4], #ifdef ENTROPY_STATS cpi, context_counters_4x4, #endif - cpi->frame_branch_ct_4x4, BLOCK_TYPES_4X4); - build_tree_distribution(cpi->frame_hybrid_coef_probs_4x4, - cpi->hybrid_coef_counts_4x4, -#ifdef ENTROPY_STATS - cpi, hybrid_context_counters_4x4, -#endif - cpi->frame_hybrid_branch_ct_4x4, BLOCK_TYPES_4X4); + cpi->frame_branch_ct_4x4, BLOCK_TYPES); build_tree_distribution(cpi->frame_coef_probs_8x8, cpi->coef_counts_8x8, + cpi->common.fc.eob_branch_counts[TX_8X8], #ifdef ENTROPY_STATS cpi, context_counters_8x8, #endif - cpi->frame_branch_ct_8x8, BLOCK_TYPES_8X8); - build_tree_distribution(cpi->frame_hybrid_coef_probs_8x8, - cpi->hybrid_coef_counts_8x8, -#ifdef ENTROPY_STATS - cpi, hybrid_context_counters_8x8, -#endif - cpi->frame_hybrid_branch_ct_8x8, BLOCK_TYPES_8X8); + cpi->frame_branch_ct_8x8, BLOCK_TYPES); build_tree_distribution(cpi->frame_coef_probs_16x16, cpi->coef_counts_16x16, + cpi->common.fc.eob_branch_counts[TX_16X16], #ifdef ENTROPY_STATS cpi, context_counters_16x16, #endif - cpi->frame_branch_ct_16x16, BLOCK_TYPES_16X16); - build_tree_distribution(cpi->frame_hybrid_coef_probs_16x16, - cpi->hybrid_coef_counts_16x16, -#ifdef ENTROPY_STATS - cpi, hybrid_context_counters_16x16, -#endif - cpi->frame_hybrid_branch_ct_16x16, BLOCK_TYPES_16X16); + cpi->frame_branch_ct_16x16, BLOCK_TYPES); build_tree_distribution(cpi->frame_coef_probs_32x32, cpi->coef_counts_32x32, + cpi->common.fc.eob_branch_counts[TX_32X32], #ifdef ENTROPY_STATS cpi, context_counters_32x32, #endif - cpi->frame_branch_ct_32x32, BLOCK_TYPES_32X32); + cpi->frame_branch_ct_32x32, BLOCK_TYPES); } +#if CONFIG_CODE_NONZEROCOUNT +static void update_nzc_probs_common(VP9_COMP* cpi, + vp9_writer* const bc, + int block_size) { + VP9_COMMON *cm = &cpi->common; + int c, r, b, t; + int update[2] = {0, 0}; + int savings = 0; + int tokens, nodes; + const vp9_tree_index *nzc_tree; + vp9_prob *new_nzc_probs; + vp9_prob *old_nzc_probs; + unsigned int *nzc_counts; + unsigned int (*nzc_branch_ct)[2]; + vp9_prob upd; + + if (block_size == 32) { + tokens = NZC32X32_TOKENS; + nzc_tree = vp9_nzc32x32_tree; + old_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0]; + new_nzc_probs = cpi->frame_nzc_probs_32x32[0][0][0]; + nzc_counts = cm->fc.nzc_counts_32x32[0][0][0]; + nzc_branch_ct = cpi->frame_nzc_branch_ct_32x32[0][0][0]; + upd = NZC_UPDATE_PROB_32X32; + } else if (block_size == 16) { + tokens = NZC16X16_TOKENS; + nzc_tree = vp9_nzc16x16_tree; + old_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0]; + new_nzc_probs = cpi->frame_nzc_probs_16x16[0][0][0]; + nzc_counts = cm->fc.nzc_counts_16x16[0][0][0]; + nzc_branch_ct = cpi->frame_nzc_branch_ct_16x16[0][0][0]; + upd = NZC_UPDATE_PROB_16X16; + } else if (block_size == 8) { + tokens = NZC8X8_TOKENS; + nzc_tree = vp9_nzc8x8_tree; + old_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0]; + new_nzc_probs = cpi->frame_nzc_probs_8x8[0][0][0]; + nzc_counts = cm->fc.nzc_counts_8x8[0][0][0]; + nzc_branch_ct = cpi->frame_nzc_branch_ct_8x8[0][0][0]; + upd = NZC_UPDATE_PROB_8X8; + } else { + nzc_tree = vp9_nzc4x4_tree; + tokens = NZC4X4_TOKENS; + old_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0]; + new_nzc_probs = cpi->frame_nzc_probs_4x4[0][0][0]; + nzc_counts = cm->fc.nzc_counts_4x4[0][0][0]; + nzc_branch_ct = cpi->frame_nzc_branch_ct_4x4[0][0][0]; + upd = NZC_UPDATE_PROB_4X4; + } + nodes = tokens - 1; + // Get the new probabilities and the branch counts + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b; + int offset_nodes = offset * nodes; + int offset_tokens = offset * tokens; + vp9_tree_probs_from_distribution(nzc_tree, + new_nzc_probs + offset_nodes, + nzc_branch_ct + offset_nodes, + nzc_counts + offset_tokens, 0); + } + } + } + + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b; + int offset_nodes = offset * nodes; + for (t = 0; t < nodes; ++t) { + vp9_prob newp = new_nzc_probs[offset_nodes + t]; + vp9_prob oldp = old_nzc_probs[offset_nodes + t]; + int s, u = 0; +#if defined(SEARCH_NEWP) + s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes], + oldp, &newp, upd); + if (s > 0 && newp != oldp) + u = 1; + if (u) + savings += s - (int)(vp9_cost_zero(upd)); + else + savings -= (int)(vp9_cost_zero(upd)); +#else + s = prob_update_savings(nzc_branch_ct[offset_nodes], + oldp, newp, upd); + if (s > 0) + u = 1; + if (u) + savings += s; +#endif + update[u]++; + } + } + } + } + if (update[1] == 0 || savings < 0) { + vp9_write_bit(bc, 0); + } else { + vp9_write_bit(bc, 1); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b; + int offset_nodes = offset * nodes; + for (t = 0; t < nodes; ++t) { + vp9_prob newp = new_nzc_probs[offset_nodes + t]; + vp9_prob *oldp = &old_nzc_probs[offset_nodes + t]; + int s, u = 0; +#if defined(SEARCH_NEWP) + s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes], + *oldp, &newp, upd); + if (s > 0 && newp != *oldp) + u = 1; +#else + s = prob_update_savings(nzc_branch_ct[offset_nodes], + *oldp, newp, upd); + if (s > 0) + u = 1; +#endif + vp9_write(bc, u, upd); + if (u) { + /* send/use new probability */ + write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } + } + } + } +} + +static void update_nzc_pcat_probs(VP9_COMP *cpi, vp9_writer* const bc) { + VP9_COMMON *cm = &cpi->common; + int c, t, b; + int update[2] = {0, 0}; + int savings = 0; + vp9_prob upd = NZC_UPDATE_PROB_PCAT; + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (t = 0; t < NZC_TOKENS_EXTRA; ++t) { + int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA]; + for (b = 0; b < bits; ++b) { + vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0], + cm->fc.nzc_pcat_counts[c][t][b][1]); + vp9_prob oldp = cm->fc.nzc_pcat_probs[c][t][b]; + int s, u = 0; +#if defined(SEARCH_NEWP) + s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b], + oldp, &newp, upd); + if (s > 0 && newp != oldp) + u = 1; + if (u) + savings += s - (int)(vp9_cost_zero(upd)); + else + savings -= (int)(vp9_cost_zero(upd)); +#else + s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b], + oldp, newp, upd); + if (s > 0) + u = 1; + if (u) + savings += s; +#endif + update[u]++; + } + } + } + if (update[1] == 0 || savings < 0) { + vp9_write_bit(bc, 0); + } else { + vp9_write_bit(bc, 1); + for (c = 0; c < MAX_NZC_CONTEXTS; ++c) { + for (t = 0; t < NZC_TOKENS_EXTRA; ++t) { + int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA]; + for (b = 0; b < bits; ++b) { + vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0], + cm->fc.nzc_pcat_counts[c][t][b][1]); + vp9_prob *oldp = &cm->fc.nzc_pcat_probs[c][t][b]; + int s, u = 0; +#if defined(SEARCH_NEWP) + s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b], + *oldp, &newp, upd); + if (s > 0 && newp != *oldp) + u = 1; +#else + s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b], + *oldp, newp, upd); + if (s > 0) + u = 1; +#endif + vp9_write(bc, u, upd); + if (u) { + /* send/use new probability */ + write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } + } + } +} + +static void update_nzc_probs(VP9_COMP* cpi, + vp9_writer* const bc) { + update_nzc_probs_common(cpi, bc, 4); + if (cpi->common.txfm_mode != ONLY_4X4) + update_nzc_probs_common(cpi, bc, 8); + if (cpi->common.txfm_mode > ALLOW_8X8) + update_nzc_probs_common(cpi, bc, 16); + if (cpi->common.txfm_mode > ALLOW_16X16) + update_nzc_probs_common(cpi, bc, 32); +#ifdef NZC_PCAT_UPDATE + update_nzc_pcat_probs(cpi, bc); +#endif +#ifdef NZC_STATS + if (!cpi->dummy_packing) + update_nzcstats(&cpi->common); +#endif +} +#endif // CONFIG_CODE_NONZEROCOUNT + static void update_coef_probs_common(vp9_writer* const bc, #ifdef ENTROPY_STATS VP9_COMP *cpi, @@ -1247,46 +2066,59 @@ static void update_coef_probs_common(vp9_writer* const bc, vp9_coeff_probs *old_frame_coef_probs, vp9_coeff_stats *frame_branch_ct, int block_types) { - int i, j, k, t; + int i, j, k, l, t; int update[2] = {0, 0}; int savings; +#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE + const int entropy_nodes_update = UNCONSTRAINED_UPDATE_NODES; +#else + const int entropy_nodes_update = ENTROPY_NODES; +#endif // vp9_prob bestupd = find_coef_update_prob(cpi); /* dry run to see if there is any udpate at all needed */ savings = 0; for (i = 0; i < block_types; ++i) { - for (j = !i; j < COEF_BANDS; ++j) { - int prev_coef_savings[ENTROPY_NODES] = {0}; - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - for (t = 0; t < ENTROPY_NODES; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][t]; - const vp9_prob oldp = old_frame_coef_probs[i][j][k][t]; - const vp9_prob upd = COEF_UPDATE_PROB; - int s = prev_coef_savings[t]; - int u = 0; - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + // int prev_coef_savings[ENTROPY_NODES] = {0}; + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + for (t = CONFIG_CODE_NONZEROCOUNT; t < entropy_nodes_update; ++t) { + vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; + const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t]; + const vp9_prob upd = vp9_coef_update_prob[t]; + int s; // = prev_coef_savings[t]; + int u = 0; + + if (l >= 3 && k == 0) + continue; #if defined(SEARCH_NEWP) - s = prob_diff_update_savings_search( - frame_branch_ct[i][j][k][t], - oldp, &newp, upd); - if (s > 0 && newp != oldp) - u = 1; - if (u) - savings += s - (int)(vp9_cost_zero(upd)); - else - savings -= (int)(vp9_cost_zero(upd)); +#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE + if (t == UNCONSTRAINED_NODES - 1) + s = prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], + old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); + else +#endif + s = prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], oldp, &newp, upd); + if (s > 0 && newp != oldp) + u = 1; + if (u) + savings += s - (int)(vp9_cost_zero(upd)); + else + savings -= (int)(vp9_cost_zero(upd)); #else - s = prob_update_savings( - frame_branch_ct[i][j][k][t], - oldp, newp, upd); - if (s > 0) - u = 1; - if (u) - savings += s; + s = prob_update_savings(frame_branch_ct[i][j][k][l][t], + oldp, newp, upd); + if (s > 0) + u = 1; + if (u) + savings += s; #endif - update[u]++; + update[u]++; + } } } } @@ -1296,44 +2128,57 @@ static void update_coef_probs_common(vp9_writer* const bc, /* Is coef updated at all */ if (update[1] == 0 || savings < 0) { vp9_write_bit(bc, 0); - } else { - vp9_write_bit(bc, 1); - for (i = 0; i < block_types; ++i) { - for (j = !i; j < COEF_BANDS; ++j) { - int prev_coef_savings[ENTROPY_NODES] = {0}; - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + return; + } + vp9_write_bit(bc, 1); + for (i = 0; i < block_types; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + // int prev_coef_savings[ENTROPY_NODES] = {0}; + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { // calc probs and branch cts for this frame only - for (t = 0; t < ENTROPY_NODES; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][t]; - vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t; - const vp9_prob upd = COEF_UPDATE_PROB; - int s = prev_coef_savings[t]; + for (t = CONFIG_CODE_NONZEROCOUNT; t < entropy_nodes_update; ++t) { + vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; + vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; + const vp9_prob upd = vp9_coef_update_prob[t]; + int s; // = prev_coef_savings[t]; int u = 0; - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + if (l >= 3 && k == 0) continue; #if defined(SEARCH_NEWP) - s = prob_diff_update_savings_search( - frame_branch_ct[i][j][k][t], +#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE + if (t == UNCONSTRAINED_NODES - 1) + s = prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], + old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); + else +#endif + s = prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd); if (s > 0 && newp != *oldp) u = 1; #else - s = prob_update_savings( - frame_branch_ct[i][j][k][t], - *oldp, newp, upd); + s = prob_update_savings(frame_branch_ct[i][j][k][l][t], + *oldp, newp, upd); if (s > 0) u = 1; #endif vp9_write(bc, u, upd); #ifdef ENTROPY_STATS if (!cpi->dummy_packing) - ++tree_update_hist[i][j][k][t][u]; + ++tree_update_hist[i][j][k][l][t][u]; #endif if (u) { /* send/use new probability */ write_prob_diff_update(bc, newp, *oldp); *oldp = newp; +#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE + if (t == UNCONSTRAINED_NODES - 1) + vp9_get_model_distribution( + newp, old_frame_coef_probs[i][j][k][l], i, j); +#endif } } } @@ -1356,17 +2201,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { cpi->frame_coef_probs_4x4, cpi->common.fc.coef_probs_4x4, cpi->frame_branch_ct_4x4, - BLOCK_TYPES_4X4); - - update_coef_probs_common(bc, -#ifdef ENTROPY_STATS - cpi, - hybrid_tree_update_hist_4x4, -#endif - cpi->frame_hybrid_coef_probs_4x4, - cpi->common.fc.hybrid_coef_probs_4x4, - cpi->frame_hybrid_branch_ct_4x4, - BLOCK_TYPES_4X4); + BLOCK_TYPES); /* do not do this if not even allowed */ if (cpi->common.txfm_mode != ONLY_4X4) { @@ -1378,17 +2213,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { cpi->frame_coef_probs_8x8, cpi->common.fc.coef_probs_8x8, cpi->frame_branch_ct_8x8, - BLOCK_TYPES_8X8); - - update_coef_probs_common(bc, -#ifdef ENTROPY_STATS - cpi, - hybrid_tree_update_hist_8x8, -#endif - cpi->frame_hybrid_coef_probs_8x8, - cpi->common.fc.hybrid_coef_probs_8x8, - cpi->frame_hybrid_branch_ct_8x8, - BLOCK_TYPES_8X8); + BLOCK_TYPES); } if (cpi->common.txfm_mode > ALLOW_8X8) { @@ -1400,16 +2225,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { cpi->frame_coef_probs_16x16, cpi->common.fc.coef_probs_16x16, cpi->frame_branch_ct_16x16, - BLOCK_TYPES_16X16); - update_coef_probs_common(bc, -#ifdef ENTROPY_STATS - cpi, - hybrid_tree_update_hist_16x16, -#endif - cpi->frame_hybrid_coef_probs_16x16, - cpi->common.fc.hybrid_coef_probs_16x16, - cpi->frame_hybrid_branch_ct_16x16, - BLOCK_TYPES_16X16); + BLOCK_TYPES); } if (cpi->common.txfm_mode > ALLOW_16X16) { @@ -1421,7 +2237,7 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { cpi->frame_coef_probs_32x32, cpi->common.fc.coef_probs_32x32, cpi->frame_branch_ct_32x32, - BLOCK_TYPES_32X32); + BLOCK_TYPES); } } @@ -1523,34 +2339,49 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, * and color type. */ if (oh.type == KEY_FRAME) { - int v; - // Start / synch code cx_data[0] = 0x9D; cx_data[1] = 0x01; cx_data[2] = 0x2a; + extra_bytes_packed = 3; + cx_data += extra_bytes_packed; + } + { + int v; - v = (pc->horiz_scale << 14) | pc->Width; - cx_data[3] = v; - cx_data[4] = v >> 8; - - v = (pc->vert_scale << 14) | pc->Height; - cx_data[5] = v; - cx_data[6] = v >> 8; + if (pc->width != pc->display_width || pc->height != pc->display_height) { + v = pc->display_width; + cx_data[0] = v; + cx_data[1] = v >> 8; - extra_bytes_packed = 7; - cx_data += extra_bytes_packed; + v = pc->display_height; + cx_data[2] = v; + cx_data[3] = v >> 8; + cx_data += 4; + extra_bytes_packed += 4; + } - vp9_start_encode(&header_bc, cx_data); + v = pc->width; + cx_data[0] = v; + cx_data[1] = v >> 8; - // signal clr type - vp9_write_bit(&header_bc, pc->clr_type); - vp9_write_bit(&header_bc, pc->clamp_type); + v = pc->height; + cx_data[2] = v; + cx_data[3] = v >> 8; - } else { - vp9_start_encode(&header_bc, cx_data); + extra_bytes_packed += 4; + cx_data += 4; } + vp9_start_encode(&header_bc, cx_data); + + // TODO(jkoleszar): remove these two unused bits? + vp9_write_bit(&header_bc, pc->clr_type); + vp9_write_bit(&header_bc, pc->clamp_type); + + // error resilient mode + vp9_write_bit(&header_bc, pc->error_resilient_mode); + // Signal whether or not Segmentation is enabled vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0); @@ -1655,7 +2486,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]); vp9_write_literal(&header_bc, pc->sb32_coded, 8); - { + vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless); + if (cpi->mb.e_mbd.lossless) { + pc->txfm_mode = ONLY_4X4; + } else { if (pc->txfm_mode == TX_MODE_SELECT) { pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] + cpi->txfm_count_16x16p[TX_4X4] + @@ -1699,6 +2533,14 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, vp9_write_bit(&header_bc, pc->filter_type); vp9_write_literal(&header_bc, pc->filter_level, 6); vp9_write_literal(&header_bc, pc->sharpness_level, 3); +#if CONFIG_LOOP_DERING + if (pc->dering_enabled) { + vp9_write_bit(&header_bc, 1); + vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4); + } else { + vp9_write_bit(&header_bc, 0); + } +#endif // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled). vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0); @@ -1765,29 +2607,35 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, // Transmit Dc, Second order and Uv quantizer delta information put_delta_q(&header_bc, pc->y1dc_delta_q); - put_delta_q(&header_bc, pc->y2dc_delta_q); - put_delta_q(&header_bc, pc->y2ac_delta_q); put_delta_q(&header_bc, pc->uvdc_delta_q); put_delta_q(&header_bc, pc->uvac_delta_q); // When there is a key frame all reference buffers are updated using the new key frame if (pc->frame_type != KEY_FRAME) { - // Should the GF or ARF be updated using the transmitted frame or buffer - vp9_write_bit(&header_bc, pc->refresh_golden_frame); - vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame); + int refresh_mask; - // For inter frames the current default behavior is that when - // cm->refresh_golden_frame is set we copy the old GF over to - // the ARF buffer. This is purely an encoder decision at present. - if (pc->refresh_golden_frame) - pc->copy_buffer_to_arf = 2; - - // If not being updated from current frame should either GF or ARF be updated from another buffer - if (!pc->refresh_golden_frame) - vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2); - - if (!pc->refresh_alt_ref_frame) - vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2); + // Should the GF or ARF be updated using the transmitted frame or buffer + if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { + /* Preserve the previously existing golden frame and update the frame in + * the alt ref slot instead. This is highly specific to the use of + * alt-ref as a forward reference, and this needs to be generalized as + * other uses are implemented (like RTC/temporal scaling) + * + * gld_fb_idx and alt_fb_idx need to be swapped for future frames, but + * that happens in vp9_onyx_if.c:update_reference_frames() so that it can + * be done outside of the recode loop. + */ + refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->alt_fb_idx); + } else { + refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->gld_fb_idx) | + (cpi->refresh_alt_ref_frame << cpi->alt_fb_idx); + } + vp9_write_literal(&header_bc, refresh_mask, NUM_REF_FRAMES); + vp9_write_literal(&header_bc, cpi->lst_fb_idx, NUM_REF_FRAMES_LG2); + vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2); + vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2); // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer) vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]); @@ -1831,10 +2679,13 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, #endif } - vp9_write_bit(&header_bc, pc->refresh_entropy_probs); + if (!pc->error_resilient_mode) { + vp9_write_bit(&header_bc, pc->refresh_entropy_probs); + vp9_write_bit(&header_bc, pc->frame_parallel_decoding_mode); + } - if (pc->frame_type != KEY_FRAME) - vp9_write_bit(&header_bc, pc->refresh_last_frame); + vp9_write_literal(&header_bc, pc->frame_context_idx, + NUM_FRAME_CONTEXTS_LG2); #ifdef ENTROPY_STATS if (pc->frame_type == INTER_FRAME) @@ -1848,7 +2699,13 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, if (pc->frame_type != KEY_FRAME) { int i, j; int new_context[INTER_MODE_CONTEXTS][4]; - update_mode_probs(pc, new_context); + if (!cpi->dummy_packing) { + update_inter_mode_probs(pc, new_context); + } else { + // In dummy pack assume context unchanged. + vpx_memcpy(new_context, pc->fc.vp9_mode_contexts, + sizeof(pc->fc.vp9_mode_contexts)); + } for (i = 0; i < INTER_MODE_CONTEXTS; i++) { for (j = 0; j < 4; j++) { @@ -1902,18 +2759,33 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, vp9_copy(cpi->common.fc.pre_coef_probs_4x4, cpi->common.fc.coef_probs_4x4); - vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_4x4, - cpi->common.fc.hybrid_coef_probs_4x4); vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8); - vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, - cpi->common.fc.hybrid_coef_probs_8x8); vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16); - vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, - cpi->common.fc.hybrid_coef_probs_16x16); vp9_copy(cpi->common.fc.pre_coef_probs_32x32, cpi->common.fc.coef_probs_32x32); +#if CONFIG_CODE_NONZEROCOUNT + vp9_copy(cpi->common.fc.pre_nzc_probs_4x4, + cpi->common.fc.nzc_probs_4x4); + vp9_copy(cpi->common.fc.pre_nzc_probs_8x8, + cpi->common.fc.nzc_probs_8x8); + vp9_copy(cpi->common.fc.pre_nzc_probs_16x16, + cpi->common.fc.nzc_probs_16x16); + vp9_copy(cpi->common.fc.pre_nzc_probs_32x32, + cpi->common.fc.nzc_probs_32x32); + vp9_copy(cpi->common.fc.pre_nzc_pcat_probs, + cpi->common.fc.nzc_pcat_probs); + // NOTE that if the counts are reset, we also need to uncomment + // the count updates in the write_nzc function + /* + vp9_zero(cpi->common.fc.nzc_counts_4x4); + vp9_zero(cpi->common.fc.nzc_counts_8x8); + vp9_zero(cpi->common.fc.nzc_counts_16x16); + vp9_zero(cpi->common.fc.nzc_counts_32x32); + vp9_zero(cpi->common.fc.nzc_pcat_counts); + */ +#endif vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob); vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob); vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob); @@ -1930,6 +2802,9 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, vp9_zero(cpi->common.fc.mv_ref_ct) update_coef_probs(cpi, &header_bc); +#if CONFIG_CODE_NONZEROCOUNT + update_nzc_probs(cpi, &header_bc); +#endif #ifdef ENTROPY_STATS active_section = 2; @@ -1941,8 +2816,9 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, int k; vp9_update_skip_probs(cpi); - for (k = 0; k < MBSKIP_CONTEXTS; ++k) + for (k = 0; k < MBSKIP_CONTEXTS; ++k) { vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8); + } } if (pc->frame_type == KEY_FRAME) { @@ -1960,7 +2836,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, if (pc->mcomp_filter_type == SWITCHABLE) update_switchable_interp_probs(cpi, &header_bc); - #if CONFIG_COMP_INTERINTRA_PRED +#if CONFIG_COMP_INTERINTRA_PRED if (pc->use_interintra) { vp9_cond_prob_update(&header_bc, &pc->fc.interintra_prob, @@ -1995,59 +2871,120 @@ void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc); } + /* tiling */ + { + int min_log2_tiles, delta_log2_tiles, n_tile_bits, n; + + vp9_get_tile_n_bits(pc, &min_log2_tiles, &delta_log2_tiles); + n_tile_bits = pc->log2_tile_columns - min_log2_tiles; + for (n = 0; n < delta_log2_tiles; n++) { + if (n_tile_bits--) { + vp9_write_bit(&header_bc, 1); + } else { + vp9_write_bit(&header_bc, 0); + break; + } + } + vp9_write_bit(&header_bc, pc->log2_tile_rows != 0); + if (pc->log2_tile_rows != 0) + vp9_write_bit(&header_bc, pc->log2_tile_rows != 1); + } + vp9_stop_encode(&header_bc); oh.first_partition_length_in_bytes = header_bc.pos; /* update frame tag */ { - int v = (oh.first_partition_length_in_bytes << 5) | + int scaling = (pc->width != pc->display_width || + pc->height != pc->display_height); + int v = (oh.first_partition_length_in_bytes << 8) | + (scaling << 5) | (oh.show_frame << 4) | (oh.version << 1) | oh.type; + assert(oh.first_partition_length_in_bytes <= 0xffff); dest[0] = v; dest[1] = v >> 8; dest[2] = v >> 16; } *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos; - vp9_start_encode(&residual_bc, cx_data + header_bc.pos); if (pc->frame_type == KEY_FRAME) { decide_kf_ymode_entropy(cpi); - write_modes(cpi, &residual_bc); } else { /* This is not required if the counts in cpi are consistent with the * final packing pass */ // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount); - write_modes(cpi, &residual_bc); - - vp9_update_mode_context(&cpi->common); } - vp9_stop_encode(&residual_bc); + { + int tile_row, tile_col, total_size = 0; + unsigned char *data_ptr = cx_data + header_bc.pos; + TOKENEXTRA *tok[1 << 6], *tok_end; + + tok[0] = cpi->tok; + for (tile_col = 1; tile_col < pc->tile_columns; tile_col++) + tok[tile_col] = tok[tile_col - 1] + cpi->tok_count[tile_col - 1]; + + for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { + vp9_get_tile_row_offsets(pc, tile_row); + tok_end = cpi->tok + cpi->tok_count[0]; + for (tile_col = 0; tile_col < pc->tile_columns; + tile_col++, tok_end += cpi->tok_count[tile_col]) { + vp9_get_tile_col_offsets(pc, tile_col); + + if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) + vp9_start_encode(&residual_bc, data_ptr + total_size + 4); + else + vp9_start_encode(&residual_bc, data_ptr + total_size); + write_modes(cpi, &residual_bc, &tok[tile_col], tok_end); + vp9_stop_encode(&residual_bc); + if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) { + /* size of this tile */ + data_ptr[total_size + 0] = residual_bc.pos; + data_ptr[total_size + 1] = residual_bc.pos >> 8; + data_ptr[total_size + 2] = residual_bc.pos >> 16; + data_ptr[total_size + 3] = residual_bc.pos >> 24; + total_size += 4; + } - *size += residual_bc.pos; + total_size += residual_bc.pos; + } + } + + assert((unsigned int)(tok[0] - cpi->tok) == cpi->tok_count[0]); + for (tile_col = 1; tile_col < pc->tile_columns; tile_col++) + assert((unsigned int)(tok[tile_col] - tok[tile_col - 1]) == + cpi->tok_count[tile_col]); + + *size += total_size; + } } #ifdef ENTROPY_STATS static void print_tree_update_for_type(FILE *f, vp9_coeff_stats *tree_update_hist, int block_types, const char *header) { - int i, j, k, l; + int i, j, k, l, m; fprintf(f, "const vp9_coeff_prob %s = {\n", header); for (i = 0; i < block_types; i++) { fprintf(f, " { \n"); - for (j = 0; j < COEF_BANDS; j++) { - fprintf(f, " {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; k++) { - fprintf(f, " {"); - for (l = 0; l < ENTROPY_NODES; l++) { - fprintf(f, "%3d, ", - get_binary_prob(tree_update_hist[i][j][k][l][0], - tree_update_hist[i][j][k][l][1])); + for (j = 0; j < REF_TYPES; j++) { + fprintf(f, " { \n"); + for (k = 0; k < COEF_BANDS; k++) { + fprintf(f, " {\n"); + for (l = 0; l < PREV_COEF_CONTEXTS; l++) { + fprintf(f, " {"); + for (m = 0; m < ENTROPY_NODES; m++) { + fprintf(f, "%3d, ", + get_binary_prob(tree_update_hist[i][j][k][l][m][0], + tree_update_hist[i][j][k][l][m][1])); + } + fprintf(f, "},\n"); } fprintf(f, "},\n"); } @@ -2062,27 +2999,21 @@ void print_tree_update_probs() { FILE *f = fopen("coefupdprob.h", "w"); fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n"); - print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES_4X4, - "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]"); - print_tree_update_for_type(f, hybrid_tree_update_hist_4x4, BLOCK_TYPES_4X4, - "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]"); - print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES_8X8, - "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]"); - print_tree_update_for_type(f, hybrid_tree_update_hist_8x8, BLOCK_TYPES_8X8, - "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]"); - print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES_16X16, - "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]"); - print_tree_update_for_type(f, hybrid_tree_update_hist_16x16, - BLOCK_TYPES_16X16, - "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]"); - print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES_32X32, - "vp9_coef_update_probs_32x32[BLOCK_TYPES_32X32]"); + print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES, + "vp9_coef_update_probs_4x4[BLOCK_TYPES]"); + print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES, + "vp9_coef_update_probs_8x8[BLOCK_TYPES]"); + print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES, + "vp9_coef_update_probs_16x16[BLOCK_TYPES]"); + print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES, + "vp9_coef_update_probs_32x32[BLOCK_TYPES]"); fclose(f); f = fopen("treeupdate.bin", "wb"); fwrite(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f); fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f); fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f); + fwrite(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f); fclose(f); } #endif diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 1960b9162e34d67eeca491d73acb95c6a0356211..491ea62b5e183dae08a3fb1179f888cbb42d2678 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -50,10 +50,7 @@ typedef struct block { int src; int src_stride; - int eob_max_offset; - int eob_max_offset_8x8; - int eob_max_offset_16x16; - int eob_max_offset_32x32; + int skip_block; } BLOCK; typedef struct { @@ -86,19 +83,12 @@ typedef struct { int64_t txfm_rd_diff[NB_TXFM_MODES]; } PICK_MODE_CONTEXT; -typedef struct superblock { - DECLARE_ALIGNED(16, int16_t, src_diff[32*32+16*16*2]); - DECLARE_ALIGNED(16, int16_t, coeff[32*32+16*16*2]); -} SUPERBLOCK; - -typedef struct macroblock { - DECLARE_ALIGNED(16, int16_t, src_diff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y - DECLARE_ALIGNED(16, int16_t, coeff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y +typedef struct macroblock MACROBLOCK; +struct macroblock { + DECLARE_ALIGNED(16, int16_t, src_diff[64*64+32*32*2]); + DECLARE_ALIGNED(16, int16_t, coeff[64*64+32*32*2]); // 16 Y blocks, 4 U blocks, 4 V blocks, - // 1 DC 2nd order block each with 16 entries - BLOCK block[25]; - - SUPERBLOCK sb_coeff_data; + BLOCK block[24]; YV12_BUFFER_CONFIG src; @@ -160,8 +150,13 @@ typedef struct macroblock { unsigned char *active_ptr; - vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4]; - vp9_coeff_count hybrid_token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4]; + vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES]; +#if CONFIG_CODE_NONZEROCOUNT + unsigned int nzc_costs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][17]; + unsigned int nzc_costs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][65]; + unsigned int nzc_costs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][257]; + unsigned int nzc_costs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][1025]; +#endif int optimize; @@ -172,17 +167,14 @@ typedef struct macroblock { PICK_MODE_CONTEXT sb32_context[4]; PICK_MODE_CONTEXT sb64_context; - void (*vp9_short_fdct4x4)(int16_t *input, int16_t *output, int pitch); - void (*vp9_short_fdct8x4)(int16_t *input, int16_t *output, int pitch); - void (*short_walsh4x4)(int16_t *input, int16_t *output, int pitch); - void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d); - void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1); - void (*vp9_short_fdct8x8)(int16_t *input, int16_t *output, int pitch); - void (*vp9_short_fdct16x16)(int16_t *input, int16_t *output, int pitch); - void (*short_fhaar2x2)(int16_t *input, int16_t *output, int pitch); - void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d); - void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d); - void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d); -} MACROBLOCK; + void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch); + void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch); + void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch); + void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch); + void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx); + void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2); + void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type); + void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type); +}; #endif // VP9_ENCODER_VP9_BLOCK_H_ diff --git a/vp9/encoder/vp9_boolhuff.c b/vp9/encoder/vp9_boolhuff.c index d1b1e0e89dc9fb08bf5cbf616709afd264ac719c..2137421827562fd991237270785d258d96c0f6ab 100644 --- a/vp9/encoder/vp9_boolhuff.c +++ b/vp9/encoder/vp9_boolhuff.c @@ -40,7 +40,6 @@ const unsigned int vp9_prob_cost[256] = { }; void vp9_start_encode(BOOL_CODER *br, unsigned char *source) { - br->lowvalue = 0; br->range = 255; br->value = 0; @@ -54,6 +53,10 @@ void vp9_stop_encode(BOOL_CODER *br) { for (i = 0; i < 32; i++) encode_bool(br, 0, 128); + + // Ensure there's no ambigous collision with any index marker bytes + if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) + br->buffer[br->pos++] = 0; } diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index bfde02ccbe841d4efa591965c4fcebe302c141f6..aeef9c6dfd05307cf55fa592b199e6b5ffa59e78 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -15,846 +15,545 @@ #include "vp9/common/vp9_systemdependent.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" + +static void fdct4_1d(int16_t *input, int16_t *output) { + int16_t step[4]; + int temp1, temp2; + + step[0] = input[0] + input[3]; + step[1] = input[1] + input[2]; + step[2] = input[1] - input[2]; + step[3] = input[0] - input[3]; + + temp1 = (step[0] + step[1]) * cospi_16_64; + temp2 = (step[0] - step[1]) * cospi_16_64; + output[0] = dct_const_round_shift(temp1); + output[2] = dct_const_round_shift(temp2); + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; + output[1] = dct_const_round_shift(temp1); + output[3] = dct_const_round_shift(temp2); +} -// TODO: these transforms can be converted into integer forms to reduce -// the complexity -static const float dct_4[16] = { - 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000, - 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188, - 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000, - 0.270598050073099, -0.653281482438188, 0.653281482438188, -0.270598050073099 -}; - -static const float adst_4[16] = { - 0.228013428883779, 0.428525073124360, 0.577350269189626, 0.656538502008139, - 0.577350269189626, 0.577350269189626, 0.000000000000000, -0.577350269189626, - 0.656538502008139, -0.228013428883779, -0.577350269189626, 0.428525073124359, - 0.428525073124360, -0.656538502008139, 0.577350269189626, -0.228013428883779 -}; - -static const float dct_8[64] = { - 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.353553390593274, - 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.353553390593274, - 0.490392640201615, 0.415734806151273, 0.277785116509801, 0.097545161008064, - -0.097545161008064, -0.277785116509801, -0.415734806151273, -0.490392640201615, - 0.461939766255643, 0.191341716182545, -0.191341716182545, -0.461939766255643, - -0.461939766255643, -0.191341716182545, 0.191341716182545, 0.461939766255643, - 0.415734806151273, -0.097545161008064, -0.490392640201615, -0.277785116509801, - 0.277785116509801, 0.490392640201615, 0.097545161008064, -0.415734806151273, - 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.353553390593274, - 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.353553390593274, - 0.277785116509801, -0.490392640201615, 0.097545161008064, 0.415734806151273, - -0.415734806151273, -0.097545161008064, 0.490392640201615, -0.277785116509801, - 0.191341716182545, -0.461939766255643, 0.461939766255643, -0.191341716182545, - -0.191341716182545, 0.461939766255643, -0.461939766255643, 0.191341716182545, - 0.097545161008064, -0.277785116509801, 0.415734806151273, -0.490392640201615, - 0.490392640201615, -0.415734806151273, 0.277785116509801, -0.097545161008064 -}; - -static const float adst_8[64] = { - 0.089131608307533, 0.175227946595735, 0.255357107325376, 0.326790388032145, - 0.387095214016349, 0.434217976756762, 0.466553967085785, 0.483002021635509, - 0.255357107325376, 0.434217976756762, 0.483002021635509, 0.387095214016349, - 0.175227946595735, -0.089131608307533, -0.326790388032145, -0.466553967085785, - 0.387095214016349, 0.466553967085785, 0.175227946595735, -0.255357107325376, - -0.483002021635509, -0.326790388032145, 0.089131608307533, 0.434217976756762, - 0.466553967085785, 0.255357107325376, -0.326790388032145, -0.434217976756762, - 0.089131608307533, 0.483002021635509, 0.175227946595735, -0.387095214016348, - 0.483002021635509, -0.089131608307533, -0.466553967085785, 0.175227946595735, - 0.434217976756762, -0.255357107325376, -0.387095214016348, 0.326790388032145, - 0.434217976756762, -0.387095214016348, -0.089131608307533, 0.466553967085786, - -0.326790388032145, -0.175227946595735, 0.483002021635509, -0.255357107325375, - 0.326790388032145, -0.483002021635509, 0.387095214016349, -0.089131608307534, - -0.255357107325377, 0.466553967085785, -0.434217976756762, 0.175227946595736, - 0.175227946595735, -0.326790388032145, 0.434217976756762, -0.483002021635509, - 0.466553967085785, -0.387095214016348, 0.255357107325376, -0.089131608307532 -}; - -/* Converted the transforms to integers. */ -static const int16_t dct_i4[16] = { - 16384, 16384, 16384, 16384, - 21407, 8867, -8867, -21407, - 16384, -16384, -16384, 16384, - 8867, -21407, 21407, -8867 -}; - -static const int16_t adst_i4[16] = { - 7472, 14042, 18919, 21513, - 18919, 18919, 0, -18919, - 21513, -7472, -18919, 14042, - 14042, -21513, 18919, -7472 -}; +void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[4 * 4]; + int16_t *outptr = &out[0]; + const int short_pitch = pitch >> 1; + int i, j; + int16_t temp_in[4], temp_out[4]; + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = input[j * short_pitch + i] << 4; + if (i == 0 && temp_in[0]) + temp_in[0] += 1; + fdct4_1d(temp_in, temp_out); + for (j = 0; j < 4; ++j) + outptr[j * 4 + i] = temp_out[j]; + } -static const int16_t dct_i8[64] = { - 11585, 11585, 11585, 11585, - 11585, 11585, 11585, 11585, - 16069, 13623, 9102, 3196, - -3196, -9102, -13623, -16069, - 15137, 6270, -6270, -15137, - -15137, -6270, 6270, 15137, - 13623, -3196, -16069, -9102, - 9102, 16069, 3196, -13623, - 11585, -11585, -11585, 11585, - 11585, -11585, -11585, 11585, - 9102, -16069, 3196, 13623, - -13623, -3196, 16069, -9102, - 6270, -15137, 15137, -6270, - -6270, 15137, -15137, 6270, - 3196, -9102, 13623, -16069, - 16069, -13623, 9102, -3196 -}; + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j + i * 4]; + fdct4_1d(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j + i * 4] = (temp_out[j] + 1) >> 2; + } +} -static const int16_t adst_i8[64] = { - 2921, 5742, 8368, 10708, - 12684, 14228, 15288, 15827, - 8368, 14228, 15827, 12684, - 5742, -2921, -10708, -15288, - 12684, 15288, 5742, -8368, - -15827, -10708, 2921, 14228, - 15288, 8368, -10708, -14228, - 2921, 15827, 5742, -12684, - 15827, -2921, -15288, 5742, - 14228, -8368, -12684, 10708, - 14228, -12684, -2921, 15288, - -10708, -5742, 15827, -8368, - 10708, -15827, 12684, -2921, - -8368, 15288, -14228, 5742, - 5742, -10708, 14228, -15827, - 15288, -12684, 8368, -2921 -}; +static void fadst4_1d(int16_t *input, int16_t *output) { + int x0, x1, x2, x3; + int s0, s1, s2, s3, s4, s5, s6, s7; -static const float dct_16[256] = { - 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, - 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, - 0.351851, 0.338330, 0.311806, 0.273300, 0.224292, 0.166664, 0.102631, 0.034654, - -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851, - 0.346760, 0.293969, 0.196424, 0.068975, -0.068975, -0.196424, -0.293969, -0.346760, - -0.346760, -0.293969, -0.196424, -0.068975, 0.068975, 0.196424, 0.293969, 0.346760, - 0.338330, 0.224292, 0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631, - 0.102631, 0.273300, 0.351851, 0.311806, 0.166664, -0.034654, -0.224292, -0.338330, - 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641, - 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641, - 0.311806, 0.034654, -0.273300, -0.338330, -0.102631, 0.224292, 0.351851, 0.166664, - -0.166664, -0.351851, -0.224292, 0.102631, 0.338330, 0.273300, -0.034654, -0.311806, - 0.293969, -0.068975, -0.346760, -0.196424, 0.196424, 0.346760, 0.068975, -0.293969, - -0.293969, 0.068975, 0.346760, 0.196424, -0.196424, -0.346760, -0.068975, 0.293969, - 0.273300, -0.166664, -0.338330, 0.034654, 0.351851, 0.102631, -0.311806, -0.224292, - 0.224292, 0.311806, -0.102631, -0.351851, -0.034654, 0.338330, 0.166664, -0.273300, - 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0.250000, - 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0.250000, - 0.224292, -0.311806, -0.102631, 0.351851, -0.034654, -0.338330, 0.166664, 0.273300, - -0.273300, -0.166664, 0.338330, 0.034654, -0.351851, 0.102631, 0.311806, -0.224292, - 0.196424, -0.346760, 0.068975, 0.293969, -0.293969, -0.068975, 0.346760, -0.196424, - -0.196424, 0.346760, -0.068975, -0.293969, 0.293969, 0.068975, -0.346760, 0.196424, - 0.166664, -0.351851, 0.224292, 0.102631, -0.338330, 0.273300, 0.034654, -0.311806, - 0.311806, -0.034654, -0.273300, 0.338330, -0.102631, -0.224292, 0.351851, -0.166664, - 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0.135299, - 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0.135299, - 0.102631, -0.273300, 0.351851, -0.311806, 0.166664, 0.034654, -0.224292, 0.338330, - -0.338330, 0.224292, -0.034654, -0.166664, 0.311806, -0.351851, 0.273300, -0.102631, - 0.068975, -0.196424, 0.293969, -0.346760, 0.346760, -0.293969, 0.196424, -0.068975, - -0.068975, 0.196424, -0.293969, 0.346760, -0.346760, 0.293969, -0.196424, 0.068975, - 0.034654, -0.102631, 0.166664, -0.224292, 0.273300, -0.311806, 0.338330, -0.351851, - 0.351851, -0.338330, 0.311806, -0.273300, 0.224292, -0.166664, 0.102631, -0.034654 -}; + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; -static const float adst_16[256] = { - 0.033094, 0.065889, 0.098087, 0.129396, 0.159534, 0.188227, 0.215215, 0.240255, - 0.263118, 0.283599, 0.301511, 0.316693, 0.329007, 0.338341, 0.344612, 0.347761, - 0.098087, 0.188227, 0.263118, 0.316693, 0.344612, 0.344612, 0.316693, 0.263118, - 0.188227, 0.098087, 0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612, - 0.159534, 0.283599, 0.344612, 0.329007, 0.240255, 0.098087, -0.065889, -0.215215, - -0.316693, -0.347761, -0.301511, -0.188227, -0.033094, 0.129396, 0.263118, 0.338341, - 0.215215, 0.338341, 0.316693, 0.159534, -0.065889, -0.263118, -0.347761, -0.283599, - -0.098087, 0.129396, 0.301511, 0.344612, 0.240255, 0.033094, -0.188227, -0.329007, - 0.263118, 0.344612, 0.188227, -0.098087, -0.316693, -0.316693, -0.098087, 0.188227, - 0.344612, 0.263118, 0.000000, -0.263118, -0.344612, -0.188227, 0.098087, 0.316693, - 0.301511, 0.301511, 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511, - 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511, 0.000000, -0.301511, - 0.329007, 0.215215, -0.188227, -0.338341, -0.033094, 0.316693, 0.240255, -0.159534, - -0.344612, -0.065889, 0.301511, 0.263118, -0.129396, -0.347761, -0.098087, 0.283599, - 0.344612, 0.098087, -0.316693, -0.188227, 0.263118, 0.263118, -0.188227, -0.316693, - 0.098087, 0.344612, 0.000000, -0.344612, -0.098087, 0.316693, 0.188227, -0.263118, - 0.347761, -0.033094, -0.344612, 0.065889, 0.338341, -0.098087, -0.329007, 0.129396, - 0.316693, -0.159534, -0.301511, 0.188227, 0.283599, -0.215215, -0.263118, 0.240255, - 0.338341, -0.159534, -0.263118, 0.283599, 0.129396, -0.344612, 0.033094, 0.329007, - -0.188227, -0.240255, 0.301511, 0.098087, -0.347761, 0.065889, 0.316693, -0.215215, - 0.316693, -0.263118, -0.098087, 0.344612, -0.188227, -0.188227, 0.344612, -0.098087, - -0.263118, 0.316693, 0.000000, -0.316693, 0.263118, 0.098087, -0.344612, 0.188227, - 0.283599, -0.329007, 0.098087, 0.215215, -0.347761, 0.188227, 0.129396, -0.338341, - 0.263118, 0.033094, -0.301511, 0.316693, -0.065889, -0.240255, 0.344612, -0.159534, - 0.240255, -0.347761, 0.263118, -0.033094, -0.215215, 0.344612, -0.283599, 0.065889, - 0.188227, -0.338341, 0.301511, -0.098087, -0.159534, 0.329007, -0.316693, 0.129396, - 0.188227, -0.316693, 0.344612, -0.263118, 0.098087, 0.098087, -0.263118, 0.344612, - -0.316693, 0.188227, 0.000000, -0.188227, 0.316693, -0.344612, 0.263118, -0.098087, - 0.129396, -0.240255, 0.316693, -0.347761, 0.329007, -0.263118, 0.159534, -0.033094, - -0.098087, 0.215215, -0.301511, 0.344612, -0.338341, 0.283599, -0.188227, 0.065889, - 0.065889, -0.129396, 0.188227, -0.240255, 0.283599, -0.316693, 0.338341, -0.347761, - 0.344612, -0.329007, 0.301511, -0.263118, 0.215215, -0.159534, 0.098087, -0.033094 -}; + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } -/* Converted the transforms to integers. */ -static const int16_t dct_i16[256] = { - 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192, - 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192, - 11529, 11086, 10217, 8955, 7350, 5461, 3363, 1136, - -1136, -3363, -5461, -7350, -8955, -10217, -11086, -11529, - 11363, 9633, 6436, 2260, -2260, -6436, -9633, -11363, - -11363, -9633, -6436, -2260, 2260, 6436, 9633, 11363, - 11086, 7350, 1136, -5461, -10217, -11529, -8955, -3363, - 3363, 8955, 11529, 10217, 5461, -1136, -7350, -11086, - 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703, - 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703, - 10217, 1136, -8955, -11086, -3363, 7350, 11529, 5461, - -5461, -11529, -7350, 3363, 11086, 8955, -1136, -10217, - 9633, -2260, -11363, -6436, 6436, 11363, 2260, -9633, - -9633, 2260, 11363, 6436, -6436, -11363, -2260, 9633, - 8955, -5461, -11086, 1136, 11529, 3363, -10217, -7350, - 7350, 10217, -3363, -11529, -1136, 11086, 5461, -8955, - 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192, - 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192, - 7350, -10217, -3363, 11529, -1136, -11086, 5461, 8955, - -8955, -5461, 11086, 1136, -11529, 3363, 10217, -7350, - 6436, -11363, 2260, 9633, -9633, -2260, 11363, -6436, - -6436, 11363, -2260, -9633, 9633, 2260, -11363, 6436, - 5461, -11529, 7350, 3363, -11086, 8955, 1136, -10217, - 10217, -1136, -8955, 11086, -3363, -7350, 11529, -5461, - 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433, - 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433, - 3363, -8955, 11529, -10217, 5461, 1136, -7350, 11086, - -11086, 7350, -1136, -5461, 10217, -11529, 8955, -3363, - 2260, -6436, 9633, -11363, 11363, -9633, 6436, -2260, - -2260, 6436, -9633, 11363, -11363, 9633, -6436, 2260, - 1136, -3363, 5461, -7350, 8955, -10217, 11086, -11529, - 11529, -11086, 10217, -8955, 7350, -5461, 3363, -1136 -}; + s0 = sinpi_1_9 * x0; + s1 = sinpi_4_9 * x0; + s2 = sinpi_2_9 * x1; + s3 = sinpi_1_9 * x1; + s4 = sinpi_3_9 * x2; + s5 = sinpi_4_9 * x3; + s6 = sinpi_2_9 * x3; + s7 = x0 + x1 - x3; + + x0 = s0 + s2 + s5; + x1 = sinpi_3_9 * s7; + x2 = s1 - s3 + s6; + x3 = s4; + + s0 = x0 + x3; + s1 = x1; + s2 = x2 - x3; + s3 = x2 - x0 + x3; + + // 1-D transform scaling factor is sqrt(2). + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} -static const int16_t adst_i16[256] = { - 1084, 2159, 3214, 4240, 5228, 6168, 7052, 7873, - 8622, 9293, 9880, 10377, 10781, 11087, 11292, 11395, - 3214, 6168, 8622, 10377, 11292, 11292, 10377, 8622, - 6168, 3214, 0, -3214, -6168, -8622, -10377, -11292, - 5228, 9293, 11292, 10781, 7873, 3214, -2159, -7052, - -10377, -11395, -9880, -6168, -1084, 4240, 8622, 11087, - 7052, 11087, 10377, 5228, -2159, -8622, -11395, -9293, - -3214, 4240, 9880, 11292, 7873, 1084, -6168, -10781, - 8622, 11292, 6168, -3214, -10377, -10377, -3214, 6168, - 11292, 8622, 0, -8622, -11292, -6168, 3214, 10377, - 9880, 9880, 0, -9880, -9880, 0, 9880, 9880, - 0, -9880, -9880, 0, 9880, 9880, 0, -9880, - 10781, 7052, -6168, -11087, -1084, 10377, 7873, -5228, - -11292, -2159, 9880, 8622, -4240, -11395, -3214, 9293, - 11292, 3214, -10377, -6168, 8622, 8622, -6168, -10377, - 3214, 11292, 0, -11292, -3214, 10377, 6168, -8622, - 11395, -1084, -11292, 2159, 11087, -3214, -10781, 4240, - 10377, -5228, -9880, 6168, 9293, -7052, -8622, 7873, - 11087, -5228, -8622, 9293, 4240, -11292, 1084, 10781, - -6168, -7873, 9880, 3214, -11395, 2159, 10377, -7052, - 10377, -8622, -3214, 11292, -6168, -6168, 11292, -3214, - -8622, 10377, 0, -10377, 8622, 3214, -11292, 6168, - 9293, -10781, 3214, 7052, -11395, 6168, 4240, -11087, - 8622, 1084, -9880, 10377, -2159, -7873, 11292, -5228, - 7873, -11395, 8622, -1084, -7052, 11292, -9293, 2159, - 6168, -11087, 9880, -3214, -5228, 10781, -10377, 4240, - 6168, -10377, 11292, -8622, 3214, 3214, -8622, 11292, - -10377, 6168, 0, -6168, 10377, -11292, 8622, -3214, - 4240, -7873, 10377, -11395, 10781, -8622, 5228, -1084, - -3214, 7052, -9880, 11292, -11087, 9293, -6168, 2159, - 2159, -4240, 6168, -7873, 9293, -10377, 11087, -11395, - 11292, -10781, 9880, -8622, 7052, -5228, 3214, -1084 +static const transform_2d FHT_4[] = { + { fdct4_1d, fdct4_1d }, // DCT_DCT = 0 + { fadst4_1d, fdct4_1d }, // ADST_DCT = 1 + { fdct4_1d, fadst4_1d }, // DCT_ADST = 2 + { fadst4_1d, fadst4_1d } // ADST_ADST = 3 }; -static const int xC1S7 = 16069; -static const int xC2S6 = 15137; -static const int xC3S5 = 13623; -static const int xC4S4 = 11585; -static const int xC5S3 = 9102; -static const int xC6S2 = 6270; -static const int xC7S1 = 3196; - -#define SHIFT_BITS 14 -#define DOROUND(X) X += (1<<(SHIFT_BITS-1)); - -#define FINAL_SHIFT 3 -#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1)) -#define IN_SHIFT (FINAL_SHIFT+1) - - -void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) { - int loop; - int short_pitch = pitch >> 1; - int is07, is12, is34, is56; - int is0734, is1256; - int id07, id12, id34, id56; - int irot_input_x, irot_input_y; - int icommon_product1; // Re-used product (c4s4 * (s12 - s56)) - int icommon_product2; // Re-used product (c4s4 * (d12 + d56)) - int temp1, temp2; // intermediate variable for computation - - int InterData[64]; - int *ip = InterData; - short *op = OutputData; - - for (loop = 0; loop < 8; loop++) { - // Pre calculate some common sums and differences. - is07 = (InputData[0] + InputData[7]) << IN_SHIFT; - is12 = (InputData[1] + InputData[2]) << IN_SHIFT; - is34 = (InputData[3] + InputData[4]) << IN_SHIFT; - is56 = (InputData[5] + InputData[6]) << IN_SHIFT; - id07 = (InputData[0] - InputData[7]) << IN_SHIFT; - id12 = (InputData[1] - InputData[2]) << IN_SHIFT; - id34 = (InputData[3] - InputData[4]) << IN_SHIFT; - id56 = (InputData[5] - InputData[6]) << IN_SHIFT; - - is0734 = is07 + is34; - is1256 = is12 + is56; - - // Pre-Calculate some common product terms. - icommon_product1 = xC4S4 * (is12 - is56); - DOROUND(icommon_product1) - icommon_product1 >>= SHIFT_BITS; - - icommon_product2 = xC4S4 * (id12 + id56); - DOROUND(icommon_product2) - icommon_product2 >>= SHIFT_BITS; - - - ip[0] = (xC4S4 * (is0734 + is1256)); - DOROUND(ip[0]); - ip[0] >>= SHIFT_BITS; - - ip[4] = (xC4S4 * (is0734 - is1256)); - DOROUND(ip[4]); - ip[4] >>= SHIFT_BITS; - - // Define inputs to rotation for outputs 2 and 6 - irot_input_x = id12 - id56; - irot_input_y = is07 - is34; - - // Apply rotation for outputs 2 and 6. - temp1 = xC6S2 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC2S6 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[2] = temp1 + temp2; - - temp1 = xC6S2 * irot_input_y; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC2S6 * irot_input_x; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[6] = temp1 - temp2; - - // Define inputs to rotation for outputs 1 and 7 - irot_input_x = icommon_product1 + id07; - irot_input_y = -(id34 + icommon_product2); - - // Apply rotation for outputs 1 and 7. - temp1 = xC1S7 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC7S1 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[1] = temp1 - temp2; - - temp1 = xC7S1 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC1S7 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[7] = temp1 + temp2; - - // Define inputs to rotation for outputs 3 and 5 - irot_input_x = id07 - icommon_product1; - irot_input_y = id34 - icommon_product2; - - // Apply rotation for outputs 3 and 5. - temp1 = xC3S5 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC5S3 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[3] = temp1 - temp2; - - - temp1 = xC5S3 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC3S5 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - ip[5] = temp1 + temp2; - - // Increment data pointer for next row - InputData += short_pitch; - ip += 8; +void vp9_short_fht4x4_c(int16_t *input, int16_t *output, + int pitch, TX_TYPE tx_type) { + int16_t out[4 * 4]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[4], temp_out[4]; + const transform_2d ht = FHT_4[tx_type]; + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = input[j * pitch + i] << 4; + if (i == 0 && temp_in[0]) + temp_in[0] += 1; + ht.cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) + outptr[j * 4 + i] = temp_out[j]; } - // Performed DCT on rows, now transform the columns - ip = InterData; - for (loop = 0; loop < 8; loop++) { - // Pre calculate some common sums and differences. - is07 = ip[0 * 8] + ip[7 * 8]; - is12 = ip[1 * 8] + ip[2 * 8]; - is34 = ip[3 * 8] + ip[4 * 8]; - is56 = ip[5 * 8] + ip[6 * 8]; - - id07 = ip[0 * 8] - ip[7 * 8]; - id12 = ip[1 * 8] - ip[2 * 8]; - id34 = ip[3 * 8] - ip[4 * 8]; - id56 = ip[5 * 8] - ip[6 * 8]; - - is0734 = is07 + is34; - is1256 = is12 + is56; - - // Pre-Calculate some common product terms - icommon_product1 = xC4S4 * (is12 - is56); - icommon_product2 = xC4S4 * (id12 + id56); - DOROUND(icommon_product1) - DOROUND(icommon_product2) - icommon_product1 >>= SHIFT_BITS; - icommon_product2 >>= SHIFT_BITS; - - - temp1 = xC4S4 * (is0734 + is1256); - temp2 = xC4S4 * (is0734 - is1256); - DOROUND(temp1); - DOROUND(temp2); - temp1 >>= SHIFT_BITS; - - temp2 >>= SHIFT_BITS; - op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT; - op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - // Define inputs to rotation for outputs 2 and 6 - irot_input_x = id12 - id56; - irot_input_y = is07 - is34; - - // Apply rotation for outputs 2 and 6. - temp1 = xC6S2 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC2S6 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - temp1 = xC6S2 * irot_input_y; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC2S6 * irot_input_x; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - // Define inputs to rotation for outputs 1 and 7 - irot_input_x = icommon_product1 + id07; - irot_input_y = -(id34 + icommon_product2); - - // Apply rotation for outputs 1 and 7. - temp1 = xC1S7 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC7S1 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - temp1 = xC7S1 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC1S7 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - // Define inputs to rotation for outputs 3 and 5 - irot_input_x = id07 - icommon_product1; - irot_input_y = id34 - icommon_product2; - - // Apply rotation for outputs 3 and 5. - temp1 = xC3S5 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC5S3 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - - temp1 = xC5S3 * irot_input_x; - DOROUND(temp1); - temp1 >>= SHIFT_BITS; - temp2 = xC3S5 * irot_input_y; - DOROUND(temp2); - temp2 >>= SHIFT_BITS; - op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT; - - // Increment data pointer for next column. - ip++; - op++; + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j + i * 4]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j + i * 4] = (temp_out[j] + 1) >> 2; } } -void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) { - /* [1 1; 1 -1] orthogonal transform */ - /* use position: 0,1, 4, 8 */ - int i; - short *ip1 = input; - short *op1 = output; - for (i = 0; i < 16; i++) { - op1[i] = 0; - } - - op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1; - op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1; - op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1; - op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1; +void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { + vp9_short_fdct4x4_c(input, output, pitch); + vp9_short_fdct4x4_c(input + 4, output + 16, pitch); } -/* For test */ -#define TEST_INT 1 -#if TEST_INT -#define vp9_fht_int_c vp9_fht_c -#else -#define vp9_fht_float_c vp9_fht_c -#endif - -void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output, - TX_TYPE tx_type, int tx_dim) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int i, j, k; - float bufa[256], bufb[256]; // buffers are for floating-point test purpose - // the implementation could be simplified in - // conjunction with integer transform - const int16_t *ip = input; - int16_t *op = output; - - float *pfa = &bufa[0]; - float *pfb = &bufb[0]; - - // pointers to vertical and horizontal transforms - const float *ptv, *pth; - - assert(tx_type != DCT_DCT); - // load and convert residual array into floating-point - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - pfa[i] = (float)ip[i]; - } - pfa += tx_dim; - ip += pitch / 2; - } - - // vertical transformation - pfa = &bufa[0]; - pfb = &bufb[0]; - - switch (tx_type) { - case ADST_ADST : - case ADST_DCT : - ptv = (tx_dim == 4) ? &adst_4[0] : - ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]); - break; - - default : - ptv = (tx_dim == 4) ? &dct_4[0] : - ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]); - break; - } - - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - pfb[i] = 0; - for (k = 0; k < tx_dim; k++) { - pfb[i] += ptv[k] * pfa[(k * tx_dim)]; - } - pfa += 1; - } - pfb += tx_dim; - ptv += tx_dim; - pfa = &bufa[0]; - } +static void fdct8_1d(int16_t *input, int16_t *output) { + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; + /*needs32*/ int t0, t1, t2, t3; + /*canbe16*/ int x0, x1, x2, x3; + + // stage 1 + s0 = input[0] + input[7]; + s1 = input[1] + input[6]; + s2 = input[2] + input[5]; + s3 = input[3] + input[4]; + s4 = input[3] - input[4]; + s5 = input[2] - input[5]; + s6 = input[1] - input[6]; + s7 = input[0] - input[7]; + + // fdct4_1d(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0] = dct_const_round_shift(t0); + output[2] = dct_const_round_shift(t2); + output[4] = dct_const_round_shift(t1); + output[6] = dct_const_round_shift(t3); - // horizontal transformation - pfa = &bufa[0]; - pfb = &bufb[0]; - - switch (tx_type) { - case ADST_ADST : - case DCT_ADST : - pth = (tx_dim == 4) ? &adst_4[0] : - ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]); - break; - - default : - pth = (tx_dim == 4) ? &dct_4[0] : - ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]); - break; - } + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = dct_const_round_shift(t0); + t3 = dct_const_round_shift(t1); - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - pfa[i] = 0; - for (k = 0; k < tx_dim; k++) { - pfa[i] += pfb[k] * pth[k]; - } - pth += tx_dim; - } + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; - pfa += tx_dim; - pfb += tx_dim; - // pth -= tx_dim * tx_dim; - - switch (tx_type) { - case ADST_ADST : - case DCT_ADST : - pth = (tx_dim == 4) ? &adst_4[0] : - ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]); - break; - - default : - pth = (tx_dim == 4) ? &dct_4[0] : - ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]); - break; - } - } + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1] = dct_const_round_shift(t0); + output[3] = dct_const_round_shift(t2); + output[5] = dct_const_round_shift(t1); + output[7] = dct_const_round_shift(t3); +} - // convert to short integer format and load BLOCKD buffer - op = output; - pfa = &bufa[0]; +void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { + const int stride = pitch >> 1; + int i, j; + int16_t intermediate[64]; - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) : - -(int16_t)(- 8 * pfa[i] + 0.49); - } - op += tx_dim; - pfa += tx_dim; + // Transform columns + { + int16_t *output = intermediate; + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; + /*needs32*/ int t0, t1, t2, t3; + /*canbe16*/ int x0, x1, x2, x3; + + int i; + for (i = 0; i < 8; i++) { + // stage 1 + s0 = (input[0 * stride] + input[7 * stride]) << 2; + s1 = (input[1 * stride] + input[6 * stride]) << 2; + s2 = (input[2 * stride] + input[5 * stride]) << 2; + s3 = (input[3 * stride] + input[4 * stride]) << 2; + s4 = (input[3 * stride] - input[4 * stride]) << 2; + s5 = (input[2 * stride] - input[5 * stride]) << 2; + s6 = (input[1 * stride] - input[6 * stride]) << 2; + s7 = (input[0 * stride] - input[7 * stride]) << 2; + + // fdct4_1d(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0 * 8] = dct_const_round_shift(t0); + output[2 * 8] = dct_const_round_shift(t2); + output[4 * 8] = dct_const_round_shift(t1); + output[6 * 8] = dct_const_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = dct_const_round_shift(t0); + t3 = dct_const_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1 * 8] = dct_const_round_shift(t0); + output[3 * 8] = dct_const_round_shift(t2); + output[5 * 8] = dct_const_round_shift(t1); + output[7 * 8] = dct_const_round_shift(t3); + input++; + output++; } } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} -/* Converted the transforms to integer form. */ -#define VERTICAL_SHIFT 11 -#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1) -#define HORIZONTAL_SHIFT 16 -#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1) -void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output, - TX_TYPE tx_type, int tx_dim) { - int i, j, k; - int16_t imbuf[256]; - - const int16_t *ip = input; - int16_t *op = output; - int16_t *im = &imbuf[0]; - - /* pointers to vertical and horizontal transforms. */ - const int16_t *ptv = NULL, *pth = NULL; - - switch (tx_type) { - case ADST_ADST : - ptv = pth = (tx_dim == 4) ? &adst_i4[0] - : ((tx_dim == 8) ? &adst_i8[0] - : &adst_i16[0]); - break; - case ADST_DCT : - ptv = (tx_dim == 4) ? &adst_i4[0] - : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]); - pth = (tx_dim == 4) ? &dct_i4[0] - : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]); - break; - case DCT_ADST : - ptv = (tx_dim == 4) ? &dct_i4[0] - : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]); - pth = (tx_dim == 4) ? &adst_i4[0] - : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]); - break; - case DCT_DCT : - ptv = pth = (tx_dim == 4) ? &dct_i4[0] - : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]); - break; - default: - assert(0); - break; + // Rows + for (i = 0; i < 8; ++i) { + fdct8_1d(&intermediate[i * 8], &final_output[i * 8]); + for (j = 0; j < 8; ++j) + final_output[j + i * 8] /= 2; } +} - /* vertical transformation */ - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - int temp = 0; - - for (k = 0; k < tx_dim; k++) { - temp += ptv[k] * ip[(k * (pitch >> 1))]; +void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we tranpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + const int stride = pitch >> 1; + int pass; + // We need an intermediate buffer between passes. + int16_t intermediate[256]; + int16_t *in = input; + int16_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + /*canbe16*/ int step1[8]; + /*canbe16*/ int step2[8]; + /*canbe16*/ int step3[8]; + /*canbe16*/ int input[8]; + /*needs32*/ int temp1, temp2; + int i; + for (i = 0; i < 16; i++) { + if (0 == pass) { + // Calculate input for the first 8 results. + input[0] = (in[0 * stride] + in[15 * stride]) << 2; + input[1] = (in[1 * stride] + in[14 * stride]) << 2; + input[2] = (in[2 * stride] + in[13 * stride]) << 2; + input[3] = (in[3 * stride] + in[12 * stride]) << 2; + input[4] = (in[4 * stride] + in[11 * stride]) << 2; + input[5] = (in[5 * stride] + in[10 * stride]) << 2; + input[6] = (in[6 * stride] + in[ 9 * stride]) << 2; + input[7] = (in[7 * stride] + in[ 8 * stride]) << 2; + // Calculate input for the next 8 results. + step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2; + step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2; + step1[2] = (in[5 * stride] - in[10 * stride]) << 2; + step1[3] = (in[4 * stride] - in[11 * stride]) << 2; + step1[4] = (in[3 * stride] - in[12 * stride]) << 2; + step1[5] = (in[2 * stride] - in[13 * stride]) << 2; + step1[6] = (in[1 * stride] - in[14 * stride]) << 2; + step1[7] = (in[0 * stride] - in[15 * stride]) << 2; + } else { + // Calculate input for the first 8 results. + input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); + input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); + input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); + input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); + input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); + input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); + input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); + input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); + // Calculate input for the next 8 results. + step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); + step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); + step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); + step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); + step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); + step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); + step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); + step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); } - - im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT); - ip++; - } - im += tx_dim; // 16 - ptv += tx_dim; - ip = input; - } - - /* horizontal transformation */ - im = &imbuf[0]; - - for (j = 0; j < tx_dim; j++) { - const int16_t *pthc = pth; - - for (i = 0; i < tx_dim; i++) { - int temp = 0; - - for (k = 0; k < tx_dim; k++) { - temp += im[k] * pthc[k]; + // Work on the first eight values; fdct8_1d(input, even_results); + { + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; + /*needs32*/ int t0, t1, t2, t3; + /*canbe16*/ int x0, x1, x2, x3; + + // stage 1 + s0 = input[0] + input[7]; + s1 = input[1] + input[6]; + s2 = input[2] + input[5]; + s3 = input[3] + input[4]; + s4 = input[3] - input[4]; + s5 = input[2] - input[5]; + s6 = input[1] - input[6]; + s7 = input[0] - input[7]; + + // fdct4_1d(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; + out[0] = dct_const_round_shift(t0); + out[4] = dct_const_round_shift(t2); + out[8] = dct_const_round_shift(t1); + out[12] = dct_const_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = dct_const_round_shift(t0); + t3 = dct_const_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[2] = dct_const_round_shift(t0); + out[6] = dct_const_round_shift(t2); + out[10] = dct_const_round_shift(t1); + out[14] = dct_const_round_shift(t3); } - - op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT); - pthc += tx_dim; + // Work on the next eight values; step1 -> odd_results + { + // step 2 + temp1 = (step1[5] - step1[2]) * cospi_16_64; + temp2 = (step1[4] - step1[3]) * cospi_16_64; + step2[2] = dct_const_round_shift(temp1); + step2[3] = dct_const_round_shift(temp2); + temp1 = (step1[4] + step1[3]) * cospi_16_64; + temp2 = (step1[5] + step1[2]) * cospi_16_64; + step2[4] = dct_const_round_shift(temp1); + step2[5] = dct_const_round_shift(temp2); + // step 3 + step3[0] = step1[0] + step2[3]; + step3[1] = step1[1] + step2[2]; + step3[2] = step1[1] - step2[2]; + step3[3] = step1[0] - step2[3]; + step3[4] = step1[7] - step2[4]; + step3[5] = step1[6] - step2[5]; + step3[6] = step1[6] + step2[5]; + step3[7] = step1[7] + step2[4]; + // step 4 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; + step2[1] = dct_const_round_shift(temp1); + step2[2] = dct_const_round_shift(temp2); + temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + step2[5] = dct_const_round_shift(temp1); + step2[6] = dct_const_round_shift(temp2); + // step 5 + step1[0] = step3[0] + step2[1]; + step1[1] = step3[0] - step2[1]; + step1[2] = step3[3] - step2[2]; + step1[3] = step3[3] + step2[2]; + step1[4] = step3[4] + step2[5]; + step1[5] = step3[4] - step2[5]; + step1[6] = step3[7] - step2[6]; + step1[7] = step3[7] + step2[6]; + // step 6 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + out[1] = dct_const_round_shift(temp1); + out[9] = dct_const_round_shift(temp2); + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + out[5] = dct_const_round_shift(temp1); + out[13] = dct_const_round_shift(temp2); + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + out[3] = dct_const_round_shift(temp1); + out[11] = dct_const_round_shift(temp2); + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + out[7] = dct_const_round_shift(temp1); + out[15] = dct_const_round_shift(temp2); + } + // Do next column (which is a transposed row in second/horizontal pass) + in++; + out += 16; } - - im += tx_dim; // 16 - op += tx_dim; + // Setup in/out for next pass. + in = intermediate; + out = output; } } -void vp9_short_fdct4x4_c(short *input, short *output, int pitch) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - - for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3]) << 5); - b1 = ((ip[1] + ip[2]) << 5); - c1 = ((ip[1] - ip[2]) << 5); - d1 = ((ip[0] - ip[3]) << 5); - - op[0] = a1 + b1; - op[2] = a1 - b1; - - op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12; - op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12; - - ip += pitch / 2; - op += 4; - - } - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[12]; - b1 = ip[4] + ip[8]; - c1 = ip[4] - ip[8]; - d1 = ip[0] - ip[12]; - - op[0] = (a1 + b1 + 7) >> 4; - op[8] = (a1 - b1 + 7) >> 4; - - op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0); - op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16; - - ip++; - op++; - } +static void fadst8_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + int x0 = input[7]; + int x1 = input[0]; + int x2 = input[5]; + int x3 = input[2]; + int x4 = input[3]; + int x5 = input[4]; + int x6 = input[1]; + int x7 = input[6]; + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = dct_const_round_shift(s0 + s4); + x1 = dct_const_round_shift(s1 + s5); + x2 = dct_const_round_shift(s2 + s6); + x3 = dct_const_round_shift(s3 + s7); + x4 = dct_const_round_shift(s0 - s4); + x5 = dct_const_round_shift(s1 - s5); + x6 = dct_const_round_shift(s2 - s6); + x7 = dct_const_round_shift(s3 - s7); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + + output[0] = x0; + output[1] = - x4; + output[2] = x6; + output[3] = - x2; + output[4] = x3; + output[5] = - x7; + output[6] = x5; + output[7] = - x1; } -void vp9_short_fdct8x4_c(short *input, short *output, int pitch) -{ - vp9_short_fdct4x4_c(input, output, pitch); - vp9_short_fdct4x4_c(input + 4, output + 16, pitch); -} - -void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - int pitch_short = pitch >> 1; - - for (i = 0; i < 4; i++) { - a1 = ip[0 * pitch_short] + ip[3 * pitch_short]; - b1 = ip[1 * pitch_short] + ip[2 * pitch_short]; - c1 = ip[1 * pitch_short] - ip[2 * pitch_short]; - d1 = ip[0 * pitch_short] - ip[3 * pitch_short]; - - op[0] = (a1 + b1 + 1) >> 1; - op[4] = (c1 + d1) >> 1; - op[8] = (a1 - b1) >> 1; - op[12] = (d1 - c1) >> 1; - - ip++; - op++; - } - ip = output; - op = output; - - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[3]; - b1 = ip[1] + ip[2]; - c1 = ip[1] - ip[2]; - d1 = ip[0] - ip[3]; - - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; - - ip += 4; - op += 4; - } -} - -#if CONFIG_LOSSLESS -void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - int pitch_short = pitch >> 1; - - for (i = 0; i < 4; i++) { - a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR; - b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR; - c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR; - d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR; - - op[0] = (a1 + b1 + 1) >> 1; - op[4] = (c1 + d1) >> 1; - op[8] = (a1 - b1) >> 1; - op[12] = (d1 - c1) >> 1; +static const transform_2d FHT_8[] = { + { fdct8_1d, fdct8_1d }, // DCT_DCT = 0 + { fadst8_1d, fdct8_1d }, // ADST_DCT = 1 + { fdct8_1d, fadst8_1d }, // DCT_ADST = 2 + { fadst8_1d, fadst8_1d } // ADST_ADST = 3 +}; - ip++; - op++; +void vp9_short_fht8x8_c(int16_t *input, int16_t *output, + int pitch, TX_TYPE tx_type) { + int16_t out[64]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[8], temp_out[8]; + const transform_2d ht = FHT_8[tx_type]; + + // Columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = input[j * pitch + i] << 2; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) + outptr[j * 8 + i] = temp_out[j]; } - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[3]; - b1 = ip[1] + ip[2]; - c1 = ip[1] - ip[2]; - d1 = ip[0] - ip[3]; - - op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - - ip += 4; - op += 4; + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j + i * 8]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j + i * 8] = temp_out[j] >> 1; } } -void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) { +void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { int i; int a1, b1, c1, d1; short *ip = input; @@ -894,1495 +593,658 @@ void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) { } } -void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) { - vp9_short_walsh4x4_x8_c(input, output, pitch); - vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch); -} -#endif - -#define TEST_INT_16x16_DCT 1 -#if !TEST_INT_16x16_DCT - -static void dct16x16_1d(double input[16], double output[16]) { - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - // step 1 - step[ 0] = input[0] + input[15]; - step[ 1] = input[1] + input[14]; - step[ 2] = input[2] + input[13]; - step[ 3] = input[3] + input[12]; - step[ 4] = input[4] + input[11]; - step[ 5] = input[5] + input[10]; - step[ 6] = input[6] + input[ 9]; - step[ 7] = input[7] + input[ 8]; - step[ 8] = input[7] - input[ 8]; - step[ 9] = input[6] - input[ 9]; - step[10] = input[5] - input[10]; - step[11] = input[4] - input[11]; - step[12] = input[3] - input[12]; - step[13] = input[2] - input[13]; - step[14] = input[1] - input[14]; - step[15] = input[0] - input[15]; - - // step 2 - output[0] = step[0] + step[7]; - output[1] = step[1] + step[6]; - output[2] = step[2] + step[5]; - output[3] = step[3] + step[4]; - output[4] = step[3] - step[4]; - output[5] = step[2] - step[5]; - output[6] = step[1] - step[6]; - output[7] = step[0] - step[7]; - - temp1 = step[ 8]*C7; - temp2 = step[15]*C9; - output[ 8] = temp1 + temp2; - - temp1 = step[ 9]*C11; - temp2 = step[14]*C5; - output[ 9] = temp1 - temp2; - - temp1 = step[10]*C3; - temp2 = step[13]*C13; - output[10] = temp1 + temp2; - - temp1 = step[11]*C15; - temp2 = step[12]*C1; - output[11] = temp1 - temp2; - - temp1 = step[11]*C1; - temp2 = step[12]*C15; - output[12] = temp2 + temp1; - - temp1 = step[10]*C13; - temp2 = step[13]*C3; - output[13] = temp2 - temp1; - - temp1 = step[ 9]*C5; - temp2 = step[14]*C11; - output[14] = temp2 + temp1; - - temp1 = step[ 8]*C9; - temp2 = step[15]*C7; - output[15] = temp2 - temp1; - - // step 3 - step[ 0] = output[0] + output[3]; - step[ 1] = output[1] + output[2]; - step[ 2] = output[1] - output[2]; - step[ 3] = output[0] - output[3]; - - temp1 = output[4]*C14; - temp2 = output[7]*C2; - step[ 4] = temp1 + temp2; - - temp1 = output[5]*C10; - temp2 = output[6]*C6; - step[ 5] = temp1 + temp2; - - temp1 = output[5]*C6; - temp2 = output[6]*C10; - step[ 6] = temp2 - temp1; - - temp1 = output[4]*C2; - temp2 = output[7]*C14; - step[ 7] = temp2 - temp1; - - step[ 8] = output[ 8] + output[11]; - step[ 9] = output[ 9] + output[10]; - step[10] = output[ 9] - output[10]; - step[11] = output[ 8] - output[11]; - - step[12] = output[12] + output[15]; - step[13] = output[13] + output[14]; - step[14] = output[13] - output[14]; - step[15] = output[12] - output[15]; - - // step 4 - output[ 0] = (step[ 0] + step[ 1]); - output[ 8] = (step[ 0] - step[ 1]); - - temp1 = step[2]*C12; - temp2 = step[3]*C4; - temp1 = temp1 + temp2; - output[ 4] = 2*(temp1*C8); - - temp1 = step[2]*C4; - temp2 = step[3]*C12; - temp1 = temp2 - temp1; - output[12] = 2*(temp1*C8); - - output[ 2] = 2*((step[4] + step[ 5])*C8); - output[14] = 2*((step[7] - step[ 6])*C8); - - temp1 = step[4] - step[5]; - temp2 = step[6] + step[7]; - output[ 6] = (temp1 + temp2); - output[10] = (temp1 - temp2); - - intermediate[8] = step[8] + step[14]; - intermediate[9] = step[9] + step[15]; - - temp1 = intermediate[8]*C12; - temp2 = intermediate[9]*C4; - temp1 = temp1 - temp2; - output[3] = 2*(temp1*C8); - - temp1 = intermediate[8]*C4; - temp2 = intermediate[9]*C12; - temp1 = temp2 + temp1; - output[13] = 2*(temp1*C8); - - output[ 9] = 2*((step[10] + step[11])*C8); - - intermediate[11] = step[10] - step[11]; - intermediate[12] = step[12] + step[13]; - intermediate[13] = step[12] - step[13]; - intermediate[14] = step[ 8] - step[14]; - intermediate[15] = step[ 9] - step[15]; - - output[15] = (intermediate[11] + intermediate[12]); - output[ 1] = -(intermediate[11] - intermediate[12]); - - output[ 7] = 2*(intermediate[13]*C8); - - temp1 = intermediate[14]*C12; - temp2 = intermediate[15]*C4; - temp1 = temp1 - temp2; - output[11] = -2*(temp1*C8); - - temp1 = intermediate[14]*C4; - temp2 = intermediate[15]*C12; - temp1 = temp2 + temp1; - output[ 5] = 2*(temp1*C8); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; +void vp9_short_walsh8x4_c(short *input, short *output, int pitch) { + vp9_short_walsh4x4_c(input, output, pitch); + vp9_short_walsh4x4_c(input + 4, output + 16, pitch); } -void vp9_short_fdct16x16_c(short *input, short *out, int pitch) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; + +// Rewrote to use same algorithm as others. +static void fdct16_1d(int16_t in[16], int16_t out[16]) { + /*canbe16*/ int step1[8]; + /*canbe16*/ int step2[8]; + /*canbe16*/ int step3[8]; + /*canbe16*/ int input[8]; + /*needs32*/ int temp1, temp2; + + // step 1 + input[0] = in[0] + in[15]; + input[1] = in[1] + in[14]; + input[2] = in[2] + in[13]; + input[3] = in[3] + in[12]; + input[4] = in[4] + in[11]; + input[5] = in[5] + in[10]; + input[6] = in[6] + in[ 9]; + input[7] = in[7] + in[ 8]; + + step1[0] = in[7] - in[ 8]; + step1[1] = in[6] - in[ 9]; + step1[2] = in[5] - in[10]; + step1[3] = in[4] - in[11]; + step1[4] = in[3] - in[12]; + step1[5] = in[2] - in[13]; + step1[6] = in[1] - in[14]; + step1[7] = in[0] - in[15]; + + // fdct8_1d(step, step); { - int shortpitch = pitch >> 1; - int i, j; - double output[256]; - // First transform columns - for (i = 0; i < 16; i++) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; j++) - temp_in[j] = input[j*shortpitch + i]; - dct16x16_1d(temp_in, temp_out); - for (j = 0; j < 16; j++) - output[j*16 + i] = temp_out[j]; - } - // Then transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = output[j + i*16]; - dct16x16_1d(temp_in, temp_out); - for (j = 0; j < 16; ++j) - output[j + i*16] = temp_out[j]; - } - // Scale by some magic number - for (i = 0; i < 256; i++) - out[i] = (short)round(output[i]/2); + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; + /*needs32*/ int t0, t1, t2, t3; + /*canbe16*/ int x0, x1, x2, x3; + + // stage 1 + s0 = input[0] + input[7]; + s1 = input[1] + input[6]; + s2 = input[2] + input[5]; + s3 = input[3] + input[4]; + s4 = input[3] - input[4]; + s5 = input[2] - input[5]; + s6 = input[1] - input[6]; + s7 = input[0] - input[7]; + + // fdct4_1d(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; + out[0] = dct_const_round_shift(t0); + out[4] = dct_const_round_shift(t2); + out[8] = dct_const_round_shift(t1); + out[12] = dct_const_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = dct_const_round_shift(t0); + t3 = dct_const_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[2] = dct_const_round_shift(t0); + out[6] = dct_const_round_shift(t2); + out[10] = dct_const_round_shift(t1); + out[14] = dct_const_round_shift(t3); } - vp9_clear_system_state(); // Make it simd safe : __asm emms; + + // step 2 + temp1 = (step1[5] - step1[2]) * cospi_16_64; + temp2 = (step1[4] - step1[3]) * cospi_16_64; + step2[2] = dct_const_round_shift(temp1); + step2[3] = dct_const_round_shift(temp2); + temp1 = (step1[4] + step1[3]) * cospi_16_64; + temp2 = (step1[5] + step1[2]) * cospi_16_64; + step2[4] = dct_const_round_shift(temp1); + step2[5] = dct_const_round_shift(temp2); + + // step 3 + step3[0] = step1[0] + step2[3]; + step3[1] = step1[1] + step2[2]; + step3[2] = step1[1] - step2[2]; + step3[3] = step1[0] - step2[3]; + step3[4] = step1[7] - step2[4]; + step3[5] = step1[6] - step2[5]; + step3[6] = step1[6] + step2[5]; + step3[7] = step1[7] + step2[4]; + + // step 4 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; + step2[1] = dct_const_round_shift(temp1); + step2[2] = dct_const_round_shift(temp2); + temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + step2[5] = dct_const_round_shift(temp1); + step2[6] = dct_const_round_shift(temp2); + + // step 5 + step1[0] = step3[0] + step2[1]; + step1[1] = step3[0] - step2[1]; + step1[2] = step3[3] - step2[2]; + step1[3] = step3[3] + step2[2]; + step1[4] = step3[4] + step2[5]; + step1[5] = step3[4] - step2[5]; + step1[6] = step3[7] - step2[6]; + step1[7] = step3[7] + step2[6]; + + // step 6 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + out[1] = dct_const_round_shift(temp1); + out[9] = dct_const_round_shift(temp2); + + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + out[5] = dct_const_round_shift(temp1); + out[13] = dct_const_round_shift(temp2); + + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + out[3] = dct_const_round_shift(temp1); + out[11] = dct_const_round_shift(temp2); + + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + out[7] = dct_const_round_shift(temp1); + out[15] = dct_const_round_shift(temp2); } -#else -static const int16_t C1 = 16305; -static const int16_t C2 = 16069; -static const int16_t C3 = 15679; -static const int16_t C4 = 15137; -static const int16_t C5 = 14449; -static const int16_t C6 = 13623; -static const int16_t C7 = 12665; -static const int16_t C8 = 11585; -static const int16_t C9 = 10394; -static const int16_t C10 = 9102; -static const int16_t C11 = 7723; -static const int16_t C12 = 6270; -static const int16_t C13 = 4756; -static const int16_t C14 = 3196; -static const int16_t C15 = 1606; - -#define RIGHT_SHIFT 14 -#define ROUNDING (1 << (RIGHT_SHIFT - 1)) - -static void dct16x16_1d(int16_t input[16], int16_t output[16], - int last_shift_bits) { - int16_t step[16]; - int intermediate[16]; - int temp1, temp2; - int final_shift = RIGHT_SHIFT; - int final_rounding = ROUNDING; - int output_shift = 0; - int output_rounding = 0; - - final_shift += last_shift_bits; - if (final_shift > 0) - final_rounding = 1 << (final_shift - 1); - - output_shift += last_shift_bits; - if (output_shift > 0) - output_rounding = 1 << (output_shift - 1); - - // step 1 - step[ 0] = input[0] + input[15]; - step[ 1] = input[1] + input[14]; - step[ 2] = input[2] + input[13]; - step[ 3] = input[3] + input[12]; - step[ 4] = input[4] + input[11]; - step[ 5] = input[5] + input[10]; - step[ 6] = input[6] + input[ 9]; - step[ 7] = input[7] + input[ 8]; - step[ 8] = input[7] - input[ 8]; - step[ 9] = input[6] - input[ 9]; - step[10] = input[5] - input[10]; - step[11] = input[4] - input[11]; - step[12] = input[3] - input[12]; - step[13] = input[2] - input[13]; - step[14] = input[1] - input[14]; - step[15] = input[0] - input[15]; - - // step 2 - output[0] = step[0] + step[7]; - output[1] = step[1] + step[6]; - output[2] = step[2] + step[5]; - output[3] = step[3] + step[4]; - output[4] = step[3] - step[4]; - output[5] = step[2] - step[5]; - output[6] = step[1] - step[6]; - output[7] = step[0] - step[7]; - - temp1 = step[ 8] * C7; - temp2 = step[15] * C9; - output[ 8] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 9] * C11; - temp2 = step[14] * C5; - output[ 9] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[10] * C3; - temp2 = step[13] * C13; - output[10] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[11] * C15; - temp2 = step[12] * C1; - output[11] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[11] * C1; - temp2 = step[12] * C15; - output[12] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[10] * C13; - temp2 = step[13] * C3; - output[13] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 9] * C5; - temp2 = step[14] * C11; - output[14] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 8] * C9; - temp2 = step[15] * C7; - output[15] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT; - - // step 3 - step[ 0] = output[0] + output[3]; - step[ 1] = output[1] + output[2]; - step[ 2] = output[1] - output[2]; - step[ 3] = output[0] - output[3]; - - temp1 = output[4] * C14; - temp2 = output[7] * C2; - step[ 4] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[5] * C10; - temp2 = output[6] * C6; - step[ 5] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[5] * C6; - temp2 = output[6] * C10; - step[ 6] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[4] * C2; - temp2 = output[7] * C14; - step[ 7] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT; - - step[ 8] = output[ 8] + output[11]; - step[ 9] = output[ 9] + output[10]; - step[10] = output[ 9] - output[10]; - step[11] = output[ 8] - output[11]; - - step[12] = output[12] + output[15]; - step[13] = output[13] + output[14]; - step[14] = output[13] - output[14]; - step[15] = output[12] - output[15]; - - // step 4 - output[ 0] = (step[ 0] + step[ 1] + output_rounding) >> output_shift; - output[ 8] = (step[ 0] - step[ 1] + output_rounding) >> output_shift; - - temp1 = step[2] * C12; - temp2 = step[3] * C4; - temp1 = (temp1 + temp2 + final_rounding) >> final_shift; - output[ 4] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[2] * C4; - temp2 = step[3] * C12; - temp1 = (temp2 - temp1 + final_rounding) >> final_shift; - output[12] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - output[ 2] = (2 * ((step[4] + step[ 5]) * C8) + final_rounding) - >> final_shift; - output[14] = (2 * ((step[7] - step[ 6]) * C8) + final_rounding) - >> final_shift; - - temp1 = step[4] - step[5]; - temp2 = step[6] + step[7]; - output[ 6] = (temp1 + temp2 + output_rounding) >> output_shift; - output[10] = (temp1 - temp2 + output_rounding) >> output_shift; - - intermediate[8] = step[8] + step[14]; - intermediate[9] = step[9] + step[15]; - - temp1 = intermediate[8] * C12; - temp2 = intermediate[9] * C4; - temp1 = (temp1 - temp2 + final_rounding) >> final_shift; - output[3] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - temp1 = intermediate[8] * C4; - temp2 = intermediate[9] * C12; - temp1 = (temp2 + temp1 + final_rounding) >> final_shift; - output[13] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - output[ 9] = (2 * ((step[10] + step[11]) * C8) + final_rounding) - >> final_shift; - - intermediate[11] = step[10] - step[11]; - intermediate[12] = step[12] + step[13]; - intermediate[13] = step[12] - step[13]; - intermediate[14] = step[ 8] - step[14]; - intermediate[15] = step[ 9] - step[15]; - - output[15] = (intermediate[11] + intermediate[12] + output_rounding) - >> output_shift; - output[ 1] = -(intermediate[11] - intermediate[12] + output_rounding) - >> output_shift; - - output[ 7] = (2 * (intermediate[13] * C8) + final_rounding) >> final_shift; - - temp1 = intermediate[14] * C12; - temp2 = intermediate[15] * C4; - temp1 = (temp1 - temp2 + final_rounding) >> final_shift; - output[11] = (-2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; - - temp1 = intermediate[14] * C4; - temp2 = intermediate[15] * C12; - temp1 = (temp2 + temp1 + final_rounding) >> final_shift; - output[ 5] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT; +void fadst16_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = - x8; + output[2] = x12; + output[3] = - x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = - x13; + output[14] = x9; + output[15] = - x1; } -void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) { - int shortpitch = pitch >> 1; - int i, j; - int16_t output[256]; - int16_t *outptr = &output[0]; +static const transform_2d FHT_16[] = { + { fdct16_1d, fdct16_1d }, // DCT_DCT = 0 + { fadst16_1d, fdct16_1d }, // ADST_DCT = 1 + { fdct16_1d, fadst16_1d }, // DCT_ADST = 2 + { fadst16_1d, fadst16_1d } // ADST_ADST = 3 +}; - // First transform columns - for (i = 0; i < 16; i++) { - int16_t temp_in[16]; - int16_t temp_out[16]; - for (j = 0; j < 16; j++) - temp_in[j] = input[j * shortpitch + i]; - dct16x16_1d(temp_in, temp_out, 0); - for (j = 0; j < 16; j++) - output[j * 16 + i] = temp_out[j]; - } +void vp9_short_fht16x16_c(int16_t *input, int16_t *output, + int pitch, TX_TYPE tx_type) { + int16_t out[256]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[16], temp_out[16]; + const transform_2d ht = FHT_16[tx_type]; - // Then transform rows - for (i = 0; i < 16; ++i) { - dct16x16_1d(outptr, out, 1); - outptr += 16; - out += 16; - } + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = input[j * pitch + i] << 2; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) + outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j + i * 16]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 16; ++j) + output[j + i * 16] = temp_out[j]; + } } -#undef RIGHT_SHIFT -#undef ROUNDING -#endif - -#if !CONFIG_DWTDCTHYBRID -static void dct32_1d(double *input, double *output, int stride) { - static const double C1 = 0.998795456205; // cos(pi * 1 / 64) - static const double C2 = 0.995184726672; // cos(pi * 2 / 64) - static const double C3 = 0.989176509965; // cos(pi * 3 / 64) - static const double C4 = 0.980785280403; // cos(pi * 4 / 64) - static const double C5 = 0.970031253195; // cos(pi * 5 / 64) - static const double C6 = 0.956940335732; // cos(pi * 6 / 64) - static const double C7 = 0.941544065183; // cos(pi * 7 / 64) - static const double C8 = 0.923879532511; // cos(pi * 8 / 64) - static const double C9 = 0.903989293123; // cos(pi * 9 / 64) - static const double C10 = 0.881921264348; // cos(pi * 10 / 64) - static const double C11 = 0.857728610000; // cos(pi * 11 / 64) - static const double C12 = 0.831469612303; // cos(pi * 12 / 64) - static const double C13 = 0.803207531481; // cos(pi * 13 / 64) - static const double C14 = 0.773010453363; // cos(pi * 14 / 64) - static const double C15 = 0.740951125355; // cos(pi * 15 / 64) - static const double C16 = 0.707106781187; // cos(pi * 16 / 64) - static const double C17 = 0.671558954847; // cos(pi * 17 / 64) - static const double C18 = 0.634393284164; // cos(pi * 18 / 64) - static const double C19 = 0.595699304492; // cos(pi * 19 / 64) - static const double C20 = 0.555570233020; // cos(pi * 20 / 64) - static const double C21 = 0.514102744193; // cos(pi * 21 / 64) - static const double C22 = 0.471396736826; // cos(pi * 22 / 64) - static const double C23 = 0.427555093430; // cos(pi * 23 / 64) - static const double C24 = 0.382683432365; // cos(pi * 24 / 64) - static const double C25 = 0.336889853392; // cos(pi * 25 / 64) - static const double C26 = 0.290284677254; // cos(pi * 26 / 64) - static const double C27 = 0.242980179903; // cos(pi * 27 / 64) - static const double C28 = 0.195090322016; // cos(pi * 28 / 64) - static const double C29 = 0.146730474455; // cos(pi * 29 / 64) - static const double C30 = 0.098017140330; // cos(pi * 30 / 64) - static const double C31 = 0.049067674327; // cos(pi * 31 / 64) - - double step[32]; + +static void dct32_1d(int *input, int *output) { + int step[32]; // Stage 1 - step[0] = input[stride*0] + input[stride*(32 - 1)]; - step[1] = input[stride*1] + input[stride*(32 - 2)]; - step[2] = input[stride*2] + input[stride*(32 - 3)]; - step[3] = input[stride*3] + input[stride*(32 - 4)]; - step[4] = input[stride*4] + input[stride*(32 - 5)]; - step[5] = input[stride*5] + input[stride*(32 - 6)]; - step[6] = input[stride*6] + input[stride*(32 - 7)]; - step[7] = input[stride*7] + input[stride*(32 - 8)]; - step[8] = input[stride*8] + input[stride*(32 - 9)]; - step[9] = input[stride*9] + input[stride*(32 - 10)]; - step[10] = input[stride*10] + input[stride*(32 - 11)]; - step[11] = input[stride*11] + input[stride*(32 - 12)]; - step[12] = input[stride*12] + input[stride*(32 - 13)]; - step[13] = input[stride*13] + input[stride*(32 - 14)]; - step[14] = input[stride*14] + input[stride*(32 - 15)]; - step[15] = input[stride*15] + input[stride*(32 - 16)]; - step[16] = -input[stride*16] + input[stride*(32 - 17)]; - step[17] = -input[stride*17] + input[stride*(32 - 18)]; - step[18] = -input[stride*18] + input[stride*(32 - 19)]; - step[19] = -input[stride*19] + input[stride*(32 - 20)]; - step[20] = -input[stride*20] + input[stride*(32 - 21)]; - step[21] = -input[stride*21] + input[stride*(32 - 22)]; - step[22] = -input[stride*22] + input[stride*(32 - 23)]; - step[23] = -input[stride*23] + input[stride*(32 - 24)]; - step[24] = -input[stride*24] + input[stride*(32 - 25)]; - step[25] = -input[stride*25] + input[stride*(32 - 26)]; - step[26] = -input[stride*26] + input[stride*(32 - 27)]; - step[27] = -input[stride*27] + input[stride*(32 - 28)]; - step[28] = -input[stride*28] + input[stride*(32 - 29)]; - step[29] = -input[stride*29] + input[stride*(32 - 30)]; - step[30] = -input[stride*30] + input[stride*(32 - 31)]; - step[31] = -input[stride*31] + input[stride*(32 - 32)]; + step[0] = input[0] + input[(32 - 1)]; + step[1] = input[1] + input[(32 - 2)]; + step[2] = input[2] + input[(32 - 3)]; + step[3] = input[3] + input[(32 - 4)]; + step[4] = input[4] + input[(32 - 5)]; + step[5] = input[5] + input[(32 - 6)]; + step[6] = input[6] + input[(32 - 7)]; + step[7] = input[7] + input[(32 - 8)]; + step[8] = input[8] + input[(32 - 9)]; + step[9] = input[9] + input[(32 - 10)]; + step[10] = input[10] + input[(32 - 11)]; + step[11] = input[11] + input[(32 - 12)]; + step[12] = input[12] + input[(32 - 13)]; + step[13] = input[13] + input[(32 - 14)]; + step[14] = input[14] + input[(32 - 15)]; + step[15] = input[15] + input[(32 - 16)]; + step[16] = -input[16] + input[(32 - 17)]; + step[17] = -input[17] + input[(32 - 18)]; + step[18] = -input[18] + input[(32 - 19)]; + step[19] = -input[19] + input[(32 - 20)]; + step[20] = -input[20] + input[(32 - 21)]; + step[21] = -input[21] + input[(32 - 22)]; + step[22] = -input[22] + input[(32 - 23)]; + step[23] = -input[23] + input[(32 - 24)]; + step[24] = -input[24] + input[(32 - 25)]; + step[25] = -input[25] + input[(32 - 26)]; + step[26] = -input[26] + input[(32 - 27)]; + step[27] = -input[27] + input[(32 - 28)]; + step[28] = -input[28] + input[(32 - 29)]; + step[29] = -input[29] + input[(32 - 30)]; + step[30] = -input[30] + input[(32 - 31)]; + step[31] = -input[31] + input[(32 - 32)]; // Stage 2 - output[stride*0] = step[0] + step[16 - 1]; - output[stride*1] = step[1] + step[16 - 2]; - output[stride*2] = step[2] + step[16 - 3]; - output[stride*3] = step[3] + step[16 - 4]; - output[stride*4] = step[4] + step[16 - 5]; - output[stride*5] = step[5] + step[16 - 6]; - output[stride*6] = step[6] + step[16 - 7]; - output[stride*7] = step[7] + step[16 - 8]; - output[stride*8] = -step[8] + step[16 - 9]; - output[stride*9] = -step[9] + step[16 - 10]; - output[stride*10] = -step[10] + step[16 - 11]; - output[stride*11] = -step[11] + step[16 - 12]; - output[stride*12] = -step[12] + step[16 - 13]; - output[stride*13] = -step[13] + step[16 - 14]; - output[stride*14] = -step[14] + step[16 - 15]; - output[stride*15] = -step[15] + step[16 - 16]; - - output[stride*16] = step[16]; - output[stride*17] = step[17]; - output[stride*18] = step[18]; - output[stride*19] = step[19]; - - output[stride*20] = (-step[20] + step[27])*C16; - output[stride*21] = (-step[21] + step[26])*C16; - output[stride*22] = (-step[22] + step[25])*C16; - output[stride*23] = (-step[23] + step[24])*C16; - - output[stride*24] = (step[24] + step[23])*C16; - output[stride*25] = (step[25] + step[22])*C16; - output[stride*26] = (step[26] + step[21])*C16; - output[stride*27] = (step[27] + step[20])*C16; - - output[stride*28] = step[28]; - output[stride*29] = step[29]; - output[stride*30] = step[30]; - output[stride*31] = step[31]; + output[0] = step[0] + step[16 - 1]; + output[1] = step[1] + step[16 - 2]; + output[2] = step[2] + step[16 - 3]; + output[3] = step[3] + step[16 - 4]; + output[4] = step[4] + step[16 - 5]; + output[5] = step[5] + step[16 - 6]; + output[6] = step[6] + step[16 - 7]; + output[7] = step[7] + step[16 - 8]; + output[8] = -step[8] + step[16 - 9]; + output[9] = -step[9] + step[16 - 10]; + output[10] = -step[10] + step[16 - 11]; + output[11] = -step[11] + step[16 - 12]; + output[12] = -step[12] + step[16 - 13]; + output[13] = -step[13] + step[16 - 14]; + output[14] = -step[14] + step[16 - 15]; + output[15] = -step[15] + step[16 - 16]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = step[18]; + output[19] = step[19]; + + output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); + output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); + output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); + output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); + + output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); + output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); + output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); + output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); + + output[28] = step[28]; + output[29] = step[29]; + output[30] = step[30]; + output[31] = step[31]; // Stage 3 - step[0] = output[stride*0] + output[stride*(8 - 1)]; - step[1] = output[stride*1] + output[stride*(8 - 2)]; - step[2] = output[stride*2] + output[stride*(8 - 3)]; - step[3] = output[stride*3] + output[stride*(8 - 4)]; - step[4] = -output[stride*4] + output[stride*(8 - 5)]; - step[5] = -output[stride*5] + output[stride*(8 - 6)]; - step[6] = -output[stride*6] + output[stride*(8 - 7)]; - step[7] = -output[stride*7] + output[stride*(8 - 8)]; - step[8] = output[stride*8]; - step[9] = output[stride*9]; - step[10] = (-output[stride*10] + output[stride*13])*C16; - step[11] = (-output[stride*11] + output[stride*12])*C16; - step[12] = (output[stride*12] + output[stride*11])*C16; - step[13] = (output[stride*13] + output[stride*10])*C16; - step[14] = output[stride*14]; - step[15] = output[stride*15]; - - step[16] = output[stride*16] + output[stride*23]; - step[17] = output[stride*17] + output[stride*22]; - step[18] = output[stride*18] + output[stride*21]; - step[19] = output[stride*19] + output[stride*20]; - step[20] = -output[stride*20] + output[stride*19]; - step[21] = -output[stride*21] + output[stride*18]; - step[22] = -output[stride*22] + output[stride*17]; - step[23] = -output[stride*23] + output[stride*16]; - step[24] = -output[stride*24] + output[stride*31]; - step[25] = -output[stride*25] + output[stride*30]; - step[26] = -output[stride*26] + output[stride*29]; - step[27] = -output[stride*27] + output[stride*28]; - step[28] = output[stride*28] + output[stride*27]; - step[29] = output[stride*29] + output[stride*26]; - step[30] = output[stride*30] + output[stride*25]; - step[31] = output[stride*31] + output[stride*24]; + step[0] = output[0] + output[(8 - 1)]; + step[1] = output[1] + output[(8 - 2)]; + step[2] = output[2] + output[(8 - 3)]; + step[3] = output[3] + output[(8 - 4)]; + step[4] = -output[4] + output[(8 - 5)]; + step[5] = -output[5] + output[(8 - 6)]; + step[6] = -output[6] + output[(8 - 7)]; + step[7] = -output[7] + output[(8 - 8)]; + step[8] = output[8]; + step[9] = output[9]; + step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); + step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); + step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); + step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); + step[14] = output[14]; + step[15] = output[15]; + + step[16] = output[16] + output[23]; + step[17] = output[17] + output[22]; + step[18] = output[18] + output[21]; + step[19] = output[19] + output[20]; + step[20] = -output[20] + output[19]; + step[21] = -output[21] + output[18]; + step[22] = -output[22] + output[17]; + step[23] = -output[23] + output[16]; + step[24] = -output[24] + output[31]; + step[25] = -output[25] + output[30]; + step[26] = -output[26] + output[29]; + step[27] = -output[27] + output[28]; + step[28] = output[28] + output[27]; + step[29] = output[29] + output[26]; + step[30] = output[30] + output[25]; + step[31] = output[31] + output[24]; // Stage 4 - output[stride*0] = step[0] + step[3]; - output[stride*1] = step[1] + step[2]; - output[stride*2] = -step[2] + step[1]; - output[stride*3] = -step[3] + step[0]; - output[stride*4] = step[4]; - output[stride*5] = (-step[5] + step[6])*C16; - output[stride*6] = (step[6] + step[5])*C16; - output[stride*7] = step[7]; - output[stride*8] = step[8] + step[11]; - output[stride*9] = step[9] + step[10]; - output[stride*10] = -step[10] + step[9]; - output[stride*11] = -step[11] + step[8]; - output[stride*12] = -step[12] + step[15]; - output[stride*13] = -step[13] + step[14]; - output[stride*14] = step[14] + step[13]; - output[stride*15] = step[15] + step[12]; - - output[stride*16] = step[16]; - output[stride*17] = step[17]; - output[stride*18] = step[18]*-C8 + step[29]*C24; - output[stride*19] = step[19]*-C8 + step[28]*C24; - output[stride*20] = step[20]*-C24 + step[27]*-C8; - output[stride*21] = step[21]*-C24 + step[26]*-C8; - output[stride*22] = step[22]; - output[stride*23] = step[23]; - output[stride*24] = step[24]; - output[stride*25] = step[25]; - output[stride*26] = step[26]*C24 + step[21]*-C8; - output[stride*27] = step[27]*C24 + step[20]*-C8; - output[stride*28] = step[28]*C8 + step[19]*C24; - output[stride*29] = step[29]*C8 + step[18]*C24; - output[stride*30] = step[30]; - output[stride*31] = step[31]; + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = -step[2] + step[1]; + output[3] = -step[3] + step[0]; + output[4] = step[4]; + output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); + output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); + output[7] = step[7]; + output[8] = step[8] + step[11]; + output[9] = step[9] + step[10]; + output[10] = -step[10] + step[9]; + output[11] = -step[11] + step[8]; + output[12] = -step[12] + step[15]; + output[13] = -step[13] + step[14]; + output[14] = step[14] + step[13]; + output[15] = step[15] + step[12]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); + output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); + output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); + output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); + output[22] = step[22]; + output[23] = step[23]; + output[24] = step[24]; + output[25] = step[25]; + output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); + output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); + output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); + output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); + output[30] = step[30]; + output[31] = step[31]; // Stage 5 - step[0] = (output[stride*0] + output[stride*1]) * C16; - step[1] = (-output[stride*1] + output[stride*0]) * C16; - step[2] = output[stride*2]*C24 + output[stride*3] * C8; - step[3] = output[stride*3]*C24 - output[stride*2] * C8; - step[4] = output[stride*4] + output[stride*5]; - step[5] = -output[stride*5] + output[stride*4]; - step[6] = -output[stride*6] + output[stride*7]; - step[7] = output[stride*7] + output[stride*6]; - step[8] = output[stride*8]; - step[9] = output[stride*9]*-C8 + output[stride*14]*C24; - step[10] = output[stride*10]*-C24 + output[stride*13]*-C8; - step[11] = output[stride*11]; - step[12] = output[stride*12]; - step[13] = output[stride*13]*C24 + output[stride*10]*-C8; - step[14] = output[stride*14]*C8 + output[stride*9]*C24; - step[15] = output[stride*15]; - - step[16] = output[stride*16] + output[stride*19]; - step[17] = output[stride*17] + output[stride*18]; - step[18] = -output[stride*18] + output[stride*17]; - step[19] = -output[stride*19] + output[stride*16]; - step[20] = -output[stride*20] + output[stride*23]; - step[21] = -output[stride*21] + output[stride*22]; - step[22] = output[stride*22] + output[stride*21]; - step[23] = output[stride*23] + output[stride*20]; - step[24] = output[stride*24] + output[stride*27]; - step[25] = output[stride*25] + output[stride*26]; - step[26] = -output[stride*26] + output[stride*25]; - step[27] = -output[stride*27] + output[stride*24]; - step[28] = -output[stride*28] + output[stride*31]; - step[29] = -output[stride*29] + output[stride*30]; - step[30] = output[stride*30] + output[stride*29]; - step[31] = output[stride*31] + output[stride*28]; + step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); + step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); + step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); + step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); + step[4] = output[4] + output[5]; + step[5] = -output[5] + output[4]; + step[6] = -output[6] + output[7]; + step[7] = output[7] + output[6]; + step[8] = output[8]; + step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); + step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); + step[11] = output[11]; + step[12] = output[12]; + step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); + step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); + step[15] = output[15]; + + step[16] = output[16] + output[19]; + step[17] = output[17] + output[18]; + step[18] = -output[18] + output[17]; + step[19] = -output[19] + output[16]; + step[20] = -output[20] + output[23]; + step[21] = -output[21] + output[22]; + step[22] = output[22] + output[21]; + step[23] = output[23] + output[20]; + step[24] = output[24] + output[27]; + step[25] = output[25] + output[26]; + step[26] = -output[26] + output[25]; + step[27] = -output[27] + output[24]; + step[28] = -output[28] + output[31]; + step[29] = -output[29] + output[30]; + step[30] = output[30] + output[29]; + step[31] = output[31] + output[28]; // Stage 6 - output[stride*0] = step[0]; - output[stride*1] = step[1]; - output[stride*2] = step[2]; - output[stride*3] = step[3]; - output[stride*4] = step[4]*C28 + step[7]*C4; - output[stride*5] = step[5]*C12 + step[6]*C20; - output[stride*6] = step[6]*C12 + step[5]*-C20; - output[stride*7] = step[7]*C28 + step[4]*-C4; - output[stride*8] = step[8] + step[9]; - output[stride*9] = -step[9] + step[8]; - output[stride*10] = -step[10] + step[11]; - output[stride*11] = step[11] + step[10]; - output[stride*12] = step[12] + step[13]; - output[stride*13] = -step[13] + step[12]; - output[stride*14] = -step[14] + step[15]; - output[stride*15] = step[15] + step[14]; - - output[stride*16] = step[16]; - output[stride*17] = step[17]*-C4 + step[30]*C28; - output[stride*18] = step[18]*-C28 + step[29]*-C4; - output[stride*19] = step[19]; - output[stride*20] = step[20]; - output[stride*21] = step[21]*-C20 + step[26]*C12; - output[stride*22] = step[22]*-C12 + step[25]*-C20; - output[stride*23] = step[23]; - output[stride*24] = step[24]; - output[stride*25] = step[25]*C12 + step[22]*-C20; - output[stride*26] = step[26]*C20 + step[21]*C12; - output[stride*27] = step[27]; - output[stride*28] = step[28]; - output[stride*29] = step[29]*C28 + step[18]*-C4; - output[stride*30] = step[30]*C4 + step[17]*C28; - output[stride*31] = step[31]; + output[0] = step[0]; + output[1] = step[1]; + output[2] = step[2]; + output[3] = step[3]; + output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); + output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); + output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); + output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); + output[8] = step[8] + step[9]; + output[9] = -step[9] + step[8]; + output[10] = -step[10] + step[11]; + output[11] = step[11] + step[10]; + output[12] = step[12] + step[13]; + output[13] = -step[13] + step[12]; + output[14] = -step[14] + step[15]; + output[15] = step[15] + step[14]; + + output[16] = step[16]; + output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); + output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); + output[19] = step[19]; + output[20] = step[20]; + output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); + output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); + output[23] = step[23]; + output[24] = step[24]; + output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); + output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); + output[27] = step[27]; + output[28] = step[28]; + output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); + output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); + output[31] = step[31]; // Stage 7 - step[0] = output[stride*0]; - step[1] = output[stride*1]; - step[2] = output[stride*2]; - step[3] = output[stride*3]; - step[4] = output[stride*4]; - step[5] = output[stride*5]; - step[6] = output[stride*6]; - step[7] = output[stride*7]; - step[8] = output[stride*8]*C30 + output[stride*15]*C2; - step[9] = output[stride*9]*C14 + output[stride*14]*C18; - step[10] = output[stride*10]*C22 + output[stride*13]*C10; - step[11] = output[stride*11]*C6 + output[stride*12]*C26; - step[12] = output[stride*12]*C6 + output[stride*11]*-C26; - step[13] = output[stride*13]*C22 + output[stride*10]*-C10; - step[14] = output[stride*14]*C14 + output[stride*9]*-C18; - step[15] = output[stride*15]*C30 + output[stride*8]*-C2; - - step[16] = output[stride*16] + output[stride*17]; - step[17] = -output[stride*17] + output[stride*16]; - step[18] = -output[stride*18] + output[stride*19]; - step[19] = output[stride*19] + output[stride*18]; - step[20] = output[stride*20] + output[stride*21]; - step[21] = -output[stride*21] + output[stride*20]; - step[22] = -output[stride*22] + output[stride*23]; - step[23] = output[stride*23] + output[stride*22]; - step[24] = output[stride*24] + output[stride*25]; - step[25] = -output[stride*25] + output[stride*24]; - step[26] = -output[stride*26] + output[stride*27]; - step[27] = output[stride*27] + output[stride*26]; - step[28] = output[stride*28] + output[stride*29]; - step[29] = -output[stride*29] + output[stride*28]; - step[30] = -output[stride*30] + output[stride*31]; - step[31] = output[stride*31] + output[stride*30]; + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + step[4] = output[4]; + step[5] = output[5]; + step[6] = output[6]; + step[7] = output[7]; + step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); + step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); + step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); + step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); + step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); + step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); + step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); + step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); + + step[16] = output[16] + output[17]; + step[17] = -output[17] + output[16]; + step[18] = -output[18] + output[19]; + step[19] = output[19] + output[18]; + step[20] = output[20] + output[21]; + step[21] = -output[21] + output[20]; + step[22] = -output[22] + output[23]; + step[23] = output[23] + output[22]; + step[24] = output[24] + output[25]; + step[25] = -output[25] + output[24]; + step[26] = -output[26] + output[27]; + step[27] = output[27] + output[26]; + step[28] = output[28] + output[29]; + step[29] = -output[29] + output[28]; + step[30] = -output[30] + output[31]; + step[31] = output[31] + output[30]; // Final stage --- outputs indices are bit-reversed. - output[stride*0] = step[0]; - output[stride*16] = step[1]; - output[stride*8] = step[2]; - output[stride*24] = step[3]; - output[stride*4] = step[4]; - output[stride*20] = step[5]; - output[stride*12] = step[6]; - output[stride*28] = step[7]; - output[stride*2] = step[8]; - output[stride*18] = step[9]; - output[stride*10] = step[10]; - output[stride*26] = step[11]; - output[stride*6] = step[12]; - output[stride*22] = step[13]; - output[stride*14] = step[14]; - output[stride*30] = step[15]; - - output[stride*1] = step[16]*C31 + step[31]*C1; - output[stride*17] = step[17]*C15 + step[30]*C17; - output[stride*9] = step[18]*C23 + step[29]*C9; - output[stride*25] = step[19]*C7 + step[28]*C25; - output[stride*5] = step[20]*C27 + step[27]*C5; - output[stride*21] = step[21]*C11 + step[26]*C21; - output[stride*13] = step[22]*C19 + step[25]*C13; - output[stride*29] = step[23]*C3 + step[24]*C29; - output[stride*3] = step[24]*C3 + step[23]*-C29; - output[stride*19] = step[25]*C19 + step[22]*-C13; - output[stride*11] = step[26]*C11 + step[21]*-C21; - output[stride*27] = step[27]*C27 + step[20]*-C5; - output[stride*7] = step[28]*C7 + step[19]*-C25; - output[stride*23] = step[29]*C23 + step[18]*-C9; - output[stride*15] = step[30]*C15 + step[17]*-C17; - output[stride*31] = step[31]*C31 + step[16]*-C1; + output[0] = step[0]; + output[16] = step[1]; + output[8] = step[2]; + output[24] = step[3]; + output[4] = step[4]; + output[20] = step[5]; + output[12] = step[6]; + output[28] = step[7]; + output[2] = step[8]; + output[18] = step[9]; + output[10] = step[10]; + output[26] = step[11]; + output[6] = step[12]; + output[22] = step[13]; + output[14] = step[14]; + output[30] = step[15]; + + output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); + output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); + output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); + output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); + output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); + output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); + output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); + output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); + output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); + output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); + output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); + output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); + output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); + output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); + output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); + output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int shortpitch = pitch >> 1; - int i, j; - double output[1024]; - // First transform columns - for (i = 0; i < 32; i++) { - double temp_in[32], temp_out[32]; - for (j = 0; j < 32; j++) - temp_in[j] = input[j*shortpitch + i]; - dct32_1d(temp_in, temp_out, 1); - for (j = 0; j < 32; j++) - output[j*32 + i] = temp_out[j]; - } - // Then transform rows - for (i = 0; i < 32; ++i) { - double temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = output[j + i*32]; - dct32_1d(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) - output[j + i*32] = temp_out[j]; - } - // Scale by some magic number - for (i = 0; i < 1024; i++) { - out[i] = (short)round(output[i]/4); - } - } - - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -#else // CONFIG_DWTDCTHYBRID - -#if DWT_TYPE == 53 - -// Note: block length must be even for this implementation -static void analysis_53_row(int length, short *x, - short *lowpass, short *highpass) { - int n; - short r, *a, *b; - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - *a++ = (r = *x++) << 1; - *b++ = *x - ((r + x[1] + 1) >> 1); - x++; - } - *a = (r = *x++) << 1; - *b = *x - r; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ += (r + (*b) + 1) >> 1; - r = *b++; - } -} - -static void analysis_53_col(int length, short *x, - short *lowpass, short *highpass) { - int n; - short r, *a, *b; - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - *a++ = (r = *x++); - *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2; - x++; - } - *a = (r = *x++); - *b = (*x - r + 1) >> 1; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ += (r + (*b) + 1) >> 1; - r = *b++; - } -} - -static void dyadic_analyze_53(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - short buffer[2 * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); - analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = c[i * pitch_c + j]; - analysis_53_col(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i]; - } - } -} - -#elif DWT_TYPE == 26 - -static void analysis_26_row(int length, short *x, - short *lowpass, short *highpass) { - int i, n; - short r, s, *a, *b; - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - r = *x++; - s = *x++; - *a++ = r + s; - *b++ = r - s; - } - n = length >> 1; - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ -= (r - a[1] + 4) >> 3; - r = *a++; - } - *b -= (r - *a + 4) >> 3; - } -} - -static void analysis_26_col(int length, short *x, - short *lowpass, short *highpass) { - int i, n; - short r, s, *a, *b; - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - r = *x++; - s = *x++; - *a++ = (r + s + 1) >> 1; - *b++ = (r - s + 1) >> 1; - } - n = length >> 1; - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ -= (r - a[1] + 4) >> 3; - r = *a++; - } - *b -= (r - *a + 4) >> 3; - } -} - -static void dyadic_analyze_26(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - short buffer[2 * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); - analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = c[i * pitch_c + j]; - analysis_26_col(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i]; - } - } -} - -#elif DWT_TYPE == 97 - -static void analysis_97(int length, double *x, - double *lowpass, double *highpass) { - static const double a_predict1 = -1.586134342; - static const double a_update1 = -0.05298011854; - static const double a_predict2 = 0.8829110762; - static const double a_update2 = 0.4435068522; - static const double s_low = 1.149604398; - static const double s_high = 1/1.149604398; - int i; - double y[DWT_MAX_LENGTH]; - // Predict 1 - for (i = 1; i < length - 2; i += 2) { - x[i] += a_predict1 * (x[i - 1] + x[i + 1]); - } - x[length - 1] += 2 * a_predict1 * x[length - 2]; - // Update 1 - for (i = 2; i < length; i += 2) { - x[i] += a_update1 * (x[i - 1] + x[i + 1]); - } - x[0] += 2 * a_update1 * x[1]; - // Predict 2 - for (i = 1; i < length - 2; i += 2) { - x[i] += a_predict2 * (x[i - 1] + x[i + 1]); - } - x[length - 1] += 2 * a_predict2 * x[length - 2]; - // Update 2 - for (i = 2; i < length; i += 2) { - x[i] += a_update2 * (x[i - 1] + x[i + 1]); - } - x[0] += 2 * a_update2 * x[1]; - memcpy(y, x, sizeof(*y) * length); - // Scale and pack - for (i = 0; i < length / 2; i++) { - lowpass[i] = y[2 * i] * s_low; - highpass[i] = y[2 * i + 1] * s_high; - } -} - -static void dyadic_analyze_97(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - double buffer[2 * DWT_MAX_LENGTH]; - double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer)); - analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH], - &y[i * DWT_MAX_LENGTH] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = y[i * DWT_MAX_LENGTH + j]; - analysis_97(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = round(buffer[i]); - } - } -} - -#endif // DWT_TYPE - -// TODO(debargha): Implement the scaling differently so as not to have to -// use the floating point dct -static void dct16x16_1d_f(double input[16], double output[16]) { - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - // step 1 - step[ 0] = input[0] + input[15]; - step[ 1] = input[1] + input[14]; - step[ 2] = input[2] + input[13]; - step[ 3] = input[3] + input[12]; - step[ 4] = input[4] + input[11]; - step[ 5] = input[5] + input[10]; - step[ 6] = input[6] + input[ 9]; - step[ 7] = input[7] + input[ 8]; - step[ 8] = input[7] - input[ 8]; - step[ 9] = input[6] - input[ 9]; - step[10] = input[5] - input[10]; - step[11] = input[4] - input[11]; - step[12] = input[3] - input[12]; - step[13] = input[2] - input[13]; - step[14] = input[1] - input[14]; - step[15] = input[0] - input[15]; - - // step 2 - output[0] = step[0] + step[7]; - output[1] = step[1] + step[6]; - output[2] = step[2] + step[5]; - output[3] = step[3] + step[4]; - output[4] = step[3] - step[4]; - output[5] = step[2] - step[5]; - output[6] = step[1] - step[6]; - output[7] = step[0] - step[7]; - - temp1 = step[ 8]*C7; - temp2 = step[15]*C9; - output[ 8] = temp1 + temp2; - - temp1 = step[ 9]*C11; - temp2 = step[14]*C5; - output[ 9] = temp1 - temp2; - - temp1 = step[10]*C3; - temp2 = step[13]*C13; - output[10] = temp1 + temp2; - - temp1 = step[11]*C15; - temp2 = step[12]*C1; - output[11] = temp1 - temp2; - - temp1 = step[11]*C1; - temp2 = step[12]*C15; - output[12] = temp2 + temp1; - - temp1 = step[10]*C13; - temp2 = step[13]*C3; - output[13] = temp2 - temp1; - - temp1 = step[ 9]*C5; - temp2 = step[14]*C11; - output[14] = temp2 + temp1; - - temp1 = step[ 8]*C9; - temp2 = step[15]*C7; - output[15] = temp2 - temp1; - - // step 3 - step[ 0] = output[0] + output[3]; - step[ 1] = output[1] + output[2]; - step[ 2] = output[1] - output[2]; - step[ 3] = output[0] - output[3]; - - temp1 = output[4]*C14; - temp2 = output[7]*C2; - step[ 4] = temp1 + temp2; - - temp1 = output[5]*C10; - temp2 = output[6]*C6; - step[ 5] = temp1 + temp2; - - temp1 = output[5]*C6; - temp2 = output[6]*C10; - step[ 6] = temp2 - temp1; - - temp1 = output[4]*C2; - temp2 = output[7]*C14; - step[ 7] = temp2 - temp1; - - step[ 8] = output[ 8] + output[11]; - step[ 9] = output[ 9] + output[10]; - step[10] = output[ 9] - output[10]; - step[11] = output[ 8] - output[11]; - - step[12] = output[12] + output[15]; - step[13] = output[13] + output[14]; - step[14] = output[13] - output[14]; - step[15] = output[12] - output[15]; - - // step 4 - output[ 0] = (step[ 0] + step[ 1]); - output[ 8] = (step[ 0] - step[ 1]); - - temp1 = step[2]*C12; - temp2 = step[3]*C4; - temp1 = temp1 + temp2; - output[ 4] = 2*(temp1*C8); - - temp1 = step[2]*C4; - temp2 = step[3]*C12; - temp1 = temp2 - temp1; - output[12] = 2*(temp1*C8); - - output[ 2] = 2*((step[4] + step[ 5])*C8); - output[14] = 2*((step[7] - step[ 6])*C8); - - temp1 = step[4] - step[5]; - temp2 = step[6] + step[7]; - output[ 6] = (temp1 + temp2); - output[10] = (temp1 - temp2); - - intermediate[8] = step[8] + step[14]; - intermediate[9] = step[9] + step[15]; - - temp1 = intermediate[8]*C12; - temp2 = intermediate[9]*C4; - temp1 = temp1 - temp2; - output[3] = 2*(temp1*C8); - - temp1 = intermediate[8]*C4; - temp2 = intermediate[9]*C12; - temp1 = temp2 + temp1; - output[13] = 2*(temp1*C8); - - output[ 9] = 2*((step[10] + step[11])*C8); - - intermediate[11] = step[10] - step[11]; - intermediate[12] = step[12] + step[13]; - intermediate[13] = step[12] - step[13]; - intermediate[14] = step[ 8] - step[14]; - intermediate[15] = step[ 9] - step[15]; - - output[15] = (intermediate[11] + intermediate[12]); - output[ 1] = -(intermediate[11] - intermediate[12]); - - output[ 7] = 2*(intermediate[13]*C8); - - temp1 = intermediate[14]*C12; - temp2 = intermediate[15]*C4; - temp1 = temp1 - temp2; - output[11] = -2*(temp1*C8); - - temp1 = intermediate[14]*C4; - temp2 = intermediate[15]*C12; - temp1 = temp2 + temp1; - output[ 5] = 2*(temp1*C8); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch, - int scale) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int shortpitch = pitch >> 1; - int i, j; - double output[256]; - // First transform columns - for (i = 0; i < 16; i++) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; j++) - temp_in[j] = input[j*shortpitch + i]; - dct16x16_1d_f(temp_in, temp_out); - for (j = 0; j < 16; j++) - output[j*16 + i] = temp_out[j]; - } - // Then transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = output[j + i*16]; - dct16x16_1d_f(temp_in, temp_out); - for (j = 0; j < 16; ++j) - output[j + i*16] = temp_out[j]; - } - // Scale by some magic number - for (i = 0; i < 256; i++) - out[i] = (short)round(output[i] / (2 << scale)); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) { - int j1, i, j, k; - float b[8]; - float b1[8]; - float d[8][8]; - float f0 = (float) .7071068; - float f1 = (float) .4903926; - float f2 = (float) .4619398; - float f3 = (float) .4157348; - float f4 = (float) .3535534; - float f5 = (float) .2777851; - float f6 = (float) .1913417; - float f7 = (float) .0975452; - pitch = pitch / 2; - for (i = 0, k = 0; i < 8; i++, k += pitch) { - for (j = 0; j < 8; j++) { - b[j] = (float)(block[k + j] << (3 - scale)); - } - /* Horizontal transform */ - for (j = 0; j < 4; j++) { - j1 = 7 - j; - b1[j] = b[j] + b[j1]; - b1[j1] = b[j] - b[j1]; - } - b[0] = b1[0] + b1[3]; - b[1] = b1[1] + b1[2]; - b[2] = b1[1] - b1[2]; - b[3] = b1[0] - b1[3]; - b[4] = b1[4]; - b[5] = (b1[6] - b1[5]) * f0; - b[6] = (b1[6] + b1[5]) * f0; - b[7] = b1[7]; - d[i][0] = (b[0] + b[1]) * f4; - d[i][4] = (b[0] - b[1]) * f4; - d[i][2] = b[2] * f6 + b[3] * f2; - d[i][6] = b[3] * f6 - b[2] * f2; - b1[4] = b[4] + b[5]; - b1[7] = b[7] + b[6]; - b1[5] = b[4] - b[5]; - b1[6] = b[7] - b[6]; - d[i][1] = b1[4] * f7 + b1[7] * f1; - d[i][5] = b1[5] * f3 + b1[6] * f5; - d[i][7] = b1[7] * f7 - b1[4] * f1; - d[i][3] = b1[6] * f3 - b1[5] * f5; - } - /* Vertical transform */ - for (i = 0; i < 8; i++) { - for (j = 0; j < 4; j++) { - j1 = 7 - j; - b1[j] = d[j][i] + d[j1][i]; - b1[j1] = d[j][i] - d[j1][i]; - } - b[0] = b1[0] + b1[3]; - b[1] = b1[1] + b1[2]; - b[2] = b1[1] - b1[2]; - b[3] = b1[0] - b1[3]; - b[4] = b1[4]; - b[5] = (b1[6] - b1[5]) * f0; - b[6] = (b1[6] + b1[5]) * f0; - b[7] = b1[7]; - d[0][i] = (b[0] + b[1]) * f4; - d[4][i] = (b[0] - b[1]) * f4; - d[2][i] = b[2] * f6 + b[3] * f2; - d[6][i] = b[3] * f6 - b[2] * f2; - b1[4] = b[4] + b[5]; - b1[7] = b[7] + b[6]; - b1[5] = b[4] - b[5]; - b1[6] = b[7] - b[6]; - d[1][i] = b1[4] * f7 + b1[7] * f1; - d[5][i] = b1[5] * f3 + b1[6] * f5; - d[7][i] = b1[7] * f7 - b1[4] * f1; - d[3][i] = b1[6] * f3 - b1[5] * f5; - } - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5); - } - } - return; -} - -#define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n)) - -#if DWTDCT_TYPE == DWTDCT16X16_LEAN - -void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[16 * 16]; + int shortpitch = pitch >> 1; int i, j; - const int short_pitch = pitch >> 1; -#if DWT_TYPE == 26 - dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 97 - dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 53 - dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32); -#endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } + int output[32 * 32]; + + // Columns + for (i = 0; i < 32; i++) { + int temp_in[32], temp_out[32]; + for (j = 0; j < 32; j++) + temp_in[j] = input[j * shortpitch + i] << 2; + dct32_1d(temp_in, temp_out); + for (j = 0; j < 32; j++) + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } -} - -#elif DWTDCT_TYPE == DWTDCT16X16 -void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[16 * 16]; - int i, j; - const int short_pitch = pitch >> 1; -#if DWT_TYPE == 26 - dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 97 - dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 53 - dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32); -#endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16); -} - -#elif DWTDCT_TYPE == DWTDCT8X8 - -void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[8 * 8]; - int i, j; - const int short_pitch = pitch >> 1; -#if DWT_TYPE == 26 - dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 97 - dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32); -#elif DWT_TYPE == 53 - dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32); -#endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8); - - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } -} - -#endif - -#if CONFIG_TX64X64 -void vp9_short_fdct64x64_c(short *input, short *out, int pitch) { - // assume out is a 64x64 buffer - short buffer[16 * 16]; - int i, j; - const int short_pitch = pitch >> 1; -#if DWT_TYPE == 26 - dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64); -#elif DWT_TYPE == 97 - dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64); -#elif DWT_TYPE == 53 - dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64); -#endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16); - -#if DWTDCT_TYPE == DWTDCT16X16_LEAN - for (i = 0; i < 16; ++i) { - for (j = 16; j < 48; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 16; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } -#elif DWTDCT_TYPE == DWTDCT16X16 - vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16); - - // There is no dct used on the highest bands for now. - // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS - // TODO(debargha): experiment with turning these coeffs to 0 + // Rows for (i = 0; i < 32; ++i) { - for (j = 32; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 32; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } + int temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = output[j + i * 32]; + dct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } -#endif // DWTDCT_TYPE } -#endif // CONFIG_TX64X64 -#endif // CONFIG_DWTDCTHYBRID diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 3f51330624d29bb4197cddec4994016113496f7f..f2a13de99a20b32f46a6933a127474a579296bd7 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -21,7 +21,6 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/common/vp9_setupintrarecon.h" -#include "vp9/common/vp9_reconintra4x4.h" #include "vp9/encoder/vp9_encodeintra.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_invtrans.h" @@ -29,8 +28,9 @@ #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" #include "vp9/encoder/vp9_tokenize.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include <stdio.h> #include <math.h> #include <limits.h> @@ -45,18 +45,15 @@ int enc_debug = 0; #endif -extern void select_interp_filter_type(VP9_COMP *cpi); +void vp9_select_interp_filter_type(VP9_COMP *cpi); static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col); static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col); static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col); static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); @@ -103,7 +100,7 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) { */ act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0, &sse); - act = act << 4; + act <<= 4; /* If the region is flat, lower the activity some more. */ if (act < 8 << 12) @@ -201,7 +198,7 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { #define OUTPUT_NORM_ACT_STATS 0 #if USE_ACT_INDEX -// Calculate and activity index for each mb +// Calculate an activity index for each mb static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { VP9_COMMON *const cm = &cpi->common; int mb_row, mb_col; @@ -271,6 +268,8 @@ static void build_activity_map(VP9_COMP *cpi) { unsigned int mb_activity; int64_t activity_sum = 0; + x->mb_activity_ptr = cpi->mb_activity_map; + // for each macroblock row in image for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { #if ALT_ACT_MEASURE @@ -488,8 +487,7 @@ static void update_state(VP9_COMP *cpi, { int segment_id = mbmi->segment_id; - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) { + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { for (i = 0; i < NB_TXFM_MODES; i++) { cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i]; } @@ -598,9 +596,6 @@ static void update_state(VP9_COMP *cpi, [vp9_switchable_interp_map[mbmi->interp_filter]]; } - cpi->prediction_error += ctx->distortion; - cpi->intra_error += ctx->intra_error; - cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff; cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; @@ -625,24 +620,12 @@ static unsigned find_seg_id(uint8_t *buf, int block_size, } static void set_offsets(VP9_COMP *cpi, - int mb_row, int mb_col, int block_size, - int *ref_yoffset, int *ref_uvoffset) { + int mb_row, int mb_col, int block_size) { MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; const int dst_fb_idx = cm->new_fb_idx; - const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride; - const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride; - const int recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col; - const int recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col; - const int src_y_stride = x->src.y_stride; - const int src_uv_stride = x->src.uv_stride; - const int src_yoffset = 16 * mb_row * src_y_stride + 16 * mb_col; - const int src_uvoffset = 8 * mb_row * src_uv_stride + 8 * mb_col; - const int ref_fb_idx = cm->lst_fb_idx; - const int ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; - const int ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; const int idx_map = mb_row * cm->mb_cols + mb_col; const int idx_str = xd->mode_info_stride * mb_row + mb_col; @@ -664,9 +647,9 @@ static void set_offsets(VP9_COMP *cpi, xd->prev_mode_info_context = cm->prev_mi + idx_str; // Set up destination pointers - xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset; - xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset; - xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->dst, + &cm->yv12_fb[dst_fb_idx], + mb_row, mb_col, NULL, NULL); /* Set up limit values for MV components to prevent them from * extending beyond the UMV borders assuming 16x16 block size */ @@ -680,23 +663,11 @@ static void set_offsets(VP9_COMP *cpi, // Set up distance of MB to edge of frame in 1/8th pel units block_size >>= 4; // in macroblock units assert(!(mb_col & (block_size - 1)) && !(mb_row & (block_size - 1))); - xd->mb_to_top_edge = -((mb_row * 16) << 3); - xd->mb_to_left_edge = -((mb_col * 16) << 3); - xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3; - xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3; - - // Are edges available for intra prediction? - xd->up_available = (mb_row != 0); - xd->left_available = (mb_col != 0); - - /* Reference buffer offsets */ - *ref_yoffset = (mb_row * ref_y_stride * 16) + (mb_col * 16); - *ref_uvoffset = (mb_row * ref_uv_stride * 8) + (mb_col * 8); + set_mb_row(cm, xd, mb_row, block_size); + set_mb_col(cm, xd, mb_col, block_size); /* set up source buffers */ - x->src.y_buffer = cpi->Source->y_buffer + src_yoffset; - x->src.u_buffer = cpi->Source->u_buffer + src_uvoffset; - x->src.v_buffer = cpi->Source->v_buffer + src_uvoffset; + setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL); /* R/D setup */ x->rddiv = cpi->RDDIV; @@ -727,34 +698,36 @@ static void set_offsets(VP9_COMP *cpi, const int x = mb_col & ~3; const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1); + const int tile_progress = cm->cur_tile_mb_col_start * cm->mb_rows; + const int mb_cols = cm->cur_tile_mb_col_end - cm->cur_tile_mb_col_start; cpi->seg0_progress = - ((y * cm->mb_cols + x * 4 + p32 + p16) << 16) / cm->MBs; + ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs; } } else { mbmi->segment_id = 0; } } -static void pick_mb_modes(VP9_COMP *cpi, - int mb_row, - int mb_col, - TOKENEXTRA **tp, - int *totalrate, - int *totaldist) { +static int pick_mb_modes(VP9_COMP *cpi, + int mb_row0, + int mb_col0, + TOKENEXTRA **tp, + int *totalrate, + int *totaldist) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; int i; - int recon_yoffset, recon_uvoffset; + int splitmodes_used = 0; ENTROPY_CONTEXT_PLANES left_context[2]; ENTROPY_CONTEXT_PLANES above_context[2]; ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context - + mb_col; + + mb_col0; /* Function should not modify L & A contexts; save and restore on exit */ vpx_memcpy(left_context, - cm->left_context + (mb_row & 2), + cm->left_context + (mb_row0 & 2), sizeof(left_context)); vpx_memcpy(above_context, initial_above_context_ptr, @@ -763,17 +736,18 @@ static void pick_mb_modes(VP9_COMP *cpi, /* Encode MBs in raster order within the SB */ for (i = 0; i < 4; i++) { const int x_idx = i & 1, y_idx = i >> 1; + const int mb_row = mb_row0 + y_idx; + const int mb_col = mb_col0 + x_idx; MB_MODE_INFO *mbmi; - if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) { + if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) { // MB lies outside frame, move on continue; } // Index of the MB in the SB 0..3 xd->mb_index = i; - set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16, - &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 16); if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); @@ -781,15 +755,11 @@ static void pick_mb_modes(VP9_COMP *cpi, mbmi = &xd->mode_info_context->mbmi; mbmi->sb_type = BLOCK_SIZE_MB16X16; - cpi->update_context = 0; // TODO Do we need this now?? - - vp9_intra_prediction_down_copy(xd); - // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB if (cm->frame_type == KEY_FRAME) { int r, d; -#ifdef ENC_DEBUG +#if 0 // ENC_DEBUG if (enc_debug) printf("intra pick_mb_modes %d %d\n", mb_row, mb_col); #endif @@ -798,8 +768,8 @@ static void pick_mb_modes(VP9_COMP *cpi, *totaldist += d; // Dummy encode, do not do the tokenization - encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0, - mb_row + y_idx, mb_col + x_idx); + encode_macroblock(cpi, tp, 0, mb_row, mb_col); + // Note the encoder may have changed the segment_id // Save the coding context @@ -808,18 +778,18 @@ static void pick_mb_modes(VP9_COMP *cpi, } else { int seg_id, r, d; -#ifdef ENC_DEBUG +#if 0 // ENC_DEBUG if (enc_debug) printf("inter pick_mb_modes %d %d\n", mb_row, mb_col); #endif - vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset, - recon_uvoffset, &r, &d); + vp9_pick_mode_inter_macroblock(cpi, x, mb_row, mb_col, &r, &d); *totalrate += r; *totaldist += d; + splitmodes_used += (mbmi->mode == SPLITMV); + // Dummy encode, do not do the tokenization - encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0, - mb_row + y_idx, mb_col + x_idx); + encode_macroblock(cpi, tp, 0, mb_row, mb_col); seg_id = mbmi->segment_id; if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) { @@ -842,12 +812,14 @@ static void pick_mb_modes(VP9_COMP *cpi, } /* Restore L & A coding context to those in place on entry */ - vpx_memcpy(cm->left_context + (mb_row & 2), + vpx_memcpy(cm->left_context + (mb_row0 & 2), left_context, sizeof(left_context)); vpx_memcpy(initial_above_context_ptr, above_context, sizeof(above_context)); + + return splitmodes_used; } static void pick_sb_modes(VP9_COMP *cpi, @@ -859,13 +831,11 @@ static void pick_sb_modes(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - int recon_yoffset, recon_uvoffset; - set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 32); xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB32X32; if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); - cpi->update_context = 0; // TODO Do we need this now?? /* Find best coding mode & reconstruct the MB so it is available * as a predictor for MBs that follow in the SB */ @@ -878,11 +848,7 @@ static void pick_sb_modes(VP9_COMP *cpi, vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context, sizeof(MODE_INFO)); } else { - vp9_rd_pick_inter_mode_sb32(cpi, x, - recon_yoffset, - recon_uvoffset, - totalrate, - totaldist); + vp9_rd_pick_inter_mode_sb32(cpi, x, mb_row, mb_col, totalrate, totaldist); } } @@ -895,34 +861,25 @@ static void pick_sb64_modes(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - int recon_yoffset, recon_uvoffset; - set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 64); xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64; if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); - cpi->update_context = 0; // TODO(rbultje) Do we need this now?? /* Find best coding mode & reconstruct the MB so it is available * as a predictor for MBs that follow in the SB */ if (cm->frame_type == KEY_FRAME) { - vp9_rd_pick_intra_mode_sb64(cpi, x, - totalrate, - totaldist); + vp9_rd_pick_intra_mode_sb64(cpi, x, totalrate, totaldist); /* Save the coding context */ - vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, - sizeof(MODE_INFO)); + vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, sizeof(MODE_INFO)); } else { - vp9_rd_pick_inter_mode_sb64(cpi, x, - recon_yoffset, - recon_uvoffset, - totalrate, - totaldist); + vp9_rd_pick_inter_mode_sb64(cpi, x, mb_row, mb_col, totalrate, totaldist); } } -static void update_stats(VP9_COMP *cpi) { +static void update_stats(VP9_COMP *cpi, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -976,6 +933,9 @@ static void update_stats(VP9_COMP *cpi) { if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME)) cpi->inter_zz_count++; } +#if CONFIG_CODE_NONZEROCOUNT + vp9_update_nzc_counts(&cpi->common, xd, mb_row, mb_col); +#endif } static void encode_sb(VP9_COMP *cpi, @@ -986,17 +946,17 @@ static void encode_sb(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - int recon_yoffset, recon_uvoffset; cpi->sb32_count[is_sb]++; if (is_sb) { - set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 32); update_state(cpi, &x->sb32_context[xd->sb_index], 32, output_enabled); - encode_superblock32(cpi, tp, recon_yoffset, recon_uvoffset, + encode_superblock32(cpi, tp, output_enabled, mb_row, mb_col); - if (output_enabled) - update_stats(cpi); + if (output_enabled) { + update_stats(cpi, mb_row, mb_col); + } if (output_enabled) { (*tp)->Token = EOSB_TOKEN; @@ -1015,24 +975,22 @@ static void encode_sb(VP9_COMP *cpi, continue; } - set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16, - &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16); xd->mb_index = i; update_state(cpi, &x->mb_context[xd->sb_index][i], 16, output_enabled); if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); - vp9_intra_prediction_down_copy(xd); - - encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, + encode_macroblock(cpi, tp, output_enabled, mb_row + y_idx, mb_col + x_idx); - if (output_enabled) - update_stats(cpi); + if (output_enabled) { + update_stats(cpi, mb_row + y_idx, mb_col + x_idx); + } if (output_enabled) { (*tp)->Token = EOSB_TOKEN; - (*tp)++; + (*tp)++; if (mb_row + y_idx < cm->mb_rows) cpi->tplist[mb_row + y_idx].stop = *tp; } @@ -1060,13 +1018,11 @@ static void encode_sb64(VP9_COMP *cpi, cpi->sb64_count[is_sb[0] == 2]++; if (is_sb[0] == 2) { - int recon_yoffset, recon_uvoffset; - - set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset); + set_offsets(cpi, mb_row, mb_col, 64); update_state(cpi, &x->sb64_context, 64, 1); - encode_superblock64(cpi, tp, recon_yoffset, recon_uvoffset, + encode_superblock64(cpi, tp, 1, mb_row, mb_col); - update_stats(cpi); + update_stats(cpi, mb_row, mb_col); (*tp)->Token = EOSB_TOKEN; (*tp)++; @@ -1098,17 +1054,18 @@ static void encode_sb_row(VP9_COMP *cpi, MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; int mb_col; - int mb_cols = cm->mb_cols; // Initialize the left context for the new SB row vpx_memset(cm->left_context, 0, sizeof(cm->left_context)); // Code each SB in the row - for (mb_col = 0; mb_col < mb_cols; mb_col += 4) { + for (mb_col = cm->cur_tile_mb_col_start; + mb_col < cm->cur_tile_mb_col_end; mb_col += 4) { int i; int sb32_rate = 0, sb32_dist = 0; int is_sb[4]; int sb64_rate = INT_MAX, sb64_dist; + int sb64_skip = 0; ENTROPY_CONTEXT_PLANES l[4], a[4]; TOKENEXTRA *tp_orig = *tp; @@ -1118,18 +1075,27 @@ static void encode_sb_row(VP9_COMP *cpi, const int x_idx = (i & 1) << 1, y_idx = i & 2; int mb_rate = 0, mb_dist = 0; int sb_rate = INT_MAX, sb_dist; + int splitmodes_used = 0; + int sb32_skip = 0; if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols) continue; xd->sb_index = i; - pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx, - tp, &mb_rate, &mb_dist); + splitmodes_used = pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx, + tp, &mb_rate, &mb_dist); + mb_rate += vp9_cost_bit(cm->sb32_coded, 0); - if (!((( mb_cols & 1) && mb_col + x_idx == mb_cols - 1) || - ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) { + if (cpi->sf.splitmode_breakout) { + sb32_skip = splitmodes_used; + sb64_skip += splitmodes_used; + } + + if ( !sb32_skip && + !(((cm->mb_cols & 1) && mb_col + x_idx == cm->mb_cols - 1) || + ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) { /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */ pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx, tp, &sb_rate, &sb_dist); @@ -1147,6 +1113,11 @@ static void encode_sb_row(VP9_COMP *cpi, is_sb[i] = 0; sb32_rate += mb_rate; sb32_dist += mb_dist; + + // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled). + if (cpi->sf.mb16_breakout) { + ++sb64_skip; + } } /* Encode SB using best computed mode(s) */ @@ -1162,7 +1133,8 @@ static void encode_sb_row(VP9_COMP *cpi, memcpy(cm->left_context, &l, sizeof(l)); sb32_rate += vp9_cost_bit(cm->sb64_coded, 0); - if (!((( mb_cols & 3) && mb_col + 3 >= mb_cols) || + if (!sb64_skip && + !(((cm->mb_cols & 3) && mb_col + 3 >= cm->mb_cols) || ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) { pick_sb64_modes(cpi, mb_row, mb_col, tp, &sb64_rate, &sb64_dist); sb64_rate += vp9_cost_bit(cm->sb64_coded, 1); @@ -1205,7 +1177,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { // Copy data over into macro block data structures. x->src = *cpi->Source; - xd->pre = cm->yv12_fb[cm->lst_fb_idx]; + xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]]; xd->dst = cm->yv12_fb[cm->new_fb_idx]; // set up frame for intra coded blocks @@ -1239,22 +1211,38 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { vpx_memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols); - xd->fullpixel_mask = 0xffffffff; - if (cm->full_pixel) - xd->fullpixel_mask = 0xfffffff8; + xd->fullpixel_mask = cm->full_pixel ? 0xfffffff8 : 0xffffffff; } +static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { + if (lossless) { + cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; + cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_iwalsh4x4; + cpi->mb.optimize = 0; + cpi->common.filter_level = 0; + cpi->zbin_mode_boost_enabled = FALSE; + cpi->common.txfm_mode = ONLY_4X4; + } else { + cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; + cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4; + } +} + + static void encode_frame_internal(VP9_COMP *cpi) { int mb_row; MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - - TOKENEXTRA *tp = cpi->tok; int totalrate; - // printf("encode_frame_internal frame %d (%d)\n", - // cpi->common.current_video_frame, cpi->common.show_frame); +// fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", +// cpi->common.current_video_frame, cpi->common.show_frame, +// cm->frame_type); // Compute a modified set of reference frame probabilities to use when // prediction fails. These are based on the current general estimates for @@ -1273,14 +1261,9 @@ static void encode_frame_internal(VP9_COMP *cpi) { totalrate = 0; - // Functions setup for all frame types so we can use MC in AltRef - vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm); - // Reset frame count of inter 0,0 motion vector usage. cpi->inter_zz_count = 0; - cpi->prediction_error = 0; - cpi->intra_error = 0; cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0; cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0; @@ -1292,16 +1275,27 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cpi->NMVcount); vp9_zero(cpi->coef_counts_4x4); - vp9_zero(cpi->hybrid_coef_counts_4x4); vp9_zero(cpi->coef_counts_8x8); - vp9_zero(cpi->hybrid_coef_counts_8x8); vp9_zero(cpi->coef_counts_16x16); - vp9_zero(cpi->hybrid_coef_counts_16x16); vp9_zero(cpi->coef_counts_32x32); + vp9_zero(cm->fc.eob_branch_counts); +#if CONFIG_CODE_NONZEROCOUNT + vp9_zero(cm->fc.nzc_counts_4x4); + vp9_zero(cm->fc.nzc_counts_8x8); + vp9_zero(cm->fc.nzc_counts_16x16); + vp9_zero(cm->fc.nzc_counts_32x32); + vp9_zero(cm->fc.nzc_pcat_counts); +#endif #if CONFIG_NEW_MVREF vp9_zero(cpi->mb_mv_ref_count); #endif + cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 && + cm->y1dc_delta_q == 0 && + cm->uvdc_delta_q == 0 && + cm->uvac_delta_q == 0); + switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless); + vp9_frame_init_quantizer(cpi); vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q); @@ -1330,24 +1324,35 @@ static void encode_frame_internal(VP9_COMP *cpi) { vpx_usec_timer_start(&emr_timer); { - // For each row of SBs in the frame - for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) { - encode_sb_row(cpi, mb_row, &tp, &totalrate); - } + // Take tiles into account and give start/end MB + int tile_col, tile_row; + TOKENEXTRA *tp = cpi->tok; + + for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) { + vp9_get_tile_row_offsets(cm, tile_row); + + for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) { + TOKENEXTRA *tp_old = tp; - cpi->tok_count = (unsigned int)(tp - cpi->tok); + // For each row of SBs in the frame + vp9_get_tile_col_offsets(cm, tile_col); + for (mb_row = cm->cur_tile_mb_row_start; + mb_row < cm->cur_tile_mb_row_end; mb_row += 4) { + encode_sb_row(cpi, mb_row, &tp, &totalrate); + } + cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old); + } + } } vpx_usec_timer_mark(&emr_timer); cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer); - } // 256 rate units to the bit, // projected_frame_size in units of BYTES cpi->projected_frame_size = totalrate >> 8; - #if 0 // Keep record of the total distortion this time around for future use cpi->last_frame_distortion = cpi->frame_distortion; @@ -1388,8 +1393,7 @@ static void reset_skip_txfm_size_mb(VP9_COMP *cpi, const int segment_id = mbmi->segment_id; xd->mode_info_context = mi; - assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) || + assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) || (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff)); mbmi->txfm_size = txfm_max; } @@ -1413,9 +1417,8 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs, int x, y; for (y = 0; y < ymbs; y++) { - for (x = 0; x < xmbs; x++) { + for (x = 0; x < xmbs; x++) mi[y * mis + x].mbmi.txfm_size = txfm_size; - } } } @@ -1433,8 +1436,7 @@ static void reset_skip_txfm_size_sb32(VP9_COMP *cpi, MODE_INFO *mi, const int xmbs = MIN(2, mb_cols_left); xd->mode_info_context = mi; - assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) || + assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) || (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs))); set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max); } @@ -1454,8 +1456,7 @@ static void reset_skip_txfm_size_sb64(VP9_COMP *cpi, MODE_INFO *mi, const int xmbs = MIN(4, mb_cols_left); xd->mode_info_context = mi; - assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) || + assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) || (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs))); set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max); } @@ -1526,9 +1527,9 @@ void vp9_encode_frame(VP9_COMP *cpi) { */ if (cpi->common.frame_type == KEY_FRAME) frame_type = 0; - else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame) + else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) frame_type = 3; - else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame) + else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) frame_type = 1; else frame_type = 2; @@ -1549,35 +1550,21 @@ void vp9_encode_frame(VP9_COMP *cpi) { pred_type = HYBRID_PREDICTION; /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */ -#if CONFIG_LOSSLESS + + cpi->mb.e_mbd.lossless = 0; if (cpi->oxcf.lossless) { txfm_type = ONLY_4X4; - } else -#endif - /* FIXME (rbultje) - * this is a hack (no really), basically to work around the complete - * nonsense coefficient cost prediction for keyframes. The probabilities - * are reset to defaults, and thus we basically have no idea how expensive - * a 4x4 vs. 8x8 will really be. The result is that any estimate at which - * of the two is better is utterly bogus. - * I'd like to eventually remove this hack, but in order to do that, we - * need to move the frame reset code from the frame encode init to the - * bitstream write code, or alternatively keep a backup of the previous - * keyframe's probabilities as an estimate of what the current keyframe's - * coefficient cost distributions may look like. */ - if (frame_type == 0) { - txfm_type = ALLOW_32X32; + cpi->mb.e_mbd.lossless = 1; } else #if 0 - /* FIXME (rbultje) - * this code is disabled for a similar reason as the code above; the - * problem is that each time we "revert" to 4x4 only (or even 8x8 only), - * the coefficient probabilities for 16x16 (and 8x8) start lagging behind, - * thus leading to them lagging further behind and not being chosen for - * subsequent frames either. This is essentially a local minimum problem - * that we can probably fix by estimating real costs more closely within - * a frame, perhaps by re-calculating costs on-the-fly as frame encoding - * progresses. */ + /* FIXME (rbultje): this code is disabled until we support cost updates + * while a frame is being encoded; the problem is that each time we + * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities + * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging + * further behind and not being chosen for subsequent frames either. This + * is essentially a local minimum problem that we can probably fix by + * estimating real costs more closely within a frame, perhaps by re- + * calculating costs on-the-fly as frame encoding progresses. */ if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] && cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > @@ -1671,7 +1658,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { // Update interpolation filter strategy for next frame. if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter)) - select_interp_filter_type(cpi); + vp9_select_interp_filter_type(cpi); } else { encode_frame_internal(cpi); } @@ -1683,30 +1670,23 @@ void vp9_setup_block_ptrs(MACROBLOCK *x) { int i; for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { + for (c = 0; c < 4; c++) x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4; - } } for (r = 0; r < 2; r++) { - for (c = 0; c < 2; c++) { + for (c = 0; c < 2; c++) x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4; - } } for (r = 0; r < 2; r++) { - for (c = 0; c < 2; c++) { + for (c = 0; c < 2; c++) x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4; - } } - x->block[24].src_diff = x->src_diff + 384; - - - for (i = 0; i < 25; i++) { + for (i = 0; i < 24; i++) x->block[i].coeff = x->coeff + i * 16; - } } void vp9_build_block_offsets(MACROBLOCK *x) { @@ -1826,63 +1806,6 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) { #endif } -static void update_sb_skip_coeff_state(VP9_COMP *cpi, - ENTROPY_CONTEXT_PLANES ta[4], - ENTROPY_CONTEXT_PLANES tl[4], - TOKENEXTRA *t[4], - TOKENEXTRA **tp, - int skip[4], int output_enabled) { - MACROBLOCK *const x = &cpi->mb; - TOKENEXTRA tokens[4][16 * 25]; - int n_tokens[4], n; - - // if there were no skips, we don't need to do anything - if (!skip[0] && !skip[1] && !skip[2] && !skip[3]) - return; - - // if we don't do coeff skipping for this frame, we don't - // need to do anything here - if (!cpi->common.mb_no_coeff_skip) - return; - - // if all 4 MBs skipped coeff coding, nothing to be done - if (skip[0] && skip[1] && skip[2] && skip[3]) - return; - - // so the situation now is that we want to skip coeffs - // for some MBs, but not all, and we didn't code EOB - // coefficients for them. However, the skip flag for this - // SB will be 0 overall, so we need to insert EOBs in the - // middle of the token tree. Do so here. - n_tokens[0] = t[1] - t[0]; - n_tokens[1] = t[2] - t[1]; - n_tokens[2] = t[3] - t[2]; - n_tokens[3] = *tp - t[3]; - if (n_tokens[0]) - memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0])); - if (n_tokens[1]) - memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0])); - if (n_tokens[2]) - memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0])); - if (n_tokens[3]) - memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0])); - - // reset pointer, stuff EOBs where necessary - *tp = t[0]; - for (n = 0; n < 4; n++) { - if (skip[n]) { - x->e_mbd.above_context = &ta[n]; - x->e_mbd.left_context = &tl[n]; - vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled); - } else { - if (n_tokens[n]) { - memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]); - } - (*tp) += n_tokens[n]; - } - } -} - static void update_sb64_skip_coeff_state(VP9_COMP *cpi, ENTROPY_CONTEXT_PLANES ta[16], ENTROPY_CONTEXT_PLANES tl[16], @@ -1994,21 +1917,151 @@ static void update_sb64_skip_coeff_state(VP9_COMP *cpi, } } +#if CONFIG_CODE_NONZEROCOUNT +static void gather_nzcs_mb16(VP9_COMMON *const cm, + MACROBLOCKD *xd) { + int i; + vpx_memset(xd->mode_info_context->mbmi.nzcs, 0, + 384 * sizeof(xd->mode_info_context->mbmi.nzcs[0])); + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_4X4: + for (i = 0; i < 24; ++i) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + case TX_8X8: + for (i = 0; i < 16; i += 4) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + if (xd->mode_info_context->mbmi.mode == I8X8_PRED || + xd->mode_info_context->mbmi.mode == SPLITMV) { + for (i = 16; i < 24; ++i) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + } else { + for (i = 16; i < 24; i += 4) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + } + break; + + case TX_16X16: + xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0]; + for (i = 16; i < 24; i += 4) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + default: + break; + } +} + +static void gather_nzcs_sb32(VP9_COMMON *const cm, + MACROBLOCKD *xd) { + int i, j; + MODE_INFO *m = xd->mode_info_context; + int mis = cm->mode_info_stride; + vpx_memset(m->mbmi.nzcs, 0, + 384 * sizeof(xd->mode_info_context->mbmi.nzcs[0])); + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_4X4: + for (i = 0; i < 96; ++i) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + case TX_8X8: + for (i = 0; i < 96; i += 4) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + case TX_16X16: + for (i = 0; i < 96; i += 16) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + case TX_32X32: + xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0]; + for (i = 64; i < 96; i += 16) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + default: + break; + } + for (i = 0; i < 2; ++i) + for (j = 0; j < 2; ++j) { + if (i == 0 && j == 0) continue; + vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs, + 384 * sizeof(m->mbmi.nzcs[0])); + } +} + +static void gather_nzcs_sb64(VP9_COMMON *const cm, + MACROBLOCKD *xd) { + int i, j; + MODE_INFO *m = xd->mode_info_context; + int mis = cm->mode_info_stride; + vpx_memset(xd->mode_info_context->mbmi.nzcs, 0, + 384 * sizeof(xd->mode_info_context->mbmi.nzcs[0])); + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_4X4: + for (i = 0; i < 384; ++i) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + case TX_8X8: + for (i = 0; i < 384; i += 4) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + case TX_16X16: + for (i = 0; i < 384; i += 16) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + case TX_32X32: + for (i = 0; i < 384; i += 64) { + xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i]; + } + break; + + default: + break; + } + for (i = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) { + if (i == 0 && j == 0) continue; + vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs, + 384 * sizeof(m->mbmi.nzcs[0])); + } +} +#endif + static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MODE_INFO *mi = xd->mode_info_context; + MB_MODE_INFO *const mbmi = &mi->mbmi; + const int mis = cm->mode_info_stride; unsigned char ref_pred_flag; assert(!xd->mode_info_context->mbmi.sb_type); #ifdef ENC_DEBUG - enc_debug = (cpi->common.current_video_frame == 46 && - mb_row == 5 && mb_col == 2); + enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame && + mb_row == 8 && mb_col == 0 && output_enabled); if (enc_debug) printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled); #endif @@ -2037,9 +2090,11 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, else cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; } else if (mbmi->mode == SPLITMV) - cpi->zbin_mode_boost = 0; + cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST; else cpi->zbin_mode_boost = MV_ZBIN_BOOST; + } else { + cpi->zbin_mode_boost = INTRA_ZBIN_BOOST; } } @@ -2053,21 +2108,21 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, } if (mbmi->ref_frame == INTRA_FRAME) { -#ifdef ENC_DEBUG +#if 0 // def ENC_DEBUG if (enc_debug) { printf("Mode %d skip %d tx_size %d\n", mbmi->mode, x->skip, mbmi->txfm_size); } #endif if (mbmi->mode == B_PRED) { - vp9_encode_intra16x16mbuv(x); + vp9_encode_intra16x16mbuv(cm, x); vp9_encode_intra4x4mby(x); } else if (mbmi->mode == I8X8_PRED) { vp9_encode_intra8x8mby(x); vp9_encode_intra8x8mbuv(x); } else { - vp9_encode_intra16x16mbuv(x); - vp9_encode_intra16x16mby(x); + vp9_encode_intra16x16mbuv(cm, x); + vp9_encode_intra16x16mby(cm, x); } if (output_enabled) @@ -2086,58 +2141,50 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, assert(cm->frame_type != KEY_FRAME); if (mbmi->ref_frame == LAST_FRAME) - ref_fb_idx = cpi->common.lst_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (mbmi->ref_frame == GOLDEN_FRAME) - ref_fb_idx = cpi->common.gld_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - ref_fb_idx = cpi->common.alt_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; - xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; - xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->pre, + &cpi->common.yv12_fb[ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[0], &xd->scale_factor_uv[0]); if (mbmi->second_ref_frame > 0) { int second_ref_fb_idx; if (mbmi->second_ref_frame == LAST_FRAME) - second_ref_fb_idx = cpi->common.lst_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (mbmi->second_ref_frame == GOLDEN_FRAME) - second_ref_fb_idx = cpi->common.gld_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - second_ref_fb_idx = cpi->common.alt_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + - recon_yoffset; - xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + - recon_uvoffset; - xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + - recon_uvoffset; + setup_pred_block(&xd->second_pre, + &cpi->common.yv12_fb[second_ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[1], &xd->scale_factor_uv[1]); } if (!x->skip) { - vp9_encode_inter16x16(x); + vp9_encode_inter16x16(cm, x, mb_row, mb_col); // Clear mb_skip_coeff if mb_no_coeff_skip is not set if (!cpi->common.mb_no_coeff_skip) mbmi->mb_skip_coeff = 0; } else { - vp9_build_1st_inter16x16_predictors_mb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - vp9_build_2nd_inter16x16_predictors_mb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); - } + vp9_build_inter16x16_predictors_mb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); #if CONFIG_COMP_INTERINTRA_PRED - else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { + if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { vp9_build_interintra_16x16_predictors_mb(xd, xd->dst.y_buffer, xd->dst.u_buffer, @@ -2155,7 +2202,7 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, int i, j; printf("\n"); printf("qcoeff\n"); - for (i = 0; i < 400; i++) { + for (i = 0; i < 384; i++) { printf("%3d ", xd->qcoeff[i]); if (i % 16 == 15) printf("\n"); } @@ -2202,15 +2249,17 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, } #endif +#if CONFIG_CODE_NONZEROCOUNT + gather_nzcs_mb16(cm, xd); +#endif vp9_tokenize_mb(cpi, xd, t, !output_enabled); } else { - int mb_skip_context = - cpi->common.mb_no_coeff_skip ? - (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff + - (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff : - 0; - if (cpi->common.mb_no_coeff_skip) { + // FIXME(rbultje): not tile-aware (mi - 1) + int mb_skip_context = cpi->common.mb_no_coeff_skip ? + (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0; + + if (cm->mb_no_coeff_skip) { mbmi->mb_skip_coeff = 1; if (output_enabled) cpi->skip_true_count[mb_skip_context]++; @@ -2227,8 +2276,7 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, int segment_id = mbmi->segment_id; if (cpi->common.txfm_mode == TX_MODE_SELECT && !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) || - (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) { + (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP)))) { assert(mbmi->txfm_size <= TX_16X16); if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED && mbmi->mode != SPLITMV) { @@ -2253,7 +2301,6 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t, } static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; @@ -2267,14 +2314,22 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride; int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride; unsigned char ref_pred_flag; - int n; - TOKENEXTRA *tp[4]; - int skip[4]; MODE_INFO *mi = x->e_mbd.mode_info_context; unsigned int segment_id = mi->mbmi.segment_id; - ENTROPY_CONTEXT_PLANES ta[4], tl[4]; const int mis = cm->mode_info_stride; +#ifdef ENC_DEBUG + enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame && + mb_row == 8 && mb_col == 0 && output_enabled); + if (enc_debug) { + printf("Encode SB32 %d %d output %d\n", mb_row, mb_col, output_enabled); + printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n", + mi->mbmi.mode, x->skip, mi->mbmi.txfm_size, + mi->mbmi.ref_frame, mi->mbmi.second_ref_frame, + mi->mbmi.mv[0].as_mv.row, mi->mbmi.mv[0].as_mv.col, + mi->mbmi.interp_filter); + } +#endif if (cm->frame_type == KEY_FRAME) { if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { adjust_act_zbin(cpi, x); @@ -2299,9 +2354,11 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, else cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; } else if (xd->mode_info_context->mbmi.mode == SPLITMV) - cpi->zbin_mode_boost = 0; + cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST; else cpi->zbin_mode_boost = MV_ZBIN_BOOST; + } else { + cpi->zbin_mode_boost = INTRA_ZBIN_BOOST; } } @@ -2326,152 +2383,137 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, assert(cm->frame_type != KEY_FRAME); if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) - ref_fb_idx = cpi->common.lst_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME) - ref_fb_idx = cpi->common.gld_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - ref_fb_idx = cpi->common.alt_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; - xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; - xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->pre, + &cpi->common.yv12_fb[ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[0], &xd->scale_factor_uv[0]); if (xd->mode_info_context->mbmi.second_ref_frame > 0) { int second_ref_fb_idx; if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME) - second_ref_fb_idx = cpi->common.lst_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME) - second_ref_fb_idx = cpi->common.gld_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - second_ref_fb_idx = cpi->common.alt_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + - recon_yoffset; - xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + - recon_uvoffset; - xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + - recon_uvoffset; + setup_pred_block(&xd->second_pre, + &cpi->common.yv12_fb[second_ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[1], &xd->scale_factor_uv[1]); } vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.y_stride, xd->dst.uv_stride); + xd->dst.y_stride, xd->dst.uv_stride, + mb_row, mb_col); } - if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) { - if (!x->skip) { - vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride, - dst, dst_y_stride); - vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff, - usrc, vsrc, src_uv_stride, - udst, vdst, dst_uv_stride); - vp9_transform_sby_32x32(x); - vp9_transform_sbuv_16x16(x); - vp9_quantize_sby_32x32(x); - vp9_quantize_sbuv_16x16(x); - // TODO(rbultje): trellis optimize - vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data); - vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data); - vp9_recon_sby_s_c(&x->e_mbd, dst); - vp9_recon_sbuv_s_c(&x->e_mbd, udst, vdst); - - vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled); - } else { - int mb_skip_context = - cpi->common.mb_no_coeff_skip ? - (mi - 1)->mbmi.mb_skip_coeff + - (mi - mis)->mbmi.mb_skip_coeff : - 0; - mi->mbmi.mb_skip_coeff = 1; - if (cm->mb_no_coeff_skip) { - if (output_enabled) - cpi->skip_true_count[mb_skip_context]++; - vp9_fix_contexts_sb(xd); - } else { - vp9_stuff_sb(cpi, xd, t, !output_enabled); - if (output_enabled) - cpi->skip_false_count[mb_skip_context]++; - } + if (!x->skip) { + vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, + dst, dst_y_stride); + vp9_subtract_sbuv_s_c(x->src_diff, + usrc, vsrc, src_uv_stride, + udst, vdst, dst_uv_stride); + switch (mi->mbmi.txfm_size) { + case TX_32X32: + vp9_transform_sby_32x32(x); + vp9_transform_sbuv_16x16(x); + vp9_quantize_sby_32x32(x); + vp9_quantize_sbuv_16x16(x); + if (x->optimize) { + vp9_optimize_sby_32x32(cm, x); + vp9_optimize_sbuv_16x16(cm, x); + } + vp9_inverse_transform_sby_32x32(xd); + vp9_inverse_transform_sbuv_16x16(xd); + break; + case TX_16X16: + vp9_transform_sby_16x16(x); + vp9_transform_sbuv_16x16(x); + vp9_quantize_sby_16x16(x); + vp9_quantize_sbuv_16x16(x); + if (x->optimize) { + vp9_optimize_sby_16x16(cm, x); + vp9_optimize_sbuv_16x16(cm, x); + } + vp9_inverse_transform_sby_16x16(xd); + vp9_inverse_transform_sbuv_16x16(xd); + break; + case TX_8X8: + vp9_transform_sby_8x8(x); + vp9_transform_sbuv_8x8(x); + vp9_quantize_sby_8x8(x); + vp9_quantize_sbuv_8x8(x); + if (x->optimize) { + vp9_optimize_sby_8x8(cm, x); + vp9_optimize_sbuv_8x8(cm, x); + } + vp9_inverse_transform_sby_8x8(xd); + vp9_inverse_transform_sbuv_8x8(xd); + break; + case TX_4X4: + vp9_transform_sby_4x4(x); + vp9_transform_sbuv_4x4(x); + vp9_quantize_sby_4x4(x); + vp9_quantize_sbuv_4x4(x); + if (x->optimize) { + vp9_optimize_sby_4x4(cm, x); + vp9_optimize_sbuv_4x4(cm, x); + } + vp9_inverse_transform_sby_4x4(xd); + vp9_inverse_transform_sbuv_4x4(xd); + break; + default: assert(0); } + vp9_recon_sby_s_c(xd, dst); + vp9_recon_sbuv_s_c(xd, udst, vdst); +#if CONFIG_CODE_NONZEROCOUNT + gather_nzcs_sb32(cm, xd); +#endif - // copy skip flag on all mb_mode_info contexts in this SB - // if this was a skip at this txfm size - if (mb_col < cm->mb_cols - 1) - mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; - if (mb_row < cm->mb_rows - 1) { - mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; - if (mb_col < cm->mb_cols - 1) - mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; - } - skip[0] = skip[2] = skip[1] = skip[3] = mi->mbmi.mb_skip_coeff; + vp9_tokenize_sb(cpi, xd, t, !output_enabled); } else { - for (n = 0; n < 4; n++) { - int x_idx = n & 1, y_idx = n >> 1; - - xd->left_context = cm->left_context + y_idx + (mb_row & 2); - xd->above_context = cm->above_context + mb_col + x_idx; - memcpy(&ta[n], xd->above_context, sizeof(ta[n])); - memcpy(&tl[n], xd->left_context, sizeof(tl[n])); - tp[n] = *t; - xd->mode_info_context = mi + x_idx + y_idx * mis; - - if (!x->skip) { - vp9_subtract_mby_s_c(x->src_diff, - src + x_idx * 16 + y_idx * 16 * src_y_stride, - src_y_stride, - dst + x_idx * 16 + y_idx * 16 * dst_y_stride, - dst_y_stride); - vp9_subtract_mbuv_s_c(x->src_diff, - usrc + x_idx * 8 + y_idx * 8 * src_uv_stride, - vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride, - src_uv_stride, - udst + x_idx * 8 + y_idx * 8 * dst_uv_stride, - vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride, - dst_uv_stride); - vp9_fidct_mb(x); - vp9_recon_mby_s_c(&x->e_mbd, - dst + x_idx * 16 + y_idx * 16 * dst_y_stride); - vp9_recon_mbuv_s_c(&x->e_mbd, - udst + x_idx * 8 + y_idx * 8 * dst_uv_stride, - vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride); - - vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled); - skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff; - } else { - int mb_skip_context = cpi->common.mb_no_coeff_skip ? - (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff + - (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff : - 0; - xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1; - if (cpi->common.mb_no_coeff_skip) { - // TODO(rbultje) this should be done per-sb instead of per-mb? - if (output_enabled) - cpi->skip_true_count[mb_skip_context]++; - vp9_reset_mb_tokens_context(xd); - } else { - vp9_stuff_mb(cpi, xd, t, !output_enabled); - // TODO(rbultje) this should be done per-sb instead of per-mb? - if (output_enabled) - cpi->skip_false_count[mb_skip_context]++; - } - } + // FIXME(rbultje): not tile-aware (mi - 1) + int mb_skip_context = cm->mb_no_coeff_skip ? + (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0; + + mi->mbmi.mb_skip_coeff = 1; + if (cm->mb_no_coeff_skip) { + if (output_enabled) + cpi->skip_true_count[mb_skip_context]++; + vp9_reset_sb_tokens_context(xd); + } else { + vp9_stuff_sb(cpi, xd, t, !output_enabled); + if (output_enabled) + cpi->skip_false_count[mb_skip_context]++; } + } - xd->mode_info_context = mi; - update_sb_skip_coeff_state(cpi, ta, tl, tp, t, skip, output_enabled); + // copy skip flag on all mb_mode_info contexts in this SB + // if this was a skip at this txfm size + if (mb_col < cm->mb_cols - 1) + mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; + if (mb_row < cm->mb_rows - 1) { + mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; + if (mb_col < cm->mb_cols - 1) + mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; } if (output_enabled) { if (cm->txfm_mode == TX_MODE_SELECT && - !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) || - (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { + !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) || + (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++; } else { - TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? - TX_32X32 : - cm->txfm_mode; + TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode; mi->mbmi.txfm_size = sz; if (mb_col < cm->mb_cols - 1) mi[1].mbmi.txfm_size = sz; @@ -2485,7 +2527,6 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t, } static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset, int output_enabled, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; @@ -2500,13 +2541,16 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride; unsigned char ref_pred_flag; int n; - TOKENEXTRA *tp[16]; - int skip[16]; MODE_INFO *mi = x->e_mbd.mode_info_context; unsigned int segment_id = mi->mbmi.segment_id; - ENTROPY_CONTEXT_PLANES ta[16], tl[16]; const int mis = cm->mode_info_stride; +#ifdef ENC_DEBUG + enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame && + mb_row == 8 && mb_col == 0 && output_enabled); + if (enc_debug) + printf("Encode SB64 %d %d output %d\n", mb_row, mb_col, output_enabled); +#endif if (cm->frame_type == KEY_FRAME) { if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { adjust_act_zbin(cpi, x); @@ -2531,10 +2575,12 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, else cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; } else if (xd->mode_info_context->mbmi.mode == SPLITMV) { - cpi->zbin_mode_boost = 0; + cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST; } else { cpi->zbin_mode_boost = MV_ZBIN_BOOST; } + } else { + cpi->zbin_mode_boost = INTRA_ZBIN_BOOST; } } @@ -2557,186 +2603,134 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t, assert(cm->frame_type != KEY_FRAME); if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) - ref_fb_idx = cpi->common.lst_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME) - ref_fb_idx = cpi->common.gld_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - ref_fb_idx = cpi->common.alt_fb_idx; + ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->pre.y_buffer = - cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; - xd->pre.u_buffer = - cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; - xd->pre.v_buffer = - cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->pre, + &cpi->common.yv12_fb[ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[0], &xd->scale_factor_uv[0]); if (xd->mode_info_context->mbmi.second_ref_frame > 0) { int second_ref_fb_idx; if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME) - second_ref_fb_idx = cpi->common.lst_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx]; else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME) - second_ref_fb_idx = cpi->common.gld_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx]; else - second_ref_fb_idx = cpi->common.alt_fb_idx; + second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx]; - xd->second_pre.y_buffer = - cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset; - xd->second_pre.u_buffer = - cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset; - xd->second_pre.v_buffer = - cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset; + setup_pred_block(&xd->second_pre, + &cpi->common.yv12_fb[second_ref_fb_idx], + mb_row, mb_col, + &xd->scale_factor[1], &xd->scale_factor_uv[1]); } vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.y_stride, xd->dst.uv_stride); + xd->dst.y_stride, xd->dst.uv_stride, + mb_row, mb_col); } - if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) { - int n; + if (!x->skip) { + vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride); + vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride, + udst, vdst, dst_uv_stride); - for (n = 0; n < 4; n++) { - int x_idx = n & 1, y_idx = n >> 1; - - xd->mode_info_context = mi + x_idx * 2 + mis * y_idx * 2; - xd->left_context = cm->left_context + (y_idx << 1); - xd->above_context = cm->above_context + mb_col + (x_idx << 1); - memcpy(&ta[n * 2], xd->above_context, sizeof(*ta) * 2); - memcpy(&tl[n * 2], xd->left_context, sizeof(*tl) * 2); - tp[n] = *t; - xd->mode_info_context = mi + x_idx * 2 + y_idx * mis * 2; - if (!x->skip) { - vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, - src + x_idx * 32 + y_idx * 32 * src_y_stride, - src_y_stride, - dst + x_idx * 32 + y_idx * 32 * dst_y_stride, - dst_y_stride); - vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff, - usrc + x_idx * 16 + y_idx * 16 * src_uv_stride, - vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride, - src_uv_stride, - udst + x_idx * 16 + y_idx * 16 * dst_uv_stride, - vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride, - dst_uv_stride); - vp9_transform_sby_32x32(x); - vp9_transform_sbuv_16x16(x); - vp9_quantize_sby_32x32(x); - vp9_quantize_sbuv_16x16(x); - // TODO(rbultje): trellis optimize - vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data); - vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data); - vp9_recon_sby_s_c(&x->e_mbd, - dst + 32 * x_idx + 32 * y_idx * dst_y_stride); - vp9_recon_sbuv_s_c(&x->e_mbd, - udst + x_idx * 16 + y_idx * 16 * dst_uv_stride, - vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride); - - vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled); - } else { - int mb_skip_context = cpi->common.mb_no_coeff_skip ? - (mi - 1)->mbmi.mb_skip_coeff + - (mi - mis)->mbmi.mb_skip_coeff : 0; - xd->mode_info_context->mbmi.mb_skip_coeff = 1; - if (cm->mb_no_coeff_skip) { - if (output_enabled) - cpi->skip_true_count[mb_skip_context]++; - vp9_fix_contexts_sb(xd); - } else { - vp9_stuff_sb(cpi, xd, t, !output_enabled); - if (output_enabled) - cpi->skip_false_count[mb_skip_context]++; + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_32X32: + vp9_transform_sb64y_32x32(x); + vp9_transform_sb64uv_32x32(x); + vp9_quantize_sb64y_32x32(x); + vp9_quantize_sb64uv_32x32(x); + if (x->optimize) { + vp9_optimize_sb64y_32x32(cm, x); + vp9_optimize_sb64uv_32x32(cm, x); } - } - - // copy skip flag on all mb_mode_info contexts in this SB - // if this was a skip at this txfm size - if (mb_col + x_idx * 2 < cm->mb_cols - 1) - mi[mis * y_idx * 2 + x_idx * 2 + 1].mbmi.mb_skip_coeff = - mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff; - if (mb_row + y_idx * 2 < cm->mb_rows - 1) { - mi[mis * y_idx * 2 + x_idx * 2 + mis].mbmi.mb_skip_coeff = - mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff; - if (mb_col + x_idx * 2 < cm->mb_cols - 1) - mi[mis * y_idx * 2 + x_idx * 2 + mis + 1].mbmi.mb_skip_coeff = - mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff; - } - skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff; + vp9_inverse_transform_sb64y_32x32(xd); + vp9_inverse_transform_sb64uv_32x32(xd); + break; + case TX_16X16: + vp9_transform_sb64y_16x16(x); + vp9_transform_sb64uv_16x16(x); + vp9_quantize_sb64y_16x16(x); + vp9_quantize_sb64uv_16x16(x); + if (x->optimize) { + vp9_optimize_sb64y_16x16(cm, x); + vp9_optimize_sb64uv_16x16(cm, x); + } + vp9_inverse_transform_sb64y_16x16(xd); + vp9_inverse_transform_sb64uv_16x16(xd); + break; + case TX_8X8: + vp9_transform_sb64y_8x8(x); + vp9_transform_sb64uv_8x8(x); + vp9_quantize_sb64y_8x8(x); + vp9_quantize_sb64uv_8x8(x); + if (x->optimize) { + vp9_optimize_sb64y_8x8(cm, x); + vp9_optimize_sb64uv_8x8(cm, x); + } + vp9_inverse_transform_sb64y_8x8(xd); + vp9_inverse_transform_sb64uv_8x8(xd); + break; + case TX_4X4: + vp9_transform_sb64y_4x4(x); + vp9_transform_sb64uv_4x4(x); + vp9_quantize_sb64y_4x4(x); + vp9_quantize_sb64uv_4x4(x); + if (x->optimize) { + vp9_optimize_sb64y_4x4(cm, x); + vp9_optimize_sb64uv_4x4(cm, x); + } + vp9_inverse_transform_sb64y_4x4(xd); + vp9_inverse_transform_sb64uv_4x4(xd); + break; + default: assert(0); } + vp9_recon_sb64y_s_c(xd, dst); + vp9_recon_sb64uv_s_c(&x->e_mbd, udst, vdst); +#if CONFIG_CODE_NONZEROCOUNT + gather_nzcs_sb64(cm, &x->e_mbd); +#endif + vp9_tokenize_sb64(cpi, &x->e_mbd, t, !output_enabled); } else { - for (n = 0; n < 16; n++) { - const int x_idx = n & 3, y_idx = n >> 2; - - xd->left_context = cm->left_context + y_idx; - xd->above_context = cm->above_context + mb_col + x_idx; - memcpy(&ta[n], xd->above_context, sizeof(ta[n])); - memcpy(&tl[n], xd->left_context, sizeof(tl[n])); - tp[n] = *t; - xd->mode_info_context = mi + x_idx + y_idx * mis; - - if (!x->skip) { - vp9_subtract_mby_s_c(x->src_diff, - src + x_idx * 16 + y_idx * 16 * src_y_stride, - src_y_stride, - dst + x_idx * 16 + y_idx * 16 * dst_y_stride, - dst_y_stride); - vp9_subtract_mbuv_s_c(x->src_diff, - usrc + x_idx * 8 + y_idx * 8 * src_uv_stride, - vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride, - src_uv_stride, - udst + x_idx * 8 + y_idx * 8 * dst_uv_stride, - vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride, - dst_uv_stride); - vp9_fidct_mb(x); - vp9_recon_mby_s_c(&x->e_mbd, - dst + x_idx * 16 + y_idx * 16 * dst_y_stride); - vp9_recon_mbuv_s_c(&x->e_mbd, - udst + x_idx * 8 + y_idx * 8 * dst_uv_stride, - vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride); - - vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled); - skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff; - } else { - int mb_skip_context = cpi->common.mb_no_coeff_skip ? - (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff + - (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff : 0; - xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1; - if (cpi->common.mb_no_coeff_skip) { - // TODO(rbultje) this should be done per-sb instead of per-mb? - if (output_enabled) - cpi->skip_true_count[mb_skip_context]++; - vp9_reset_mb_tokens_context(xd); - } else { - vp9_stuff_mb(cpi, xd, t, !output_enabled); - // TODO(rbultje) this should be done per-sb instead of per-mb? - if (output_enabled) - cpi->skip_false_count[mb_skip_context]++; - } - } + // FIXME(rbultje): not tile-aware (mi - 1) + int mb_skip_context = cpi->common.mb_no_coeff_skip ? + (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0; + + xd->mode_info_context->mbmi.mb_skip_coeff = 1; + if (cm->mb_no_coeff_skip) { + if (output_enabled) + cpi->skip_true_count[mb_skip_context]++; + vp9_reset_sb64_tokens_context(xd); + } else { + vp9_stuff_sb64(cpi, xd, t, !output_enabled); + if (output_enabled) + cpi->skip_false_count[mb_skip_context]++; } } - xd->mode_info_context = mi; - update_sb64_skip_coeff_state(cpi, ta, tl, tp, t, skip, output_enabled); + // copy skip flag on all mb_mode_info contexts in this SB + // if this was a skip at this txfm size + for (n = 1; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + if (mb_col + x_idx < cm->mb_cols && mb_row + y_idx < cm->mb_rows) + mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; + } if (output_enabled) { if (cm->txfm_mode == TX_MODE_SELECT && - !((cm->mb_no_coeff_skip && - ((mi->mbmi.txfm_size == TX_32X32 && - skip[0] && skip[1] && skip[2] && skip[3]) || - (mi->mbmi.txfm_size != TX_32X32 && - skip[0] && skip[1] && skip[2] && skip[3] && - skip[4] && skip[5] && skip[6] && skip[7] && - skip[8] && skip[9] && skip[10] && skip[11] && - skip[12] && skip[13] && skip[14] && skip[15]))) || - (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) && - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) { + !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) || + (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++; } else { int x, y; - TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? - TX_32X32 : - cm->txfm_mode; + TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode; for (y = 0; y < 4; y++) { for (x = 0; x < 4; x++) { if (mb_col + x < cm->mb_cols && mb_row + y < cm->mb_rows) { diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h index 1b056e163105e0689d13d0b525aedf27a5bacbf5..9f13edcec9ddfaa35e60071930fcabcd9cb53629 100644 --- a/vp9/encoder/vp9_encodeframe.h +++ b/vp9/encoder/vp9_encodeframe.h @@ -14,8 +14,8 @@ struct macroblock; -extern void vp9_build_block_offsets(struct macroblock *x); +void vp9_build_block_offsets(struct macroblock *x); -extern void vp9_setup_block_ptrs(struct macroblock *x); +void vp9_setup_block_ptrs(struct macroblock *x); #endif // VP9_ENCODER_VP9_ENCODEFRAME_H_ diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index ce9a38003e5f6b1efc72daf233c867d86d41b4f5..eddacb872a4378b85b6791e02fd251917f9f1346 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -12,14 +12,11 @@ #include "vp9_rtcd.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/common/vp9_reconintra.h" -#include "vp9/common/vp9_reconintra4x4.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/common/vp9_invtrans.h" #include "vp9/encoder/vp9_encodeintra.h" int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) { - int i; - int intra_pred_var = 0; MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; (void) cpi; @@ -28,17 +25,17 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) { mbmi->uv_mode = DC_PRED; mbmi->ref_frame = INTRA_FRAME; - vp9_encode_intra16x16mby(x); + vp9_encode_intra16x16mby(&cpi->common, x); } else { + int i; + for (i = 0; i < 16; i++) { x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED; vp9_encode_intra4x4block(x, i); } } - intra_pred_var = vp9_get_mb_ss(x->src_diff); - - return intra_pred_var; + return vp9_get_mb_ss(x->src_diff); } void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) { @@ -47,21 +44,22 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) { TX_TYPE tx_type; #if CONFIG_NEWBINTRAMODES - b->bmi.as_mode.context = vp9_find_bpred_context(b); + b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b); #endif - vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor); + vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor); vp9_subtract_b(be, b, 16); - tx_type = get_tx_type_4x4(&x->e_mbd, b); + tx_type = get_tx_type_4x4(&x->e_mbd, ib); if (tx_type != DCT_DCT) { - vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4); - vp9_ht_quantize_b_4x4(be, b, tx_type); - vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob); + vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_ht_quantize_b_4x4(x, ib, tx_type); + vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type); } else { - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, b) ; - vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 32); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, ib); + vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib], + b->dqcoeff, b->diff, 32); } vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); @@ -72,10 +70,9 @@ void vp9_encode_intra4x4mby(MACROBLOCK *mb) { for (i = 0; i < 16; i++) vp9_encode_intra4x4block(mb, i); - return; } -void vp9_encode_intra16x16mby(MACROBLOCK *x) { +void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) { MACROBLOCKD *xd = &x->e_mbd; BLOCK *b = &x->block[0]; TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; @@ -84,30 +81,34 @@ void vp9_encode_intra16x16mby(MACROBLOCK *x) { vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride); - if (tx_size == TX_16X16) { - vp9_transform_mby_16x16(x); - vp9_quantize_mby_16x16(x); - if (x->optimize) - vp9_optimize_mby_16x16(x); - vp9_inverse_transform_mby_16x16(xd); - } else if (tx_size == TX_8X8) { - vp9_transform_mby_8x8(x); - vp9_quantize_mby_8x8(x); - if (x->optimize) - vp9_optimize_mby_8x8(x); - vp9_inverse_transform_mby_8x8(xd); - } else { - vp9_transform_mby_4x4(x); - vp9_quantize_mby_4x4(x); - if (x->optimize) - vp9_optimize_mby_4x4(x); - vp9_inverse_transform_mby_4x4(xd); + switch (tx_size) { + case TX_16X16: + vp9_transform_mby_16x16(x); + vp9_quantize_mby_16x16(x); + if (x->optimize) + vp9_optimize_mby_16x16(cm, x); + vp9_inverse_transform_mby_16x16(xd); + break; + case TX_8X8: + vp9_transform_mby_8x8(x); + vp9_quantize_mby_8x8(x); + if (x->optimize) + vp9_optimize_mby_8x8(cm, x); + vp9_inverse_transform_mby_8x8(xd); + break; + default: + vp9_transform_mby_4x4(x); + vp9_quantize_mby_4x4(x); + if (x->optimize) + vp9_optimize_mby_4x4(cm, x); + vp9_inverse_transform_mby_4x4(xd); + break; } vp9_recon_mby(xd); } -void vp9_encode_intra16x16mbuv(MACROBLOCK *x) { +void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) { MACROBLOCKD *xd = &x->e_mbd; TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; @@ -116,19 +117,22 @@ void vp9_encode_intra16x16mbuv(MACROBLOCK *x) { vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, xd->predictor, x->src.uv_stride); - if (tx_size == TX_4X4) { - vp9_transform_mbuv_4x4(x); - vp9_quantize_mbuv_4x4(x); - if (x->optimize) - vp9_optimize_mbuv_4x4(x); - vp9_inverse_transform_mbuv_4x4(xd); - } else /* 16x16 or 8x8 */ { - vp9_transform_mbuv_8x8(x); - vp9_quantize_mbuv_8x8(x); - if (x->optimize) - vp9_optimize_mbuv_8x8(x); - vp9_inverse_transform_mbuv_8x8(xd); - } + switch (tx_size) { + case TX_4X4: + vp9_transform_mbuv_4x4(x); + vp9_quantize_mbuv_4x4(x); + if (x->optimize) + vp9_optimize_mbuv_4x4(cm, x); + vp9_inverse_transform_mbuv_4x4(xd); + break; + default: // 16x16 or 8x8 + vp9_transform_mbuv_8x8(x); + vp9_quantize_mbuv_8x8(x); + if (x->optimize) + vp9_optimize_mbuv_8x8(cm, x); + vp9_inverse_transform_mbuv_8x8(xd); + break; + } vp9_recon_intra_mbuv(xd); } @@ -141,38 +145,47 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { int i; TX_TYPE tx_type; - vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor); + vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first, b->predictor); // generate residual blocks vp9_subtract_4b_c(be, b, 16); if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { int idx = (ib & 0x02) ? (ib + 2) : ib; - tx_type = get_tx_type_8x8(xd, &xd->block[ib]); + tx_type = get_tx_type_8x8(xd, ib); if (tx_type != DCT_DCT) { - vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, - tx_type, 8); - x->quantize_b_8x8(x->block + idx, xd->block + idx); - vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32, - tx_type, 8, xd->block[idx].eob); + vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type); + x->quantize_b_8x8(x, idx, tx_type); + vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, + 16, tx_type); } else { - x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32); - x->quantize_b_8x8(x->block + idx, xd->block + idx); + x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32); + x->quantize_b_8x8(x, idx, DCT_DCT); vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32); } } else { for (i = 0; i < 4; i++) { b = &xd->block[ib + iblock[i]]; be = &x->block[ib + iblock[i]]; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, ib + iblock[i]); if (tx_type != DCT_DCT) { - vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4); - vp9_ht_quantize_b_4x4(be, b, tx_type); - vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob); + vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type); + vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type); + } else if (!(i & 1) && + get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) { + x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1); + vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]], + b->dqcoeff, b->diff, 32); + vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i] + 1], + (b + 1)->dqcoeff, (b + 1)->diff, 32); + i++; } else { - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, b); - vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, ib + iblock[i]); + vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]], + b->dqcoeff, b->diff, 32); } } } @@ -186,43 +199,37 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { } void vp9_encode_intra8x8mby(MACROBLOCK *x) { - int i, ib; + int i; - for (i = 0; i < 4; i++) { - ib = vp9_i8x8_block[i]; - vp9_encode_intra8x8(x, ib); - } + for (i = 0; i < 4; i++) + vp9_encode_intra8x8(x, vp9_i8x8_block[i]); } -static void encode_intra_uv4x4(MACROBLOCK *x, int ib, - int mode) { +static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) { BLOCKD *b = &x->e_mbd.block[ib]; BLOCK *be = &x->block[ib]; - vp9_intra_uv4x4_predict(b, mode, b->predictor); + vp9_intra_uv4x4_predict(&x->e_mbd, b, mode, b->predictor); vp9_subtract_b(be, b, 8); - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16); - x->quantize_b_4x4(be, b); - vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 16); + x->fwd_txm4x4(be->src_diff, be->coeff, 16); + x->quantize_b_4x4(x, ib); + vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib], + b->dqcoeff, b->diff, 16); vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); } void vp9_encode_intra8x8mbuv(MACROBLOCK *x) { - int i, ib, mode; - BLOCKD *b; + int i; for (i = 0; i < 4; i++) { - ib = vp9_i8x8_block[i]; - b = &x->e_mbd.block[ib]; - mode = b->bmi.as_mode.first; - - /*u */ - encode_intra_uv4x4(x, i + 16, mode); - /*v */ - encode_intra_uv4x4(x, i + 20, mode); + BLOCKD *b = &x->e_mbd.block[vp9_i8x8_block[i]]; + int mode = b->bmi.as_mode.first; + + encode_intra_uv4x4(x, i + 16, mode); // u + encode_intra_uv4x4(x, i + 20, mode); // v } } diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h index b017673ee92433f7d0afe508e155d1464342e45c..0b19b5652c95bfe9890bd88d6466cc6909ad337d 100644 --- a/vp9/encoder/vp9_encodeintra.h +++ b/vp9/encoder/vp9_encodeintra.h @@ -14,8 +14,8 @@ #include "vp9/encoder/vp9_onyx_int.h" int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred); -void vp9_encode_intra16x16mby(MACROBLOCK *x); -void vp9_encode_intra16x16mbuv(MACROBLOCK *x); +void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x); void vp9_encode_intra4x4mby(MACROBLOCK *mb); void vp9_encode_intra4x4block(MACROBLOCK *x, int ib); void vp9_encode_intra8x8mby(MACROBLOCK *x); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 45278a71b04d888ad766396c7d3f49c783ac84eb..3ad429a9e543753d4820a270a0baa466d79deb75 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -29,9 +29,8 @@ void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) { int r, c; for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { + for (c = 0; c < 4; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - } diff_ptr += pitch; pred_ptr += pitch; @@ -47,9 +46,9 @@ void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) { int r, c; for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) { + for (c = 0; c < 8; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - } + diff_ptr += pitch; pred_ptr += pitch; src_ptr += src_stride; @@ -65,9 +64,8 @@ void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc, int r, c; for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) { + for (c = 0; c < 8; c++) udiff[c] = usrc[c] - upred[c]; - } udiff += 8; upred += dst_stride; @@ -98,9 +96,8 @@ void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src, int src_stride, int r, c; for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { + for (c = 0; c < 16; c++) diff[c] = src[c] - pred[c]; - } diff += 16; pred += dst_stride; @@ -113,9 +110,8 @@ void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride, int r, c; for (r = 0; r < 32; r++) { - for (c = 0; c < 32; c++) { + for (c = 0; c < 32; c++) diff[c] = src[c] - pred[c]; - } diff += 32; pred += dst_stride; @@ -132,9 +128,8 @@ void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc, int r, c; for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { + for (c = 0; c < 16; c++) udiff[c] = usrc[c] - upred[c]; - } udiff += 16; upred += dst_stride; @@ -142,9 +137,8 @@ void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc, } for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { + for (c = 0; c < 16; c++) vdiff[c] = vsrc[c] - vpred[c]; - } vdiff += 16; vpred += dst_stride; @@ -152,6 +146,50 @@ void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc, } } +void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride, + const uint8_t *pred, int dst_stride) { + int r, c; + + for (r = 0; r < 64; r++) { + for (c = 0; c < 64; c++) { + diff[c] = src[c] - pred[c]; + } + + diff += 64; + pred += dst_stride; + src += src_stride; + } +} + +void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc, + const uint8_t *vsrc, int src_stride, + const uint8_t *upred, + const uint8_t *vpred, int dst_stride) { + int16_t *udiff = diff + 4096; + int16_t *vdiff = diff + 4096 + 1024; + int r, c; + + for (r = 0; r < 32; r++) { + for (c = 0; c < 32; c++) { + udiff[c] = usrc[c] - upred[c]; + } + + udiff += 32; + upred += dst_stride; + usrc += src_stride; + } + + for (r = 0; r < 32; r++) { + for (c = 0; c < 32; c++) { + vdiff[c] = vsrc[c] - vpred[c]; + } + + vdiff += 32; + vpred += dst_stride; + vsrc += src_stride; + } +} + void vp9_subtract_mby_c(int16_t *diff, uint8_t *src, uint8_t *pred, int stride) { vp9_subtract_mby_s_c(diff, src, stride, pred, 16); @@ -166,52 +204,29 @@ static void subtract_mb(MACROBLOCK *x) { x->e_mbd.predictor, x->src.uv_stride); } -static void build_dcblock_4x4(MACROBLOCK *x) { - int16_t *src_diff_ptr = &x->src_diff[384]; - int i; - - for (i = 0; i < 16; i++) { - src_diff_ptr[i] = x->coeff[i * 16]; - x->coeff[i * 16] = 0; - } -} - void vp9_transform_mby_4x4(MACROBLOCK *x) { int i; MACROBLOCKD *xd = &x->e_mbd; - int has_2nd_order = get_2nd_order_usage(xd); for (i = 0; i < 16; i++) { BLOCK *b = &x->block[i]; - TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]); + TX_TYPE tx_type = get_tx_type_4x4(xd, i); if (tx_type != DCT_DCT) { - assert(has_2nd_order == 0); - vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 4); + vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type); + } else if (!(i & 1) && get_tx_type_4x4(xd, i + 1) == DCT_DCT) { + x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 32); + i++; } else { - x->vp9_short_fdct4x4(&x->block[i].src_diff[0], - &x->block[i].coeff[0], 32); + x->fwd_txm4x4(x->block[i].src_diff, x->block[i].coeff, 32); } } - - if (has_2nd_order) { - // build dc block from 16 y dc values - build_dcblock_4x4(x); - - // do 2nd order transform on the dc block - x->short_walsh4x4(&x->block[24].src_diff[0], - &x->block[24].coeff[0], 8); - } else { - vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0])); - } } void vp9_transform_mbuv_4x4(MACROBLOCK *x) { int i; - for (i = 16; i < 24; i += 2) { - x->vp9_short_fdct8x4(&x->block[i].src_diff[0], - &x->block[i].coeff[0], 16); - } + for (i = 16; i < 24; i += 2) + x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 16); } static void transform_mb_4x4(MACROBLOCK *x) { @@ -219,71 +234,36 @@ static void transform_mb_4x4(MACROBLOCK *x) { vp9_transform_mbuv_4x4(x); } -static void build_dcblock_8x8(MACROBLOCK *x) { - int16_t *src_diff_ptr = x->block[24].src_diff; - int i; - - for (i = 0; i < 16; i++) { - src_diff_ptr[i] = 0; - } - src_diff_ptr[0] = x->coeff[0 * 16]; - src_diff_ptr[1] = x->coeff[4 * 16]; - src_diff_ptr[4] = x->coeff[8 * 16]; - src_diff_ptr[8] = x->coeff[12 * 16]; - x->coeff[0 * 16] = 0; - x->coeff[4 * 16] = 0; - x->coeff[8 * 16] = 0; - x->coeff[12 * 16] = 0; -} - void vp9_transform_mby_8x8(MACROBLOCK *x) { int i; MACROBLOCKD *xd = &x->e_mbd; TX_TYPE tx_type; - int has_2nd_order = get_2nd_order_usage(xd); for (i = 0; i < 9; i += 8) { BLOCK *b = &x->block[i]; - tx_type = get_tx_type_8x8(xd, &xd->block[i]); + tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { - assert(has_2nd_order == 0); - vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 8); + vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type); } else { - x->vp9_short_fdct8x8(&x->block[i].src_diff[0], - &x->block[i].coeff[0], 32); + x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 32); } } for (i = 2; i < 11; i += 8) { BLOCK *b = &x->block[i]; - tx_type = get_tx_type_8x8(xd, &xd->block[i]); + tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { - assert(has_2nd_order == 0); - vp9_fht_c(b->src_diff, 32, (b + 2)->coeff, tx_type, 8); + vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type); } else { - x->vp9_short_fdct8x8(&x->block[i].src_diff[0], - &x->block[i + 2].coeff[0], 32); + x->fwd_txm8x8(x->block[i].src_diff, x->block[i + 2].coeff, 32); } } - - if (has_2nd_order) { - // build dc block from 2x2 y dc values - build_dcblock_8x8(x); - - // do 2nd order transform on the dc block - x->short_fhaar2x2(&x->block[24].src_diff[0], - &x->block[24].coeff[0], 8); - } else { - vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0])); - } } void vp9_transform_mbuv_8x8(MACROBLOCK *x) { int i; - for (i = 16; i < 24; i += 4) { - x->vp9_short_fdct8x8(&x->block[i].src_diff[0], - &x->block[i].coeff[0], 16); - } + for (i = 16; i < 24; i += 4) + x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 16); } void vp9_transform_mb_8x8(MACROBLOCK *x) { @@ -294,13 +274,12 @@ void vp9_transform_mb_8x8(MACROBLOCK *x) { void vp9_transform_mby_16x16(MACROBLOCK *x) { MACROBLOCKD *xd = &x->e_mbd; BLOCK *b = &x->block[0]; - TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]); + TX_TYPE tx_type = get_tx_type_16x16(xd, 0); vp9_clear_system_state(); if (tx_type != DCT_DCT) { - vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 16); + vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type); } else { - x->vp9_short_fdct16x16(&x->block[0].src_diff[0], - &x->block[0].coeff[0], 32); + x->fwd_txm16x16(x->block[0].src_diff, x->block[0].coeff, 32); } } @@ -310,17 +289,210 @@ void vp9_transform_mb_16x16(MACROBLOCK *x) { } void vp9_transform_sby_32x32(MACROBLOCK *x) { - SUPERBLOCK * const x_sb = &x->sb_coeff_data; - vp9_short_fdct32x32(x_sb->src_diff, x_sb->coeff, 64); + vp9_short_fdct32x32(x->src_diff, x->coeff, 64); +} + +void vp9_transform_sby_16x16(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + int n; + + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4); + + if (tx_type != DCT_DCT) { + vp9_short_fht16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16, + x->coeff + n * 256, 32, tx_type); + } else { + x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16, + x->coeff + n * 256, 64); + } + } +} + +void vp9_transform_sby_8x8(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + int n; + + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2); + + if (tx_type != DCT_DCT) { + vp9_short_fht8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8, + x->coeff + n * 64, 32, tx_type); + } else { + x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8, + x->coeff + n * 64, 64); + } + } +} + +void vp9_transform_sby_4x4(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + int n; + + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx); + + if (tx_type != DCT_DCT) { + vp9_short_fht4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4, + x->coeff + n * 16, 32, tx_type); + } else { + x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4, + x->coeff + n * 16, 64); + } + } } void vp9_transform_sbuv_16x16(MACROBLOCK *x) { - SUPERBLOCK * const x_sb = &x->sb_coeff_data; vp9_clear_system_state(); - x->vp9_short_fdct16x16(x_sb->src_diff + 1024, - x_sb->coeff + 1024, 32); - x->vp9_short_fdct16x16(x_sb->src_diff + 1280, - x_sb->coeff + 1280, 32); + x->fwd_txm16x16(x->src_diff + 1024, x->coeff + 1024, 32); + x->fwd_txm16x16(x->src_diff + 1280, x->coeff + 1280, 32); +} + +void vp9_transform_sbuv_8x8(MACROBLOCK *x) { + int n; + + vp9_clear_system_state(); + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + x->fwd_txm8x8(x->src_diff + 1024 + y_idx * 16 * 8 + x_idx * 8, + x->coeff + 1024 + n * 64, 32); + x->fwd_txm8x8(x->src_diff + 1280 + y_idx * 16 * 8 + x_idx * 8, + x->coeff + 1280 + n * 64, 32); + } +} + +void vp9_transform_sbuv_4x4(MACROBLOCK *x) { + int n; + + vp9_clear_system_state(); + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + + x->fwd_txm4x4(x->src_diff + 1024 + y_idx * 16 * 4 + x_idx * 4, + x->coeff + 1024 + n * 16, 32); + x->fwd_txm4x4(x->src_diff + 1280 + y_idx * 16 * 4 + x_idx * 4, + x->coeff + 1280 + n * 16, 32); + } +} + +void vp9_transform_sb64y_32x32(MACROBLOCK *x) { + int n; + + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + vp9_short_fdct32x32(x->src_diff + y_idx * 64 * 32 + x_idx * 32, + x->coeff + n * 1024, 128); + } +} + +void vp9_transform_sb64y_16x16(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + int n; + + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4); + + if (tx_type != DCT_DCT) { + vp9_short_fht16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16, + x->coeff + n * 256, 64, tx_type); + } else { + x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16, + x->coeff + n * 256, 128); + } + } +} + +void vp9_transform_sb64y_8x8(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + int n; + + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2); + + if (tx_type != DCT_DCT) { + vp9_short_fht8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8, + x->coeff + n * 64, 64, tx_type); + } else { + x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8, + x->coeff + n * 64, 128); + } + } +} + +void vp9_transform_sb64y_4x4(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + int n; + + for (n = 0; n < 256; n++) { + const int x_idx = n & 15, y_idx = n >> 4; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx); + + if (tx_type != DCT_DCT) { + vp9_short_fht8x8(x->src_diff + y_idx * 64 * 4 + x_idx * 4, + x->coeff + n * 16, 64, tx_type); + } else { + x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4, + x->coeff + n * 16, 128); + } + } +} + +void vp9_transform_sb64uv_32x32(MACROBLOCK *x) { + vp9_clear_system_state(); + vp9_short_fdct32x32(x->src_diff + 4096, + x->coeff + 4096, 64); + vp9_short_fdct32x32(x->src_diff + 4096 + 1024, + x->coeff + 4096 + 1024, 64); +} + +void vp9_transform_sb64uv_16x16(MACROBLOCK *x) { + int n; + + vp9_clear_system_state(); + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + x->fwd_txm16x16(x->src_diff + 4096 + y_idx * 32 * 16 + x_idx * 16, + x->coeff + 4096 + n * 256, 64); + x->fwd_txm16x16(x->src_diff + 4096 + 1024 + y_idx * 32 * 16 + x_idx * 16, + x->coeff + 4096 + 1024 + n * 256, 64); + } +} + +void vp9_transform_sb64uv_8x8(MACROBLOCK *x) { + int n; + + vp9_clear_system_state(); + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + + x->fwd_txm8x8(x->src_diff + 4096 + y_idx * 32 * 8 + x_idx * 8, + x->coeff + 4096 + n * 64, 64); + x->fwd_txm8x8(x->src_diff + 4096 + 1024 + y_idx * 32 * 8 + x_idx * 8, + x->coeff + 4096 + 1024 + n * 64, 64); + } +} + +void vp9_transform_sb64uv_4x4(MACROBLOCK *x) { + int n; + + vp9_clear_system_state(); + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + + x->fwd_txm4x4(x->src_diff + 4096 + y_idx * 32 * 4 + x_idx * 4, + x->coeff + 4096 + n * 16, 64); + x->fwd_txm4x4(x->src_diff + 4096 + 1024 + y_idx * 32 * 4 + x_idx * 4, + x->coeff + 4096 + 1024 + n * 16, 64); + } } #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) @@ -338,13 +510,10 @@ struct vp9_token_state { // TODO: experiments to find optimal multiple numbers #define Y1_RD_MULT 4 #define UV_RD_MULT 2 -#define Y2_RD_MULT 4 static const int plane_rd_mult[4] = { Y1_RD_MULT, - Y2_RD_MULT, UV_RD_MULT, - Y1_RD_MULT }; #define UPDATE_RD_COST()\ @@ -357,72 +526,120 @@ static const int plane_rd_mult[4] = { }\ } -static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, +// This function is a place holder for now but may ultimately need +// to scan previous tokens to work out the correct context. +static int trellis_get_coeff_context(const int *scan, + const int *nb, + int idx, int token, + uint8_t *token_cache, + int pad, int l) { + int bak = token_cache[idx], pt; + token_cache[idx] = token; + pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l); + token_cache[idx] = bak; + return pt; +} + +static void optimize_b(VP9_COMMON *const cm, + MACROBLOCK *mb, int ib, PLANE_TYPE type, + const int16_t *dequant_ptr, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int tx_size) { - BLOCK *b = &mb->block[i]; - BLOCKD *d = &mb->e_mbd.block[i]; - vp9_token_state tokens[257][2]; - unsigned best_index[257][2]; - const int16_t *dequant_ptr = d->dequant, *coeff_ptr = b->coeff; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; - int eob = d->eob, final_eob, sz = 0; - int i0 = (type == PLANE_TYPE_Y_NO_DC); - int rc, x, next; + const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME; + MACROBLOCKD *const xd = &mb->e_mbd; + vp9_token_state tokens[1025][2]; + unsigned best_index[1025][2]; + const int16_t *coeff_ptr = mb->coeff + ib * 16; + int16_t *qcoeff_ptr = xd->qcoeff + ib * 16; + int16_t *dqcoeff_ptr = xd->dqcoeff + ib * 16; + int eob = xd->eobs[ib], final_eob, sz = 0; + const int i0 = 0; + int rc, x, next, i; int64_t rdmult, rddiv, rd_cost0, rd_cost1; int rate0, rate1, error0, error1, t0, t1; int best, band, pt; int err_mult = plane_rd_mult[type]; - int default_eob; - int const *scan, *bands; -#if CONFIG_NEWCOEFCONTEXT - const int *neighbors; + int default_eob, pad; + int const *scan, *nb; + const int mul = 1 + (tx_size == TX_32X32); + uint8_t token_cache[1024]; +#if CONFIG_CODE_NONZEROCOUNT + // TODO(debargha): the dynamic programming approach used in this function + // is not compatible with the true rate cost when nzcs are used. Note + // the total rate is the sum of the nzc rate and the indicvidual token + // rates. The latter part can be optimized in this function, but because + // the nzc rate is a function of all the other tokens without a Markov + // relationship this rate cannot be considered correctly. + // The current implementation uses a suboptimal approach to account for + // the nzc rates somewhat, but in reality the optimization approach needs + // to change substantially. + uint16_t nzc = xd->nzcs[ib]; + uint16_t nzc0, nzc1; + uint16_t final_nzc = 0, final_nzc_exp; + int nzc_context = vp9_get_nzc_context(cm, xd, ib); + unsigned int *nzc_cost; + nzc0 = nzc1 = nzc; #endif switch (tx_size) { default: - case TX_4X4: - scan = vp9_default_zig_zag1d_4x4; - bands = vp9_coef_bands_4x4; + case TX_4X4: { + const TX_TYPE tx_type = get_tx_type_4x4(xd, ib); default_eob = 16; - // TODO: this isn't called (for intra4x4 modes), but will be left in - // since it could be used later - { - TX_TYPE tx_type = get_tx_type_4x4(&mb->e_mbd, d); - if (tx_type != DCT_DCT) { - switch (tx_type) { - case ADST_DCT: - scan = vp9_row_scan_4x4; - break; - - case DCT_ADST: - scan = vp9_col_scan_4x4; - break; - - default: - scan = vp9_default_zig_zag1d_4x4; - break; - } - } else { - scan = vp9_default_zig_zag1d_4x4; - } +#if CONFIG_CODE_NONZEROCOUNT + nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type]; +#endif + if (tx_type == DCT_ADST) { + scan = vp9_col_scan_4x4; + } else if (tx_type == ADST_DCT) { + scan = vp9_row_scan_4x4; + } else { + scan = vp9_default_zig_zag1d_4x4; } break; - case TX_8X8: - scan = vp9_default_zig_zag1d_8x8; - bands = vp9_coef_bands_8x8; + } + case TX_8X8: { + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x; + const TX_TYPE tx_type = get_tx_type_8x8(xd, y + (x >> 1)); + if (tx_type == DCT_ADST) { + scan = vp9_col_scan_8x8; + } else if (tx_type == ADST_DCT) { + scan = vp9_row_scan_8x8; + } else { + scan = vp9_default_zig_zag1d_8x8; + } default_eob = 64; +#if CONFIG_CODE_NONZEROCOUNT + nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type]; +#endif break; - case TX_16X16: - scan = vp9_default_zig_zag1d_16x16; - bands = vp9_coef_bands_16x16; + } + case TX_16X16: { + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x; + const TX_TYPE tx_type = get_tx_type_16x16(xd, y + (x >> 2)); + if (tx_type == DCT_ADST) { + scan = vp9_col_scan_16x16; + } else if (tx_type == ADST_DCT) { + scan = vp9_row_scan_16x16; + } else { + scan = vp9_default_zig_zag1d_16x16; + } default_eob = 256; +#if CONFIG_CODE_NONZEROCOUNT + nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type]; +#endif break; - } -#if CONFIG_NEWCOEFCONTEXT - neighbors = vp9_get_coef_neighbors_handle(scan); + } + case TX_32X32: + scan = vp9_default_zig_zag1d_32x32; + default_eob = 1024; +#if CONFIG_CODE_NONZEROCOUNT + nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type]; #endif + break; + } /* Now set up a Viterbi trellis to evaluate alternative roundings. */ rdmult = mb->rdmult * err_mult; @@ -431,15 +648,26 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, rddiv = mb->rddiv; memset(best_index, 0, sizeof(best_index)); /* Initialize the sentinel node of the trellis. */ +#if CONFIG_CODE_NONZEROCOUNT + tokens[eob][0].rate = nzc_cost[nzc]; +#else tokens[eob][0].rate = 0; +#endif tokens[eob][0].error = 0; tokens[eob][0].next = default_eob; tokens[eob][0].token = DCT_EOB_TOKEN; tokens[eob][0].qc = 0; *(tokens[eob] + 1) = *(tokens[eob] + 0); next = eob; + for (i = 0; i < eob; i++) + token_cache[i] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].Token; + nb = vp9_get_coef_neighbors_handle(scan, &pad); + for (i = eob; i-- > i0;) { int base_bits, d2, dx; +#if CONFIG_CODE_NONZEROCOUNT + int new_nzc0, new_nzc1; +#endif rc = scan[i]; x = qcoeff_ptr[rc]; @@ -454,23 +682,19 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, t0 = (vp9_dct_value_tokens_ptr + x)->Token; /* Consider both possible successor states. */ if (next < default_eob) { - band = bands[i + 1]; - pt = vp9_prev_token_class[t0]; -#if CONFIG_NEWCOEFCONTEXT - if (NEWCOEFCONTEXT_BAND_COND(band)) - pt = vp9_get_coef_neighbor_context( - qcoeff_ptr, i0, neighbors, scan[i + 1]); -#endif + band = get_coef_band(scan, tx_size, i + 1); + pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache, + pad, default_eob); rate0 += - mb->token_costs[tx_size][type][band][pt][tokens[next][0].token]; + mb->token_costs[tx_size][type][ref][band][pt][tokens[next][0].token]; rate1 += - mb->token_costs[tx_size][type][band][pt][tokens[next][1].token]; + mb->token_costs[tx_size][type][ref][band][pt][tokens[next][1].token]; } UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; base_bits = *(vp9_dct_value_cost_ptr + x); - dx = dqcoeff_ptr[rc] - coeff_ptr[rc]; + dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]); d2 = dx * dx; tokens[i][0].rate = base_bits + (best ? rate1 : rate0); tokens[i][0].error = d2 + (best ? error1 : error0); @@ -478,12 +702,17 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, tokens[i][0].token = t0; tokens[i][0].qc = x; best_index[i][0] = best; +#if CONFIG_CODE_NONZEROCOUNT + new_nzc0 = (best ? nzc1 : nzc0); +#endif + /* Evaluate the second possibility for this state. */ rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; - if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) && - (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0])) + if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) && + (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul + + dequant_ptr[rc != 0])) shortcut = 1; else shortcut = 0; @@ -502,41 +731,27 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, DCT_EOB_TOKEN : ZERO_TOKEN; t1 = tokens[next][1].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN; +#if CONFIG_CODE_NONZEROCOUNT + // Account for rate drop because of the nzc change. + // TODO(debargha): Find a better solution + rate0 -= nzc_cost[nzc0] - nzc_cost[nzc0 - 1]; + rate1 -= nzc_cost[nzc1] - nzc_cost[nzc1 - 1]; +#endif } else { t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token; } if (next < default_eob) { - band = bands[i + 1]; + band = get_coef_band(scan, tx_size, i + 1); if (t0 != DCT_EOB_TOKEN) { -#if CONFIG_NEWCOEFCONTEXT - int tmp = qcoeff_ptr[scan[i]]; - qcoeff_ptr[scan[i]] = x; - if (NEWCOEFCONTEXT_BAND_COND(band)) - pt = vp9_get_coef_neighbor_context( - qcoeff_ptr, i0, neighbors, scan[i + 1]); - else - pt = vp9_prev_token_class[t0]; - qcoeff_ptr[scan[i]] = tmp; -#else - pt = vp9_prev_token_class[t0]; -#endif - rate0 += mb->token_costs[tx_size][type][band][pt][ + pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache, + pad, default_eob); + rate0 += mb->token_costs[tx_size][type][ref][band][pt][ tokens[next][0].token]; } if (t1 != DCT_EOB_TOKEN) { -#if CONFIG_NEWCOEFCONTEXT - int tmp = qcoeff_ptr[scan[i]]; - qcoeff_ptr[scan[i]] = x; - if (NEWCOEFCONTEXT_BAND_COND(band)) - pt = vp9_get_coef_neighbor_context( - qcoeff_ptr, i0, neighbors, scan[i + 1]); - else - pt = vp9_prev_token_class[t1]; - qcoeff_ptr[scan[i]] = tmp; -#else - pt = vp9_prev_token_class[t1]; -#endif - rate1 += mb->token_costs[tx_size][type][band][pt][ + pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache, + pad, default_eob); + rate1 += mb->token_costs[tx_size][type][ref][band][pt][ tokens[next][1].token]; } } @@ -556,6 +771,11 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, tokens[i][1].token = best ? t1 : t0; tokens[i][1].qc = x; best_index[i][1] = best; +#if CONFIG_CODE_NONZEROCOUNT + new_nzc1 = (best ? nzc1 : nzc0) - (!x); + nzc0 = new_nzc0; + nzc1 = new_nzc1; +#endif /* Finally, make this the new head of the trellis. */ next = i; } @@ -563,16 +783,18 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, * add a new trellis node, but we do need to update the costs. */ else { - band = bands[i + 1]; + band = get_coef_band(scan, tx_size, i + 1); t0 = tokens[next][0].token; t1 = tokens[next][1].token; /* Update the cost of each path if we're past the EOB token. */ if (t0 != DCT_EOB_TOKEN) { - tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0]; + tokens[next][0].rate += + mb->token_costs[tx_size][type][ref][band][0][t0]; tokens[next][0].token = ZERO_TOKEN; } if (t1 != DCT_EOB_TOKEN) { - tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1]; + tokens[next][1].rate += + mb->token_costs[tx_size][type][ref][band][0][t1]; tokens[next][1].token = ZERO_TOKEN; } /* Don't update next, because we didn't add a new node. */ @@ -580,7 +802,7 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, } /* Now pick the best path through the whole trellis. */ - band = bands[i + 1]; + band = get_coef_band(scan, tx_size, i + 1); VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l); rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; @@ -588,99 +810,41 @@ static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type, error1 = tokens[next][1].error; t0 = tokens[next][0].token; t1 = tokens[next][1].token; - rate0 += mb->token_costs[tx_size][type][band][pt][t0]; - rate1 += mb->token_costs[tx_size][type][band][pt][t1]; + rate0 += mb->token_costs[tx_size][type][ref][band][pt][t0]; + rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1]; UPDATE_RD_COST(); best = rd_cost1 < rd_cost0; +#if CONFIG_CODE_NONZEROCOUNT + final_nzc_exp = (best ? nzc1 : nzc0); +#endif final_eob = i0 - 1; for (i = next; i < eob; i = next) { x = tokens[i][best].qc; - if (x) + if (x) { final_eob = i; +#if CONFIG_CODE_NONZEROCOUNT + ++final_nzc; +#endif + } rc = scan[i]; qcoeff_ptr[rc] = x; - dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]); + dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul; next = tokens[i][best].next; best = best_index[i][best]; } final_eob++; - d->eob = final_eob; - *a = *l = (d->eob > !type); + xd->eobs[ib] = final_eob; + *a = *l = (final_eob > 0); +#if CONFIG_CODE_NONZEROCOUNT + assert(final_nzc == final_nzc_exp); + xd->nzcs[ib] = final_nzc; +#endif } -/************************************************************************** -our inverse hadamard transform effectively is weighted sum of all 16 inputs -with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And -dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the -output after inverse wht and idct will be all zero. A sum of absolute value -smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht -fall between -65 and +65. -**************************************************************************/ -#define SUM_2ND_COEFF_THRESH 65 - -static void check_reset_2nd_coeffs(MACROBLOCKD *xd, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { - int sum = 0; - int i; - BLOCKD *bd = &xd->block[24]; - if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH - && bd->dequant[1] >= SUM_2ND_COEFF_THRESH) - return; - - for (i = 0; i < bd->eob; i++) { - int coef = bd->dqcoeff[vp9_default_zig_zag1d_4x4[i]]; - sum += (coef >= 0) ? coef : -coef; - if (sum >= SUM_2ND_COEFF_THRESH) - return; - } - - if (sum < SUM_2ND_COEFF_THRESH) { - for (i = 0; i < bd->eob; i++) { - int rc = vp9_default_zig_zag1d_4x4[i]; - bd->qcoeff[rc] = 0; - bd->dqcoeff[rc] = 0; - } - bd->eob = 0; - *a = *l = (bd->eob != 0); - } -} - -#define SUM_2ND_COEFF_THRESH_8X8 32 -static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { - int sum = 0; - BLOCKD *bd = &xd->block[24]; - int coef; - - coef = bd->dqcoeff[0]; - sum += (coef >= 0) ? coef : -coef; - coef = bd->dqcoeff[1]; - sum += (coef >= 0) ? coef : -coef; - coef = bd->dqcoeff[4]; - sum += (coef >= 0) ? coef : -coef; - coef = bd->dqcoeff[8]; - sum += (coef >= 0) ? coef : -coef; - - if (sum < SUM_2ND_COEFF_THRESH_8X8) { - bd->qcoeff[0] = 0; - bd->dqcoeff[0] = 0; - bd->qcoeff[1] = 0; - bd->dqcoeff[1] = 0; - bd->qcoeff[4] = 0; - bd->dqcoeff[4] = 0; - bd->qcoeff[8] = 0; - bd->dqcoeff[8] = 0; - bd->eob = 0; - *a = *l = (bd->eob != 0); - } -} - -void vp9_optimize_mby_4x4(MACROBLOCK *x) { +void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { int b; - PLANE_TYPE type; - int has_2nd_order; ENTROPY_CONTEXT_PLANES t_above, t_left; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; @@ -694,28 +858,14 @@ void vp9_optimize_mby_4x4(MACROBLOCK *x) { ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; - has_2nd_order = get_2nd_order_usage(&x->e_mbd); - - type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; - for (b = 0; b < 16; b++) { - optimize_b(x, b, type, - ta + vp9_block2above[TX_4X4][b], - tl + vp9_block2left[TX_4X4][b], TX_4X4); - } - - if (has_2nd_order) { - b = 24; - optimize_b(x, b, PLANE_TYPE_Y2, + optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant, ta + vp9_block2above[TX_4X4][b], tl + vp9_block2left[TX_4X4][b], TX_4X4); - check_reset_2nd_coeffs(&x->e_mbd, - ta + vp9_block2above[TX_4X4][b], - tl + vp9_block2left[TX_4X4][b]); } } -void vp9_optimize_mbuv_4x4(MACROBLOCK *x) { +void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { int b; ENTROPY_CONTEXT_PLANES t_above, t_left; ENTROPY_CONTEXT *ta; @@ -731,24 +881,22 @@ void vp9_optimize_mbuv_4x4(MACROBLOCK *x) { tl = (ENTROPY_CONTEXT *)&t_left; for (b = 16; b < 24; b++) { - optimize_b(x, b, PLANE_TYPE_UV, + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant, ta + vp9_block2above[TX_4X4][b], tl + vp9_block2left[TX_4X4][b], TX_4X4); } } -static void optimize_mb_4x4(MACROBLOCK *x) { - vp9_optimize_mby_4x4(x); - vp9_optimize_mbuv_4x4(x); +static void optimize_mb_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + vp9_optimize_mby_4x4(cm, x); + vp9_optimize_mbuv_4x4(cm, x); } -void vp9_optimize_mby_8x8(MACROBLOCK *x) { +void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { int b; - PLANE_TYPE type; ENTROPY_CONTEXT_PLANES t_above, t_left; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; - int has_2nd_order = get_2nd_order_usage(&x->e_mbd); if (!x->e_mbd.above_context || !x->e_mbd.left_context) return; @@ -758,31 +906,19 @@ void vp9_optimize_mby_8x8(MACROBLOCK *x) { ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; - type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; for (b = 0; b < 16; b += 4) { ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b]; ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b]; -#if CONFIG_CNVCONTEXT ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0; ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0; -#else - ENTROPY_CONTEXT above_ec = a[0]; - ENTROPY_CONTEXT left_ec = l[0]; -#endif - optimize_b(x, b, type, &above_ec, &left_ec, TX_8X8); + optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant, + &above_ec, &left_ec, TX_8X8); a[1] = a[0] = above_ec; l[1] = l[0] = left_ec; } - - // 8x8 always have 2nd order block - if (has_2nd_order) { - check_reset_8x8_2nd_coeffs(&x->e_mbd, - ta + vp9_block2above[TX_8X8][24], - tl + vp9_block2left[TX_8X8][24]); - } } -void vp9_optimize_mbuv_8x8(MACROBLOCK *x) { +void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { int b; ENTROPY_CONTEXT *const ta = (ENTROPY_CONTEXT *)x->e_mbd.above_context; ENTROPY_CONTEXT *const tl = (ENTROPY_CONTEXT *)x->e_mbd.left_context; @@ -793,23 +929,19 @@ void vp9_optimize_mbuv_8x8(MACROBLOCK *x) { for (b = 16; b < 24; b += 4) { ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b]; ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b]; -#if CONFIG_CNVCONTEXT ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0; ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0; -#else - ENTROPY_CONTEXT above_ec = a[0]; - ENTROPY_CONTEXT left_ec = l[0]; -#endif - optimize_b(x, b, PLANE_TYPE_UV, &above_ec, &left_ec, TX_8X8); + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant, + &above_ec, &left_ec, TX_8X8); } } -static void optimize_mb_8x8(MACROBLOCK *x) { - vp9_optimize_mby_8x8(x); - vp9_optimize_mbuv_8x8(x); +static void optimize_mb_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + vp9_optimize_mby_8x8(cm, x); + vp9_optimize_mbuv_8x8(cm, x); } -void vp9_optimize_mby_16x16(MACROBLOCK *x) { +void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { ENTROPY_CONTEXT_PLANES *const t_above = x->e_mbd.above_context; ENTROPY_CONTEXT_PLANES *const t_left = x->e_mbd.left_context; ENTROPY_CONTEXT ta, tl; @@ -817,22 +949,345 @@ void vp9_optimize_mby_16x16(MACROBLOCK *x) { if (!t_above || !t_left) return; -#if CONFIG_CNVCONTEXT ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0; tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0; -#else - ta = t_above->y1[0]; - tl = t_left->y1[0]; -#endif - optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, &ta, &tl, TX_16X16); + optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + &ta, &tl, TX_16X16); +} + +static void optimize_mb_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + vp9_optimize_mby_16x16(cm, x); + vp9_optimize_mbuv_8x8(cm, x); +} + +void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context; + ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1); + ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context; + ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1); + ENTROPY_CONTEXT ta, tl; + + ta = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0; + tl = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0; + optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + &ta, &tl, TX_32X32); +} + +void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context; + ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1); + ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context; + ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1); + ENTROPY_CONTEXT ta[2], tl[2]; + int n; + + ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0; + ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0; + tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0; + tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0; + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + ta + x_idx, tl + y_idx, TX_16X16); + } +} + +void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context; + ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1); + ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context; + ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1); + ENTROPY_CONTEXT ta[4], tl[4]; + int n; + + ta[0] = (a[0] + a[1]) != 0; + ta[1] = (a[2] + a[3]) != 0; + ta[2] = (a1[0] + a1[1]) != 0; + ta[3] = (a1[2] + a1[3]) != 0; + tl[0] = (l[0] + l[1]) != 0; + tl[1] = (l[2] + l[3]) != 0; + tl[2] = (l1[0] + l1[1]) != 0; + tl[3] = (l1[2] + l1[3]) != 0; + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + + optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + ta + x_idx, tl + y_idx, TX_8X8); + } +} + +void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT ta[8], tl[8]; + int n; + + vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT)); + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + + optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + ta + x_idx, tl + y_idx, TX_4X4); + } +} + +void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context; + ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec; + int b; + + for (b = 64; b < 96; b += 16) { + const int cidx = b >= 80 ? 20 : 16; + a = ta + vp9_block2above_sb[TX_16X16][b]; + l = tl + vp9_block2left_sb[TX_16X16][b]; + a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; + left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant, + &above_ec, &left_ec, TX_16X16); + } +} + +void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left; + ENTROPY_CONTEXT *a, *l, above_ec, left_ec; + int b; + + vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above)); + vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left)); + for (b = 64; b < 96; b += 4) { + const int cidx = b >= 80 ? 20 : 16; + a = ta + vp9_block2above_sb[TX_8X8][b]; + l = tl + vp9_block2left_sb[TX_8X8][b]; + above_ec = (a[0] + a[1]) != 0; + left_ec = (l[0] + l[1]) != 0; + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant, + &above_ec, &left_ec, TX_8X8); + a[0] = a[1] = above_ec; + l[0] = l[1] = left_ec; + } +} + +void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left; + ENTROPY_CONTEXT *a, *l; + int b; + + vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above)); + vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left)); + for (b = 64; b < 96; b++) { + const int cidx = b >= 80 ? 20 : 16; + a = ta + vp9_block2above_sb[TX_4X4][b]; + l = tl + vp9_block2left_sb[TX_4X4][b]; + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant, + a, l, TX_4X4); + } +} + +void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context; + ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1); + ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2); + ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3); + ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context; + ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1); + ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2); + ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3); + ENTROPY_CONTEXT ta[2], tl[2]; + int n; + + ta[0] = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0; + ta[1] = (a2[0] + a2[1] + a2[2] + a2[3] + a3[0] + a3[1] + a3[2] + a3[3]) != 0; + tl[0] = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0; + tl[1] = (l2[0] + l2[1] + l2[2] + l2[3] + l3[0] + l3[1] + l3[2] + l3[3]) != 0; + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + ta + x_idx, tl + y_idx, TX_32X32); + } +} + +void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context; + ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1); + ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2); + ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3); + ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context; + ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1); + ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2); + ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3); + ENTROPY_CONTEXT ta[4], tl[4]; + int n; + + ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0; + ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0; + ta[2] = (a2[0] + a2[1] + a2[2] + a2[3]) != 0; + ta[3] = (a3[0] + a3[1] + a3[2] + a3[3]) != 0; + tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0; + tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0; + tl[2] = (l2[0] + l2[1] + l2[2] + l2[3]) != 0; + tl[3] = (l3[0] + l3[1] + l3[2] + l3[3]) != 0; + for (n = 0; n < 16; n++) { + const int x_idx = n & 3, y_idx = n >> 2; + + optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + ta + x_idx, tl + y_idx, TX_16X16); + } +} + +void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context; + ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1); + ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2); + ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3); + ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context; + ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1); + ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2); + ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3); + ENTROPY_CONTEXT ta[8], tl[8]; + int n; + + ta[0] = (a[0] + a[1]) != 0; + ta[1] = (a[2] + a[3]) != 0; + ta[2] = (a1[0] + a1[1]) != 0; + ta[3] = (a1[2] + a1[3]) != 0; + ta[4] = (a2[0] + a2[1]) != 0; + ta[5] = (a2[2] + a2[3]) != 0; + ta[6] = (a3[0] + a3[1]) != 0; + ta[7] = (a3[2] + a3[3]) != 0; + tl[0] = (l[0] + l[1]) != 0; + tl[1] = (l[2] + l[3]) != 0; + tl[2] = (l1[0] + l1[1]) != 0; + tl[3] = (l1[2] + l1[3]) != 0; + tl[4] = (l2[0] + l2[1]) != 0; + tl[5] = (l2[2] + l2[3]) != 0; + tl[6] = (l3[0] + l3[1]) != 0; + tl[7] = (l3[2] + l3[3]) != 0; + for (n = 0; n < 64; n++) { + const int x_idx = n & 7, y_idx = n >> 3; + + optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + ta + x_idx, tl + y_idx, TX_8X8); + } +} + +void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT ta[16], tl[16]; + int n; + + vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(ta + 8, x->e_mbd.above_context + 2, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(ta + 12, x->e_mbd.above_context + 3, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(tl + 8, x->e_mbd.left_context + 2, 4 * sizeof(ENTROPY_CONTEXT)); + vpx_memcpy(tl + 12, x->e_mbd.left_context + 3, 4 * sizeof(ENTROPY_CONTEXT)); + for (n = 0; n < 256; n++) { + const int x_idx = n & 15, y_idx = n >> 4; + + optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant, + ta + x_idx, tl + y_idx, TX_4X4); + } +} + +void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context; + ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec; + int b; + + for (b = 256; b < 384; b += 64) { + const int cidx = b >= 320 ? 20 : 16; + a = ta + vp9_block2above_sb64[TX_32X32][b]; + l = tl + vp9_block2left_sb64[TX_32X32][b]; + a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a2 = a + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l2 = l + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a3 = a + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l3 = l + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a_ec = (a[0] + a[1] + a1[0] + a1[1] + a2[0] + a2[1] + a3[0] + a3[1]) != 0; + l_ec = (l[0] + l[1] + l1[0] + l1[1] + l2[0] + l2[1] + l3[0] + l3[1]) != 0; + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant, + &a_ec, &l_ec, TX_32X32); + } +} + +void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left; + ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec; + int b; + + vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above)); + vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left)); + for (b = 256; b < 384; b += 16) { + const int cidx = b >= 320 ? 20 : 16; + a = ta + vp9_block2above_sb64[TX_16X16][b]; + l = tl + vp9_block2left_sb64[TX_16X16][b]; + a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; + left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant, + &above_ec, &left_ec, TX_16X16); + a[0] = a[1] = a1[0] = a1[1] = above_ec; + l[0] = l[1] = l1[0] = l1[1] = left_ec; + } } -static void optimize_mb_16x16(MACROBLOCK *x) { - vp9_optimize_mby_16x16(x); - vp9_optimize_mbuv_8x8(x); +void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left; + ENTROPY_CONTEXT *a, *l, above_ec, left_ec; + int b; + + vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above)); + vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left)); + for (b = 256; b < 384; b += 4) { + const int cidx = b >= 320 ? 20 : 16; + a = ta + vp9_block2above_sb64[TX_8X8][b]; + l = tl + vp9_block2left_sb64[TX_8X8][b]; + above_ec = (a[0] + a[1]) != 0; + left_ec = (l[0] + l[1]) != 0; + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant, + &above_ec, &left_ec, TX_8X8); + a[0] = a[1] = above_ec; + l[0] = l[1] = left_ec; + } +} + +void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left; + ENTROPY_CONTEXT *a, *l; + int b; + + vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above)); + vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left)); + for (b = 256; b < 384; b++) { + const int cidx = b >= 320 ? 20 : 16; + a = ta + vp9_block2above_sb64[TX_4X4][b]; + l = tl + vp9_block2left_sb64[TX_4X4][b]; + optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant, + a, l, TX_4X4); + } } -void vp9_fidct_mb(MACROBLOCK *x) { +void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) { MACROBLOCKD *const xd = &x->e_mbd; TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size; @@ -840,7 +1295,7 @@ void vp9_fidct_mb(MACROBLOCK *x) { vp9_transform_mb_16x16(x); vp9_quantize_mb_16x16(x); if (x->optimize) - optimize_mb_16x16(x); + optimize_mb_16x16(cm, x); vp9_inverse_transform_mb_16x16(xd); } else if (tx_size == TX_8X8) { if (xd->mode_info_context->mbmi.mode == SPLITMV) { @@ -850,8 +1305,8 @@ void vp9_fidct_mb(MACROBLOCK *x) { vp9_quantize_mby_8x8(x); vp9_quantize_mbuv_4x4(x); if (x->optimize) { - vp9_optimize_mby_8x8(x); - vp9_optimize_mbuv_4x4(x); + vp9_optimize_mby_8x8(cm, x); + vp9_optimize_mbuv_4x4(cm, x); } vp9_inverse_transform_mby_8x8(xd); vp9_inverse_transform_mbuv_4x4(xd); @@ -859,33 +1314,34 @@ void vp9_fidct_mb(MACROBLOCK *x) { vp9_transform_mb_8x8(x); vp9_quantize_mb_8x8(x); if (x->optimize) - optimize_mb_8x8(x); + optimize_mb_8x8(cm, x); vp9_inverse_transform_mb_8x8(xd); } } else { transform_mb_4x4(x); vp9_quantize_mb_4x4(x); if (x->optimize) - optimize_mb_4x4(x); + optimize_mb_4x4(cm, x); vp9_inverse_transform_mb_4x4(xd); } } -void vp9_encode_inter16x16(MACROBLOCK *x) { +void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x, + int mb_row, int mb_col) { MACROBLOCKD *const xd = &x->e_mbd; - vp9_build_inter_predictors_mb(xd); + vp9_build_inter_predictors_mb(xd, mb_row, mb_col); subtract_mb(x); - vp9_fidct_mb(x); + vp9_fidct_mb(cm, x); vp9_recon_mb(xd); } /* this function is used by first pass only */ -void vp9_encode_inter16x16y(MACROBLOCK *x) { +void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) { MACROBLOCKD *xd = &x->e_mbd; BLOCK *b = &x->block[0]; - vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0); + vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col); vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride); diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index f3c679227f7e5dc12f94025a47a2f70cdb8a230d..242afbeae9ab87e5761233eaf9513860bd1df162 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -13,6 +13,8 @@ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/common/vp9_onyxc_int.h" typedef struct { MB_PREDICTION_MODE mode; @@ -21,32 +23,60 @@ typedef struct { } MODE_DEFINITION; -#include "vp9/encoder/vp9_onyx_int.h" struct VP9_ENCODER_RTCD; -void vp9_encode_inter16x16(MACROBLOCK *x); +void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x, + int mb_row, int mb_col); void vp9_transform_mbuv_4x4(MACROBLOCK *x); void vp9_transform_mby_4x4(MACROBLOCK *x); -void vp9_optimize_mby_4x4(MACROBLOCK *x); -void vp9_optimize_mbuv_4x4(MACROBLOCK *x); -void vp9_encode_inter16x16y(MACROBLOCK *x); +void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col); void vp9_transform_mb_8x8(MACROBLOCK *mb); void vp9_transform_mby_8x8(MACROBLOCK *x); void vp9_transform_mbuv_8x8(MACROBLOCK *x); -void vp9_build_dcblock_8x8(MACROBLOCK *b); -void vp9_optimize_mby_8x8(MACROBLOCK *x); -void vp9_optimize_mbuv_8x8(MACROBLOCK *x); +void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x); void vp9_transform_mb_16x16(MACROBLOCK *mb); void vp9_transform_mby_16x16(MACROBLOCK *x); -void vp9_optimize_mby_16x16(MACROBLOCK *x); +void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x); void vp9_transform_sby_32x32(MACROBLOCK *x); +void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sby_16x16(MACROBLOCK *x); +void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sby_8x8(MACROBLOCK *x); +void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sby_4x4(MACROBLOCK *x); +void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x); void vp9_transform_sbuv_16x16(MACROBLOCK *x); +void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sbuv_8x8(MACROBLOCK *x); +void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sbuv_4x4(MACROBLOCK *x); +void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x); + +void vp9_transform_sb64y_32x32(MACROBLOCK *x); +void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sb64y_16x16(MACROBLOCK *x); +void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sb64y_8x8(MACROBLOCK *x); +void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sb64y_4x4(MACROBLOCK *x); +void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sb64uv_32x32(MACROBLOCK *x); +void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sb64uv_16x16(MACROBLOCK *x); +void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sb64uv_8x8(MACROBLOCK *x); +void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x); +void vp9_transform_sb64uv_4x4(MACROBLOCK *x); +void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x); -void vp9_fidct_mb(MACROBLOCK *x); +void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x); void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch); @@ -63,5 +93,11 @@ void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc, const uint8_t *vsrc, int src_stride, const uint8_t *upred, const uint8_t *vpred, int dst_stride); +void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride, + const uint8_t *pred, int dst_stride); +void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc, + const uint8_t *vsrc, int src_stride, + const uint8_t *upred, + const uint8_t *vpred, int dst_stride); #endif // VP9_ENCODER_VP9_ENCODEMB_H_ diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 8df6c20a75f726f7ff47fa737fd128e9fc997f21..70f9e3153a2de5cbb59396624bdb201fea3442b1 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -378,6 +378,19 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; int new_mv_mode_penalty = 256; + int sr = 0; + int quart_frm = MIN(cpi->common.width, cpi->common.height); + + // refine the motion search range accroding to the frame dimension + // for first pass test + while ((quart_frm << sr) < MAX_FULL_PEL_VAL) + sr++; + if (sr) + sr--; + + step_param += sr; + further_steps -= sr; + // override the default variance function to use MSE v_fn_ptr.vf = vp9_mse16x16; @@ -435,9 +448,11 @@ void vp9_first_pass(VP9_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; int recon_yoffset, recon_uvoffset; - YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; + YV12_BUFFER_CONFIG *lst_yv12 = + &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]]; YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; - YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; + YV12_BUFFER_CONFIG *gld_yv12 = + &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]]; int recon_y_stride = lst_yv12->y_stride; int recon_uv_stride = lst_yv12->uv_stride; int64_t intra_error = 0; @@ -611,7 +626,7 @@ void vp9_first_pass(VP9_COMP *cpi) { this_error = motion_error; vp9_set_mbmode_and_mvs(x, NEWMV, &mv); xd->mode_info_context->mbmi.txfm_size = TX_4X4; - vp9_encode_inter16x16y(x); + vp9_encode_inter16x16y(x, mb_row, mb_col); sum_mvr += mv.as_mv.row; sum_mvr_abs += abs(mv.as_mv.row); sum_mvc += mv.as_mv.col; @@ -843,16 +858,15 @@ static double calc_correction_factor(double err_per_mb, power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low; power_term = (power_term > pt_high) ? pt_high : power_term; - // Adjustments to error term - // TBD - // Calculate correction factor + if (power_term < 1.0) + assert(error_term >= 0.0); correction_factor = pow(error_term, power_term); // Clip range correction_factor = (correction_factor < 0.05) - ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor; + ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor; return correction_factor; } @@ -886,8 +900,7 @@ static void adjust_maxq_qrange(VP9_COMP *cpi) { static int estimate_max_q(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh, - int overhead_bits) { + int section_target_bandwitdh) { int Q; int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; @@ -898,7 +911,6 @@ static int estimate_max_q(VP9_COMP *cpi, double err_per_mb = section_err / num_mbs; double err_correction_factor; double speed_correction = 1.0; - double overhead_bits_per_mb; if (section_target_bandwitdh <= 0) return cpi->twopass.maxq_max_limit; // Highest value allowed @@ -910,15 +922,19 @@ static int estimate_max_q(VP9_COMP *cpi, // Look at the drop in prediction quality between the last frame // and the GF buffer (which contained an older frame). - sr_err_diff = - (fpstats->sr_coded_error - fpstats->coded_error) / - (fpstats->count * cpi->common.MBs); - sr_correction = (sr_err_diff / 32.0); - sr_correction = pow(sr_correction, 0.25); - if (sr_correction < 0.75) + if (fpstats->sr_coded_error > fpstats->coded_error) { + sr_err_diff = + (fpstats->sr_coded_error - fpstats->coded_error) / + (fpstats->count * cpi->common.MBs); + sr_correction = (sr_err_diff / 32.0); + sr_correction = pow(sr_correction, 0.25); + if (sr_correction < 0.75) + sr_correction = 0.75; + else if (sr_correction > 1.25) + sr_correction = 1.25; + } else { sr_correction = 0.75; - else if (sr_correction > 1.25) - sr_correction = 1.25; + } // Calculate a corrective factor based on a rolling ratio of bits spent // vs target bits @@ -950,13 +966,6 @@ static int estimate_max_q(VP9_COMP *cpi, speed_correction = 1.25; } - // Estimate of overhead bits per mb - // Correction to overhead bits for min allowed Q. - // PGW TODO.. This code is broken for the extended Q range - // for now overhead set to 0. - overhead_bits_per_mb = overhead_bits / num_mbs; - overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit); - // Try and pick a max Q that will be high enough to encode the // content at the given rate. for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) { @@ -967,23 +976,9 @@ static int estimate_max_q(VP9_COMP *cpi, sr_correction * speed_correction * cpi->twopass.est_max_qcorrection_factor; - if (err_correction_factor < 0.05) - err_correction_factor = 0.05; - else if (err_correction_factor > 5.0) - err_correction_factor = 5.0; bits_per_mb_at_this_q = - vp9_bits_per_mb(INTER_FRAME, Q) + (int)overhead_bits_per_mb; - - bits_per_mb_at_this_q = (int)(.5 + err_correction_factor * - (double)bits_per_mb_at_this_q); - - // Mode and motion overhead - // As Q rises in real encode loop rd code will force overhead down - // We make a crude adjustment for this here as *.98 per Q step. - // PGW TODO.. This code is broken for the extended Q range - // for now overhead set to 0. - // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98); + vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor); if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) break; @@ -1001,7 +996,7 @@ static int estimate_max_q(VP9_COMP *cpi, // PGW TODO.. This code is broken for the extended Q range if ((cpi->ni_frames > ((int)cpi->twopass.total_stats->count >> 8)) && - (cpi->ni_frames > 150)) { + (cpi->ni_frames > 25)) { adjust_maxq_qrange(cpi); } @@ -1012,8 +1007,7 @@ static int estimate_max_q(VP9_COMP *cpi, // complexity and data rate. static int estimate_cq(VP9_COMP *cpi, FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh, - int overhead_bits) { + int section_target_bandwitdh) { int Q; int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; @@ -1026,15 +1020,11 @@ static int estimate_cq(VP9_COMP *cpi, double speed_correction = 1.0; double clip_iiratio; double clip_iifactor; - double overhead_bits_per_mb; - target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs); - // Estimate of overhead bits per mb - overhead_bits_per_mb = overhead_bits / num_mbs; // Corrections for higher compression speed settings // (reduced compression expected) @@ -1047,15 +1037,19 @@ static int estimate_cq(VP9_COMP *cpi, // Look at the drop in prediction quality between the last frame // and the GF buffer (which contained an older frame). - sr_err_diff = - (fpstats->sr_coded_error - fpstats->coded_error) / - (fpstats->count * cpi->common.MBs); - sr_correction = (sr_err_diff / 32.0); - sr_correction = pow(sr_correction, 0.25); - if (sr_correction < 0.75) + if (fpstats->sr_coded_error > fpstats->coded_error) { + sr_err_diff = + (fpstats->sr_coded_error - fpstats->coded_error) / + (fpstats->count * cpi->common.MBs); + sr_correction = (sr_err_diff / 32.0); + sr_correction = pow(sr_correction, 0.25); + if (sr_correction < 0.75) + sr_correction = 0.75; + else if (sr_correction > 1.25) + sr_correction = 1.25; + } else { sr_correction = 0.75; - else if (sr_correction > 1.25) - sr_correction = 1.25; + } // II ratio correction factor for clip as a whole clip_iiratio = cpi->twopass.total_stats->intra_error / @@ -1073,23 +1067,8 @@ static int estimate_cq(VP9_COMP *cpi, calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) * sr_correction * speed_correction * clip_iifactor; - if (err_correction_factor < 0.05) - err_correction_factor = 0.05; - else if (err_correction_factor > 5.0) - err_correction_factor = 5.0; - bits_per_mb_at_this_q = - vp9_bits_per_mb(INTER_FRAME, Q) + (int)overhead_bits_per_mb; - - bits_per_mb_at_this_q = (int)(.5 + err_correction_factor * - (double)bits_per_mb_at_this_q); - - // Mode and motion overhead - // As Q rises in real encode loop rd code will force overhead down - // We make a crude adjustment for this here as *.98 per Q step. - // PGW TODO.. This code is broken for the extended Q range - // for now overhead set to 0. - overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98); + vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor); if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) break; @@ -1209,12 +1188,16 @@ static double get_prediction_decay_rate(VP9_COMP *cpi, mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) / (cpi->common.MBs); - second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0); - second_ref_decay = pow(second_ref_decay, 0.5); - if (second_ref_decay < 0.85) + if (mb_sr_err_diff <= 512.0) { + second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0); + second_ref_decay = pow(second_ref_decay, 0.5); + if (second_ref_decay < 0.85) + second_ref_decay = 0.85; + else if (second_ref_decay > 1.0) + second_ref_decay = 1.0; + } else { second_ref_decay = 0.85; - else if (second_ref_decay > 1.0) - second_ref_decay = 1.0; + } if (second_ref_decay < prediction_decay_rate) prediction_decay_rate = second_ref_decay; @@ -1459,11 +1442,14 @@ static int calc_arf_boost( return arf_boost; } -static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { +static void configure_arnr_filter(VP9_COMP *cpi, + FIRSTPASS_STATS *this_frame, + int group_boost) { int half_gf_int; int frames_after_arf; int frames_bwd = cpi->oxcf.arnr_max_frames - 1; int frames_fwd = cpi->oxcf.arnr_max_frames - 1; + int q; // Define the arnr filter width for this group of frames: // We only filter frames that lie within a distance of half @@ -1508,6 +1494,25 @@ static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd; + + // Adjust the strength based on active max q + q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1); + if (q > 8) { + cpi->active_arnr_strength = cpi->oxcf.arnr_strength; + } else { + cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q); + if (cpi->active_arnr_strength < 0) + cpi->active_arnr_strength = 0; + } + + // Adjust number of frames in filter and strength based on gf boost level. + if (cpi->active_arnr_frames > (group_boost / 150)) { + cpi->active_arnr_frames = (group_boost / 150); + cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1); + } + if (cpi->active_arnr_strength > (group_boost / 300)) { + cpi->active_arnr_strength = (group_boost / 300); + } } // Analyse and define a gf/arf group . @@ -1531,7 +1536,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; - + double mv_ratio_accumulator_thresh; int max_bits = frame_max_bits(cpi); // Max for a single frame unsigned int allow_alt_ref = @@ -1540,6 +1545,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int f_boost = 0; int b_boost = 0; int flash_detected; + int active_max_gf_interval; cpi->twopass.gf_group_bits = 0; @@ -1562,11 +1568,22 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (cpi->common.frame_type == KEY_FRAME) gf_group_err -= gf_first_frame_err; - // Scan forward to try and work out how many frames the next gf group - // should contain and what level of boost is appropriate for the GF - // or ARF that will be coded with the group - i = 0; + // Motion breakout threshold for loop below depends on image size. + mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 10.0; + // Work out a maximum interval for the GF. + // If the image appears completely static we can extend beyond this. + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_max_gf_interval = + 12 + ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 5); + + if (active_max_gf_interval > cpi->max_gf_interval) + active_max_gf_interval = cpi->max_gf_interval; + + i = 0; while (((i < cpi->twopass.static_scene_max_gf_interval) || ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->twopass.frames_to_key)) { @@ -1618,7 +1635,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Break out conditions. if ( // Break at cpi->max_gf_interval unless almost totally static - (i >= cpi->max_gf_interval && (zero_motion_accumulator < 0.995)) || + (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) || ( // Dont break out with a very short interval (i > MIN_GF_INTERVAL) && @@ -1626,7 +1643,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) && ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) && (!flash_detected) && - ((mv_ratio_accumulator > 100.0) || + ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || (abs_mv_in_out_accumulator > 3.0) || (mv_in_out_accumulator < -2.0) || ((boost_score - old_boost_score) < IIFACTOR)) @@ -1673,7 +1690,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); cpi->source_alt_ref_pending = TRUE; - configure_arnr_filter(cpi, this_frame); + configure_arnr_filter(cpi, this_frame, cpi->gfu_boost); } else { cpi->gfu_boost = (int)boost_score; cpi->source_alt_ref_pending = FALSE; @@ -1945,7 +1962,8 @@ static int adjust_active_maxq(int old_maxqi, int new_maxqi) { void vp9_second_pass(VP9_COMP *cpi) { int tmp_q; - int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame); + int frames_left = (int)(cpi->twopass.total_stats->count - + cpi->common.current_video_frame); FIRSTPASS_STATS this_frame; FIRSTPASS_STATS this_frame_copy; @@ -1953,76 +1971,12 @@ void vp9_second_pass(VP9_COMP *cpi) { double this_frame_intra_error; double this_frame_coded_error; - int overhead_bits; - if (!cpi->twopass.stats_in) { return; } vp9_clear_system_state(); - vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS)); - - if (EOF == input_stats(cpi, &this_frame)) - return; - - this_frame_intra_error = this_frame.intra_error; - this_frame_coded_error = this_frame.coded_error; - - // keyframe and section processing ! - if (cpi->twopass.frames_to_key == 0) { - // Define next KF group and assign bits to it - vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); - find_next_key_frame(cpi, &this_frame_copy); - } - - // Is this a GF / ARF (Note that a KF is always also a GF) - if (cpi->frames_till_gf_update_due == 0) { - // Define next gf group and assign bits to it - vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); - define_gf_group(cpi, &this_frame_copy); - - // If we are going to code an altref frame at the end of the group and the current frame is not a key frame.... - // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits - // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well - if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) { - // Assign a standard frames worth of bits from those allocated to the GF group - int bak = cpi->per_frame_bandwidth; - vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); - assign_std_frame_bits(cpi, &this_frame_copy); - cpi->per_frame_bandwidth = bak; - } - } - - // Otherwise this is an ordinary frame - else { - // Assign bits from those allocated to the GF group - vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); - assign_std_frame_bits(cpi, &this_frame_copy); - } - - // Keep a globally available copy of this and the next frame's iiratio. - cpi->twopass.this_iiratio = (int)(this_frame_intra_error / - DOUBLE_DIVIDE_CHECK(this_frame_coded_error)); - { - FIRSTPASS_STATS next_frame; - if (lookup_next_frame_stats(cpi, &next_frame) != EOF) { - cpi->twopass.next_iiratio = (int)(next_frame.intra_error / - DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); - } - } - - // Set nominal per second bandwidth for this frame - cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth - * cpi->output_frame_rate); - if (cpi->target_bandwidth < 0) - cpi->target_bandwidth = 0; - - - // Account for mv, mode and other overheads. - overhead_bits = (int)estimate_modemvcost( - cpi, cpi->twopass.total_left_stats); - // Special case code for first frame. if (cpi->common.current_video_frame == 0) { cpi->twopass.est_max_qcorrection_factor = 1.0; @@ -2034,8 +1988,7 @@ void vp9_second_pass(VP9_COMP *cpi) { est_cq = estimate_cq(cpi, cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)(cpi->twopass.bits_left / frames_left)); cpi->cq_target_quality = cpi->oxcf.cq_level; if (est_cq > cpi->cq_target_quality) @@ -2049,21 +2002,23 @@ void vp9_second_pass(VP9_COMP *cpi) { tmp_q = estimate_max_q( cpi, cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)(cpi->twopass.bits_left / frames_left)); cpi->active_worst_quality = tmp_q; cpi->ni_av_qi = tmp_q; cpi->avg_q = vp9_convert_qindex_to_q(tmp_q); +#ifndef ONE_SHOT_Q_ESTIMATE // Limit the maxq value returned subsequently. // This increases the risk of overspend or underspend if the initial // estimate for the clip is bad, but helps prevent excessive // variation in Q, especially near the end of a clip // where for example a small overspend may cause Q to crash adjust_maxq_qrange(cpi); +#endif } +#ifndef ONE_SHOT_Q_ESTIMATE // The last few frames of a clip almost always have to few or too many // bits and for the sake of over exact rate control we dont want to make // radical adjustments to the allowed quantizer range just to use up a @@ -2078,13 +2033,71 @@ void vp9_second_pass(VP9_COMP *cpi) { tmp_q = estimate_max_q( cpi, cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)(cpi->twopass.bits_left / frames_left)); // Make a damped adjustment to active max Q cpi->active_worst_quality = adjust_active_maxq(cpi->active_worst_quality, tmp_q); } +#endif + + vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS)); + if (EOF == input_stats(cpi, &this_frame)) + return; + + this_frame_intra_error = this_frame.intra_error; + this_frame_coded_error = this_frame.coded_error; + + // keyframe and section processing ! + if (cpi->twopass.frames_to_key == 0) { + // Define next KF group and assign bits to it + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + find_next_key_frame(cpi, &this_frame_copy); + } + + // Is this a GF / ARF (Note that a KF is always also a GF) + if (cpi->frames_till_gf_update_due == 0) { + // Define next gf group and assign bits to it + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + define_gf_group(cpi, &this_frame_copy); + + // If we are going to code an altref frame at the end of the group + // and the current frame is not a key frame.... + // If the previous group used an arf this frame has already benefited + // from that arf boost and it should not be given extra bits + // If the previous group was NOT coded using arf we may want to apply + // some boost to this GF as well + if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) { + // Assign a standard frames worth of bits from those allocated + // to the GF group + int bak = cpi->per_frame_bandwidth; + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + assign_std_frame_bits(cpi, &this_frame_copy); + cpi->per_frame_bandwidth = bak; + } + } else { + // Otherwise this is an ordinary frame + // Assign bits from those allocated to the GF group + vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + assign_std_frame_bits(cpi, &this_frame_copy); + } + + // Keep a globally available copy of this and the next frame's iiratio. + cpi->twopass.this_iiratio = (int)(this_frame_intra_error / + DOUBLE_DIVIDE_CHECK(this_frame_coded_error)); + { + FIRSTPASS_STATS next_frame; + if (lookup_next_frame_stats(cpi, &next_frame) != EOF) { + cpi->twopass.next_iiratio = (int)(next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); + } + } + + // Set nominal per second bandwidth for this frame + cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth + * cpi->output_frame_rate); + if (cpi->target_bandwidth < 0) + cpi->target_bandwidth = 0; cpi->twopass.frames_to_key--; @@ -2092,7 +2105,6 @@ void vp9_second_pass(VP9_COMP *cpi) { subtract_stats(cpi->twopass.total_left_stats, &this_frame); } - static int test_candidate_kf(VP9_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 19bc4d67d9dbde96e9d7fcdc6801d889a60f2dc6..2296a66695a30ca8a27eee810ebb927c54948369 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -11,12 +11,12 @@ #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ #define VP9_ENCODER_VP9_FIRSTPASS_H_ -extern void vp9_init_first_pass(VP9_COMP *cpi); -extern void vp9_first_pass(VP9_COMP *cpi); -extern void vp9_end_first_pass(VP9_COMP *cpi); +void vp9_init_first_pass(VP9_COMP *cpi); +void vp9_first_pass(VP9_COMP *cpi); +void vp9_end_first_pass(VP9_COMP *cpi); -extern void vp9_init_second_pass(VP9_COMP *cpi); -extern void vp9_second_pass(VP9_COMP *cpi); -extern void vp9_end_second_pass(VP9_COMP *cpi); +void vp9_init_second_pass(VP9_COMP *cpi); +void vp9_second_pass(VP9_COMP *cpi); +void vp9_end_second_pass(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_FIRSTPASS_H_ diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c index 1bca9d267c2a300c5e32b62dd4af99b284b94775..a89d2547e16b6706677e26fe8e9f000bab228c13 100644 --- a/vp9/encoder/vp9_lookahead.c +++ b/vp9/encoder/vp9_lookahead.c @@ -9,7 +9,9 @@ */ #include <assert.h> #include <stdlib.h> + #include "vpx_config.h" +#include "vp9/common/vp9_common.h" #include "vp9/encoder/vp9_lookahead.h" #include "vp9/common/vp9_extend.h" @@ -25,10 +27,9 @@ struct lookahead_ctx { /* Return the buffer at the given absolute index and increment the index */ -static struct lookahead_entry * -pop(struct lookahead_ctx *ctx, - unsigned int *idx) { - unsigned int index = *idx; +static struct lookahead_entry * pop(struct lookahead_ctx *ctx, + unsigned int *idx) { + unsigned int index = *idx; struct lookahead_entry *buf = ctx->buf + index; assert(index < ctx->max_sz); @@ -39,8 +40,7 @@ pop(struct lookahead_ctx *ctx, } -void -vp9_lookahead_destroy(struct lookahead_ctx *ctx) { +void vp9_lookahead_destroy(struct lookahead_ctx *ctx) { if (ctx) { if (ctx->buf) { unsigned int i; @@ -54,23 +54,15 @@ vp9_lookahead_destroy(struct lookahead_ctx *ctx) { } -struct lookahead_ctx * -vp9_lookahead_init(unsigned int width, - unsigned int height, - unsigned int depth) { +struct lookahead_ctx * vp9_lookahead_init(unsigned int width, + unsigned int height, + unsigned int depth) { struct lookahead_ctx *ctx = NULL; - /* Clamp the lookahead queue depth */ - if (depth < 1) - depth = 1; - else if (depth > MAX_LAG_BUFFERS) - depth = MAX_LAG_BUFFERS; - - /* Align the buffer dimensions */ - width = (width + 15) &~15; - height = (height + 15) &~15; + // Clamp the lookahead queue depth + depth = clamp(depth, 1, MAX_LAG_BUFFERS); - /* Allocate the lookahead structures */ + // Allocate the lookahead structures ctx = calloc(1, sizeof(*ctx)); if (ctx) { unsigned int i; @@ -90,13 +82,9 @@ bail: } -int -vp9_lookahead_push(struct lookahead_ctx *ctx, - YV12_BUFFER_CONFIG *src, - int64_t ts_start, - int64_t ts_end, - unsigned int flags, - unsigned char *active_map) { +int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, unsigned int flags, + unsigned char *active_map) { struct lookahead_entry *buf; int row, col, active_end; int mb_rows = (src->y_height + 15) >> 4; @@ -156,9 +144,8 @@ vp9_lookahead_push(struct lookahead_ctx *ctx, } -struct lookahead_entry * -vp9_lookahead_pop(struct lookahead_ctx *ctx, - int drain) { +struct lookahead_entry * vp9_lookahead_pop(struct lookahead_ctx *ctx, + int drain) { struct lookahead_entry *buf = NULL; if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) { @@ -169,9 +156,8 @@ vp9_lookahead_pop(struct lookahead_ctx *ctx, } -struct lookahead_entry * -vp9_lookahead_peek(struct lookahead_ctx *ctx, - int index) { +struct lookahead_entry * vp9_lookahead_peek(struct lookahead_ctx *ctx, + int index) { struct lookahead_entry *buf = NULL; assert(index < (int)ctx->max_sz); @@ -184,8 +170,6 @@ vp9_lookahead_peek(struct lookahead_ctx *ctx, return buf; } - -unsigned int -vp9_lookahead_depth(struct lookahead_ctx *ctx) { +unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; } diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h index a7aad46a598359b6b8f079dff831b0a72a61319a..2406618b9c42ba83265ef82a7777df41b2dabf48 100644 --- a/vp9/encoder/vp9_lookahead.h +++ b/vp9/encoder/vp9_lookahead.h @@ -28,17 +28,13 @@ struct lookahead_ctx; * * The lookahead stage is a queue of frame buffers on which some analysis * may be done when buffers are enqueued. - * - * */ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, unsigned int height, - unsigned int depth - ); + unsigned int depth); /**\brief Destroys the lookahead stage - * */ void vp9_lookahead_destroy(struct lookahead_ctx *ctx); @@ -58,13 +54,9 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx); * \param[in] flags Flags set on this frame * \param[in] active_map Map that specifies which macroblock is active */ -int -vp9_lookahead_push(struct lookahead_ctx *ctx, - YV12_BUFFER_CONFIG *src, - int64_t ts_start, - int64_t ts_end, - unsigned int flags, - unsigned char *active_map); +int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, unsigned int flags, + unsigned char *active_map); /**\brief Get the next source buffer to encode @@ -76,11 +68,9 @@ vp9_lookahead_push(struct lookahead_ctx *ctx, * * \retval NULL, if drain set and queue is empty * \retval NULL, if drain not set and queue not of the configured depth - * */ -struct lookahead_entry * -vp9_lookahead_pop(struct lookahead_ctx *ctx, - int drain); +struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx, + int drain); /**\brief Get a future source buffer to encode @@ -89,18 +79,15 @@ vp9_lookahead_pop(struct lookahead_ctx *ctx, * \param[in] index Index of the frame to be returned, 0 == next frame * * \retval NULL, if no buffer exists at the specified index - * */ -struct lookahead_entry * -vp9_lookahead_peek(struct lookahead_ctx *ctx, - int index); +struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx, + int index); /**\brief Get the number of frames currently in the lookahead queue * * \param[in] ctx Pointer to the lookahead context */ -unsigned int -vp9_lookahead_depth(struct lookahead_ctx *ctx); +unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx); #endif // VP9_ENCODER_VP9_LOOKAHEAD_H_ diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 0ff60c8b060ef7b27154fba1114da717c5e74c81..715d683778a3dfc596c6ddfb8961b8020e6d8e62 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -20,14 +20,16 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, int_mv *ref_mv, - int_mv *dst_mv) { + int_mv *dst_mv, + int mb_row, + int mb_col) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; BLOCK *b = &x->block[0]; BLOCKD *d = &xd->block[0]; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; unsigned int best_err; - int step_param; + int tmp_col_min = x->mv_col_min; int tmp_col_max = x->mv_col_max; @@ -36,11 +38,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, int_mv ref_full; // Further step/diamond searches as necessary - if (cpi->Speed < 8) { - step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0); - } else { - step_param = cpi->sf.first_step + 2; - } + int step_param = cpi->sf.first_step + + (cpi->Speed < 8 ? (cpi->Speed > 5 ? 1 : 0) : 2); vp9_clamp_mv_min_max(x, ref_mv); @@ -72,7 +71,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, } vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv); - vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0); + vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col); best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride, xd->predictor, 16, INT_MAX); @@ -93,8 +92,9 @@ static int do_16x16_motion_search YV12_BUFFER_CONFIG *buf, int buf_mb_y_offset, YV12_BUFFER_CONFIG *ref, - int mb_y_offset -) { + int mb_y_offset, + int mb_row, + int mb_col) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; unsigned int err, tmp_err; @@ -124,7 +124,7 @@ static int do_16x16_motion_search // Test last reference frame using the previous best mv as the // starting point (best reference) for the search - tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv); + tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col); if (tmp_err < err) { err = tmp_err; dst_mv->as_int = tmp_mv.as_int; @@ -136,7 +136,8 @@ static int do_16x16_motion_search int_mv zero_ref_mv, tmp_mv; zero_ref_mv.as_int = 0; - tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv); + tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, + mb_row, mb_col); if (tmp_err < err) { dst_mv->as_int = tmp_mv.as_int; err = tmp_err; @@ -229,7 +230,9 @@ static void update_mbgraph_mb_stats int gld_y_offset, YV12_BUFFER_CONFIG *alt_ref, int_mv *prev_alt_ref_mv, - int arf_y_offset + int arf_y_offset, + int mb_row, + int mb_col ) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -249,7 +252,8 @@ static void update_mbgraph_mb_stats int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv, &stats->ref[GOLDEN_FRAME].m.mv, buf, mb_y_offset, - golden_ref, gld_y_offset); + golden_ref, gld_y_offset, + mb_row, mb_col); stats->ref[GOLDEN_FRAME].err = g_motion_error; } else { stats->ref[GOLDEN_FRAME].err = INT_MAX; @@ -292,6 +296,9 @@ static void update_mbgraph_frame_stats int_mv arf_top_mv, gld_top_mv; MODE_INFO mi_local; + // Make sure the mi context starts in a consistent state. + memset(&mi_local, 0, sizeof(mi_local)); + // Set up limit values for motion vectors to prevent them extending outside the UMV borders arf_top_mv.as_int = 0; gld_top_mv.as_int = 0; @@ -323,7 +330,8 @@ static void update_mbgraph_frame_stats update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref, &gld_left_mv, gld_y_in_offset, - alt_ref, &arf_left_mv, arf_y_in_offset); + alt_ref, &arf_left_mv, arf_y_in_offset, + mb_row, mb_col); arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int; gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int; if (mb_col == 0) { @@ -412,7 +420,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs; // This error case should not be reachable as this function should - // never be called with the common data structure unititialized. + // never be called with the common data structure uninitialized. else cpi->static_mb_pct = 0; @@ -427,13 +435,11 @@ static void separate_arf_mbs(VP9_COMP *cpi) { vpx_free(arf_not_zz); } -void vp9_update_mbgraph_stats -( - VP9_COMP *cpi -) { +void vp9_update_mbgraph_stats(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int i, n_frames = vp9_lookahead_depth(cpi->lookahead); - YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx]; + YV12_BUFFER_CONFIG *golden_ref = + &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]]; // we need to look ahead beyond where the ARF transitions into // being a GF - so exit if we don't look ahead beyond that diff --git a/vp9/encoder/vp9_mbgraph.h b/vp9/encoder/vp9_mbgraph.h index db23eca330b5b80c2e5ffdc47efaaf16f9dbc033..c5bca4d01f53c8ad3b156e6437cdaf0f72fe6858 100644 --- a/vp9/encoder/vp9_mbgraph.h +++ b/vp9/encoder/vp9_mbgraph.h @@ -11,6 +11,6 @@ #ifndef VP9_ENCODER_VP9_MBGRAPH_H_ #define VP9_ENCODER_VP9_MBGRAPH_H_ -extern void vp9_update_mbgraph_stats(VP9_COMP *cpi); +void vp9_update_mbgraph_stats(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_MBGRAPH_H_ diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 4694a92c60520be2d3df1f3c0de447e607720746..e642b7487b3218a39a2e180e5c9999b5f8bfdbd6 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -8,27 +8,22 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <stdio.h> +#include <limits.h> +#include <math.h> #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_mcomp.h" #include "vpx_mem/vpx_mem.h" #include "./vpx_config.h" -#include <stdio.h> -#include <limits.h> -#include <math.h> #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_common.h" -#ifdef ENTROPY_STATS -static int mv_ref_ct [31] [4] [2]; -static int mv_mode_cts [4] [2]; -#endif - void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.col & 7) ? 1 : 0); + ((ref_mv->as_mv.col & 7) ? 1 : 0); int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.row & 7) ? 1 : 0); + ((ref_mv->as_mv.row & 7) ? 1 : 0); int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL; int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL; @@ -43,36 +38,47 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { x->mv_row_max = row_max; } +int vp9_init_search_range(int width, int height) { + int sr = 0; + int frm = MIN(width, height); + + while ((frm << sr) < MAX_FULL_PEL_VAL) + sr++; + + if (sr) + sr--; + + return sr; +} + int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], - int Weight, int ishp) { + int weight, int ishp) { MV v; - v.row = (mv->as_mv.row - ref->as_mv.row); - v.col = (mv->as_mv.col - ref->as_mv.col); + v.row = mv->as_mv.row - ref->as_mv.row; + v.col = mv->as_mv.col - ref->as_mv.col; return ((mvjcost[vp9_get_mv_joint(v)] + - mvcost[0][v.row] + mvcost[1][v.col]) * - Weight) >> 7; + mvcost[0][v.row] + mvcost[1][v.col]) * weight) >> 7; } static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], int error_per_bit, int ishp) { if (mvcost) { MV v; - v.row = (mv->as_mv.row - ref->as_mv.row); - v.col = (mv->as_mv.col - ref->as_mv.col); + v.row = mv->as_mv.row - ref->as_mv.row; + v.col = mv->as_mv.col - ref->as_mv.col; return ((mvjcost[vp9_get_mv_joint(v)] + mvcost[0][v.row] + mvcost[1][v.col]) * - error_per_bit + 128) >> 8; + error_per_bit + 4096) >> 13; } return 0; } static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost, int *mvsadcost[2], int error_per_bit) { - if (mvsadcost) { MV v; - v.row = (mv->as_mv.row - ref->as_mv.row); - v.col = (mv->as_mv.col - ref->as_mv.col); + v.row = mv->as_mv.row - ref->as_mv.row; + v.col = mv->as_mv.col - ref->as_mv.col; return ((mvjsadcost[vp9_get_mv_joint(v)] + mvsadcost[0][v.row] + mvsadcost[1][v.col]) * error_per_bit + 128) >> 8; @@ -81,45 +87,39 @@ static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost, } void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) { - int Len; + int len; int search_site_count = 0; - // Generate offsets for 4 search sites per step. - Len = MAX_FIRST_STEP; x->ss[search_site_count].mv.col = 0; x->ss[search_site_count].mv.row = 0; x->ss[search_site_count].offset = 0; search_site_count++; - while (Len > 0) { - + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { // Compute offsets for search sites. x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = -Len; - x->ss[search_site_count].offset = -Len * stride; + x->ss[search_site_count].mv.row = -len; + x->ss[search_site_count].offset = -len * stride; search_site_count++; // Compute offsets for search sites. x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = Len; - x->ss[search_site_count].offset = Len * stride; + x->ss[search_site_count].mv.row = len; + x->ss[search_site_count].offset = len * stride; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.col = -len; x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = -Len; + x->ss[search_site_count].offset = -len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.col = len; x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = Len; + x->ss[search_site_count].offset = len; search_site_count++; - - // Contract. - Len /= 2; } x->ss_count = search_site_count; @@ -127,68 +127,63 @@ void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) { } void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { - int Len; + int len; int search_site_count = 0; // Generate offsets for 8 search sites per step. - Len = MAX_FIRST_STEP; x->ss[search_site_count].mv.col = 0; x->ss[search_site_count].mv.row = 0; x->ss[search_site_count].offset = 0; search_site_count++; - while (Len > 0) { - + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { // Compute offsets for search sites. x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = -Len; - x->ss[search_site_count].offset = -Len * stride; + x->ss[search_site_count].mv.row = -len; + x->ss[search_site_count].offset = -len * stride; search_site_count++; // Compute offsets for search sites. x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = Len; - x->ss[search_site_count].offset = Len * stride; + x->ss[search_site_count].mv.row = len; + x->ss[search_site_count].offset = len * stride; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -Len; + x->ss[search_site_count].mv.col = -len; x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = -Len; + x->ss[search_site_count].offset = -len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = Len; + x->ss[search_site_count].mv.col = len; x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = Len; + x->ss[search_site_count].offset = len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -Len; - x->ss[search_site_count].mv.row = -Len; - x->ss[search_site_count].offset = -Len * stride - Len; + x->ss[search_site_count].mv.col = -len; + x->ss[search_site_count].mv.row = -len; + x->ss[search_site_count].offset = -len * stride - len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = Len; - x->ss[search_site_count].mv.row = -Len; - x->ss[search_site_count].offset = -Len * stride + Len; + x->ss[search_site_count].mv.col = len; + x->ss[search_site_count].mv.row = -len; + x->ss[search_site_count].offset = -len * stride + len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -Len; - x->ss[search_site_count].mv.row = Len; - x->ss[search_site_count].offset = Len * stride - Len; + x->ss[search_site_count].mv.col = -len; + x->ss[search_site_count].mv.row = len; + x->ss[search_site_count].offset = len * stride - len; search_site_count++; // Compute offsets for search sites. - x->ss[search_site_count].mv.col = Len; - x->ss[search_site_count].mv.row = Len; - x->ss[search_site_count].offset = Len * stride + Len; + x->ss[search_site_count].mv.col = len; + x->ss[search_site_count].mv.row = len; + x->ss[search_site_count].offset = len * stride + len; search_site_count++; - - // Contract. - Len /= 2; } x->ss_count = search_site_count; @@ -210,7 +205,8 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { (mvcost ? \ ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + \ mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \ - error_per_bit + 128) >> 8 : 0) + error_per_bit + 4096) >> 13 : 0) + #define SP(x) (((x) & 7) << 1) // convert motion vector component to offset // for svf calc @@ -1546,7 +1542,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int in_what_stride = d->pre_stride; int mv_stride = d->pre_stride; uint8_t *bestaddress; - int_mv *best_mv = &d->bmi.as_mv.first; + int_mv *best_mv = &d->bmi.as_mv[0]; int_mv this_mv; int bestsad = INT_MAX; int r, c; @@ -1641,7 +1637,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int in_what_stride = d->pre_stride; int mv_stride = d->pre_stride; uint8_t *bestaddress; - int_mv *best_mv = &d->bmi.as_mv.first; + int_mv *best_mv = &d->bmi.as_mv[0]; int_mv this_mv; unsigned int bestsad = INT_MAX; int r, c; @@ -1770,7 +1766,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int in_what_stride = d->pre_stride; int mv_stride = d->pre_stride; uint8_t *bestaddress; - int_mv *best_mv = &d->bmi.as_mv.first; + int_mv *best_mv = &d->bmi.as_mv[0]; int_mv this_mv; unsigned int bestsad = INT_MAX; int r, c; @@ -1787,7 +1783,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int col_min = ref_col - distance; int col_max = ref_col + distance; - DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8); + DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8); unsigned int sad_array[3]; int_mv fcenter_mv; @@ -2023,12 +2019,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, for (i = 0; i < search_range; i++) { int best_site = -1; - int all_in = 1; - - all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min); - all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max); - all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min); - all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max); + int all_in = ((ref_mv->as_mv.row - 1) > x->mv_row_min) & + ((ref_mv->as_mv.row + 1) < x->mv_row_max) & + ((ref_mv->as_mv.col - 1) > x->mv_col_min) & + ((ref_mv->as_mv.col + 1) < x->mv_col_max); if (all_in) { unsigned int sad_array[4]; @@ -2103,21 +2097,22 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #ifdef ENTROPY_STATS -void print_mode_context(void) { +void print_mode_context(VP9_COMMON *pc) { FILE *f = fopen("vp9_modecont.c", "a"); int i, j; fprintf(f, "#include \"vp9_entropy.h\"\n"); - fprintf(f, "const int vp9_mode_contexts[6][4] ="); + fprintf(f, "const int vp9_mode_contexts[INTER_MODE_CONTEXTS][4] ="); fprintf(f, "{\n"); - for (j = 0; j < 6; j++) { + for (j = 0; j < INTER_MODE_CONTEXTS; j++) { fprintf(f, " {/* %d */ ", j); fprintf(f, " "); for (i = 0; i < 4; i++) { int this_prob; // context probs - this_prob = get_binary_prob(mv_ref_ct[j][i][0], mv_ref_ct[j][i][1]); + this_prob = get_binary_prob(pc->fc.mv_ref_ct[j][i][0], + pc->fc.mv_ref_ct[j][i][1]); fprintf(f, "%5d, ", this_prob); } @@ -2128,44 +2123,4 @@ void print_mode_context(void) { fclose(f); } -/* MV ref count ENTROPY_STATS stats code */ -void init_mv_ref_counts() { - vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct)); - vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts)); -} - -void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) { - if (m == ZEROMV) { - ++mv_ref_ct [ct[0]] [0] [0]; - ++mv_mode_cts[0][0]; - } else { - ++mv_ref_ct [ct[0]] [0] [1]; - ++mv_mode_cts[0][1]; - - if (m == NEARESTMV) { - ++mv_ref_ct [ct[1]] [1] [0]; - ++mv_mode_cts[1][0]; - } else { - ++mv_ref_ct [ct[1]] [1] [1]; - ++mv_mode_cts[1][1]; - - if (m == NEARMV) { - ++mv_ref_ct [ct[2]] [2] [0]; - ++mv_mode_cts[2][0]; - } else { - ++mv_ref_ct [ct[2]] [2] [1]; - ++mv_mode_cts[2][1]; - - if (m == NEWMV) { - ++mv_ref_ct [ct[3]] [3] [0]; - ++mv_mode_cts[3][0]; - } else { - ++mv_ref_ct [ct[3]] [3] [1]; - ++mv_mode_cts[3][1]; - } - } - } - } -} - #endif/* END MV ref count ENTROPY_STATS stats code */ diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 358d10bc69ca3986b32cd6e2170fb0f47aec6369..fd1bb2b4e47e48b873d7d410bbf38ed27ae5ff1a 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -16,21 +16,25 @@ #include "vp9/encoder/vp9_variance.h" #ifdef ENTROPY_STATS -extern void init_mv_ref_counts(); -extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); -void print_mode_context(void); +void print_mode_context(VP9_COMMON *pc); #endif +// The maximum number of steps in a step search given the largest +// allowed initial step +#define MAX_MVSEARCH_STEPS 11 +// Max full pel mv specified in 1 pel units +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) +// Maximum size of the first step in full pel units +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) -#define MAX_MVSEARCH_STEPS 8 // The maximum number of steps in a step search given the largest allowed initial step -#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) // Max full pel mv specified in 1 pel units -#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) // Maximum size of the first step in full pel units +void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv); +int vp9_init_search_range(int width, int height); + +int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, + int *mvcost[2], int weight, int ishp); +void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); +void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); -extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv); -extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, - int *mvcost[2], int Weight, int ishp); -extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); -extern void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); // Runs sequence of diamond searches in smaller steps for RD struct VP9_COMP; int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b, @@ -39,20 +43,13 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b, vp9_variance_fn_ptr_t *fn_ptr, int_mv *ref_mv, int_mv *dst_mv); -extern int vp9_hex_search -( - MACROBLOCK *x, - BLOCK *b, - BLOCKD *d, - int_mv *ref_mv, - int_mv *best_mv, - int search_param, - int error_per_bit, - const vp9_variance_fn_ptr_t *vf, - int *mvjsadcost, int *mvsadcost[2], - int *mvjcost, int *mvcost[2], - int_mv *center_mv -); +int vp9_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, + int_mv *ref_mv, int_mv *best_mv, + int search_param, int error_per_bit, + const vp9_variance_fn_ptr_t *vf, + int *mvjsadcost, int *mvsadcost[2], + int *mvjcost, int *mvcost[2], + int_mv *center_mv); typedef int (fractional_mv_step_fp) (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 27e0e48a3e522c22d54082b391efed4351dafdce..656975aa46700224cf13fce962b6343a496e49c2 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -10,7 +10,9 @@ #include "vpx_config.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_quantize.h" @@ -22,6 +24,7 @@ #include "vp9/common/vp9_extend.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_tile_common.h" #include "vp9/encoder/vp9_segmentation.h" #include "./vp9_rtcd.h" #include "./vpx_scale_rtcd.h" @@ -111,6 +114,13 @@ extern void init_nmvstats(); extern void print_nmvstats(); #endif +#if CONFIG_CODE_NONZEROCOUNT +#ifdef NZC_STATS +extern void init_nzcstats(); +extern void print_nzcstats(); +#endif +#endif + #ifdef SPEEDSTATS unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; #endif @@ -146,31 +156,24 @@ static int inter_minq[QINDEX_RANGE]; // The formulae were derived from computing a 3rd order polynomial best // fit to the original data (after plotting real maxq vs minq (not q index)) static int calculate_minq_index(double maxq, - double x3, double x2, double x, double c) { + double x3, double x2, double x1, double c) { int i; - double minqtarget; - - minqtarget = ((x3 * maxq * maxq * maxq) + - (x2 * maxq * maxq) + - (x * maxq) + - c); - - if (minqtarget > maxq) - minqtarget = maxq; + const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c, + maxq); for (i = 0; i < QINDEX_RANGE; i++) { if (minqtarget <= vp9_convert_qindex_to_q(i)) return i; } + return QINDEX_RANGE - 1; } static void init_minq_luts(void) { int i; - double maxq; for (i = 0; i < QINDEX_RANGE; i++) { - maxq = vp9_convert_qindex_to_q(i); + const double maxq = vp9_convert_qindex_to_q(i); kf_low_motion_minq[i] = calculate_minq_index(maxq, @@ -206,7 +209,6 @@ static void set_mvcost(MACROBLOCK *mb) { if (mb->e_mbd.allow_high_precision_mv) { mb->mvcost = mb->nmvcost_hp; mb->mvsadcost = mb->nmvsadcost_hp; - } else { mb->mvcost = mb->nmvcost; mb->mvsadcost = mb->nmvsadcost; @@ -214,15 +216,13 @@ static void set_mvcost(MACROBLOCK *mb) { } static void init_base_skip_probs(void) { int i; - double q; - int t; for (i = 0; i < QINDEX_RANGE; i++) { - q = vp9_convert_qindex_to_q(i); + const double q = vp9_convert_qindex_to_q(i); // Exponential decay caluclation of baseline skip prob with clamping // Based on crude best fit of old table. - t = (int)(564.25 * pow(2.71828, (-0.012 * q))); + const int t = (int)(564.25 * pow(2.71828, (-0.012 * q))); base_skip_false_prob[i][1] = clip_prob(t); base_skip_false_prob[i][2] = clip_prob(t * 3 / 4); @@ -236,12 +236,12 @@ static void update_base_skip_probs(VP9_COMP *cpi) { if (cm->frame_type != KEY_FRAME) { vp9_update_skip_probs(cpi); - if (cm->refresh_alt_ref_frame) { + if (cpi->refresh_alt_ref_frame) { int k; for (k = 0; k < MBSKIP_CONTEXTS; ++k) cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k]; cpi->last_skip_probs_q[2] = cm->base_qindex; - } else if (cpi->common.refresh_golden_frame) { + } else if (cpi->refresh_golden_frame) { int k; for (k = 0; k < MBSKIP_CONTEXTS; ++k) cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k]; @@ -258,7 +258,6 @@ static void update_base_skip_probs(VP9_COMP *cpi) { cm->mbskip_pred_probs[k]; } } - } void vp9_initialize_enc() { @@ -299,7 +298,6 @@ static void setup_features(VP9_COMP *cpi) { vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas)); set_default_lf_deltas(cpi); - } @@ -332,9 +330,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf); vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source); -#if VP9_TEMPORAL_ALT_REF vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer); -#endif vp9_lookahead_destroy(cpi->lookahead); vpx_free(cpi->tok); @@ -388,7 +384,7 @@ static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) { return target_index - start_index; } -static void init_seg_features(VP9_COMP *cpi) { +static void configure_static_seg_features(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &cpi->mb.e_mbd; @@ -408,10 +404,8 @@ static void init_seg_features(VP9_COMP *cpi) { // Clear down the segment features. vp9_clearall_segfeatures(xd); - } - - // If this is an alt ref frame - else if (cm->refresh_alt_ref_frame) { + } else if (cpi->refresh_alt_ref_frame) { + // If this is an alt ref frame // Clear down the global segmentation map vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols)); xd->update_mb_segmentation_map = 0; @@ -448,7 +442,7 @@ static void init_seg_features(VP9_COMP *cpi) { else if (xd->segmentation_enabled) { // First normal frame in a valid gf or alt ref group if (cpi->common.frames_since_golden == 0) { - // Set up segment features for normal frames in an af group + // Set up segment features for normal frames in an arf group if (cpi->source_alt_ref_active) { xd->update_mb_segmentation_map = 0; xd->update_mb_segmentation_data = 1; @@ -465,16 +459,9 @@ static void init_seg_features(VP9_COMP *cpi) { // Segment coding disabled for compred testing if (high_q || (cpi->static_mb_pct == 100)) { - // set_segref(xd, 1, LAST_FRAME); vp9_set_segref(xd, 1, ALTREF_FRAME); vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME); - - vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV); - vp9_enable_segfeature(xd, 1, SEG_LVL_MODE); - - // EOB segment coding not fixed for 8x8 yet - vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0); - vp9_enable_segfeature(xd, 1, SEG_LVL_EOB); + vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP); } } // Disable segmentation and clear down features if alt ref @@ -493,29 +480,23 @@ static void init_seg_features(VP9_COMP *cpi) { } // Special case where we are coding over the top of a previous - // alt ref frame + // alt ref frame. // Segment coding disabled for compred testing else if (cpi->is_src_frame_alt_ref) { - // Enable mode and ref frame features for segment 0 as well + // Enable ref frame features for segment 0 as well vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME); - vp9_enable_segfeature(xd, 0, SEG_LVL_MODE); vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME); - vp9_enable_segfeature(xd, 1, SEG_LVL_MODE); - // All mbs should use ALTREF_FRAME, ZEROMV exclusively + // All mbs should use ALTREF_FRAME vp9_clear_segref(xd, 0); vp9_set_segref(xd, 0, ALTREF_FRAME); vp9_clear_segref(xd, 1); vp9_set_segref(xd, 1, ALTREF_FRAME); - vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV); - vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV); - // Skip all MBs if high Q + // Skip all MBs if high Q (0,0 mv and skip coeffs) if (high_q) { - vp9_enable_segfeature(xd, 0, SEG_LVL_EOB); - vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0); - vp9_enable_segfeature(xd, 1, SEG_LVL_EOB); - vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0); + vp9_enable_segfeature(xd, 0, SEG_LVL_SKIP); + vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP); } // Enable data udpate xd->update_mb_segmentation_data = 1; @@ -534,17 +515,13 @@ static void print_seg_map(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int row, col; int map_index = 0; - FILE *statsfile; - - statsfile = fopen("segmap.stt", "a"); + FILE *statsfile = fopen("segmap.stt", "a"); - fprintf(statsfile, "%10d\n", - cm->current_video_frame); + fprintf(statsfile, "%10d\n", cm->current_video_frame); for (row = 0; row < cpi->common.mb_rows; row++) { for (col = 0; col < cpi->common.mb_cols; col++) { - fprintf(statsfile, "%10d", - cpi->segmentation_map[map_index]); + fprintf(statsfile, "%10d", cpi->segmentation_map[map_index]); map_index++; } fprintf(statsfile, "\n"); @@ -590,368 +567,88 @@ static void set_default_lf_deltas(VP9_COMP *cpi) { cpi->mb.e_mbd.mode_lf_deltas[3] = 4; // Split mv } -void vp9_set_speed_features(VP9_COMP *cpi) { +static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) { SPEED_FEATURES *sf = &cpi->sf; - int Mode = cpi->compressor_speed; - int Speed = cpi->Speed; + int speed_multiplier = speed + 1; int i; - VP9_COMMON *cm = &cpi->common; - - // Only modes 0 and 1 supported for now in experimental code basae - if (Mode > 1) - Mode = 1; - // Initialise default mode frequency sampling variables - for (i = 0; i < MAX_MODES; i ++) { - cpi->mode_check_freq[i] = 0; - cpi->mode_test_hit_counts[i] = 0; - cpi->mode_chosen_counts[i] = 0; + // Set baseline threshold values + for (i = 0; i < MAX_MODES; ++i) { + sf->thresh_mult[i] = (mode == 0) ? -500 : 0; } - // best quality defaults - sf->RD = 1; - sf->search_method = NSTEP; - sf->improved_dct = 1; - sf->auto_filter = 1; - sf->recode_loop = 1; - sf->quarter_pixel_search = 1; - sf->half_pixel_search = 1; - sf->iterative_sub_pixel = 1; -#if CONFIG_LOSSLESS - sf->optimize_coefficients = 0; -#else - sf->optimize_coefficients = 1; -#endif - sf->no_skip_block4x4_search = 1; + sf->thresh_mult[THR_ZEROMV ] = 0; + sf->thresh_mult[THR_ZEROG ] = 0; + sf->thresh_mult[THR_ZEROA ] = 0; - sf->first_step = 0; - sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + sf->thresh_mult[THR_NEARESTMV] = 0; + sf->thresh_mult[THR_NEARESTG ] = 0; + sf->thresh_mult[THR_NEARESTA ] = 0; - // default thresholds to 0 - for (i = 0; i < MAX_MODES; i++) - sf->thresh_mult[i] = 0; + sf->thresh_mult[THR_NEARMV ] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEARG ] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEARA ] += speed_multiplier * 1000; - switch (Mode) { - case 0: // best quality mode - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_ZEROG ] = 0; - sf->thresh_mult[THR_ZEROA ] = 0; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_NEARG ] = 0; - sf->thresh_mult[THR_NEARA ] = 0; - - sf->thresh_mult[THR_DC ] = 0; - - sf->thresh_mult[THR_V_PRED ] = 1000; - sf->thresh_mult[THR_H_PRED ] = 1000; - sf->thresh_mult[THR_D45_PRED ] = 1000; - sf->thresh_mult[THR_D135_PRED] = 1000; - sf->thresh_mult[THR_D117_PRED] = 1000; - sf->thresh_mult[THR_D153_PRED] = 1000; - sf->thresh_mult[THR_D27_PRED ] = 1000; - sf->thresh_mult[THR_D63_PRED ] = 1000; - sf->thresh_mult[THR_B_PRED ] = 2000; - sf->thresh_mult[THR_I8X8_PRED] = 2000; - sf->thresh_mult[THR_TM ] = 1000; - - sf->thresh_mult[THR_NEWMV ] = 1000; - sf->thresh_mult[THR_NEWG ] = 1000; - sf->thresh_mult[THR_NEWA ] = 1000; - - sf->thresh_mult[THR_SPLITMV ] = 2500; - sf->thresh_mult[THR_SPLITG ] = 5000; - sf->thresh_mult[THR_SPLITA ] = 5000; - - sf->thresh_mult[THR_COMP_ZEROLG ] = 0; - sf->thresh_mult[THR_COMP_NEARESTLG] = 0; - sf->thresh_mult[THR_COMP_NEARLG ] = 0; - sf->thresh_mult[THR_COMP_ZEROLA ] = 0; - sf->thresh_mult[THR_COMP_NEARESTLA] = 0; - sf->thresh_mult[THR_COMP_NEARLA ] = 0; - sf->thresh_mult[THR_COMP_ZEROGA ] = 0; - sf->thresh_mult[THR_COMP_NEARESTGA] = 0; - sf->thresh_mult[THR_COMP_NEARGA ] = 0; - - sf->thresh_mult[THR_COMP_NEWLG ] = 1000; - sf->thresh_mult[THR_COMP_NEWLA ] = 1000; - sf->thresh_mult[THR_COMP_NEWGA ] = 1000; - - sf->thresh_mult[THR_COMP_SPLITLA ] = 2500; - sf->thresh_mult[THR_COMP_SPLITGA ] = 5000; - sf->thresh_mult[THR_COMP_SPLITLG ] = 5000; + sf->thresh_mult[THR_DC ] = 0; + sf->thresh_mult[THR_TM ] += speed_multiplier * 1000; + sf->thresh_mult[THR_V_PRED ] += speed_multiplier * 1000; + sf->thresh_mult[THR_H_PRED ] += speed_multiplier * 1000; + sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 1500; + sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 1500; + sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 1500; + sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 1500; + sf->thresh_mult[THR_D27_PRED ] += speed_multiplier * 1500; + sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500; -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = 0; -#endif + sf->thresh_mult[THR_B_PRED ] += speed_multiplier * 2500; + sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500; - sf->first_step = 0; - sf->max_step_search_steps = MAX_MVSEARCH_STEPS; - sf->search_best_filter = SEARCH_BEST_FILTER; - break; - case 1: - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_DC ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_V_PRED ] = 1000; - sf->thresh_mult[THR_H_PRED ] = 1000; - sf->thresh_mult[THR_D45_PRED ] = 1000; - sf->thresh_mult[THR_D135_PRED] = 1000; - sf->thresh_mult[THR_D117_PRED] = 1000; - sf->thresh_mult[THR_D153_PRED] = 1000; - sf->thresh_mult[THR_D27_PRED ] = 1000; - sf->thresh_mult[THR_D63_PRED ] = 1000; - sf->thresh_mult[THR_B_PRED ] = 2500; - sf->thresh_mult[THR_I8X8_PRED] = 2500; - sf->thresh_mult[THR_TM ] = 1000; - - sf->thresh_mult[THR_NEARESTG ] = 1000; - sf->thresh_mult[THR_NEARESTA ] = 1000; - - sf->thresh_mult[THR_ZEROG ] = 1000; - sf->thresh_mult[THR_ZEROA ] = 1000; - sf->thresh_mult[THR_NEARG ] = 1000; - sf->thresh_mult[THR_NEARA ] = 1000; - - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_ZEROG ] = 0; - sf->thresh_mult[THR_ZEROA ] = 0; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - sf->thresh_mult[THR_NEARMV ] = 0; - sf->thresh_mult[THR_NEARG ] = 0; - sf->thresh_mult[THR_NEARA ] = 0; - - sf->thresh_mult[THR_NEWMV ] = 1000; - sf->thresh_mult[THR_NEWG ] = 1000; - sf->thresh_mult[THR_NEWA ] = 1000; - - sf->thresh_mult[THR_SPLITMV ] = 1700; - sf->thresh_mult[THR_SPLITG ] = 4500; - sf->thresh_mult[THR_SPLITA ] = 4500; - - sf->thresh_mult[THR_COMP_ZEROLG ] = 0; - sf->thresh_mult[THR_COMP_NEARESTLG] = 0; - sf->thresh_mult[THR_COMP_NEARLG ] = 0; - sf->thresh_mult[THR_COMP_ZEROLA ] = 0; - sf->thresh_mult[THR_COMP_NEARESTLA] = 0; - sf->thresh_mult[THR_COMP_NEARLA ] = 0; - sf->thresh_mult[THR_COMP_ZEROGA ] = 0; - sf->thresh_mult[THR_COMP_NEARESTGA] = 0; - sf->thresh_mult[THR_COMP_NEARGA ] = 0; - - sf->thresh_mult[THR_COMP_NEWLG ] = 1000; - sf->thresh_mult[THR_COMP_NEWLA ] = 1000; - sf->thresh_mult[THR_COMP_NEWGA ] = 1000; - - sf->thresh_mult[THR_COMP_SPLITLA ] = 1700; - sf->thresh_mult[THR_COMP_SPLITGA ] = 4500; - sf->thresh_mult[THR_COMP_SPLITLG ] = 4500; -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = 0; -#endif + sf->thresh_mult[THR_NEWMV ] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEWG ] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEWA ] += speed_multiplier * 1000; - if (Speed > 0) { - /* Disable coefficient optimization above speed 0 */ - sf->optimize_coefficients = 0; - sf->no_skip_block4x4_search = 0; - - sf->first_step = 1; - - cpi->mode_check_freq[THR_SPLITG] = 2; - cpi->mode_check_freq[THR_SPLITA] = 2; - cpi->mode_check_freq[THR_SPLITMV] = 0; - - cpi->mode_check_freq[THR_COMP_SPLITGA] = 2; - cpi->mode_check_freq[THR_COMP_SPLITLG] = 2; - cpi->mode_check_freq[THR_COMP_SPLITLA] = 0; - } + sf->thresh_mult[THR_SPLITMV ] += speed_multiplier * 2500; + sf->thresh_mult[THR_SPLITG ] += speed_multiplier * 2500; + sf->thresh_mult[THR_SPLITA ] += speed_multiplier * 2500; - if (Speed > 1) { - cpi->mode_check_freq[THR_SPLITG] = 4; - cpi->mode_check_freq[THR_SPLITA] = 4; - cpi->mode_check_freq[THR_SPLITMV] = 2; + sf->thresh_mult[THR_COMP_ZEROLG ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_ZEROLA ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_ZEROGA ] += speed_multiplier * 1500; - cpi->mode_check_freq[THR_COMP_SPLITGA] = 4; - cpi->mode_check_freq[THR_COMP_SPLITLG] = 4; - cpi->mode_check_freq[THR_COMP_SPLITLA] = 2; + sf->thresh_mult[THR_COMP_NEARESTLG] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500; - sf->thresh_mult[THR_TM ] = 1500; - sf->thresh_mult[THR_V_PRED ] = 1500; - sf->thresh_mult[THR_H_PRED ] = 1500; - sf->thresh_mult[THR_D45_PRED ] = 1500; - sf->thresh_mult[THR_D135_PRED] = 1500; - sf->thresh_mult[THR_D117_PRED] = 1500; - sf->thresh_mult[THR_D153_PRED] = 1500; - sf->thresh_mult[THR_D27_PRED ] = 1500; - sf->thresh_mult[THR_D63_PRED ] = 1500; - sf->thresh_mult[THR_B_PRED ] = 5000; - sf->thresh_mult[THR_I8X8_PRED] = 5000; - - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 10000; - sf->thresh_mult[THR_COMP_SPLITLG ] = 20000; - } + sf->thresh_mult[THR_COMP_NEARLG ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEARLA ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEARGA ] += speed_multiplier * 1500; - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - sf->thresh_mult[THR_NEARESTG ] = 1500; - sf->thresh_mult[THR_ZEROG ] = 1500; - sf->thresh_mult[THR_NEARG ] = 1500; - sf->thresh_mult[THR_NEWG ] = 2000; - sf->thresh_mult[THR_SPLITG ] = 20000; - sf->thresh_mult[THR_COMP_SPLITGA ] = 20000; - } + sf->thresh_mult[THR_COMP_NEWLG ] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_NEWLA ] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_NEWGA ] += speed_multiplier * 2000; - if (cpi->ref_frame_flags & VP9_ALT_FLAG) { - sf->thresh_mult[THR_NEARESTA ] = 1500; - sf->thresh_mult[THR_ZEROA ] = 1500; - sf->thresh_mult[THR_NEARA ] = 1500; - sf->thresh_mult[THR_NEWA ] = 2000; - sf->thresh_mult[THR_SPLITA ] = 20000; - sf->thresh_mult[THR_COMP_SPLITLA ] = 10000; - } + sf->thresh_mult[THR_COMP_SPLITLA ] += speed_multiplier * 4500; + sf->thresh_mult[THR_COMP_SPLITGA ] += speed_multiplier * 4500; + sf->thresh_mult[THR_COMP_SPLITLG ] += speed_multiplier * 4500; - sf->thresh_mult[THR_COMP_ZEROLG ] = 1500; - sf->thresh_mult[THR_COMP_NEARESTLG] = 1500; - sf->thresh_mult[THR_COMP_NEARLG ] = 1500; - sf->thresh_mult[THR_COMP_ZEROLA ] = 1500; - sf->thresh_mult[THR_COMP_NEARESTLA] = 1500; - sf->thresh_mult[THR_COMP_NEARLA ] = 1500; - sf->thresh_mult[THR_COMP_ZEROGA ] = 1500; - sf->thresh_mult[THR_COMP_NEARESTGA] = 1500; - sf->thresh_mult[THR_COMP_NEARGA ] = 1500; - - sf->thresh_mult[THR_COMP_NEWLG ] = 2000; - sf->thresh_mult[THR_COMP_NEWLA ] = 2000; - sf->thresh_mult[THR_COMP_NEWGA ] = 2000; #if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = 0; -#endif - } + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] += speed_multiplier * 1500; - if (Speed > 2) { - cpi->mode_check_freq[THR_SPLITG] = 15; - cpi->mode_check_freq[THR_SPLITA] = 15; - cpi->mode_check_freq[THR_SPLITMV] = 7; - - cpi->mode_check_freq[THR_COMP_SPLITGA] = 15; - cpi->mode_check_freq[THR_COMP_SPLITLG] = 15; - cpi->mode_check_freq[THR_COMP_SPLITLA] = 7; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += speed_multiplier * 1500; - sf->thresh_mult[THR_TM ] = 2000; - sf->thresh_mult[THR_V_PRED ] = 2000; - sf->thresh_mult[THR_H_PRED ] = 2000; - sf->thresh_mult[THR_D45_PRED ] = 2000; - sf->thresh_mult[THR_D135_PRED] = 2000; - sf->thresh_mult[THR_D117_PRED] = 2000; - sf->thresh_mult[THR_D153_PRED] = 2000; - sf->thresh_mult[THR_D27_PRED ] = 2000; - sf->thresh_mult[THR_D63_PRED ] = 2000; - sf->thresh_mult[THR_B_PRED ] = 7500; - sf->thresh_mult[THR_I8X8_PRED] = 7500; - - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - sf->thresh_mult[THR_NEWMV ] = 2000; - sf->thresh_mult[THR_SPLITMV ] = 25000; - sf->thresh_mult[THR_COMP_SPLITLG ] = 50000; - } - - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - sf->thresh_mult[THR_NEARESTG ] = 2000; - sf->thresh_mult[THR_ZEROG ] = 2000; - sf->thresh_mult[THR_NEARG ] = 2000; - sf->thresh_mult[THR_NEWG ] = 2500; - sf->thresh_mult[THR_SPLITG ] = 50000; - sf->thresh_mult[THR_COMP_SPLITGA ] = 50000; - } + sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] += speed_multiplier * 1500; - if (cpi->ref_frame_flags & VP9_ALT_FLAG) { - sf->thresh_mult[THR_NEARESTA ] = 2000; - sf->thresh_mult[THR_ZEROA ] = 2000; - sf->thresh_mult[THR_NEARA ] = 2000; - sf->thresh_mult[THR_NEWA ] = 2500; - sf->thresh_mult[THR_SPLITA ] = 50000; - sf->thresh_mult[THR_COMP_SPLITLA ] = 25000; - } - - sf->thresh_mult[THR_COMP_ZEROLG ] = 2000; - sf->thresh_mult[THR_COMP_NEARESTLG] = 2000; - sf->thresh_mult[THR_COMP_NEARLG ] = 2000; - sf->thresh_mult[THR_COMP_ZEROLA ] = 2000; - sf->thresh_mult[THR_COMP_NEARESTLA] = 2000; - sf->thresh_mult[THR_COMP_NEARLA ] = 2000; - sf->thresh_mult[THR_COMP_ZEROGA ] = 2000; - sf->thresh_mult[THR_COMP_NEARESTGA] = 2000; - sf->thresh_mult[THR_COMP_NEARGA ] = 2000; - - sf->thresh_mult[THR_COMP_NEWLG ] = 2500; - sf->thresh_mult[THR_COMP_NEWLA ] = 2500; - sf->thresh_mult[THR_COMP_NEWGA ] = 2500; -#if CONFIG_COMP_INTERINTRA_PRED - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = 0; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = 0; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] += speed_multiplier * 2000; #endif - sf->improved_dct = 0; - - // Only do recode loop on key frames, golden frames and - // alt ref frames - sf->recode_loop = 2; - - } - - break; - - }; /* switch */ - /* disable frame modes if flags not set */ if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) { sf->thresh_mult[THR_NEWMV ] = INT_MAX; @@ -959,67 +656,162 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->thresh_mult[THR_ZEROMV ] = INT_MAX; sf->thresh_mult[THR_NEARMV ] = INT_MAX; sf->thresh_mult[THR_SPLITMV ] = INT_MAX; +#if CONFIG_COMP_INTERINTRA_PRED + sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = INT_MAX; + sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = INT_MAX; +#endif } - if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) { sf->thresh_mult[THR_NEARESTG ] = INT_MAX; sf->thresh_mult[THR_ZEROG ] = INT_MAX; sf->thresh_mult[THR_NEARG ] = INT_MAX; sf->thresh_mult[THR_NEWG ] = INT_MAX; + sf->thresh_mult[THR_SPLITG ] = INT_MAX; #if CONFIG_COMP_INTERINTRA_PRED sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG ] = INT_MAX; sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX; sf->thresh_mult[THR_COMP_INTERINTRA_NEARG ] = INT_MAX; sf->thresh_mult[THR_COMP_INTERINTRA_NEWG ] = INT_MAX; #endif - sf->thresh_mult[THR_SPLITG ] = INT_MAX; } - if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) { sf->thresh_mult[THR_NEARESTA ] = INT_MAX; sf->thresh_mult[THR_ZEROA ] = INT_MAX; sf->thresh_mult[THR_NEARA ] = INT_MAX; sf->thresh_mult[THR_NEWA ] = INT_MAX; + sf->thresh_mult[THR_SPLITA ] = INT_MAX; #if CONFIG_COMP_INTERINTRA_PRED sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA ] = INT_MAX; sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX; sf->thresh_mult[THR_COMP_INTERINTRA_NEARA ] = INT_MAX; sf->thresh_mult[THR_COMP_INTERINTRA_NEWA ] = INT_MAX; #endif - sf->thresh_mult[THR_SPLITA ] = INT_MAX; } - if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) { + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != + (VP9_LAST_FLAG | VP9_GOLD_FLAG)) { sf->thresh_mult[THR_COMP_ZEROLG ] = INT_MAX; sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX; sf->thresh_mult[THR_COMP_NEARLG ] = INT_MAX; sf->thresh_mult[THR_COMP_NEWLG ] = INT_MAX; sf->thresh_mult[THR_COMP_SPLITLG ] = INT_MAX; } - - if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) { + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != + (VP9_LAST_FLAG | VP9_ALT_FLAG)) { sf->thresh_mult[THR_COMP_ZEROLA ] = INT_MAX; sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX; sf->thresh_mult[THR_COMP_NEARLA ] = INT_MAX; sf->thresh_mult[THR_COMP_NEWLA ] = INT_MAX; sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX; } - - if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { + if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != + (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { sf->thresh_mult[THR_COMP_ZEROGA ] = INT_MAX; sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX; sf->thresh_mult[THR_COMP_NEARGA ] = INT_MAX; sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX; sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX; } -#if CONFIG_COMP_INTERINTRA_PRED - if ((cpi->ref_frame_flags & VP9_LAST_FLAG) != VP9_LAST_FLAG) { - sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL ] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEARL ] = INT_MAX; - sf->thresh_mult[THR_COMP_INTERINTRA_NEWL ] = INT_MAX; +} + +void vp9_set_speed_features(VP9_COMP *cpi) { + SPEED_FEATURES *sf = &cpi->sf; + int mode = cpi->compressor_speed; + int speed = cpi->Speed; + int i; + + // Only modes 0 and 1 supported for now in experimental code basae + if (mode > 1) + mode = 1; + + // Initialise default mode frequency sampling variables + for (i = 0; i < MAX_MODES; i ++) { + cpi->mode_check_freq[i] = 0; + cpi->mode_test_hit_counts[i] = 0; + cpi->mode_chosen_counts[i] = 0; } -#endif + + // best quality defaults + sf->RD = 1; + sf->search_method = NSTEP; + sf->improved_dct = 1; + sf->auto_filter = 1; + sf->recode_loop = 1; + sf->quarter_pixel_search = 1; + sf->half_pixel_search = 1; + sf->iterative_sub_pixel = 1; + sf->no_skip_block4x4_search = 1; + if (cpi->oxcf.lossless) + sf->optimize_coefficients = 0; + else + sf->optimize_coefficients = 1; + + sf->first_step = 0; + sf->max_step_search_steps = MAX_MVSEARCH_STEPS; + sf->static_segmentation = 1; + sf->splitmode_breakout = 0; + sf->mb16_breakout = 0; + + switch (mode) { + case 0: // best quality mode + sf->search_best_filter = SEARCH_BEST_FILTER; + break; + + case 1: + sf->static_segmentation = 1; + sf->splitmode_breakout = 1; + sf->mb16_breakout = 0; + + if (speed > 0) { + /* Disable coefficient optimization above speed 0 */ + sf->optimize_coefficients = 0; + sf->no_skip_block4x4_search = 0; + + sf->first_step = 1; + + cpi->mode_check_freq[THR_SPLITG] = 2; + cpi->mode_check_freq[THR_SPLITA] = 2; + cpi->mode_check_freq[THR_SPLITMV] = 0; + + cpi->mode_check_freq[THR_COMP_SPLITGA] = 2; + cpi->mode_check_freq[THR_COMP_SPLITLG] = 2; + cpi->mode_check_freq[THR_COMP_SPLITLA] = 0; + } + + if (speed > 1) { + cpi->mode_check_freq[THR_SPLITG] = 4; + cpi->mode_check_freq[THR_SPLITA] = 4; + cpi->mode_check_freq[THR_SPLITMV] = 2; + + cpi->mode_check_freq[THR_COMP_SPLITGA] = 4; + cpi->mode_check_freq[THR_COMP_SPLITLG] = 4; + cpi->mode_check_freq[THR_COMP_SPLITLA] = 2; + } + + if (speed > 2) { + cpi->mode_check_freq[THR_SPLITG] = 15; + cpi->mode_check_freq[THR_SPLITA] = 15; + cpi->mode_check_freq[THR_SPLITMV] = 7; + + cpi->mode_check_freq[THR_COMP_SPLITGA] = 15; + cpi->mode_check_freq[THR_COMP_SPLITLG] = 15; + cpi->mode_check_freq[THR_COMP_SPLITLA] = 7; + + sf->improved_dct = 0; + + // Only do recode loop on key frames, golden frames and + // alt ref frames + sf->recode_loop = 2; + } + + break; + + }; /* switch */ + + // Set rd thresholds based on mode and speed setting + set_rd_speed_thresholds(cpi, mode, speed); // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. @@ -1028,36 +820,19 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->improved_dct = 0; } - if (cpi->sf.search_method == NSTEP) { - vp9_init3smotion_compensation(&cpi->mb, - cm->yv12_fb[cm->lst_fb_idx].y_stride); - } else if (cpi->sf.search_method == DIAMOND) { - vp9_init_dsmotion_compensation(&cpi->mb, - cm->yv12_fb[cm->lst_fb_idx].y_stride); - } - - cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16; - cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8; - cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4; - cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4; - cpi->mb.short_walsh4x4 = vp9_short_walsh4x4; - cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2; - -#if CONFIG_LOSSLESS - if (cpi->oxcf.lossless) { - cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8; - cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8; - cpi->mb.short_walsh4x4 = vp9_short_walsh4x4; - cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2; - cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless; + cpi->mb.fwd_txm16x16 = vp9_short_fdct16x16; + cpi->mb.fwd_txm8x8 = vp9_short_fdct8x8; + cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; + cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; + if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { + cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; + cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; } -#endif cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4; cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair; cpi->mb.quantize_b_8x8 = vp9_regular_quantize_b_8x8; cpi->mb.quantize_b_16x16 = vp9_regular_quantize_b_16x16; - cpi->mb.quantize_b_2x2 = vp9_regular_quantize_b_2x2; vp9_init_quantizer(cpi); @@ -1078,24 +853,19 @@ void vp9_set_speed_features(VP9_COMP *cpi) { frames_at_speed[cpi->Speed]++; #endif } -static void alloc_raw_frame_buffers(VP9_COMP *cpi) { - int width = (cpi->oxcf.Width + 15) & ~15; - int height = (cpi->oxcf.Height + 15) & ~15; - cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height, +static void alloc_raw_frame_buffers(VP9_COMP *cpi) { + cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height, cpi->oxcf.lag_in_frames); if (!cpi->lookahead) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); -#if VP9_TEMPORAL_ALT_REF - if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer, - width, height, VP9BORDERINPIXELS)) + cpi->oxcf.width, cpi->oxcf.height, + VP9BORDERINPIXELS)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); - -#endif } static int alloc_partition_data(VP9_COMP *cpi) { @@ -1115,10 +885,7 @@ static int alloc_partition_data(VP9_COMP *cpi) { void vp9_alloc_compressor_data(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; - int width = cm->Width; - int height = cm->Height; - - if (vp9_alloc_frame_buffers(cm, width, height)) + if (vp9_alloc_frame_buffers(cm, cm->width, cm->height)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffers"); @@ -1126,25 +893,16 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate partition data"); - - if ((width & 0xf) != 0) - width += 16 - (width & 0xf); - - if ((height & 0xf) != 0) - height += 16 - (height & 0xf); - - if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf, - width, height, VP9BORDERINPIXELS)) + cm->width, cm->height, VP9BORDERINPIXELS)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, - width, height, VP9BORDERINPIXELS)) + cm->width, cm->height, VP9BORDERINPIXELS)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); - vpx_free(cpi->tok); { @@ -1199,6 +957,48 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { } +static void update_frame_size(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + + /* our internal buffers are always multiples of 16 */ + int aligned_width = (cm->width + 15) & ~15; + int aligned_height = (cm->height + 15) & ~15; + + cm->mb_rows = aligned_height >> 4; + cm->mb_cols = aligned_width >> 4; + cm->MBs = cm->mb_rows * cm->mb_cols; + cm->mode_info_stride = cm->mb_cols + 1; + memset(cm->mip, 0, + (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO)); + vp9_update_mode_info_border(cm, cm->mip); + + cm->mi = cm->mip + cm->mode_info_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; + vp9_update_mode_info_in_image(cm, cm->mi); + + /* Update size of buffers local to this frame */ + if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf, + cm->width, cm->height, VP9BORDERINPIXELS)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate last frame buffer"); + + if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source, + cm->width, cm->height, VP9BORDERINPIXELS)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate scaled source buffer"); + + { + int y_stride = cpi->scaled_source.y_stride; + + if (cpi->sf.search_method == NSTEP) { + vp9_init3smotion_compensation(&cpi->mb, y_stride); + } else if (cpi->sf.search_method == DIAMOND) { + vp9_init_dsmotion_compensation(&cpi->mb, y_stride); + } + } +} + + // TODO perhaps change number of steps expose to outside world when setting // max and min limits. Also this will likely want refining for the extended Q // range. @@ -1239,15 +1039,12 @@ void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) { cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS; // Set Maximum gf/arf interval - cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2); - - if (cpi->max_gf_interval < 12) - cpi->max_gf_interval = 12; + cpi->max_gf_interval = 16; // Extended interval for genuinely static scenes cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1; - // Special conditions when altr ref frame enabled in lagged compress mode + // Special conditions when alt ref frame enabled in lagged compress mode if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) { if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1) cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1; @@ -1260,28 +1057,45 @@ void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) { cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval; } - -static int -rescale(int val, int num, int denom) { +static int64_t rescale(int val, int64_t num, int denom) { int64_t llnum = num; int64_t llden = denom; int64_t llval = val; - return (int)(llval * llnum / llden); + return (llval * llnum / llden); } +static void set_tile_limits(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int min_log2_tiles, max_log2_tiles; + + cm->log2_tile_columns = cpi->oxcf.tile_columns; + cm->log2_tile_rows = cpi->oxcf.tile_rows; + + vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles); + max_log2_tiles += min_log2_tiles; + if (cm->log2_tile_columns < min_log2_tiles) + cm->log2_tile_columns = min_log2_tiles; + else if (cm->log2_tile_columns > max_log2_tiles) + cm->log2_tile_columns = max_log2_tiles; + cm->tile_columns = 1 << cm->log2_tile_columns; + cm->tile_rows = 1 << cm->log2_tile_rows; +} static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; cpi->oxcf = *oxcf; cpi->goldfreq = 7; - cm->version = oxcf->Version; + cm->version = oxcf->version; vp9_setup_version(cm); + cm->width = oxcf->width; + cm->height = oxcf->height; + // change includes all joint functionality vp9_change_config(ptr, oxcf); @@ -1304,31 +1118,30 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->static_mb_pct = 0; -#if VP9_TEMPORAL_ALT_REF + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; + + set_tile_limits(cpi); + { int i; - cpi->fixed_divide[0] = 0; - for (i = 1; i < 512; i++) cpi->fixed_divide[i] = 0x80000 / i; } -#endif } void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *cm = &cpi->common; - - if (!cpi) - return; + VP9_COMMON *const cm = &cpi->common; - if (!oxcf) + if (!cpi || !oxcf) return; - if (cm->version != oxcf->Version) { - cm->version = oxcf->Version; + if (cm->version != oxcf->version) { + cm->version = oxcf->version; vp9_setup_version(cm); } @@ -1351,7 +1164,6 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { if (cpi->oxcf.cpu_used > 5) cpi->oxcf.cpu_used = 5; - break; case MODE_SECONDPASS_BEST: @@ -1364,20 +1176,14 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q]; cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level]; - cpi->mb.e_mbd.inv_xform4x4_1_x8 = vp9_short_idct4x4llm_1; - cpi->mb.e_mbd.inv_xform4x4_x8 = vp9_short_idct4x4llm; - cpi->mb.e_mbd.inv_walsh4x4_1 = vp9_short_inv_walsh4x4_1; - cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4; - -#if CONFIG_LOSSLESS cpi->oxcf.lossless = oxcf->lossless; if (cpi->oxcf.lossless) { - cpi->mb.e_mbd.inv_xform4x4_1_x8 = vp9_short_inv_walsh4x4_1_x8; - cpi->mb.e_mbd.inv_xform4x4_x8 = vp9_short_inv_walsh4x4_x8; - cpi->mb.e_mbd.inv_walsh4x4_1 = vp9_short_inv_walsh4x4_1_lossless; - cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_iwalsh4x4; + } else { + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4; } -#endif cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; @@ -1385,8 +1191,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { // cpi->use_golden_frame_only = 0; // cpi->use_last_frame_only = 0; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 1; cm->refresh_entropy_probs = 1; setup_features(cpi); @@ -1414,31 +1220,28 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { // Convert target bandwidth from Kbit/s to Bit/s cpi->oxcf.target_bandwidth *= 1000; - cpi->oxcf.starting_buffer_level = - rescale(cpi->oxcf.starting_buffer_level, - cpi->oxcf.target_bandwidth, 1000); + cpi->oxcf.starting_buffer_level = rescale(cpi->oxcf.starting_buffer_level, + cpi->oxcf.target_bandwidth, 1000); // Set or reset optimal and maximum buffer levels. if (cpi->oxcf.optimal_buffer_level == 0) cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; else - cpi->oxcf.optimal_buffer_level = - rescale(cpi->oxcf.optimal_buffer_level, - cpi->oxcf.target_bandwidth, 1000); + cpi->oxcf.optimal_buffer_level = rescale(cpi->oxcf.optimal_buffer_level, + cpi->oxcf.target_bandwidth, 1000); if (cpi->oxcf.maximum_buffer_size == 0) cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; else - cpi->oxcf.maximum_buffer_size = - rescale(cpi->oxcf.maximum_buffer_size, - cpi->oxcf.target_bandwidth, 1000); + cpi->oxcf.maximum_buffer_size = rescale(cpi->oxcf.maximum_buffer_size, + cpi->oxcf.target_bandwidth, 1000); // Set up frame rate and related parameters rate control values. vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate); // Set absolute upper and lower quality limits - cpi->worst_quality = cpi->oxcf.worst_allowed_q; - cpi->best_quality = cpi->oxcf.best_allowed_q; + cpi->worst_quality = cpi->oxcf.worst_allowed_q; + cpi->best_quality = cpi->oxcf.best_allowed_q; // active values should only be modified if out of new range if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) { @@ -1467,11 +1270,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->target_bandwidth = cpi->oxcf.target_bandwidth; - cm->Width = cpi->oxcf.Width; - cm->Height = cpi->oxcf.Height; - - cm->horiz_scale = cpi->horiz_scale; - cm->vert_scale = cpi->vert_scale; + cm->display_width = cpi->oxcf.width; + cm->display_height = cpi->oxcf.height; // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) if (cpi->oxcf.Sharpness > 7) @@ -1479,26 +1279,18 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cm->sharpness_level = cpi->oxcf.Sharpness; - if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) { - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); - - Scale2Ratio(cm->horiz_scale, &hr, &hs); - Scale2Ratio(cm->vert_scale, &vr, &vs); - - // always go to the next whole number - cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs; - cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; - } - - if (((cm->Width + 15) & 0xfffffff0) != - cm->yv12_fb[cm->lst_fb_idx].y_width || - ((cm->Height + 15) & 0xfffffff0) != - cm->yv12_fb[cm->lst_fb_idx].y_height || - cm->yv12_fb[cm->lst_fb_idx].y_width == 0) { + // Increasing the size of the frame beyond the first seen frame, or some + // otherwise signalled maximum size, is not supported. + // TODO(jkoleszar): exit gracefully. + if (!cpi->initial_width) { alloc_raw_frame_buffers(cpi); vp9_alloc_compressor_data(cpi); + cpi->initial_width = cm->width; + cpi->initial_height = cm->height; } + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + update_frame_size(cpi); if (cpi->oxcf.fixed_q >= 0) { cpi->last_q[0] = cpi->oxcf.fixed_q; @@ -1526,6 +1318,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->last_frame_distortion = 0; #endif + set_tile_limits(cpi); } #define M_LOG2_E 0.693147180559945309417 @@ -1541,30 +1334,30 @@ static void cal_nmvjointsadcost(int *mvjointsadcost) { static void cal_nmvsadcosts(int *mvsadcost[2]) { int i = 1; - mvsadcost [0] [0] = 0; - mvsadcost [1] [0] = 0; + mvsadcost[0][0] = 0; + mvsadcost[1][0] = 0; do { double z = 256 * (2 * (log2f(8 * i) + .6)); - mvsadcost [0][i] = (int) z; - mvsadcost [1][i] = (int) z; - mvsadcost [0][-i] = (int) z; - mvsadcost [1][-i] = (int) z; + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; } while (++i <= MV_MAX); } static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { int i = 1; - mvsadcost [0] [0] = 0; - mvsadcost [1] [0] = 0; + mvsadcost[0][0] = 0; + mvsadcost[1][0] = 0; do { double z = 256 * (2 * (log2f(8 * i) + .6)); - mvsadcost [0][i] = (int) z; - mvsadcost [1][i] = (int) z; - mvsadcost [0][-i] = (int) z; - mvsadcost [1][-i] = (int) z; + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; } while (++i <= MV_MAX); } @@ -1681,6 +1474,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { #endif #ifdef NMV_STATS init_nmvstats(); +#endif +#if CONFIG_CODE_NONZEROCOUNT +#ifdef NZC_STATS + init_nzcstats(); +#endif #endif /*Initialize the feed-forward activity masking.*/ @@ -1693,7 +1491,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->source_alt_ref_pending = FALSE; cpi->source_alt_ref_active = FALSE; - cpi->common.refresh_alt_ref_frame = 0; + cpi->refresh_alt_ref_frame = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -1795,10 +1593,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->rd_thresh_mult[i] = 128; } -#ifdef ENTROPY_STATS - init_mv_ref_counts(); -#endif - #define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].vf = VF; \ @@ -1838,14 +1632,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4, NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) -#if ARCH_X86 || ARCH_X86_64 - cpi->fn_ptr[BLOCK_16X16].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_16X8].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_8X16].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_8X8].copymem = vp9_copy32xn; - cpi->fn_ptr[BLOCK_4X4].copymem = vp9_copy32xn; -#endif - cpi->full_search_sad = vp9_full_search_sad; cpi->diamond_search_sad = vp9_diamond_search_sad; cpi->refining_search_sad = vp9_refining_search_sad; @@ -1865,6 +1651,13 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->common.error.setjmp = 0; vp9_zero(cpi->y_uv_mode_count) +#if CONFIG_CODE_NONZEROCOUNT + vp9_zero(cm->fc.nzc_counts_4x4); + vp9_zero(cm->fc.nzc_counts_8x8); + vp9_zero(cm->fc.nzc_counts_16x16); + vp9_zero(cm->fc.nzc_counts_32x32); + vp9_zero(cm->fc.nzc_pcat_counts); +#endif return (VP9_PTR) cpi; } @@ -1885,13 +1678,19 @@ void vp9_remove_compressor(VP9_PTR *ptr) { if (cpi->pass != 1) { print_context_counters(); print_tree_update_probs(); - print_mode_context(); + print_mode_context(&cpi->common); } #endif #ifdef NMV_STATS if (cpi->pass != 1) print_nmvstats(); #endif +#if CONFIG_CODE_NONZEROCOUNT +#ifdef NZC_STATS + if (cpi->pass != 1) + print_nzcstats(); +#endif +#endif #if CONFIG_INTERNAL_STATS @@ -1908,7 +1707,8 @@ void vp9_remove_compressor(VP9_PTR *ptr) { print_mode_contexts(&cpi->common); #endif if (cpi->b_calculate_psnr) { - YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; + YV12_BUFFER_CONFIG *lst_yv12 = + &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]]; double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height; double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error); double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2); @@ -2176,8 +1976,8 @@ static void generate_psnr_packet(VP9_COMP *cpi) { struct vpx_codec_cx_pkt pkt; uint64_t sse; int i; - unsigned int width = cpi->common.Width; - unsigned int height = cpi->common.Height; + unsigned int width = cpi->common.width; + unsigned int height = cpi->common.height; pkt.kind = VPX_CODEC_PSNR_PKT; sse = calc_plane_error(orig->y_buffer, orig->y_stride, @@ -2230,34 +2030,34 @@ int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) { if (ref_frame_flags > 7) return -1; - cpi->common.refresh_golden_frame = 0; - cpi->common.refresh_alt_ref_frame = 0; - cpi->common.refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->refresh_last_frame = 0; if (ref_frame_flags & VP9_LAST_FLAG) - cpi->common.refresh_last_frame = 1; + cpi->refresh_last_frame = 1; if (ref_frame_flags & VP9_GOLD_FLAG) - cpi->common.refresh_golden_frame = 1; + cpi->refresh_golden_frame = 1; if (ref_frame_flags & VP9_ALT_FLAG) - cpi->common.refresh_alt_ref_frame = 1; + cpi->refresh_alt_ref_frame = 1; return 0; } -int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd) { +int vp9_copy_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { VP9_COMP *cpi = (VP9_COMP *)(ptr); VP9_COMMON *cm = &cpi->common; int ref_fb_idx; if (ref_frame_flag == VP9_LAST_FLAG) - ref_fb_idx = cm->lst_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx]; else if (ref_frame_flag == VP9_GOLD_FLAG) - ref_fb_idx = cm->gld_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx]; else if (ref_frame_flag == VP9_ALT_FLAG) - ref_fb_idx = cm->alt_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx]; else return -1; @@ -2266,6 +2066,17 @@ int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, return 0; } +int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) { + VP9_COMP *cpi = (VP9_COMP *)(ptr); + VP9_COMMON *cm = &cpi->common; + + if (index < 0 || index >= NUM_REF_FRAMES) + return -1; + + *fb = &cm->yv12_fb[cm->ref_frame_map[index]]; + return 0; +} + int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { VP9_COMP *cpi = (VP9_COMP *)(ptr); @@ -2274,11 +2085,11 @@ int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, int ref_fb_idx; if (ref_frame_flag == VP9_LAST_FLAG) - ref_fb_idx = cm->lst_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx]; else if (ref_frame_flag == VP9_GOLD_FLAG) - ref_fb_idx = cm->gld_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx]; else if (ref_frame_flag == VP9_ALT_FLAG) - ref_fb_idx = cm->alt_fb_idx; + ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx]; else return -1; @@ -2327,7 +2138,7 @@ void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) { void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { YV12_BUFFER_CONFIG *s = cm->frame_to_show; uint8_t *src = s->y_buffer; - int h = cm->Height; + int h = cm->height; do { fwrite(src, s->y_width, 1, yuv_rec_file); @@ -2335,7 +2146,7 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { } while (--h); src = s->u_buffer; - h = (cm->Height + 1) / 2; + h = (cm->height + 1) / 2; do { fwrite(src, s->uv_width, 1, yuv_rec_file); @@ -2343,15 +2154,79 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { } while (--h); src = s->v_buffer; - h = (cm->Height + 1) / 2; + h = (cm->height + 1) / 2; do { fwrite(src, s->uv_width, 1, yuv_rec_file); src += s->uv_stride; } while (--h); + fflush(yuv_rec_file); } #endif +static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb, + YV12_BUFFER_CONFIG *dst_fb) { + const int in_w = src_fb->y_crop_width; + const int in_h = src_fb->y_crop_height; + const int out_w = dst_fb->y_crop_width; + const int out_h = dst_fb->y_crop_height; + int x, y; + + for (y = 0; y < out_h; y += 16) { + for (x = 0; x < out_w; x += 16) { + int x_q4 = x * 16 * in_w / out_w; + int y_q4 = y * 16 * in_h / out_h; + uint8_t *src, *dst; + int src_stride, dst_stride; + + + src = src_fb->y_buffer + + y * in_h / out_h * src_fb->y_stride + + x * in_w / out_w; + dst = dst_fb->y_buffer + + y * dst_fb->y_stride + + x; + src_stride = src_fb->y_stride; + dst_stride = dst_fb->y_stride; + + vp9_convolve8(src, src_stride, dst, dst_stride, + vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w, + vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h, + 16, 16); + + x_q4 >>= 1; + y_q4 >>= 1; + src_stride = src_fb->uv_stride; + dst_stride = dst_fb->uv_stride; + + src = src_fb->u_buffer + + y / 2 * in_h / out_h * src_fb->uv_stride + + x / 2 * in_w / out_w; + dst = dst_fb->u_buffer + + y / 2 * dst_fb->uv_stride + + x / 2; + vp9_convolve8(src, src_stride, dst, dst_stride, + vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w, + vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h, + 8, 8); + + src = src_fb->v_buffer + + y / 2 * in_h / out_h * src_fb->uv_stride + + x / 2 * in_w / out_w; + dst = dst_fb->v_buffer + + y / 2 * dst_fb->uv_stride + + x / 2; + vp9_convolve8(src, src_stride, dst, dst_stride, + vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w, + vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h, + 8, 8); + } + } + + vp8_yv12_extend_frame_borders(dst_fb); +} + + static void update_alt_ref_frame_stats(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; @@ -2374,13 +2249,13 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; // Update the Golden frame usage counts. - if (cm->refresh_golden_frame) { + if (cpi->refresh_golden_frame) { // Update data structure that monitors level of reference to last GF vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; // this frame refreshes means next frames don't unless specified by user - cm->refresh_golden_frame = 0; + cpi->refresh_golden_frame = 0; cpi->common.frames_since_golden = 0; // if ( cm->frame_type == KEY_FRAME ) @@ -2402,7 +2277,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { // ******** Fixed Q test code only ************ // If we are going to use the ALT reference for the next group of frames set a flag to say so. if (cpi->oxcf.fixed_q >= 0 && - cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) { + cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) { cpi->source_alt_ref_pending = TRUE; cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; } @@ -2414,7 +2289,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { if (cpi->frames_till_gf_update_due > 0) cpi->frames_till_gf_update_due--; - } else if (!cpi->common.refresh_alt_ref_frame) { + } else if (!cpi->refresh_alt_ref_frame) { // Decrement count down till next gf if (cpi->frames_till_gf_update_due > 0) cpi->frames_till_gf_update_due--; @@ -2535,8 +2410,8 @@ static int recode_loop_test(VP9_COMP *cpi, if ((cpi->sf.recode_loop == 1) || ((cpi->sf.recode_loop == 2) && ((cm->frame_type == KEY_FRAME) || - cm->refresh_golden_frame || - cm->refresh_alt_ref_frame))) { + cpi->refresh_golden_frame || + cpi->refresh_alt_ref_frame))) { // General over and under shoot tests if (((cpi->projected_frame_size > high_limit) && (q < maxq)) || ((cpi->projected_frame_size < low_limit) && (q > minq))) { @@ -2563,86 +2438,56 @@ static int recode_loop_test(VP9_COMP *cpi, return force_recode; } -static void update_reference_frames(VP9_COMMON *cm) { - YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb; +static void update_reference_frames(VP9_COMP * const cpi) { + VP9_COMMON * const cm = &cpi->common; // At this point the new frame has been encoded. // If any buffer copy / swapping is signaled it should be done here. - if (cm->frame_type == KEY_FRAME) { - yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG; - - yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG; - yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG; - - cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx; - } else { /* For non key frames */ - if (cm->refresh_alt_ref_frame) { - assert(!cm->copy_buffer_to_arf); - - cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG; - cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG; - cm->alt_fb_idx = cm->new_fb_idx; - } else if (cm->copy_buffer_to_arf) { - assert(!(cm->copy_buffer_to_arf & ~0x3)); - - if (cm->copy_buffer_to_arf == 1) { - if (cm->alt_fb_idx != cm->lst_fb_idx) { - yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG; - yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG; - cm->alt_fb_idx = cm->lst_fb_idx; - } - } else { /* if (cm->copy_buffer_to_arf == 2) */ - if (cm->alt_fb_idx != cm->gld_fb_idx) { - yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG; - yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG; - cm->alt_fb_idx = cm->gld_fb_idx; - } - } + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); + } else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { + /* Preserve the previously existing golden frame and update the frame in + * the alt ref slot instead. This is highly specific to the current use of + * alt-ref as a forward reference, and this needs to be generalized as + * other uses are implemented (like RTC/temporal scaling) + * + * The update to the buffer in the alt ref slot was signalled in + * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated + * as the golden frame next time. + */ + int tmp; + + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); + + tmp = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->gld_fb_idx; + cpi->gld_fb_idx = tmp; + } else { /* For non key/golden frames */ + if (cpi->refresh_alt_ref_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); } - if (cm->refresh_golden_frame) { - assert(!cm->copy_buffer_to_gf); - - cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG; - cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG; - cm->gld_fb_idx = cm->new_fb_idx; - } else if (cm->copy_buffer_to_gf) { - assert(!(cm->copy_buffer_to_arf & ~0x3)); - - if (cm->copy_buffer_to_gf == 1) { - if (cm->gld_fb_idx != cm->lst_fb_idx) { - yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG; - yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG; - cm->gld_fb_idx = cm->lst_fb_idx; - } - } else { /* if (cm->copy_buffer_to_gf == 2) */ - if (cm->alt_fb_idx != cm->gld_fb_idx) { - yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG; - yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG; - cm->gld_fb_idx = cm->alt_fb_idx; - } - } + if (cpi->refresh_golden_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); } } - if (cm->refresh_last_frame) { - cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG; - cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG; - cm->lst_fb_idx = cm->new_fb_idx; + if (cpi->refresh_last_frame) { + ref_cnt_fb(cm->fb_idx_ref_cnt, + &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx); } } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { - if (cm->no_lpf) { + if (cm->no_lpf || cpi->mb.e_mbd.lossless) { cm->filter_level = 0; - } -#if CONFIG_LOSSLESS - else if (cpi->oxcf.lossless) { - cm->filter_level = 0; - } -#endif - else { + } else { struct vpx_usec_timer timer; vp9_clear_system_state(); @@ -2659,14 +2504,15 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { if (cm->filter_level > 0) { vp9_set_alt_lf_level(cpi, cm->filter_level); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0); + vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0, + cm->dering_enabled); } vp8_yv12_extend_frame_borders(cm->frame_to_show); } -void select_interp_filter_type(VP9_COMP *cpi) { +void vp9_select_interp_filter_type(VP9_COMP *cpi) { int i; int high_filter_index = 0; unsigned int thresh; @@ -2719,6 +2565,38 @@ static void select_interintra_mode(VP9_COMP *cpi) { } #endif +static void scale_references(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int i; + + for (i = 0; i < 3; i++) { + YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]]; + + if (ref->y_crop_width != cm->width || + ref->y_crop_height != cm->height) { + int new_fb = get_free_fb(cm); + + vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb], + cm->width, cm->height, + VP9BORDERINPIXELS); + scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]); + cpi->scaled_ref_idx[i] = new_fb; + } else { + cpi->scaled_ref_idx[i] = cm->ref_frame_map[i]; + cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++; + } + } +} + +static void release_scaled_references(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int i; + + for (i = 0; i < 3; i++) { + cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--; + } +} + static void encode_frame_to_data_rate(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, @@ -2735,8 +2613,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, int q_low; int q_high; - int zbin_oq_high; - int zbin_oq_low = 0; int top_index; int bottom_index; @@ -2749,11 +2625,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #if RESET_FOREACH_FILTER int q_low0; int q_high0; - int zbin_oq_high0; - int zbin_oq_low0 = 0; int Q0; - int last_zbin_oq; - int last_zbin_oq0; int active_best_quality0; int active_worst_quality0; double rate_correction_factor0; @@ -2773,39 +2645,42 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, int mcomp_filter_index = 0; int64_t mcomp_filter_cost[4]; + /* Scale the source buffer, if required */ + if (cm->mb_cols * 16 != cpi->un_scaled_source->y_width || + cm->mb_rows * 16 != cpi->un_scaled_source->y_height) { + scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source); + cpi->Source = &cpi->scaled_source; + } else { + cpi->Source = cpi->un_scaled_source; + } + + scale_references(cpi); + // Clear down mmx registers to allow floating point in what follows vp9_clear_system_state(); // For an alt ref frame in 2 pass we skip the call to the second // pass function that sets the target bandwidth so must set it here - if (cpi->common.refresh_alt_ref_frame) { + if (cpi->refresh_alt_ref_frame) { cpi->per_frame_bandwidth = cpi->twopass.gf_bits; // Per frame bit target for the alt ref frame // per second target bitrate cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_frame_rate); } - // Default turn off buffer to buffer copying - cm->copy_buffer_to_gf = 0; - cm->copy_buffer_to_arf = 0; - // Clear zbin over-quant value and mode boost values. - cpi->zbin_over_quant = 0; cpi->zbin_mode_boost = 0; // Enable or disable mode based tweaking of the zbin // For 2 Pass Only used where GF/ARF prediction quality // is above a threshold cpi->zbin_mode_boost = 0; -#if CONFIG_LOSSLESS - cpi->zbin_mode_boost_enabled = FALSE; -#else - cpi->zbin_mode_boost_enabled = TRUE; -#endif - if (cpi->gfu_boost <= 400) { + + // if (cpi->oxcf.lossless) cpi->zbin_mode_boost_enabled = FALSE; - } + // else + // cpi->zbin_mode_boost_enabled = TRUE; // Current default encoder behaviour for the altref sign bias if (cpi->source_alt_ref_active) @@ -2846,10 +2721,22 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, for (i = 0; i < MAX_MODES; i++) { cpi->rd_thresh_mult[i] = 128; } + + cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0); + cm->frame_parallel_decoding_mode = + (cpi->oxcf.frame_parallel_decoding_mode != 0); + if (cm->error_resilient_mode) { + cm->frame_parallel_decoding_mode = 1; + cm->refresh_entropy_probs = 0; + } } - // Test code for new segment features - init_seg_features(cpi); + // Configure use of segmentation for enhanced coding of static regions. + // Only allowed for now in second pass of two pass (as requires lagged coding) + // and if the relevent speed feature flag is set. + if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) { + configure_static_seg_features(cpi); + } // Decide how big to make the frame vp9_pick_frame_size(cpi); @@ -2896,9 +2783,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->active_best_quality < cpi->best_quality) cpi->active_best_quality = cpi->best_quality; } - } - - else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) { + } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { int high = 2000; int low = 400; @@ -2935,7 +2820,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->active_best_quality * 15 / 16; } } else { +#ifdef ONE_SHOT_Q_ESTIMATE +#ifdef STRICT_ONE_SHOT_Q + cpi->active_best_quality = Q; +#else + cpi->active_best_quality = inter_minq[Q]; +#endif +#else cpi->active_best_quality = inter_minq[Q]; +#endif // For the constant/constrained quality mode we dont want // q to fall below the cq level. @@ -2971,17 +2864,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Determine initial Q to try Q = vp9_regulate_q(cpi, cpi->this_frame_target); } -#if RESET_FOREACH_FILTER - last_zbin_oq = cpi->zbin_over_quant; -#endif - - // Set highest allowed value for Zbin over quant - if (cm->frame_type == KEY_FRAME) - zbin_oq_high = 0; // ZBIN_OQ_MAX/16 - else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active)) - zbin_oq_high = 16; - else - zbin_oq_high = ZBIN_OQ_MAX; vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit); @@ -3016,7 +2898,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #if CONFIG_POSTPROC if (cpi->oxcf.noise_sensitivity > 0) { - uint8_t *src; int l = 0; switch (cpi->oxcf.noise_sensitivity) { @@ -3030,7 +2911,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, l = 60; break; case 4: - case 5: l = 100; break; @@ -3039,18 +2919,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, break; } - - if (cm->frame_type == KEY_FRAME) { - vp9_de_noise(cpi->Source, cpi->Source, l, 1, 0); - } else { - vp9_de_noise(cpi->Source, cpi->Source, l, 1, 0); - - src = cpi->Source->y_buffer; - - if (cpi->Source->y_stride < 0) { - src += cpi->Source->y_stride * (cpi->Source->y_height - 1); - } - } + vp9_denoise(cpi->Source, cpi->Source, l, 1, 0); } #endif @@ -3064,9 +2933,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, q_low0 = q_low; q_high0 = q_high; Q0 = Q; - zbin_oq_low0 = zbin_oq_low; - zbin_oq_high0 = zbin_oq_high; - last_zbin_oq0 = last_zbin_oq; rate_correction_factor0 = cpi->rate_correction_factor; gf_rate_correction_factor0 = cpi->gf_rate_correction_factor; active_best_quality0 = cpi->active_best_quality; @@ -3087,12 +2953,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k]; if (cm->frame_type != KEY_FRAME) { - if (cpi->common.refresh_alt_ref_frame) { + if (cpi->refresh_alt_ref_frame) { for (k = 0; k < MBSKIP_CONTEXTS; k++) { if (cpi->last_skip_false_probs[2][k] != 0) cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k]; } - } else if (cpi->common.refresh_golden_frame) { + } else if (cpi->refresh_golden_frame) { for (k = 0; k < MBSKIP_CONTEXTS; k++) { if (cpi->last_skip_false_probs[1][k] != 0) cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k]; @@ -3124,13 +2990,28 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Set up entropy depending on frame type. - if (cm->frame_type == KEY_FRAME) + if (cm->frame_type == KEY_FRAME) { + /* Choose which entropy context to use. When using a forward reference + * frame, it immediately follows the keyframe, and thus benefits from + * using the same entropy context established by the keyframe. Otherwise, + * use the default context 0. + */ + cm->frame_context_idx = cpi->oxcf.play_alternate; vp9_setup_key_frame(cpi); - else + } else { + /* Choose which entropy context to use. Currently there are only two + * contexts used, one for normal frames and one for alt ref frames. + */ + cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; vp9_setup_inter_frame(cpi); + } } // transform / motion compensation build reconstruction frame +#if CONFIG_MODELCOEFPROB && ADJUST_KF_COEF_PROBS + if (cm->frame_type == KEY_FRAME) + vp9_adjust_default_coef_probs(cm); +#endif vp9_encode_frame(cpi); @@ -3214,23 +3095,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->projected_frame_size > cpi->this_frame_target) { q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value - if (cpi->zbin_over_quant > 0) // If we are using over quant do the same for zbin_oq_low - zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high; - if (undershoot_seen || (loop_count > 1)) { // Update rate_correction_factor unless cpi->active_worst_quality has changed. if (!active_worst_qchanged) vp9_update_rate_correction_factors(cpi, 1); Q = (q_high + q_low + 1) / 2; - - // Adjust cpi->zbin_over_quant (only allowed when Q is max) - if (Q < MAXQ) - cpi->zbin_over_quant = 0; - else { - zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high; - cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2; - } } else { // Update rate_correction_factor unless cpi->active_worst_quality has changed. if (!active_worst_qchanged) @@ -3238,7 +3108,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, Q = vp9_regulate_q(cpi, cpi->this_frame_target); - while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) { + while ((Q < q_low) && (Retries < 10)) { vp9_update_rate_correction_factors(cpi, 0); Q = vp9_regulate_q(cpi, cpi->this_frame_target); Retries++; @@ -3249,10 +3119,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Frame is too small else { - if (cpi->zbin_over_quant == 0) - q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant - else // else lower zbin_oq_high - zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low; + q_high = (Q > q_low) ? (Q - 1) : q_low; if (overshoot_seen || (loop_count > 1)) { // Update rate_correction_factor unless cpi->active_worst_quality has changed. @@ -3260,12 +3127,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_update_rate_correction_factors(cpi, 1); Q = (q_high + q_low) / 2; - - // Adjust cpi->zbin_over_quant (only allowed when Q is max) - if (Q < MAXQ) - cpi->zbin_over_quant = 0; - else - cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2; } else { // Update rate_correction_factor unless cpi->active_worst_quality has changed. if (!active_worst_qchanged) @@ -3282,7 +3143,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, q_low = Q; } - while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) { + while ((Q > q_high) && (Retries < 10)) { vp9_update_rate_correction_factors(cpi, 0); Q = vp9_regulate_q(cpi, cpi->this_frame_target); Retries++; @@ -3293,21 +3154,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Clamp Q to upper and lower limits: - if (Q > q_high) - Q = q_high; - else if (Q < q_low) - Q = q_low; + Q = clamp(Q, q_low, q_high); - // Clamp cpi->zbin_over_quant - cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? - zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? - zbin_oq_high : cpi->zbin_over_quant; - - // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE; - Loop = ((Q != last_q)) ? TRUE : FALSE; -#if RESET_FOREACH_FILTER - last_zbin_oq = cpi->zbin_over_quant; -#endif + Loop = Q != last_q; } else Loop = FALSE; @@ -3351,12 +3200,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (Loop == TRUE) { overshoot_seen = FALSE; undershoot_seen = FALSE; - zbin_oq_low = zbin_oq_low0; - zbin_oq_high = zbin_oq_high0; q_low = q_low0; q_high = q_high0; Q = Q0; - cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0; cpi->rate_correction_factor = rate_correction_factor0; cpi->gf_rate_correction_factor = gf_rate_correction_factor0; cpi->active_best_quality = active_best_quality0; @@ -3412,12 +3258,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_update_gf_useage_maps(cpi, cm, &cpi->mb); if (cm->frame_type == KEY_FRAME) - cm->refresh_last_frame = 1; + cpi->refresh_last_frame = 1; #if 0 { FILE *f = fopen("gfactive.stt", "a"); - fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame); + fprintf(f, "%8d %8d %8d %8d %8d\n", + cm->current_video_frame, + (100 * cpi->gf_active_count) + / (cpi->common.mb_rows * cpi->common.mb_cols), + cpi->this_iiratio, + cpi->next_iiratio, + cpi->refresh_golden_frame); fclose(f); } #endif @@ -3444,18 +3296,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, update_reference_segmentation_map(cpi); } - update_reference_frames(cm); + release_scaled_references(cpi); + update_reference_frames(cpi); vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4); - vp9_copy(cpi->common.fc.hybrid_coef_counts_4x4, - cpi->hybrid_coef_counts_4x4); vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8); - vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, - cpi->hybrid_coef_counts_8x8); vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16); - vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16, - cpi->hybrid_coef_counts_16x16); vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32); - vp9_adapt_coef_probs(&cpi->common); + if (!cpi->common.error_resilient_mode && + !cpi->common.frame_parallel_decoding_mode) { + vp9_adapt_coef_probs(&cpi->common); +#if CONFIG_CODE_NONZEROCOUNT + vp9_adapt_nzc_probs(&cpi->common); +#endif + } if (cpi->common.frame_type != KEY_FRAME) { vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count); vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count); @@ -3467,14 +3320,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #if CONFIG_COMP_INTERINTRA_PRED vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count); #endif - vp9_adapt_mode_probs(&cpi->common); - cpi->common.fc.NMVcount = cpi->NMVcount; - /* - printf("2: %d %d %d %d\n", cpi->NMVcount.joints[0], cpi->NMVcount.joints[1], - cpi->NMVcount.joints[2], cpi->NMVcount.joints[3]); - */ - vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); + if (!cpi->common.error_resilient_mode && + !cpi->common.frame_parallel_decoding_mode) { + vp9_adapt_mode_probs(&cpi->common); + vp9_adapt_mode_context(&cpi->common); + vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); + } } #if CONFIG_COMP_INTERINTRA_PRED if (cm->frame_type != KEY_FRAME) @@ -3502,8 +3354,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if ((cm->base_qindex < cpi->last_boosted_qindex) || ((cpi->static_mb_pct < 100) && ((cm->frame_type == KEY_FRAME) || - cm->refresh_alt_ref_frame || - (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) { + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) { cpi->last_boosted_qindex = cm->base_qindex; } @@ -3516,7 +3368,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2; // Keep a record from which we can calculate the average Q excluding GF updates and key frames - if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) { + if ((cm->frame_type != KEY_FRAME) + && !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { cpi->ni_frames++; cpi->tot_q += vp9_convert_qindex_to_q(Q); cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames; @@ -3538,11 +3391,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; - // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass. - cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4; - cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4; - cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32; - cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32; + // Rolling monitors of whether we are over or underspending used to help + // regulate min and Max Q in two pass. + if (cm->frame_type != KEY_FRAME) { + cpi->rolling_target_bits = + ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4; + cpi->rolling_actual_bits = + ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4; + cpi->long_rolling_target_bits = + ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32; + cpi->long_rolling_actual_bits = + ((cpi->long_rolling_actual_bits * 31) + + cpi->projected_frame_size + 16) / 32; + } // Actual bits spent cpi->total_actual_bits += cpi->projected_frame_size; @@ -3558,7 +3419,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->twopass.kf_group_bits < 0) cpi->twopass.kf_group_bits = 0; - } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) { + } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size; if (cpi->twopass.gf_group_bits < 0) @@ -3569,7 +3430,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // in this frame. update_base_skip_probs(cpi); -#if 0// 1 && CONFIG_INTERNAL_STATS +#if 0 // 1 && CONFIG_INTERNAL_STATS { FILE *f = fopen("tmp.stt", "a"); int recon_err; @@ -3582,13 +3443,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->twopass.total_left_stats->coded_error != 0.0) fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d" "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" - "%6d %5d %5d %5d %8d %8.2f %10d %10.3f" + "%6d %6d %5d %5d %5d %8.2f %10d %10.3f" "%10.3f %8d %10d %10d %10d\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, 0, //loop_size_estimate, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, - (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), + (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, vp9_convert_qindex_to_q(cm->base_qindex), (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, @@ -3597,9 +3458,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->avg_q, vp9_convert_qindex_to_q(cpi->ni_av_qi), vp9_convert_qindex_to_q(cpi->cq_target_quality), - cpi->zbin_over_quant, - // cpi->avg_frame_qindex, cpi->zbin_over_quant, - cm->refresh_golden_frame, cm->refresh_alt_ref_frame, + cpi->refresh_last_frame, + cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left, @@ -3611,14 +3471,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, else fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d" "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" - "%6d %5d %5d %5d %8d %8.2f %10d %10.3f" + "%5d %5d %5d %8d %8d %8.2f %10d %10.3f" "%8d %10d %10d %10d\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, 0, //loop_size_estimate, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, - (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), + (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, vp9_convert_qindex_to_q(cm->base_qindex), (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, @@ -3627,9 +3487,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->avg_q, vp9_convert_qindex_to_q(cpi->ni_av_qi), vp9_convert_qindex_to_q(cpi->cq_target_quality), - cpi->zbin_over_quant, - // cpi->avg_frame_qindex, cpi->zbin_over_quant, - cm->refresh_golden_frame, cm->refresh_alt_ref_frame, + cpi->refresh_last_frame, + cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left, @@ -3645,8 +3504,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, - cm->frame_type, cm->refresh_golden_frame, - cm->refresh_alt_ref_frame); + cm->frame_type, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame); for (i = 0; i < MAX_MODES; i++) fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); @@ -3665,33 +3524,34 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #endif // If this was a kf or Gf note the Q - if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame) + if ((cm->frame_type == KEY_FRAME) + || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) cm->last_kf_gf_q = cm->base_qindex; - if (cm->refresh_golden_frame == 1) + if (cpi->refresh_golden_frame == 1) cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; else cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN; - if (cm->refresh_alt_ref_frame == 1) + if (cpi->refresh_alt_ref_frame == 1) cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF; else cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF; - if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed + if (cpi->refresh_last_frame & cpi->refresh_golden_frame) cpi->gold_is_last = 1; - else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other + else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame) cpi->gold_is_last = 0; - if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed + if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame) cpi->alt_is_last = 1; - else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other + else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame) cpi->alt_is_last = 0; - if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed + if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame) cpi->gold_is_alt = 1; - else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other + else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame) cpi->gold_is_alt = 0; cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; @@ -3705,7 +3565,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->gold_is_alt) cpi->ref_frame_flags &= ~VP9_ALT_FLAG; - if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME)) + if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame + && (cm->frame_type != KEY_FRAME)) // Update the alternate reference frame stats as appropriate. update_alt_ref_frame_stats(cpi); else @@ -3727,6 +3588,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, xd->update_mb_segmentation_data = 0; xd->mode_ref_lf_delta_update = 0; + // keep track of the last coded dimensions + cm->last_width = cm->width; + cm->last_height = cm->height; // Dont increment frame counters if this was an altref buffer update not a real frame if (cm->show_frame) { @@ -3744,8 +3608,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, FILE *recon_file; sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame); recon_file = fopen(filename, "wb"); - fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc, - cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file); + fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc, + cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size, + 1, recon_file); fclose(recon_file); } #endif @@ -3765,13 +3630,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) { - if (!cpi->common.refresh_alt_ref_frame) + if (!cpi->refresh_alt_ref_frame) vp9_second_pass(cpi); encode_frame_to_data_rate(cpi, size, dest, frame_flags); + +#ifdef DISABLE_RC_LONG_TERM_MEM + cpi->twopass.bits_left -= cpi->this_frame_target; +#else cpi->twopass.bits_left -= 8 * *size; +#endif - if (!cpi->common.refresh_alt_ref_frame) { + if (!cpi->refresh_alt_ref_frame) { double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate; double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); @@ -3808,9 +3678,8 @@ static int frame_is_reference(const VP9_COMP *cpi) { const VP9_COMMON *cm = &cpi->common; const MACROBLOCKD *xd = &cpi->mb.e_mbd; - return cm->frame_type == KEY_FRAME || cm->refresh_last_frame - || cm->refresh_golden_frame || cm->refresh_alt_ref_frame - || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf + return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame + || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame || cm->refresh_entropy_probs || xd->mode_ref_lf_delta_update || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data; @@ -3846,9 +3715,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, force_src_buffer = &cpi->alt_ref_buffer; } cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due; - cm->refresh_alt_ref_frame = 1; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 0; + cpi->refresh_alt_ref_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 0; cm->show_frame = 0; cpi->source_alt_ref_pending = FALSE; // Clear Pending altf Ref flag. cpi->is_src_frame_alt_ref = 0; @@ -3862,8 +3731,10 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cpi->is_src_frame_alt_ref = cpi->alt_ref_source && (cpi->source == cpi->alt_ref_source); - if (cpi->is_src_frame_alt_ref) + if (cpi->is_src_frame_alt_ref) { + cpi->refresh_last_frame = 0; cpi->alt_ref_source = NULL; + } } } @@ -3889,7 +3760,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, } // adjust frame rates based on timestamps given - if (!cm->refresh_alt_ref_frame) { + if (!cpi->refresh_alt_ref_frame) { int64_t this_duration; int step = 0; @@ -3945,28 +3816,34 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, #if 0 - if (cm->refresh_alt_ref_frame) { - // cm->refresh_golden_frame = 1; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 0; + if (cpi->refresh_alt_ref_frame) { + // cpi->refresh_golden_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 0; } else { - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 1; } #endif - /* find a free buffer for the new frame */ - { - int i = 0; - for (; i < NUM_YV12_BUFFERS; i++) { - if (!cm->yv12_fb[i].flags) { - cm->new_fb_idx = i; - break; - } - } - assert(i < NUM_YV12_BUFFERS); - } + /* find a free buffer for the new frame, releasing the reference previously + * held. + */ + cm->fb_idx_ref_cnt[cm->new_fb_idx]--; + cm->new_fb_idx = get_free_fb(cm); + + /* Get the mapping of L/G/A to the reference buffer pool */ + cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx]; + cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx]; + cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx]; + + /* Reset the frame pointers to the current frame size */ + vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], + cm->width, cm->height, + VP9BORDERINPIXELS); + + vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm); if (cpi->pass == 1) { Pass1Encode(cpi, size, dest, frame_flags); } else if (cpi->pass == 2) { @@ -3976,21 +3853,19 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, } if (cm->refresh_entropy_probs) { - if (cm->refresh_alt_ref_frame) - vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc)); - else - vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc)); + vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc, + sizeof(cm->fc)); } - // if its a dropped frame honor the requests on subsequent frames if (*size > 0) { + // if its a dropped frame honor the requests on subsequent frames cpi->droppable = !frame_is_reference(cpi); // return to normal state cm->refresh_entropy_probs = 1; - cm->refresh_alt_ref_frame = 0; - cm->refresh_golden_frame = 0; - cm->refresh_last_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 1; cm->frame_type = INTER_FRAME; } @@ -4113,7 +3988,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags) { VP9_COMP *cpi = (VP9_COMP *) comp; - if (cpi->common.refresh_alt_ref_frame) + if (!cpi->common.show_frame) return -1; else { int ret; @@ -4123,9 +3998,9 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, if (cpi->common.frame_to_show) { *dest = *cpi->common.frame_to_show; - dest->y_width = cpi->common.Width; - dest->y_height = cpi->common.Height; - dest->uv_height = cpi->common.Height / 2; + dest->y_width = cpi->common.width; + dest->y_height = cpi->common.height; + dest->uv_height = cpi->common.height / 2; ret = 0; } else { ret = -1; @@ -4217,17 +4092,25 @@ int vp9_set_active_map(VP9_PTR comp, unsigned char *map, int vp9_set_internal_size(VP9_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode) { VP9_COMP *cpi = (VP9_COMP *) comp; + VP9_COMMON *cm = &cpi->common; + int hr = 0, hs = 0, vr = 0, vs = 0; - if (horiz_mode <= ONETWO) - cpi->common.horiz_scale = horiz_mode; - else + if (horiz_mode > ONETWO) return -1; - if (vert_mode <= ONETWO) - cpi->common.vert_scale = vert_mode; - else + if (vert_mode > ONETWO) return -1; + Scale2Ratio(horiz_mode, &hr, &hs); + Scale2Ratio(vert_mode, &vr, &vs); + + // always go to the next whole number + cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs; + cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs; + + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + update_frame_size(cpi); return 0; } @@ -4235,16 +4118,17 @@ int vp9_set_internal_size(VP9_PTR comp, int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) { int i, j; - int Total = 0; + int total = 0; uint8_t *src = source->y_buffer; uint8_t *dst = dest->y_buffer; - // Loop through the Y plane raw and reconstruction data summing (square differences) + // Loop through the Y plane raw and reconstruction data summing + // (square differences) for (i = 0; i < source->y_height; i += 16) { for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, + total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); } @@ -4252,7 +4136,7 @@ int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) { dst += 16 * dest->y_stride; } - return Total; + return total; } diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 74a58b4300a3b37c77c25e15afd617dee2e16fba..891cc3f5223fb748e340c7b91f0a7494c72618fd 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -29,6 +29,11 @@ #include "vp9/common/vp9_findnearmv.h" #include "vp9/encoder/vp9_lookahead.h" +// Experimental rate control switches +// #define ONE_SHOT_Q_ESTIMATE 1 +// #define STRICT_ONE_SHOT_Q 1 +// #define DISABLE_RC_LONG_TERM_MEM 1 + // #define SPEEDSTATS 1 #define MIN_GF_INTERVAL 4 #define DEFAULT_GF_INTERVAL 7 @@ -37,10 +42,6 @@ #define MAX_LAG_BUFFERS 25 -#define AF_THRESH 25 -#define AF_THRESH2 100 -#define ARF_DECAY_THRESH 12 - #if CONFIG_COMP_INTERINTRA_PRED #define MAX_MODES 54 #else @@ -50,12 +51,11 @@ #define MIN_THRESHMULT 32 #define MAX_THRESHMULT 512 -#define GF_ZEROMV_ZBIN_BOOST 12 -#define LF_ZEROMV_ZBIN_BOOST 6 -#define MV_ZBIN_BOOST 4 -#define ZBIN_OQ_MAX 192 - -#define VP9_TEMPORAL_ALT_REF 1 +#define GF_ZEROMV_ZBIN_BOOST 0 +#define LF_ZEROMV_ZBIN_BOOST 0 +#define MV_ZBIN_BOOST 0 +#define SPLIT_MV_ZBIN_BOOST 0 +#define INTRA_ZBIN_BOOST 0 typedef struct { nmv_context nmvc; @@ -86,13 +86,10 @@ typedef struct { // 0 = BPRED, ZERO_MV, MV, SPLIT signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; - vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32]; + vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES]; + vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES]; + vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES]; + vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES]; vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1]; vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */ @@ -111,6 +108,18 @@ typedef struct { int mv_ref_ct[INTER_MODE_CONTEXTS][4][2]; int vp9_mode_contexts[INTER_MODE_CONTEXTS][4]; +#if CONFIG_CODE_NONZEROCOUNT + vp9_prob nzc_probs_4x4 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES]; + vp9_prob nzc_probs_8x8 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES]; + vp9_prob nzc_probs_16x16 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES]; + vp9_prob nzc_probs_32x32 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES]; + vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS] + [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA]; +#endif } CODING_CONTEXT; typedef struct { @@ -259,7 +268,9 @@ typedef struct { int optimize_coefficients; int no_skip_block4x4_search; int search_best_filter; - + int splitmode_breakout; + int mb16_breakout; + int static_segmentation; } SPEED_FEATURES; typedef struct { @@ -301,41 +312,14 @@ typedef struct VP9_COMP { DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]); - DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]); - DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]); - - DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]); - - DECLARE_ALIGNED(16, short, Y1zbin_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, Y2zbin_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, UVzbin_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_32x32[QINDEX_RANGE][1024]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_32x32[QINDEX_RANGE][1024]); - MACROBLOCK mb; VP9_COMMON common; VP9_CONFIG oxcf; @@ -357,11 +341,17 @@ typedef struct VP9_COMP { int alt_is_last; // Alt reference frame same as last ( short circuit altref search) int gold_is_alt; // don't do both alt and gold search ( just do gold). - // int refresh_alt_ref_frame; + int scaled_ref_idx[3]; + int lst_fb_idx; + int gld_fb_idx; + int alt_fb_idx; + int refresh_last_frame; + int refresh_golden_frame; + int refresh_alt_ref_frame; YV12_BUFFER_CONFIG last_frame_uf; TOKENEXTRA *tok; - unsigned int tok_count; + unsigned int tok_count[1 << 6]; unsigned int frames_since_key; @@ -396,11 +386,6 @@ typedef struct VP9_COMP { CODING_CONTEXT coding_context; // Rate targetting variables - int64_t prediction_error; - int64_t last_prediction_error; - int64_t intra_error; - int64_t last_intra_error; - int this_frame_target; int projected_frame_size; int last_q[2]; // Separate values for Intra/Inter @@ -422,6 +407,7 @@ typedef struct VP9_COMP { int max_gf_interval; int baseline_gf_interval; int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames + int active_arnr_strength; // <= cpi->oxcf.arnr_max_strength int64_t key_frame_count; int prior_key_frame_distance[KEY_FRAME_CONTEXT]; @@ -441,7 +427,6 @@ typedef struct VP9_COMP { double tot_q; double avg_q; - int zbin_over_quant; int zbin_mode_boost; int zbin_mode_boost_enabled; @@ -484,37 +469,47 @@ typedef struct VP9_COMP { nmv_context_counts NMVcount; - vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_probs frame_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]; - vp9_coeff_stats frame_hybrid_branch_ct_4x4[BLOCK_TYPES_4X4]; - - vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_probs frame_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]; - vp9_coeff_stats frame_hybrid_branch_ct_8x8[BLOCK_TYPES_8X8]; - - vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_probs frame_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]; - vp9_coeff_stats frame_hybrid_branch_ct_16x16[BLOCK_TYPES_16X16]; - - vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32]; - vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES_32X32]; - vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES_32X32]; + vp9_coeff_count coef_counts_4x4[BLOCK_TYPES]; + vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES]; + vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES]; + + vp9_coeff_count coef_counts_8x8[BLOCK_TYPES]; + vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES]; + vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES]; + + vp9_coeff_count coef_counts_16x16[BLOCK_TYPES]; + vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES]; + vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES]; + + vp9_coeff_count coef_counts_32x32[BLOCK_TYPES]; + vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES]; + vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES]; + +#if CONFIG_CODE_NONZEROCOUNT + vp9_prob frame_nzc_probs_4x4 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES]; + unsigned int frame_nzc_branch_ct_4x4 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES][2]; + vp9_prob frame_nzc_probs_8x8 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES]; + unsigned int frame_nzc_branch_ct_8x8 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES][2]; + vp9_prob frame_nzc_probs_16x16 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES]; + unsigned int frame_nzc_branch_ct_16x16 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES][2]; + vp9_prob frame_nzc_probs_32x32 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES]; + unsigned int frame_nzc_branch_ct_32x32 + [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES][2]; +#endif int gfu_boost; int last_boost; int kf_boost; int kf_zeromotion_pct; - int target_bandwidth; + int64_t target_bandwidth; struct vpx_codec_pkt_list *output_pkt_list; #if 0 @@ -542,8 +537,6 @@ typedef struct VP9_COMP { int goldfreq; int auto_worst_q; int cpu_used; - int horiz_scale; - int vert_scale; int pass; vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS]; @@ -628,11 +621,9 @@ typedef struct VP9_COMP { double est_max_qcorrection_factor; } twopass; -#if VP9_TEMPORAL_ALT_REF YV12_BUFFER_CONFIG alt_ref_buffer; YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; int fixed_divide[512]; -#endif #if CONFIG_INTERNAL_STATS int count; @@ -683,9 +674,6 @@ typedef struct VP9_COMP { int droppable; - // TODO Do we still need this?? - int update_context; - int dummy_packing; /* flag to indicate if packing is dummy */ unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1] @@ -696,6 +684,8 @@ typedef struct VP9_COMP { unsigned int mb_mv_ref_count[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; #endif + int initial_width; + int initial_height; } VP9_COMP; void vp9_encode_frame(VP9_COMP *cpi); diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index b443ede6fae6a121e19b3bba247a41ebdb716d89..645d66b258e0453a374cc0b57876b3035654429b 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -8,7 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ - +#include <assert.h> +#include <limits.h> #include "vp9/common/vp9_onyxc_int.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_picklpf.h" @@ -27,6 +28,7 @@ void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, int yoffset; int linestocopy; + assert(src_ybc->y_stride == dst_ybc->y_stride); yheight = src_ybc->y_height; ystride = src_ybc->y_stride; @@ -246,7 +248,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { int Bias = 0; // Bias against raising loop filter and in favour of lowering it // Make a copy of the unfiltered / processed recon buffer - vp8_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf); + vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); if (cm->frame_type == KEY_FRAME) cm->sharpness_level = 0; @@ -266,7 +268,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { // Get baseline error score vp9_set_alt_lf_level(cpi, filt_mid); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1); + vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, 0); best_err = vp9_calc_ss_err(sd, cm->frame_to_show); filt_best = filt_mid; @@ -291,7 +293,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { if ((filt_direction <= 0) && (filt_low != filt_mid)) { // Get Low filter error score vp9_set_alt_lf_level(cpi, filt_low); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1); + vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, 0); filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); @@ -311,7 +313,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { // Now look at filt_high if ((filt_direction >= 0) && (filt_high != filt_mid)) { vp9_set_alt_lf_level(cpi, filt_high); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1); + vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, 0); filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); @@ -336,4 +338,30 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { } cm->filter_level = filt_best; + +#if CONFIG_LOOP_DERING + /* Decide whether to turn on deringing filter */ + { // NOLINT + int best_dering = 0; + int this_dering; + int last_err_diff = INT_MAX; + + for (this_dering = 1; this_dering <= 16; this_dering++) { + vp9_set_alt_lf_level(cpi, filt_best); + vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, this_dering); + filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); + vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); + if (filt_err < best_err) { + best_err = filt_err; + best_dering = this_dering; + last_err_diff = INT_MAX; + } else { + if (filt_err - best_err > last_err_diff) + break; + last_err_diff = filt_err - best_err; + } + } + cm->dering_enabled = best_dering; + } +#endif } diff --git a/vp9/encoder/vp9_picklpf.h b/vp9/encoder/vp9_picklpf.h index cb015006ff24b0ecc78da6839164578bb5f963f9..ca3cab618009c108c122e9288c474f0c536ce97f 100644 --- a/vp9/encoder/vp9_picklpf.h +++ b/vp9/encoder/vp9_picklpf.h @@ -15,12 +15,12 @@ struct yv12_buffer_config; struct VP9_COMP; -extern void vp9_pick_filter_level_fast(struct yv12_buffer_config *sd, - struct VP9_COMP *cpi); +void vp9_pick_filter_level_fast(struct yv12_buffer_config *sd, + struct VP9_COMP *cpi); -extern void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val); +void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val); -extern void vp9_pick_filter_level(struct yv12_buffer_config *sd, - struct VP9_COMP *cpi); +void vp9_pick_filter_level(struct yv12_buffer_config *sd, + struct VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_PICKLPF_H_ diff --git a/vp9/encoder/vp9_psnr.c b/vp9/encoder/vp9_psnr.c index eb00f41590b470466215a1f8b0a85ae053901482..94394341d8ea38cc51f6f9853395faae081b7518 100644 --- a/vp9/encoder/vp9_psnr.c +++ b/vp9/encoder/vp9_psnr.c @@ -11,17 +11,16 @@ #include "vpx_scale/yv12config.h" #include "math.h" -#include "vp9/common/vp9_systemdependent.h" /* for vp9_clear_system_state() */ #define MAX_PSNR 100 -double vp9_mse2psnr(double Samples, double Peak, double Mse) { +double vp9_mse2psnr(double samples, double peak, double mse) { double psnr; - if ((double)Mse > 0.0) - psnr = 10.0 * log10(Peak * Peak * Samples / Mse); + if (mse > 0.0) + psnr = 10.0 * log10(peak * peak * samples / mse); else - psnr = MAX_PSNR; // Limit to prevent / 0 + psnr = MAX_PSNR; // Limit to prevent / 0 if (psnr > MAX_PSNR) psnr = MAX_PSNR; diff --git a/vp9/encoder/vp9_psnr.h b/vp9/encoder/vp9_psnr.h index 121f0dc98dcffca589623c8daaf7446d8750a1d1..15dd8366bd830c16eb28b7be47e95861752bf577 100644 --- a/vp9/encoder/vp9_psnr.h +++ b/vp9/encoder/vp9_psnr.h @@ -12,6 +12,6 @@ #ifndef VP9_ENCODER_VP9_PSNR_H_ #define VP9_ENCODER_VP9_PSNR_H_ -extern double vp9_mse2psnr(double Samples, double Peak, double Mse); +double vp9_mse2psnr(double samples, double peak, double mse); #endif // VP9_ENCODER_VP9_PSNR_H_ diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 36b6567135cb5d8b05ba75e0f04d736ec4453781..881fce50f4f11feeef99d96898d89e4366b3623b 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -21,32 +21,46 @@ extern int enc_debug; #endif -void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) { +static INLINE int plane_idx(MACROBLOCKD *xd, int b_idx) { + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + if (b_idx < (16 << (sb_type * 2))) + return 0; // Y + else if (b_idx < (20 << (sb_type * 2))) + return 16; // U + assert(b_idx < (24 << (sb_type * 2))); + return 20; // V +} + +void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) { + MACROBLOCKD *const xd = &mb->e_mbd; + BLOCK *const b = &mb->block[0]; + BLOCKD *const d = &xd->block[0]; int i, rc, eob; int zbin; int x, y, z, sz; + int16_t *coeff_ptr = mb->coeff + b_idx * 16; + int16_t *qcoeff_ptr = xd->qcoeff + b_idx * 16; + int16_t *dqcoeff_ptr = xd->dqcoeff + b_idx * 16; int16_t *zbin_boost_ptr = b->zrun_zbin_boost; - int16_t *coeff_ptr = b->coeff; int16_t *zbin_ptr = b->zbin; int16_t *round_ptr = b->round; int16_t *quant_ptr = b->quant; uint8_t *quant_shift_ptr = b->quant_shift; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; int16_t *dequant_ptr = d->dequant; int zbin_oq_value = b->zbin_extra; + const int *pt_scan; +#if CONFIG_CODE_NONZEROCOUNT + int nzc = 0; +#endif - int const *pt_scan ; - + assert(plane_idx(xd, b_idx) == 0); switch (tx_type) { case ADST_DCT: pt_scan = vp9_row_scan_4x4; break; - case DCT_ADST: pt_scan = vp9_col_scan_4x4; break; - default: pt_scan = vp9_default_zig_zag1d_4x4; break; @@ -57,251 +71,266 @@ void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) { eob = -1; - for (i = 0; i < b->eob_max_offset; i++) { - rc = pt_scan[i]; - z = coeff_ptr[rc]; - - zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; - zbin_boost_ptr ++; - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - if (x >= zbin) { - x += round_ptr[rc]; - y = (((x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength + if (!b->skip_block) { + for (i = 0; i < 16; i++) { + rc = pt_scan[i]; + z = coeff_ptr[rc]; + + zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; + zbin_boost_ptr++; + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += round_ptr[rc]; + y = (((x * quant_ptr[rc]) >> 16) + x) + >> quant_shift_ptr[rc]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + + if (y) { + eob = i; // last nonzero coeffs +#if CONFIG_CODE_NONZEROCOUNT + ++nzc; // number of nonzero coeffs +#endif + zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength + } } } } - d->eob = eob + 1; + xd->eobs[b_idx] = eob + 1; +#if CONFIG_CODE_NONZEROCOUNT + xd->nzcs[b_idx] = nzc; +#endif } -void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) { +void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) { + MACROBLOCKD *const xd = &mb->e_mbd; + const int c_idx = plane_idx(xd, b_idx); + BLOCK *const b = &mb->block[c_idx]; + BLOCKD *const d = &xd->block[c_idx]; int i, rc, eob; int zbin; int x, y, z, sz; + int16_t *coeff_ptr = mb->coeff + b_idx * 16; + int16_t *qcoeff_ptr = xd->qcoeff + b_idx * 16; + int16_t *dqcoeff_ptr = xd->dqcoeff + b_idx * 16; int16_t *zbin_boost_ptr = b->zrun_zbin_boost; - int16_t *coeff_ptr = b->coeff; int16_t *zbin_ptr = b->zbin; int16_t *round_ptr = b->round; int16_t *quant_ptr = b->quant; uint8_t *quant_shift_ptr = b->quant_shift; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; int16_t *dequant_ptr = d->dequant; int zbin_oq_value = b->zbin_extra; +#if CONFIG_CODE_NONZEROCOUNT + int nzc = 0; +#endif vpx_memset(qcoeff_ptr, 0, 32); vpx_memset(dqcoeff_ptr, 0, 32); eob = -1; - for (i = 0; i < b->eob_max_offset; i++) { - rc = vp9_default_zig_zag1d_4x4[i]; - z = coeff_ptr[rc]; + if (!b->skip_block) { + for (i = 0; i < 16; i++) { + rc = vp9_default_zig_zag1d_4x4[i]; + z = coeff_ptr[rc]; - zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; - zbin_boost_ptr ++; + zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; + zbin_boost_ptr++; - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) - if (x >= zbin) { - x += round_ptr[rc]; + if (x >= zbin) { + x += round_ptr[rc]; - y = (((x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + y = (((x * quant_ptr[rc]) >> 16) + x) + >> quant_shift_ptr[rc]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value - if (y) { - eob = i; // last nonzero coeffs - zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength + if (y) { + eob = i; // last nonzero coeffs +#if CONFIG_CODE_NONZEROCOUNT + ++nzc; // number of nonzero coeffs +#endif + zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength + } } } } - d->eob = eob + 1; + xd->eobs[b_idx] = eob + 1; +#if CONFIG_CODE_NONZEROCOUNT + xd->nzcs[b_idx] = nzc; +#endif } -void vp9_quantize_mby_4x4_c(MACROBLOCK *x) { +void vp9_quantize_mby_4x4(MACROBLOCK *x) { int i; - int has_2nd_order = get_2nd_order_usage(&x->e_mbd); for (i = 0; i < 16; i++) { - TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, &x->e_mbd.block[i]); + TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, i); if (tx_type != DCT_DCT) { - assert(has_2nd_order == 0); - vp9_ht_quantize_b_4x4(&x->block[i], &x->e_mbd.block[i], tx_type); + vp9_ht_quantize_b_4x4(x, i, tx_type); } else { - x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]); + x->quantize_b_4x4(x, i); } } - if (has_2nd_order) { - x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]); - } else { - vpx_memset(x->e_mbd.block[24].qcoeff, 0, - 16 * sizeof(x->e_mbd.block[24].qcoeff[0])); - vpx_memset(x->e_mbd.block[24].dqcoeff, 0, - 16 * sizeof(x->e_mbd.block[24].dqcoeff[0])); - x->e_mbd.block[24].eob = 0; - } } -void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) { +void vp9_quantize_mbuv_4x4(MACROBLOCK *x) { int i; for (i = 16; i < 24; i++) - x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]); + x->quantize_b_4x4(x, i); } -void vp9_quantize_mb_4x4_c(MACROBLOCK *x) { - vp9_quantize_mby_4x4_c(x); - vp9_quantize_mbuv_4x4_c(x); +void vp9_quantize_mb_4x4(MACROBLOCK *x) { + vp9_quantize_mby_4x4(x); + vp9_quantize_mbuv_4x4(x); } -void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) { - int i, rc, eob; - int zbin; - int x, y, z, sz; - int16_t *zbin_boost_ptr = b->zrun_zbin_boost; - int zbin_zrun_index = 0; - int16_t *coeff_ptr = b->coeff; - int16_t *zbin_ptr = b->zbin; - int16_t *round_ptr = b->round; - int16_t *quant_ptr = b->quant; - uint8_t *quant_shift_ptr = b->quant_shift; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; - int16_t *dequant_ptr = d->dequant; - int zbin_oq_value = b->zbin_extra; - // double q2nd = 4; - vpx_memset(qcoeff_ptr, 0, 32); - vpx_memset(dqcoeff_ptr, 0, 32); - - eob = -1; - - for (i = 0; i < b->eob_max_offset_8x8; i++) { - rc = vp9_default_zig_zag1d_4x4[i]; - z = coeff_ptr[rc]; - - zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index]; - zbin_zrun_index += 4; - zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value); - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) +void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) { + MACROBLOCKD *const xd = &mb->e_mbd; + int16_t *qcoeff_ptr = xd->qcoeff + 16 * b_idx; + int16_t *dqcoeff_ptr = xd->dqcoeff + 16 * b_idx; + const int c_idx = plane_idx(xd, b_idx); + BLOCK *const b = &mb->block[c_idx]; + BLOCKD *const d = &xd->block[c_idx]; + const int *pt_scan; - if (x >= zbin) { - x += (round_ptr[rc]); - y = ((int)((int)(x * quant_ptr[rc]) >> 16) + x) - >> quant_shift_ptr[rc]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - zbin_zrun_index = 0; - } - } + switch (tx_type) { + case ADST_DCT: + pt_scan = vp9_row_scan_8x8; + break; + case DCT_ADST: + pt_scan = vp9_col_scan_8x8; + break; + default: + pt_scan = vp9_default_zig_zag1d_8x8; + break; } - d->eob = eob + 1; -} - -void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) { - int i, rc, eob; - int zbin; - int x, y, z, sz; - int16_t *zbin_boost_ptr = b->zrun_zbin_boost_8x8; - int16_t *coeff_ptr = b->coeff; - int16_t *zbin_ptr = b->zbin_8x8; - int16_t *round_ptr = b->round; - int16_t *quant_ptr = b->quant; - uint8_t *quant_shift_ptr = b->quant_shift; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; - int16_t *dequant_ptr = d->dequant; - int zbin_oq_value = b->zbin_extra; - vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t)); - eob = -1; - - for (i = 0; i < b->eob_max_offset_8x8; i++) { - rc = vp9_default_zig_zag1d_8x8[i]; - z = coeff_ptr[rc]; - - zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value); - zbin_boost_ptr++; - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - if (x >= zbin) { - x += (round_ptr[rc != 0]); - y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) - >> quant_shift_ptr[rc != 0]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value + if (!b->skip_block) { + int i, rc, eob; + int zbin; + int x, y, z, sz; + int zero_run; + int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + int16_t *coeff_ptr = mb->coeff + 16 * b_idx; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant; + uint8_t *quant_shift_ptr = b->quant_shift; + int16_t *dequant_ptr = d->dequant; + int zbin_oq_value = b->zbin_extra; +#if CONFIG_CODE_NONZEROCOUNT + int nzc = 0; +#endif - if (y) { - eob = i; // last nonzero coeffs - zbin_boost_ptr = b->zrun_zbin_boost_8x8; + eob = -1; + + // Special case for DC as it is the one triggering access in various + // tables: {zbin, quant, quant_shift, dequant}_ptr[rc != 0] + { + z = coeff_ptr[0]; + zbin = (zbin_ptr[0] + zbin_boost_ptr[0] + zbin_oq_value); + zero_run = 1; + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += (round_ptr[0]); + y = ((int)(((int)(x * quant_ptr[0]) >> 16) + x)) + >> quant_shift_ptr[0]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[0] = x; // write to destination + dqcoeff_ptr[0] = x * dequant_ptr[0]; // dequantized value + + if (y) { + eob = 0; // last nonzero coeffs +#if CONFIG_CODE_NONZEROCOUNT + ++nzc; // number of nonzero coeffs +#endif + zero_run = 0; + } + } + } + for (i = 1; i < 64; i++) { + rc = pt_scan[i]; + z = coeff_ptr[rc]; + zbin = (zbin_ptr[1] + zbin_boost_ptr[zero_run] + zbin_oq_value); + // The original code was incrementing zero_run while keeping it at + // maximum 15 by adding "(zero_run < 15)". The same is achieved by + // removing the opposite of the sign mask of "(zero_run - 15)". + zero_run -= (zero_run - 15) >> 31; + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += (round_ptr[rc != 0]); + y = ((int)(((int)(x * quant_ptr[1]) >> 16) + x)) + >> quant_shift_ptr[1]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[1]; // dequantized value + + if (y) { + eob = i; // last nonzero coeffs +#if CONFIG_CODE_NONZEROCOUNT + ++nzc; // number of nonzero coeffs +#endif + zero_run = 0; + } } } + xd->eobs[b_idx] = eob + 1; +#if CONFIG_CODE_NONZEROCOUNT + xd->nzcs[b_idx] = nzc; +#endif + } else { + xd->eobs[b_idx] = 0; +#if CONFIG_CODE_NONZEROCOUNT + xd->nzcs[b_idx] = 0; +#endif } - - d->eob = eob + 1; } void vp9_quantize_mby_8x8(MACROBLOCK *x) { int i; - int has_2nd_order = get_2nd_order_usage(&x->e_mbd); +#if CONFIG_CODE_NONZEROCOUNT for (i = 0; i < 16; i ++) { - x->e_mbd.block[i].eob = 0; + x->e_mbd.nzcs[i] = 0; } - x->e_mbd.block[24].eob = 0; +#endif for (i = 0; i < 16; i += 4) { - int ib = (i & 8) + ((i & 4) >> 1); - TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, &x->e_mbd.block[ib]); - if (tx_type != DCT_DCT) - assert(has_2nd_order == 0); - x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]); - } - - if (has_2nd_order) { - x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]); - } else { - vpx_memset(x->e_mbd.block[24].qcoeff, 0, - 16 * sizeof(x->e_mbd.block[24].qcoeff[0])); - vpx_memset(x->e_mbd.block[24].dqcoeff, 0, - 16 * sizeof(x->e_mbd.block[24].dqcoeff[0])); - x->e_mbd.block[24].eob = 0; + TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, (i & 8) + ((i & 4) >> 1)); + x->quantize_b_8x8(x, i, tx_type); } } void vp9_quantize_mbuv_8x8(MACROBLOCK *x) { int i; - for (i = 16; i < 24; i ++) - x->e_mbd.block[i].eob = 0; +#if CONFIG_CODE_NONZEROCOUNT + for (i = 16; i < 24; i ++) { + x->e_mbd.nzcs[i] = 0; + } +#endif for (i = 16; i < 24; i += 4) - x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]); + x->quantize_b_8x8(x, i, DCT_DCT); } void vp9_quantize_mb_8x8(MACROBLOCK *x) { @@ -310,12 +339,14 @@ void vp9_quantize_mb_8x8(MACROBLOCK *x) { } void vp9_quantize_mby_16x16(MACROBLOCK *x) { + TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd, 0); +#if CONFIG_CODE_NONZEROCOUNT int i; - - for (i = 0; i < 16; i++) - x->e_mbd.block[i].eob = 0; - x->e_mbd.block[24].eob = 0; - x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]); + for (i = 0; i < 16; i++) { + x->e_mbd.nzcs[i] = 0; + } +#endif + x->quantize_b_16x16(x, 0, tx_type); } void vp9_quantize_mb_16x16(MACROBLOCK *x) { @@ -324,107 +355,256 @@ void vp9_quantize_mb_16x16(MACROBLOCK *x) { } static void quantize(int16_t *zbin_boost_orig_ptr, - int16_t *coeff_ptr, int n_coeffs, int max_coeffs, + int16_t *coeff_ptr, int n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, uint8_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, - int *eob_ptr, const int *scan, int mul) { + uint16_t *eob_ptr, +#if CONFIG_CODE_NONZEROCOUNT + uint16_t *nzc_ptr, +#endif + const int *scan, int mul) { int i, rc, eob; int zbin; int x, y, z, sz; + int zero_run = 0; int16_t *zbin_boost_ptr = zbin_boost_orig_ptr; +#if CONFIG_CODE_NONZEROCOUNT + int nzc = 0; +#endif vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); eob = -1; - for (i = 0; i < max_coeffs; i++) { - rc = scan[i]; - z = coeff_ptr[rc] * mul; - - zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value); - zbin_boost_ptr ++; - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) - - if (x >= zbin) { - x += (round_ptr[rc!=0]); - y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) - >> quant_shift_ptr[rc!=0]; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - zbin_boost_ptr = zbin_boost_orig_ptr; + + if (!skip_block) { + for (i = 0; i < n_coeffs; i++) { + rc = scan[i]; + z = coeff_ptr[rc] * mul; + + zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value); + zero_run += (zero_run < 15); + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += (round_ptr[rc != 0]); + y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) + >> quant_shift_ptr[rc != 0]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value + + if (y) { + eob = i; // last nonzero coeffs + zero_run = 0; +#if CONFIG_CODE_NONZEROCOUNT + ++nzc; // number of nonzero coeffs +#endif + } } } } *eob_ptr = eob + 1; +#if CONFIG_CODE_NONZEROCOUNT + *nzc_ptr = nzc; +#endif } -void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) { - quantize(b->zrun_zbin_boost_16x16, - b->coeff, - 256, b->eob_max_offset_16x16, - b->zbin_16x16, b->round, b->quant, b->quant_shift, - d->qcoeff, - d->dqcoeff, +void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) { + MACROBLOCKD *const xd = &mb->e_mbd; + const int c_idx = plane_idx(xd, b_idx); + BLOCK *const b = &mb->block[c_idx]; + BLOCKD *const d = &xd->block[c_idx]; + const int *pt_scan; + + switch (tx_type) { + case ADST_DCT: + pt_scan = vp9_row_scan_16x16; + break; + case DCT_ADST: + pt_scan = vp9_col_scan_16x16; + break; + default: + pt_scan = vp9_default_zig_zag1d_16x16; + break; + } + + quantize(b->zrun_zbin_boost, + mb->coeff + 16 * b_idx, + 256, b->skip_block, + b->zbin, b->round, b->quant, b->quant_shift, + xd->qcoeff + 16 * b_idx, + xd->dqcoeff + 16 * b_idx, d->dequant, b->zbin_extra, - &d->eob, vp9_default_zig_zag1d_16x16, 1); + &xd->eobs[b_idx], +#if CONFIG_CODE_NONZEROCOUNT + &xd->nzcs[b_idx], +#endif + pt_scan, 1); } -void vp9_quantize_sby_32x32(MACROBLOCK *x) { - x->e_mbd.block[0].eob = 0; - quantize(x->block[0].zrun_zbin_boost_32x32, - x->sb_coeff_data.coeff, - 1024, x->block[0].eob_max_offset_32x32, - x->block[0].zbin_32x32, - x->block[0].round, x->block[0].quant, x->block[0].quant_shift, - x->e_mbd.sb_coeff_data.qcoeff, - x->e_mbd.sb_coeff_data.dqcoeff, - x->e_mbd.block[0].dequant, - x->block[0].zbin_extra, - &x->e_mbd.block[0].eob, +void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx) { + MACROBLOCKD *const xd = &mb->e_mbd; + const int c_idx = plane_idx(xd, b_idx); + BLOCK *const b = &mb->block[c_idx]; + BLOCKD *const d = &xd->block[c_idx]; + + quantize(b->zrun_zbin_boost, + mb->coeff + b_idx * 16, + 1024, b->skip_block, + b->zbin, + b->round, b->quant, b->quant_shift, + xd->qcoeff + b_idx * 16, + xd->dqcoeff + b_idx * 16, + d->dequant, + b->zbin_extra, + &xd->eobs[b_idx], +#if CONFIG_CODE_NONZEROCOUNT + &xd->nzcs[b_idx], +#endif vp9_default_zig_zag1d_32x32, 2); } +void vp9_quantize_sby_32x32(MACROBLOCK *x) { + vp9_regular_quantize_b_32x32(x, 0); +} + +void vp9_quantize_sby_16x16(MACROBLOCK *x) { + int n; + + for (n = 0; n < 4; n++) { + TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd, + (16 * (n & 2)) + ((n & 1) * 4)); + x->quantize_b_16x16(x, n * 16, tx_type); + } +} + +void vp9_quantize_sby_8x8(MACROBLOCK *x) { + int n; + + for (n = 0; n < 16; n++) { + TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, + (4 * (n & 12)) + ((n & 3) * 2)); + x->quantize_b_8x8(x, n * 4, tx_type); + } +} + +void vp9_quantize_sby_4x4(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + int n; + + for (n = 0; n < 64; n++) { + const TX_TYPE tx_type = get_tx_type_4x4(xd, n); + if (tx_type != DCT_DCT) { + vp9_ht_quantize_b_4x4(x, n, tx_type); + } else { + x->quantize_b_4x4(x, n); + } + } +} + void vp9_quantize_sbuv_16x16(MACROBLOCK *x) { + x->quantize_b_16x16(x, 64, DCT_DCT); + x->quantize_b_16x16(x, 80, DCT_DCT); +} + +void vp9_quantize_sbuv_8x8(MACROBLOCK *x) { int i; - x->e_mbd.block[16].eob = 0; - x->e_mbd.block[20].eob = 0; - for (i = 16; i < 24; i += 4) - quantize(x->block[i].zrun_zbin_boost_16x16, - x->sb_coeff_data.coeff + 1024 + (i - 16) * 64, - 256, x->block[i].eob_max_offset_16x16, - x->block[i].zbin_16x16, - x->block[i].round, x->block[0].quant, x->block[i].quant_shift, - x->e_mbd.sb_coeff_data.qcoeff + 1024 + (i - 16) * 64, - x->e_mbd.sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64, - x->e_mbd.block[i].dequant, - x->block[i].zbin_extra, - &x->e_mbd.block[i].eob, - vp9_default_zig_zag1d_16x16, 1); + for (i = 64; i < 96; i += 4) + x->quantize_b_8x8(x, i, DCT_DCT); +} + +void vp9_quantize_sbuv_4x4(MACROBLOCK *x) { + int i; + + for (i = 64; i < 96; i++) + x->quantize_b_4x4(x, i); +} + +void vp9_quantize_sb64y_32x32(MACROBLOCK *x) { + int n; + + for (n = 0; n < 4; n++) + vp9_regular_quantize_b_32x32(x, n * 64); +} + +void vp9_quantize_sb64y_16x16(MACROBLOCK *x) { + int n; + + for (n = 0; n < 16; n++) { + TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd, + (16 * (n & 12)) + ((n & 3) * 4)); + x->quantize_b_16x16(x, n * 16, tx_type); + } +} + +void vp9_quantize_sb64y_8x8(MACROBLOCK *x) { + int n; + + for (n = 0; n < 64; n++) { + TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, + (4 * (n & 56)) + ((n & 7) * 2)); + x->quantize_b_8x8(x, n * 4, tx_type); + } +} + +void vp9_quantize_sb64y_4x4(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + int n; + + for (n = 0; n < 256; n++) { + const TX_TYPE tx_type = get_tx_type_4x4(xd, n); + if (tx_type != DCT_DCT) { + vp9_ht_quantize_b_4x4(x, n, tx_type); + } else { + x->quantize_b_4x4(x, n); + } + } +} + +void vp9_quantize_sb64uv_32x32(MACROBLOCK *x) { + vp9_regular_quantize_b_32x32(x, 256); + vp9_regular_quantize_b_32x32(x, 320); +} + +void vp9_quantize_sb64uv_16x16(MACROBLOCK *x) { + int i; + + for (i = 256; i < 384; i += 16) + x->quantize_b_16x16(x, i, DCT_DCT); +} + +void vp9_quantize_sb64uv_8x8(MACROBLOCK *x) { + int i; + + for (i = 256; i < 384; i += 4) + x->quantize_b_8x8(x, i, DCT_DCT); +} + +void vp9_quantize_sb64uv_4x4(MACROBLOCK *x) { + int i; + + for (i = 256; i < 384; i++) + x->quantize_b_4x4(x, i); } /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of * these two C functions if corresponding optimized routine is not available. * NEON optimized version implements currently the fast quantization for pair * of blocks. */ -void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2, - BLOCKD *d1, BLOCKD *d2) { - vp9_regular_quantize_b_4x4(b1, d1); - vp9_regular_quantize_b_4x4(b2, d2); +void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2) { + vp9_regular_quantize_b_4x4(x, b_idx1); + vp9_regular_quantize_b_4x4(x, b_idx2); } -static void invert_quant(int16_t *quant, - uint8_t *shift, int d) { +static void invert_quant(int16_t *quant, uint8_t *shift, int d) { unsigned t; int l; t = d; @@ -438,247 +618,52 @@ static void invert_quant(int16_t *quant, void vp9_init_quantizer(VP9_COMP *cpi) { int i; int quant_val; - int Q; - static const int zbin_boost[16] = { 0, 0, 8, 10, 12, 14, 16, 20, - 24, 28, 32, 36, 40, 44, 44, 44 - }; - - static const int zbin_boost_8x8[64] = { 0, 0, 0, 8, 8, 8, 10, 12, - 14, 16, 18, 20, 22, 24, 26, 28, - 30, 32, 34, 36, 38, 40, 42, 44, - 46, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48 - }; - static const int zbin_boost_16x16[256] = { - 0, 0, 0, 8, 8, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, - 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - }; - static const int zbin_boost_32x32[1024] = { - 0, 0, 0, 8, 8, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, - 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, - }; - int qrounding_factor = 48; - - - for (Q = 0; Q < QINDEX_RANGE; Q++) { - int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80; - -#if CONFIG_LOSSLESS - if (cpi->oxcf.lossless) { - if (Q == 0) { - qzbin_factor = 64; - qrounding_factor = 64; - } - } -#endif + int q; + + static const int zbin_boost[16] = { 0, 0, 0, 8, 8, 8, 10, 12, + 14, 16, 20, 24, 28, 32, 36, 40 }; + for (q = 0; q < QINDEX_RANGE; q++) { + int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80; + int qrounding_factor = 48; + if (q == 0) { + qzbin_factor = 64; + qrounding_factor = 64; + } // dc values - quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q); - invert_quant(cpi->Y1quant[Q] + 0, - cpi->Y1quant_shift[Q] + 0, quant_val); - cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7; - cpi->common.Y1dequant[Q][0] = quant_val; - cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7; - cpi->zrun_zbin_boost_y1_8x8[Q][0] = - ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_y1_16x16[Q][0] = - ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; - cpi->Y1zbin_32x32[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_32x32[Q][0] = - ((quant_val * zbin_boost_32x32[0]) + 64) >> 7; - - - quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q); - invert_quant(cpi->Y2quant[Q] + 0, - cpi->Y2quant_shift[Q] + 0, quant_val); - cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7; - cpi->common.Y2dequant[Q][0] = quant_val; - cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7; - cpi->zrun_zbin_boost_y2_8x8[Q][0] = - ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_y2_16x16[Q][0] = - ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; - - quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q); - invert_quant(cpi->UVquant[Q] + 0, - cpi->UVquant_shift[Q] + 0, quant_val); - cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7; - cpi->common.UVdequant[Q][0] = quant_val; - cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7; - cpi->zrun_zbin_boost_uv_8x8[Q][0] = - ((quant_val * zbin_boost_8x8[0]) + 64) >> 7; - cpi->zrun_zbin_boost_uv_16x16[Q][0] = - ((quant_val * zbin_boost_16x16[0]) + 64) >> 7; + quant_val = vp9_dc_quant(q, cpi->common.y1dc_delta_q); + invert_quant(cpi->Y1quant[q] + 0, cpi->Y1quant_shift[q] + 0, quant_val); + cpi->Y1zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); + cpi->Y1round[q][0] = (qrounding_factor * quant_val) >> 7; + cpi->common.Y1dequant[q][0] = quant_val; + cpi->zrun_zbin_boost_y1[q][0] = (quant_val * zbin_boost[0]) >> 7; + + quant_val = vp9_dc_uv_quant(q, cpi->common.uvdc_delta_q); + invert_quant(cpi->UVquant[q] + 0, cpi->UVquant_shift[q] + 0, quant_val); + cpi->UVzbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); + cpi->UVround[q][0] = (qrounding_factor * quant_val) >> 7; + cpi->common.UVdequant[q][0] = quant_val; + cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7; // all the 4x4 ac values =; for (i = 1; i < 16; i++) { int rc = vp9_default_zig_zag1d_4x4[i]; - quant_val = vp9_ac_yquant(Q); - invert_quant(cpi->Y1quant[Q] + rc, - cpi->Y1quant_shift[Q] + rc, quant_val); - cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7; - cpi->common.Y1dequant[Q][rc] = quant_val; - cpi->zrun_zbin_boost_y1[Q][i] = - ((quant_val * zbin_boost[i]) + 64) >> 7; - - quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q); - invert_quant(cpi->Y2quant[Q] + rc, - cpi->Y2quant_shift[Q] + rc, quant_val); - cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7; - cpi->common.Y2dequant[Q][rc] = quant_val; - cpi->zrun_zbin_boost_y2[Q][i] = - ((quant_val * zbin_boost[i]) + 64) >> 7; - - quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q); - invert_quant(cpi->UVquant[Q] + rc, - cpi->UVquant_shift[Q] + rc, quant_val); - cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7; - cpi->common.UVdequant[Q][rc] = quant_val; - cpi->zrun_zbin_boost_uv[Q][i] = - ((quant_val * zbin_boost[i]) + 64) >> 7; - } - - // 8x8 structures... only zbin seperated out for now - // This needs cleaning up for 8x8 especially if we are to add - // support for non flat Q matices - for (i = 1; i < 64; i++) { - int rc = vp9_default_zig_zag1d_8x8[i]; - - quant_val = vp9_ac_yquant(Q); - cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_8x8[Q][i] = - ((quant_val * zbin_boost_8x8[i]) + 64) >> 7; - - quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q); - cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y2_8x8[Q][i] = - ((quant_val * zbin_boost_8x8[i]) + 64) >> 7; - - quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q); - cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_uv_8x8[Q][i] = - ((quant_val * zbin_boost_8x8[i]) + 64) >> 7; - } - - // 16x16 structures. Same comment above applies. - for (i = 1; i < 256; i++) { - int rc = vp9_default_zig_zag1d_16x16[i]; - - quant_val = vp9_ac_yquant(Q); - cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_16x16[Q][i] = - ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; - - quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q); - cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y2_16x16[Q][i] = - ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; - - quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q); - cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_uv_16x16[Q][i] = - ((quant_val * zbin_boost_16x16[i]) + 64) >> 7; - } - // 32x32 structures. Same comment above applies. - for (i = 1; i < 1024; i++) { - int rc = vp9_default_zig_zag1d_32x32[i]; - - quant_val = vp9_ac_yquant(Q); - cpi->Y1zbin_32x32[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7; - cpi->zrun_zbin_boost_y1_32x32[Q][i] = - ((quant_val * zbin_boost_32x32[i]) + 64) >> 7; + quant_val = vp9_ac_yquant(q); + invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val); + cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); + cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7; + cpi->common.Y1dequant[q][rc] = quant_val; + cpi->zrun_zbin_boost_y1[q][i] = + ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7); + + quant_val = vp9_ac_uv_quant(q, cpi->common.uvac_delta_q); + invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc, quant_val); + cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); + cpi->UVround[q][rc] = (qrounding_factor * quant_val) >> 7; + cpi->common.UVdequant[q][rc] = quant_val; + cpi->zrun_zbin_boost_uv[q][i] = + ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7); } } } @@ -709,106 +694,40 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { // Y zbin_extra = (cpi->common.Y1dequant[QIndex][1] * - (cpi->zbin_over_quant + - cpi->zbin_mode_boost + + (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; for (i = 0; i < 16; i++) { x->block[i].quant = cpi->Y1quant[QIndex]; x->block[i].quant_shift = cpi->Y1quant_shift[QIndex]; x->block[i].zbin = cpi->Y1zbin[QIndex]; - x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex]; - x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex]; - x->block[i].zbin_32x32 = cpi->Y1zbin_32x32[QIndex]; x->block[i].round = cpi->Y1round[QIndex]; x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex]; x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex]; - x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex]; - x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex]; - x->block[i].zrun_zbin_boost_32x32 = cpi->zrun_zbin_boost_y1_32x32[QIndex]; x->block[i].zbin_extra = (int16_t)zbin_extra; - // Segment max eob offset feature. - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) { - x->block[i].eob_max_offset = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_8x8 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_16x16 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_32x32 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - } else { - x->block[i].eob_max_offset = 16; - x->block[i].eob_max_offset_8x8 = 64; - x->block[i].eob_max_offset_16x16 = 256; - x->block[i].eob_max_offset_32x32 = 1024; - } + // Segment skip feature. + x->block[i].skip_block = + vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); } // UV zbin_extra = (cpi->common.UVdequant[QIndex][1] * - (cpi->zbin_over_quant + - cpi->zbin_mode_boost + + (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; for (i = 16; i < 24; i++) { x->block[i].quant = cpi->UVquant[QIndex]; x->block[i].quant_shift = cpi->UVquant_shift[QIndex]; x->block[i].zbin = cpi->UVzbin[QIndex]; - x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex]; - x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex]; x->block[i].round = cpi->UVround[QIndex]; x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex]; x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex]; - x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex]; - x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex]; - x->block[i].zbin_extra = (int16_t)zbin_extra; - // Segment max eob offset feature. - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) { - x->block[i].eob_max_offset = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_8x8 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[i].eob_max_offset_16x16 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - } else { - x->block[i].eob_max_offset = 16; - x->block[i].eob_max_offset_8x8 = 64; - x->block[i].eob_max_offset_16x16 = 256; - } - } - - // Y2 - zbin_extra = (cpi->common.Y2dequant[QIndex][1] * - ((cpi->zbin_over_quant / 2) + - cpi->zbin_mode_boost + - x->act_zbin_adj)) >> 7; - - x->block[24].quant = cpi->Y2quant[QIndex]; - x->block[24].quant_shift = cpi->Y2quant_shift[QIndex]; - x->block[24].zbin = cpi->Y2zbin[QIndex]; - x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex]; - x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex]; - x->block[24].round = cpi->Y2round[QIndex]; - x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex]; - x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex]; - x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex]; - x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex]; - x->block[24].zbin_extra = (int16_t)zbin_extra; - - // TBD perhaps not use for Y2 - // Segment max eob offset feature. - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) { - x->block[24].eob_max_offset = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - x->block[24].eob_max_offset_8x8 = - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); - } else { - x->block[24].eob_max_offset = 16; - x->block[24].eob_max_offset_8x8 = 4; + // Segment skip feature. + x->block[i].skip_block = + vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); } /* save this macroblock QIndex for vp9_update_zbin_extra() */ @@ -822,8 +741,7 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { // Y zbin_extra = (cpi->common.Y1dequant[QIndex][1] * - (cpi->zbin_over_quant + - cpi->zbin_mode_boost + + (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; for (i = 0; i < 16; i++) { x->block[i].zbin_extra = (int16_t)zbin_extra; @@ -831,21 +749,12 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { // UV zbin_extra = (cpi->common.UVdequant[QIndex][1] * - (cpi->zbin_over_quant + - cpi->zbin_mode_boost + + (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; for (i = 16; i < 24; i++) { x->block[i].zbin_extra = (int16_t)zbin_extra; } - - // Y2 - zbin_extra = (cpi->common.Y2dequant[QIndex][1] * - ((cpi->zbin_over_quant / 2) + - cpi->zbin_mode_boost + - x->act_zbin_adj)) >> 7; - - x->block[24].zbin_extra = (int16_t)zbin_extra; } void vp9_frame_init_quantizer(VP9_COMP *cpi) { @@ -861,13 +770,15 @@ void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) { cm->base_qindex = Q; + // Set lossless mode + if (cm->base_qindex <= 4) + cm->base_qindex = 0; + // if any of the delta_q values are changing update flag will // have to be set. cm->y1dc_delta_q = 0; - cm->y2ac_delta_q = 0; cm->uvdc_delta_q = 0; cm->uvac_delta_q = 0; - cm->y2dc_delta_q = 0; // quantizer has to be reinitialized if any delta_q changes. // As there are not any here for now this is inactive code. diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index ac44a751c4c08ef1fc109ed7b30938a5ebfefd54..6ba6cbdd9d1cfc7bac197dba99618b3ecbf663b7 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -14,10 +14,10 @@ #include "vp9/encoder/vp9_block.h" #define prototype_quantize_block(sym) \ - void (sym)(BLOCK *b,BLOCKD *d) + void (sym)(MACROBLOCK *mb, int b_idx) #define prototype_quantize_block_pair(sym) \ - void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) + void (sym)(MACROBLOCK *mb, int b_idx1, int b_idx2) #define prototype_quantize_mb(sym) \ void (sym)(MACROBLOCK *x) @@ -26,60 +26,41 @@ #include "x86/vp9_quantize_x86.h" #endif -#define prototype_quantize_block_type(sym) \ - void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type) -extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4); +void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_ix, TX_TYPE type); +void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx); +void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2); +void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type); +void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type); +void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx); -#ifndef vp9_quantize_quantb_4x4 -#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4 -#endif -extern prototype_quantize_block(vp9_quantize_quantb_4x4); - -#ifndef vp9_quantize_quantb_4x4_pair -#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair -#endif -extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair); - -#ifndef vp9_quantize_quantb_8x8 -#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8 -#endif -extern prototype_quantize_block(vp9_quantize_quantb_8x8); - -#ifndef vp9_quantize_quantb_16x16 -#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16 -#endif -extern prototype_quantize_block(vp9_quantize_quantb_16x16); - -#ifndef vp9_quantize_quantb_2x2 -#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2 -#endif -extern prototype_quantize_block(vp9_quantize_quantb_2x2); - -#ifndef vp9_quantize_mb_4x4 -#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c -#endif -extern prototype_quantize_mb(vp9_quantize_mb_4x4); +void vp9_quantize_mb_4x4(MACROBLOCK *x); void vp9_quantize_mb_8x8(MACROBLOCK *x); -#ifndef vp9_quantize_mbuv_4x4 -#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c -#endif -extern prototype_quantize_mb(vp9_quantize_mbuv_4x4); - -#ifndef vp9_quantize_mby_4x4 -#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c -#endif -extern prototype_quantize_mb(vp9_quantize_mby_4x4); +void vp9_quantize_mbuv_4x4(MACROBLOCK *x); +void vp9_quantize_mby_4x4(MACROBLOCK *x); -extern prototype_quantize_mb(vp9_quantize_mby_8x8); -extern prototype_quantize_mb(vp9_quantize_mbuv_8x8); +void vp9_quantize_mby_8x8(MACROBLOCK *x); +void vp9_quantize_mbuv_8x8(MACROBLOCK *x); void vp9_quantize_mb_16x16(MACROBLOCK *x); -extern prototype_quantize_block(vp9_quantize_quantb_16x16); -extern prototype_quantize_mb(vp9_quantize_mby_16x16); +void vp9_quantize_mby_16x16(MACROBLOCK *x); void vp9_quantize_sby_32x32(MACROBLOCK *x); +void vp9_quantize_sby_16x16(MACROBLOCK *x); +void vp9_quantize_sby_8x8(MACROBLOCK *x); +void vp9_quantize_sby_4x4(MACROBLOCK *x); void vp9_quantize_sbuv_16x16(MACROBLOCK *x); +void vp9_quantize_sbuv_8x8(MACROBLOCK *x); +void vp9_quantize_sbuv_4x4(MACROBLOCK *x); + +void vp9_quantize_sb64y_32x32(MACROBLOCK *x); +void vp9_quantize_sb64y_16x16(MACROBLOCK *x); +void vp9_quantize_sb64y_8x8(MACROBLOCK *x); +void vp9_quantize_sb64y_4x4(MACROBLOCK *x); +void vp9_quantize_sb64uv_32x32(MACROBLOCK *x); +void vp9_quantize_sb64uv_16x16(MACROBLOCK *x); +void vp9_quantize_sb64uv_8x8(MACROBLOCK *x); +void vp9_quantize_sb64uv_4x4(MACROBLOCK *x); struct VP9_COMP; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index f663b56c914119c059d8be0296bc7b580993b631..d26f5ec460d78cddc432be3c934b0a0a3449b5f6 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -14,8 +14,8 @@ #include <string.h> #include <limits.h> #include <assert.h> +#include <math.h> -#include "math.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_modecont.h" #include "vp9/common/vp9_common.h" @@ -25,9 +25,10 @@ #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" -#define MIN_BPB_FACTOR 0.005 -#define MAX_BPB_FACTOR 50 +#define MIN_BPB_FACTOR 0.005 +#define MAX_BPB_FACTOR 50 #ifdef MODE_STATS extern unsigned int y_modes[VP9_YMODES]; @@ -88,38 +89,33 @@ static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, // tables if and when things settle down in the experimental bitstream double vp9_convert_qindex_to_q(int qindex) { // Convert the index to a real Q value (scaled down to match old Q values) - return (double)vp9_ac_yquant(qindex) / 4.0; + return vp9_ac_yquant(qindex) / 4.0; } int vp9_gfboost_qadjust(int qindex) { - int retval; - double q; - - q = vp9_convert_qindex_to_q(qindex); - retval = (int)((0.00000828 * q * q * q) + - (-0.0055 * q * q) + - (1.32 * q) + 79.3); - return retval; + const double q = vp9_convert_qindex_to_q(qindex); + return (int)((0.00000828 * q * q * q) + + (-0.0055 * q * q) + + (1.32 * q) + 79.3); } static int kfboost_qadjust(int qindex) { - int retval; - double q; - - q = vp9_convert_qindex_to_q(qindex); - retval = (int)((0.00000973 * q * q * q) + - (-0.00613 * q * q) + - (1.316 * q) + 121.2); - return retval; + const double q = vp9_convert_qindex_to_q(qindex); + return (int)((0.00000973 * q * q * q) + + (-0.00613 * q * q) + + (1.316 * q) + 121.2); } -int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) { - if (frame_type == KEY_FRAME) - return (int)(4500000 / vp9_convert_qindex_to_q(qindex)); - else - return (int)(2850000 / vp9_convert_qindex_to_q(qindex)); -} +int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor) { + const double q = vp9_convert_qindex_to_q(qindex); + int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000; + + // q based adjustment to baseline enumberator + enumerator += (int)(enumerator * q) >> 12; + return (int)(0.5 + (enumerator * correction_factor / q)); +} void vp9_save_coding_context(VP9_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; @@ -168,16 +164,20 @@ void vp9_save_coding_context(VP9_COMP *cpi) { vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas); vp9_copy(cc->coef_probs_4x4, cm->fc.coef_probs_4x4); - vp9_copy(cc->hybrid_coef_probs_4x4, cm->fc.hybrid_coef_probs_4x4); vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8); - vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8); vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16); - vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16); vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32); vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob); #if CONFIG_COMP_INTERINTRA_PRED cc->interintra_prob = cm->fc.interintra_prob; #endif +#if CONFIG_CODE_NONZEROCOUNT + vp9_copy(cc->nzc_probs_4x4, cm->fc.nzc_probs_4x4); + vp9_copy(cc->nzc_probs_8x8, cm->fc.nzc_probs_8x8); + vp9_copy(cc->nzc_probs_16x16, cm->fc.nzc_probs_16x16); + vp9_copy(cc->nzc_probs_32x32, cm->fc.nzc_probs_32x32); + vp9_copy(cc->nzc_pcat_probs, cm->fc.nzc_pcat_probs); +#endif } void vp9_restore_coding_context(VP9_COMP *cpi) { @@ -226,89 +226,55 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas); vp9_copy(cm->fc.coef_probs_4x4, cc->coef_probs_4x4); - vp9_copy(cm->fc.hybrid_coef_probs_4x4, cc->hybrid_coef_probs_4x4); vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8); - vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8); vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16); - vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16); vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32); vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob); #if CONFIG_COMP_INTERINTRA_PRED cm->fc.interintra_prob = cc->interintra_prob; #endif +#if CONFIG_CODE_NONZEROCOUNT + vp9_copy(cm->fc.nzc_probs_4x4, cc->nzc_probs_4x4); + vp9_copy(cm->fc.nzc_probs_8x8, cc->nzc_probs_8x8); + vp9_copy(cm->fc.nzc_probs_16x16, cc->nzc_probs_16x16); + vp9_copy(cm->fc.nzc_probs_32x32, cc->nzc_probs_32x32); + vp9_copy(cm->fc.nzc_pcat_probs, cc->nzc_pcat_probs); +#endif } - void vp9_setup_key_frame(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; - // Setup for Key frame: - vp9_default_coef_probs(& cpi->common); - vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob); - vp9_init_mbmode_probs(& cpi->common); - vp9_default_bmode_probs(cm->fc.bmode_prob); - - if(cm->last_frame_seg_map) - vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols)); - - vp9_init_mv_probs(& cpi->common); + MACROBLOCKD *xd = &cpi->mb.e_mbd; - // cpi->common.filter_level = 0; // Reset every key frame. - cpi->common.filter_level = cpi->common.base_qindex * 3 / 8; + vp9_setup_past_independence(cm, xd); // interval before next GF cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - - cpi->common.refresh_golden_frame = TRUE; - cpi->common.refresh_alt_ref_frame = TRUE; - - vp9_init_mode_contexts(&cpi->common); - vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc)); - vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc)); - - vpx_memset(cm->prev_mip, 0, - (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO)); - vpx_memset(cm->mip, 0, - (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO)); - - vp9_update_mode_info_border(cm, cm->mip); - vp9_update_mode_info_in_image(cm, cm->mi); - -#if CONFIG_NEW_MVREF - if (1) { - MACROBLOCKD *xd = &cpi->mb.e_mbd; - - // Defaults probabilities for encoding the MV ref id signal - vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB, - sizeof(xd->mb_mv_ref_probs)); - } -#endif + /* All buffers are implicitly updated on key frames. */ + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; } void vp9_setup_inter_frame(VP9_COMP *cpi) { - if (cpi->common.refresh_alt_ref_frame) { - vpx_memcpy(&cpi->common.fc, - &cpi->common.lfc_a, - sizeof(cpi->common.fc)); - } else { - vpx_memcpy(&cpi->common.fc, - &cpi->common.lfc, - sizeof(cpi->common.fc)); - } -} + VP9_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &cpi->mb.e_mbd; + if (cm->error_resilient_mode) + vp9_setup_past_independence(cm, xd); + assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS); + vpx_memcpy(&cm->fc, &cm->frame_contexts[cm->frame_context_idx], + sizeof(cm->fc)); +} -static int estimate_bits_at_q(int frame_kind, int Q, int MBs, +static int estimate_bits_at_q(int frame_kind, int q, int mbs, double correction_factor) { - int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q)); + const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor)); - /* Attempt to retain reasonable accuracy without overflow. The cutoff is - * chosen such that the maximum product of Bpm and MBs fits 31 bits. The - * largest Bpm takes 20 bits. - */ - if (MBs > (1 << 11)) - return (Bpm >> BPER_MB_NORMBITS) * MBs; - else - return (Bpm * MBs) >> BPER_MB_NORMBITS; + // Attempt to retain reasonable accuracy without overflow. The cutoff is + // chosen such that the maximum product of Bpm and MBs fits 31 bits. The + // largest Bpm takes 20 bits. + return (mbs > (1 << 11)) ? (bpm >> BPER_MB_NORMBITS) * mbs + : (bpm * mbs) >> BPER_MB_NORMBITS; } @@ -331,7 +297,6 @@ static void calc_iframe_target_size(VP9_COMP *cpi) { } cpi->this_frame_target = target; - } @@ -347,25 +312,15 @@ static void calc_gf_params(VP9_COMP *cpi) { static void calc_pframe_target_size(VP9_COMP *cpi) { - int min_frame_target; - - min_frame_target = 0; - - min_frame_target = cpi->min_frame_bandwidth; - - if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5)) - min_frame_target = cpi->av_per_frame_bandwidth >> 5; - - - // Special alt reference frame case - if (cpi->common.refresh_alt_ref_frame) { + const int min_frame_target = MAX(cpi->min_frame_bandwidth, + cpi->av_per_frame_bandwidth >> 5); + if (cpi->refresh_alt_ref_frame) { + // Special alt reference frame case // Per frame bit target for the alt ref frame cpi->per_frame_bandwidth = cpi->twopass.gf_bits; cpi->this_frame_target = cpi->per_frame_bandwidth; - } - - // Normal frames (gf,and inter) - else { + } else { + // Normal frames (gf,and inter) cpi->this_frame_target = cpi->per_frame_bandwidth; } @@ -377,16 +332,16 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { if (cpi->this_frame_target < min_frame_target) cpi->this_frame_target = min_frame_target; - if (!cpi->common.refresh_alt_ref_frame) + if (!cpi->refresh_alt_ref_frame) // Note the baseline target data rate for this inter frame. cpi->inter_frame_target = cpi->this_frame_target; // Adjust target frame size for Golden Frames: if (cpi->frames_till_gf_update_due == 0) { - // int Boost = 0; - int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; + const int q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] + : cpi->oxcf.fixed_q; - cpi->common.refresh_golden_frame = TRUE; + cpi->refresh_golden_frame = 1; calc_gf_params(cpi); @@ -398,17 +353,17 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { // The spend on the GF is defined in the two pass code // for two pass encodes cpi->this_frame_target = cpi->per_frame_bandwidth; - } else + } else { cpi->this_frame_target = - (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) + (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0) * cpi->last_boost) / 100; + } - } - // If there is an active ARF at this location use the minimum - // bits on this frame even if it is a contructed arf. - // The active maximum quantizer insures that an appropriate - // number of bits will be spent if needed for contstructed ARFs. - else { + } else { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a contructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for contstructed ARFs. cpi->this_frame_target = 0; } @@ -418,12 +373,12 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { - int Q = cpi->common.base_qindex; - int correction_factor = 100; + const int q = cpi->common.base_qindex; + int correction_factor = 100; double rate_correction_factor; double adjustment_limit; - int projected_size_based_on_q = 0; + int projected_size_based_on_q = 0; // Clear down mmx registers to allow floating point in what follows vp9_clear_system_state(); // __asm emms; @@ -431,35 +386,18 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { if (cpi->common.frame_type == KEY_FRAME) { rate_correction_factor = cpi->key_frame_rate_correction_factor; } else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) rate_correction_factor = cpi->gf_rate_correction_factor; else rate_correction_factor = cpi->rate_correction_factor; } - // Work out how big we would have expected the frame to be at this Q given the current correction factor. + // Work out how big we would have expected the frame to be at this Q given + // the current correction factor. // Stay in double to avoid int overflow when values are large - projected_size_based_on_q = - (int)(((.5 + rate_correction_factor * - vp9_bits_per_mb(cpi->common.frame_type, Q)) * - cpi->common.MBs) / (1 << BPER_MB_NORMBITS)); - - // Make some allowance for cpi->zbin_over_quant - if (cpi->zbin_over_quant > 0) { - int Z = cpi->zbin_over_quant; - double Factor = 0.99; - double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX; - - while (Z > 0) { - Z--; - projected_size_based_on_q = - (int)(Factor * projected_size_based_on_q); - Factor += factor_adjustment; - - if (Factor >= 0.999) - Factor = 0.999; - } - } + projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q, + cpi->common.MBs, + rate_correction_factor); // Work out a size correction factor. // if ( cpi->this_frame_target > 0 ) @@ -505,7 +443,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { if (cpi->common.frame_type == KEY_FRAME) cpi->key_frame_rate_correction_factor = rate_correction_factor; else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) cpi->gf_rate_correction_factor = rate_correction_factor; else cpi->rate_correction_factor = rate_correction_factor; @@ -514,7 +452,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { - int Q = cpi->active_worst_quality; + int q = cpi->active_worst_quality; int i; int last_error = INT_MAX; @@ -522,14 +460,11 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { int bits_per_mb_at_this_q; double correction_factor; - // Reset Zbin OQ value - cpi->zbin_over_quant = 0; - // Select the appropriate correction factor based upon type of frame. if (cpi->common.frame_type == KEY_FRAME) correction_factor = cpi->key_frame_rate_correction_factor; else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) correction_factor = cpi->gf_rate_correction_factor; else correction_factor = cpi->rate_correction_factor; @@ -544,61 +479,22 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { i = cpi->active_best_quality; do { - bits_per_mb_at_this_q = - (int)(.5 + correction_factor * - vp9_bits_per_mb(cpi->common.frame_type, i)); + bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i, + correction_factor); if (bits_per_mb_at_this_q <= target_bits_per_mb) { if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) - Q = i; + q = i; else - Q = i - 1; + q = i - 1; break; - } else + } else { last_error = bits_per_mb_at_this_q - target_bits_per_mb; - } while (++i <= cpi->active_worst_quality); - - - // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like - // the RD multiplier and zero bin size. - if (Q >= MAXQ) { - int zbin_oqmax; - - double Factor = 0.99; - double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX; - - if (cpi->common.frame_type == KEY_FRAME) - zbin_oqmax = 0; // ZBIN_OQ_MAX/16 - else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active)) - zbin_oqmax = 16; - else - zbin_oqmax = ZBIN_OQ_MAX; - - // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true. - // The effect will be highly clip dependent and may well have sudden steps. - // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero - // bin and hence decreasing the number of low magnitude non zero coefficients. - while (cpi->zbin_over_quant < zbin_oqmax) { - cpi->zbin_over_quant++; - - if (cpi->zbin_over_quant > zbin_oqmax) - cpi->zbin_over_quant = zbin_oqmax; - - // Adjust bits_per_mb_at_this_q estimate - bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q); - Factor += factor_adjustment; - - if (Factor >= 0.999) - Factor = 0.999; - - if (bits_per_mb_at_this_q <= target_bits_per_mb) // Break out if we get down to the target rate - break; } + } while (++i <= cpi->active_worst_quality); - } - - return Q; + return q; } @@ -643,7 +539,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) { total_weight += prior_key_frame_weight[i]; } - av_key_frame_frequency /= total_weight; + av_key_frame_frequency /= total_weight; } return av_key_frame_frequency; @@ -671,7 +567,7 @@ void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit, *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; } else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) { + if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) { *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; } else { diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index c6484817fcfffe8d23935b3bdd3cb11c5c77089a..473317605d475d96072ea5bb502d6238fc1ae619 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -16,23 +16,24 @@ #define FRAME_OVERHEAD_BITS 200 -extern void vp9_save_coding_context(VP9_COMP *cpi); -extern void vp9_restore_coding_context(VP9_COMP *cpi); +void vp9_save_coding_context(VP9_COMP *cpi); +void vp9_restore_coding_context(VP9_COMP *cpi); -extern void vp9_setup_key_frame(VP9_COMP *cpi); -extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var); -extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame); -extern void vp9_adjust_key_frame_context(VP9_COMP *cpi); -extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi, - int *frame_under_shoot_limit, - int *frame_over_shoot_limit); +void vp9_setup_key_frame(VP9_COMP *cpi); +void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var); +int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame); +void vp9_adjust_key_frame_context(VP9_COMP *cpi); +void vp9_compute_frame_size_bounds(VP9_COMP *cpi, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); // return of 0 means drop frame -extern int vp9_pick_frame_size(VP9_COMP *cpi); +int vp9_pick_frame_size(VP9_COMP *cpi); -extern double vp9_convert_qindex_to_q(int qindex); -extern int vp9_gfboost_qadjust(int qindex); -extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex); +double vp9_convert_qindex_to_q(int qindex); +int vp9_gfboost_qadjust(int qindex); +extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor); void vp9_setup_inter_frame(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e8d0cc68e188392f8663c33d24c6b47f5f06b2b6..0083e8ae1a5000b5a547afe276ad2ad1b269e8f3 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -23,7 +23,6 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" -#include "vp9/common/vp9_reconintra4x4.h" #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/encoder/vp9_encodemb.h" @@ -151,21 +150,70 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { static void fill_token_costs(vp9_coeff_count *c, vp9_coeff_probs *p, int block_type_counts) { - int i, j, k; + int i, j, k, l; for (i = 0; i < block_type_counts; i++) - for (j = 0; j < COEF_BANDS; j++) - for (k = 0; k < PREV_COEF_CONTEXTS; k++) { - if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0))) - vp9_cost_tokens_skip((int *)(c[i][j][k]), - p[i][j][k], + for (j = 0; j < REF_TYPES; j++) + for (k = 0; k < COEF_BANDS; k++) + for (l = 0; l < PREV_COEF_CONTEXTS; l++) { + vp9_cost_tokens_skip((int *)(c[i][j][k][l]), + p[i][j][k][l], vp9_coef_tree); - else - vp9_cost_tokens((int *)(c[i][j][k]), - p[i][j][k], - vp9_coef_tree); + } +} + +#if CONFIG_CODE_NONZEROCOUNT +static void fill_nzc_costs(VP9_COMP *cpi, int block_size) { + int nzc_context, r, b, nzc, values; + int cost[16]; + values = block_size * block_size + 1; + + for (nzc_context = 0; nzc_context < MAX_NZC_CONTEXTS; ++nzc_context) { + for (r = 0; r < REF_TYPES; ++r) { + for (b = 0; b < BLOCK_TYPES; ++b) { + unsigned int *nzc_costs; + if (block_size == 4) { + vp9_cost_tokens(cost, + cpi->common.fc.nzc_probs_4x4[nzc_context][r][b], + vp9_nzc4x4_tree); + nzc_costs = cpi->mb.nzc_costs_4x4[nzc_context][r][b]; + } else if (block_size == 8) { + vp9_cost_tokens(cost, + cpi->common.fc.nzc_probs_8x8[nzc_context][r][b], + vp9_nzc8x8_tree); + nzc_costs = cpi->mb.nzc_costs_8x8[nzc_context][r][b]; + } else if (block_size == 16) { + vp9_cost_tokens(cost, + cpi->common.fc.nzc_probs_16x16[nzc_context][r][b], + vp9_nzc16x16_tree); + nzc_costs = cpi->mb.nzc_costs_16x16[nzc_context][r][b]; + } else { + vp9_cost_tokens(cost, + cpi->common.fc.nzc_probs_32x32[nzc_context][r][b], + vp9_nzc32x32_tree); + nzc_costs = cpi->mb.nzc_costs_32x32[nzc_context][r][b]; + } + + for (nzc = 0; nzc < values; ++nzc) { + int e, c, totalcost = 0; + c = codenzc(nzc); + totalcost = cost[c]; + if ((e = vp9_extranzcbits[c])) { + int x = nzc - vp9_basenzcvalue[c]; + while (e--) { + totalcost += vp9_cost_bit( + cpi->common.fc.nzc_pcat_probs[nzc_context] + [c - NZC_TOKENS_NOEXTRA][e], + ((x >> e) & 1)); + } + } + nzc_costs[nzc] = totalcost; + } } + } + } } +#endif static int rd_iifactor[32] = { 4, 4, 3, 2, 1, 0, 0, 0, @@ -193,19 +241,17 @@ void vp9_init_me_luts() { } static int compute_rd_mult(int qindex) { - int q; - - q = vp9_dc_quant(qindex, 0); - return (11 * q * q) >> 6; + int q = vp9_dc_quant(qindex, 0); + return (11 * q * q) >> 2; } -void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) { - cpi->mb.sadperbit16 = sad_per_bit16lut[QIndex]; - cpi->mb.sadperbit4 = sad_per_bit4lut[QIndex]; +void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { + cpi->mb.sadperbit16 = sad_per_bit16lut[qindex]; + cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; } -void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) { +void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { int q, i; vp9_clear_system_state(); // __asm emms; @@ -214,40 +260,23 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) { // for key frames, golden frames and arf frames. // if (cpi->common.refresh_golden_frame || // cpi->common.refresh_alt_ref_frame) - QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex); - - cpi->RDMULT = compute_rd_mult(QIndex); - - // Extend rate multiplier along side quantizer zbin increases - if (cpi->zbin_over_quant > 0) { - double oq_factor; - - // Experimental code using the same basic equation as used for Q above - // The units of cpi->zbin_over_quant are 1/128 of Q bin size - oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant); - cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor); - } + qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex); + cpi->RDMULT = compute_rd_mult(qindex); if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { if (cpi->twopass.next_iiratio > 31) cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4; else cpi->RDMULT += - (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; + (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; } - - if (cpi->RDMULT < 7) - cpi->RDMULT = 7; - - cpi->mb.errorperbit = (cpi->RDMULT / 110); + cpi->mb.errorperbit = cpi->RDMULT >> 6; cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); vp9_set_speed_features(cpi); - q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25); - q = q << 2; - cpi->RDMULT = cpi->RDMULT << 4; - + q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25); + q <<= 2; if (q < 8) q = 8; @@ -279,22 +308,19 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) { } fill_token_costs(cpi->mb.token_costs[TX_4X4], - cpi->common.fc.coef_probs_4x4, BLOCK_TYPES_4X4); - fill_token_costs(cpi->mb.hybrid_token_costs[TX_4X4], - cpi->common.fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4); - + cpi->common.fc.coef_probs_4x4, BLOCK_TYPES); fill_token_costs(cpi->mb.token_costs[TX_8X8], - cpi->common.fc.coef_probs_8x8, BLOCK_TYPES_8X8); - fill_token_costs(cpi->mb.hybrid_token_costs[TX_8X8], - cpi->common.fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8); - + cpi->common.fc.coef_probs_8x8, BLOCK_TYPES); fill_token_costs(cpi->mb.token_costs[TX_16X16], - cpi->common.fc.coef_probs_16x16, BLOCK_TYPES_16X16); - fill_token_costs(cpi->mb.hybrid_token_costs[TX_16X16], - cpi->common.fc.hybrid_coef_probs_16x16, BLOCK_TYPES_16X16); - + cpi->common.fc.coef_probs_16x16, BLOCK_TYPES); fill_token_costs(cpi->mb.token_costs[TX_32X32], - cpi->common.fc.coef_probs_32x32, BLOCK_TYPES_32X32); + cpi->common.fc.coef_probs_32x32, BLOCK_TYPES); +#if CONFIG_CODE_NONZEROCOUNT + fill_nzc_costs(cpi, 4); + fill_nzc_costs(cpi, 8); + fill_nzc_costs(cpi, 16); + fill_nzc_costs(cpi, 32); +#endif /*rough estimate for costing*/ cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4; @@ -321,26 +347,7 @@ int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) { return error; } -int vp9_mbblock_error_8x8_c(MACROBLOCK *mb, int dc) { - BLOCK *be; - BLOCKD *bd; - int i, j; - int berror, error = 0; - - for (i = 0; i < 16; i+=4) { - be = &mb->block[i]; - bd = &mb->e_mbd.block[i]; - berror = 0; - for (j = dc; j < 64; j++) { - int this_diff = be->coeff[j] - bd->dqcoeff[j]; - berror += this_diff * this_diff; - } - error += berror; - } - return error; -} - -int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) { +int vp9_mbblock_error_c(MACROBLOCK *mb) { BLOCK *be; BLOCKD *bd; int i, j; @@ -350,7 +357,7 @@ int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) { be = &mb->block[i]; bd = &mb->e_mbd.block[i]; berror = 0; - for (j = dc; j < 16; j++) { + for (j = 0; j < 16; j++) { int this_diff = be->coeff[j] - bd->dqcoeff[j]; berror += this_diff * this_diff; } @@ -417,75 +424,143 @@ int vp9_uvsse(MACROBLOCK *x) { sse2 += sse1; } return sse2; - } -#if CONFIG_NEWCOEFCONTEXT -#define PT pn -#else -#define PT pt -#endif -static int cost_coeffs(MACROBLOCK *mb, - BLOCKD *b, PLANE_TYPE type, - ENTROPY_CONTEXT *a, - ENTROPY_CONTEXT *l, - TX_SIZE tx_size) { +static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, + int ib, PLANE_TYPE type, + ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &mb->e_mbd; + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; int pt; - const int eob = b->eob; - MACROBLOCKD *xd = &mb->e_mbd; - const int ib = (int)(b - xd->block); - int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0; - int cost = 0, seg_eob; + const int eob = xd->eobs[ib]; + int c = 0; + int cost = 0, pad; + const int *scan, *nb; + const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16; + const int ref = mbmi->ref_frame != INTRA_FRAME; + unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = + mb->token_costs[tx_size][type][ref]; + ENTROPY_CONTEXT a_ec, l_ec; + ENTROPY_CONTEXT *const a1 = a + + sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT); + ENTROPY_CONTEXT *const l1 = l + + sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT); + +#if CONFIG_CODE_NONZEROCOUNT + int nzc_context = vp9_get_nzc_context(cm, xd, ib); + unsigned int *nzc_cost; +#else const int segment_id = xd->mode_info_context->mbmi.segment_id; - const int *scan, *band; - int16_t *qcoeff_ptr = b->qcoeff; - const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type(xd, b) : DCT_DCT; -#if CONFIG_NEWCOEFCONTEXT - const int *neighbors; - int pn; + vp9_prob (*coef_probs)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES]; #endif + int seg_eob, default_eob; + uint8_t token_cache[1024]; - ENTROPY_CONTEXT a_ec = *a, l_ec = *l; + // Check for consistency of tx_size with mode info + if (type == PLANE_TYPE_Y_WITH_DC) { + assert(xd->mode_info_context->mbmi.txfm_size == tx_size); + } else { + TX_SIZE tx_size_uv = get_uv_tx_size(xd); + assert(tx_size == tx_size_uv); + } switch (tx_size) { - case TX_4X4: - scan = vp9_default_zig_zag1d_4x4; - band = vp9_coef_bands_4x4; + case TX_4X4: { + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_4x4(xd, ib) : DCT_DCT; + a_ec = *a; + l_ec = *l; +#if CONFIG_CODE_NONZEROCOUNT + nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type]; +#else + coef_probs = cm->fc.coef_probs_4x4; +#endif seg_eob = 16; - if (type == PLANE_TYPE_Y_WITH_DC) { - if (tx_type == ADST_DCT) { - scan = vp9_row_scan_4x4; - } else if (tx_type == DCT_ADST) { - scan = vp9_col_scan_4x4; - } + if (tx_type == ADST_DCT) { + scan = vp9_row_scan_4x4; + } else if (tx_type == DCT_ADST) { + scan = vp9_col_scan_4x4; + } else { + scan = vp9_default_zig_zag1d_4x4; } break; - case TX_8X8: - if (type == PLANE_TYPE_Y2) { - scan = vp9_default_zig_zag1d_4x4; - band = vp9_coef_bands_4x4; - seg_eob = 4; + } + case TX_8X8: { + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x; + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT; + a_ec = (a[0] + a[1]) != 0; + l_ec = (l[0] + l[1]) != 0; + if (tx_type == ADST_DCT) { + scan = vp9_row_scan_8x8; + } else if (tx_type == DCT_ADST) { + scan = vp9_col_scan_8x8; } else { scan = vp9_default_zig_zag1d_8x8; - band = vp9_coef_bands_8x8; - seg_eob = 64; } +#if CONFIG_CODE_NONZEROCOUNT + nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type]; +#else + coef_probs = cm->fc.coef_probs_8x8; +#endif + seg_eob = 64; break; - case TX_16X16: - scan = vp9_default_zig_zag1d_16x16; - band = vp9_coef_bands_16x16; + } + case TX_16X16: { + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x; + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT; + if (tx_type == ADST_DCT) { + scan = vp9_row_scan_16x16; + } else if (tx_type == DCT_ADST) { + scan = vp9_col_scan_16x16; + } else { + scan = vp9_default_zig_zag1d_16x16; + } +#if CONFIG_CODE_NONZEROCOUNT + nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type]; +#else + coef_probs = cm->fc.coef_probs_16x16; +#endif seg_eob = 256; if (type == PLANE_TYPE_UV) { - const int uv_idx = ib - 16; - qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * uv_idx; + a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; + l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; + } else { + a_ec = (a[0] + a[1] + a[2] + a[3]) != 0; + l_ec = (l[0] + l[1] + l[2] + l[3]) != 0; } break; + } case TX_32X32: scan = vp9_default_zig_zag1d_32x32; - band = vp9_coef_bands_32x32; +#if CONFIG_CODE_NONZEROCOUNT + nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type]; +#else + coef_probs = cm->fc.coef_probs_32x32; +#endif seg_eob = 1024; - qcoeff_ptr = xd->sb_coeff_data.qcoeff; + if (type == PLANE_TYPE_UV) { + ENTROPY_CONTEXT *a2, *a3, *l2, *l3; + a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a_ec = (a[0] + a[1] + a1[0] + a1[1] + + a2[0] + a2[1] + a3[0] + a3[1]) != 0; + l_ec = (l[0] + l[1] + l1[0] + l1[1] + + l2[0] + l2[1] + l3[0] + l3[1]) != 0; + } else { + a_ec = (a[0] + a[1] + a[2] + a[3] + + a1[0] + a1[1] + a1[2] + a1[3]) != 0; + l_ec = (l[0] + l[1] + l[2] + l[3] + + l1[0] + l1[1] + l1[2] + l1[3]) != 0; + } break; default: abort(); @@ -493,202 +568,152 @@ static int cost_coeffs(MACROBLOCK *mb, } VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec); -#if CONFIG_NEWCOEFCONTEXT - neighbors = vp9_get_coef_neighbors_handle(scan); - pn = pt; -#endif + nb = vp9_get_coef_neighbors_handle(scan, &pad); + default_eob = seg_eob; - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) - seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); +#if CONFIG_CODE_NONZEROCOUNT == 0 + if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) + seg_eob = 0; +#endif - if (tx_type != DCT_DCT) { - for (; c < eob; c++) { - int v = qcoeff_ptr[scan[c]]; - int t = vp9_dct_value_tokens_ptr[v].Token; - cost += mb->hybrid_token_costs[tx_size][type][band[c]][PT][t]; - cost += vp9_dct_value_cost_ptr[v]; - pt = vp9_prev_token_class[t]; -#if CONFIG_NEWCOEFCONTEXT - if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1])) - pn = vp9_get_coef_neighbor_context( - qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]); - else - pn = pt; + { +#if CONFIG_CODE_NONZEROCOUNT + int nzc = 0; #endif - } - if (c < seg_eob) - cost += mb->hybrid_token_costs[tx_size][type][band[c]] - [PT][DCT_EOB_TOKEN]; - } else { for (; c < eob; c++) { int v = qcoeff_ptr[scan[c]]; int t = vp9_dct_value_tokens_ptr[v].Token; - cost += mb->token_costs[tx_size][type][band[c]][pt][t]; +#if CONFIG_CODE_NONZEROCOUNT + nzc += (v != 0); +#endif + token_cache[c] = t; + cost += token_costs[get_coef_band(scan, tx_size, c)][pt][t]; cost += vp9_dct_value_cost_ptr[v]; - pt = vp9_prev_token_class[t]; -#if CONFIG_NEWCOEFCONTEXT - if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1])) - pn = vp9_get_coef_neighbor_context( - qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]); - else - pn = pt; +#if !CONFIG_CODE_NONZEROCOUNT + if (!c || token_cache[c - 1]) + cost += vp9_cost_bit(coef_probs[type][ref] + [get_coef_band(scan, tx_size, c)] + [pt][0], 1); #endif + pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob); } +#if CONFIG_CODE_NONZEROCOUNT + cost += nzc_cost[nzc]; +#else if (c < seg_eob) - cost += mb->token_costs[tx_size][type][band[c]] - [PT][DCT_EOB_TOKEN]; + cost += mb->token_costs[tx_size][type][ref] + [get_coef_band(scan, tx_size, c)] + [pt][DCT_EOB_TOKEN]; +#endif } // is eob first coefficient; - pt = (c > !type); + pt = (c > 0); *a = *l = pt; + if (tx_size >= TX_8X8) { + a[1] = l[1] = pt; + if (tx_size >= TX_16X16) { + if (type == PLANE_TYPE_UV) { + a1[0] = a1[1] = l1[0] = l1[1] = pt; + } else { + a[2] = a[3] = l[2] = l[3] = pt; + if (tx_size >= TX_32X32) { + a1[0] = a1[1] = a1[2] = a1[3] = pt; + l1[0] = l1[1] = l1[2] = l1[3] = pt; + } + } + } + } return cost; } -static int rdcost_mby_4x4(MACROBLOCK *mb, int has_2nd_order, int backup) { +static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb) { int cost = 0; int b; MACROBLOCKD *xd = &mb->e_mbd; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta; - ENTROPY_CONTEXT *tl; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left; - if (backup) { - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); - - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - } else { - ta = (ENTROPY_CONTEXT *)xd->above_context; - tl = (ENTROPY_CONTEXT *)xd->left_context; - } + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); for (b = 0; b < 16; b++) - cost += cost_coeffs(mb, xd->block + b, - (has_2nd_order ? - PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC), + cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC, ta + vp9_block2above[TX_4X4][b], tl + vp9_block2left[TX_4X4][b], TX_4X4); - if (has_2nd_order) - cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2, - ta + vp9_block2above[TX_4X4][24], - tl + vp9_block2left[TX_4X4][24], - TX_4X4); - return cost; } -static void macro_block_yrd_4x4(MACROBLOCK *mb, - int *Rate, - int *Distortion, - int *skippable, int backup) { +static void macro_block_yrd_4x4(VP9_COMMON *const cm, + MACROBLOCK *mb, + int *rate, + int *distortion, + int *skippable) { MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK *const mb_y2 = mb->block + 24; - BLOCKD *const x_y2 = xd->block + 24; - int d, has_2nd_order; xd->mode_info_context->mbmi.txfm_size = TX_4X4; - has_2nd_order = get_2nd_order_usage(xd); - // Fdct and building the 2nd order block vp9_transform_mby_4x4(mb); vp9_quantize_mby_4x4(mb); - d = vp9_mbblock_error(mb, has_2nd_order); - if (has_2nd_order) - d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16); - - *Distortion = (d >> 2); - // rate - *Rate = rdcost_mby_4x4(mb, has_2nd_order, backup); - *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, has_2nd_order); + + *distortion = vp9_mbblock_error(mb) >> 2; + *rate = rdcost_mby_4x4(cm, mb); + *skippable = vp9_mby_is_skippable_4x4(xd); } -static int rdcost_mby_8x8(MACROBLOCK *mb, int has_2nd_order, int backup) { +static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb) { int cost = 0; int b; MACROBLOCKD *xd = &mb->e_mbd; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta; - ENTROPY_CONTEXT *tl; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left; - if (backup) { - vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); - - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - } else { - ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context; - tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context; - } + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); for (b = 0; b < 16; b += 4) - cost += cost_coeffs(mb, xd->block + b, - (has_2nd_order ? - PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC), + cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC, ta + vp9_block2above[TX_8X8][b], tl + vp9_block2left[TX_8X8][b], TX_8X8); - if (has_2nd_order) - cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2, - ta + vp9_block2above[TX_8X8][24], - tl + vp9_block2left[TX_8X8][24], - TX_8X8); return cost; } -static void macro_block_yrd_8x8(MACROBLOCK *mb, - int *Rate, - int *Distortion, - int *skippable, int backup) { +static void macro_block_yrd_8x8(VP9_COMMON *const cm, + MACROBLOCK *mb, + int *rate, + int *distortion, + int *skippable) { MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK *const mb_y2 = mb->block + 24; - BLOCKD *const x_y2 = xd->block + 24; - int d, has_2nd_order; xd->mode_info_context->mbmi.txfm_size = TX_8X8; - vp9_transform_mby_8x8(mb); vp9_quantize_mby_8x8(mb); - has_2nd_order = get_2nd_order_usage(xd); - d = vp9_mbblock_error_8x8_c(mb, has_2nd_order); - if (has_2nd_order) - d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16); - - *Distortion = (d >> 2); - // rate - *Rate = rdcost_mby_8x8(mb, has_2nd_order, backup); - *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, has_2nd_order); + + *distortion = vp9_mbblock_error(mb) >> 2; + *rate = rdcost_mby_8x8(cm, mb); + *skippable = vp9_mby_is_skippable_8x8(xd); } -static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) { - int cost; - MACROBLOCKD *xd = &mb->e_mbd; +static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb) { + MACROBLOCKD *const xd = &mb->e_mbd; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta, *tl; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left; - if (backup) { - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - } else { - ta = (ENTROPY_CONTEXT *)xd->above_context; - tl = (ENTROPY_CONTEXT *)xd->left_context; - } - - cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16); - return cost; + return cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16); } -static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion, - int *skippable, int backup) { - int d; - MACROBLOCKD *xd = &mb->e_mbd; +static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &mb->e_mbd; xd->mode_info_context->mbmi.txfm_size = TX_16X16; vp9_transform_mby_16x16(mb); @@ -696,15 +721,13 @@ static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion, // TODO(jingning) is it possible to quickly determine whether to force // trailing coefficients to be zero, instead of running trellis // optimization in the rate-distortion optimization loop? - if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED) - vp9_optimize_mby_16x16(mb); - - d = vp9_mbblock_error(mb, 0); + if (mb->optimize && + xd->mode_info_context->mbmi.mode < I8X8_PRED) + vp9_optimize_mby_16x16(cm, mb); - *Distortion = (d >> 2); - // rate - *Rate = rdcost_mby_16x16(mb, backup); - *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd); + *distortion = vp9_mbblock_error(mb) >> 2; + *rate = rdcost_mby_16x16(cm, mb); + *skippable = vp9_mby_is_skippable_16x16(xd); } static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, @@ -795,15 +818,16 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) { + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB]; vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor, x->block[0].src_stride); - macro_block_yrd_16x16(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1); - macro_block_yrd_8x8(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1); - macro_block_yrd_4x4(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1); + macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); + macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); + macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable, txfm_cache, TX_16X16); @@ -818,27 +842,8 @@ static void copy_predictor(uint8_t *dst, const uint8_t *predictor) { d[12] = p[12]; } -static int rdcost_sby_32x32(MACROBLOCK *x, int backup) { - MACROBLOCKD * const xd = &x->e_mbd; - ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta, *tl; - - if (backup) { - ta = (ENTROPY_CONTEXT *) &t_above, - tl = (ENTROPY_CONTEXT *) &t_left; - - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); - } else { - ta = (ENTROPY_CONTEXT *) xd->above_context; - tl = (ENTROPY_CONTEXT *) xd->left_context; - } - - return cost_coeffs(x, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32); -} - static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff, - int block_size) { + int block_size, int shift) { int i; int64_t error = 0; @@ -846,217 +851,293 @@ static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff, unsigned int this_diff = coeff[i] - dqcoeff[i]; error += this_diff * this_diff; } + error >>= shift; return error > INT_MAX ? INT_MAX : (int)error; } -#define DEBUG_ERROR 0 -static void super_block_yrd_32x32(MACROBLOCK *x, - int *rate, int *distortion, int *skippable, - int backup) { - SUPERBLOCK * const x_sb = &x->sb_coeff_data; +static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 64; b++) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb[TX_4X4][b], + tl + vp9_block2left_sb[TX_4X4][b], TX_4X4); + + return cost; +} + +static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_4X4; + vp9_transform_sby_4x4(x); + vp9_quantize_sby_4x4(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2); + *rate = rdcost_sby_4x4(cm, x); + *skippable = vp9_sby_is_skippable_4x4(xd); +} + +static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 64; b += 4) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb[TX_8X8][b], + tl + vp9_block2left_sb[TX_8X8][b], TX_8X8); + + return cost; +} + +static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_8X8; + vp9_transform_sby_8x8(x); + vp9_quantize_sby_8x8(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2); + *rate = rdcost_sby_8x8(cm, x); + *skippable = vp9_sby_is_skippable_8x8(xd); +} + +static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 64; b += 16) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb[TX_16X16][b], + tl + vp9_block2left_sb[TX_16X16][b], TX_16X16); + + return cost; +} + +static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_16X16; + vp9_transform_sby_16x16(x); + vp9_quantize_sby_16x16(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2); + *rate = rdcost_sby_16x16(cm, x); + *skippable = vp9_sby_is_skippable_16x16(xd); +} + +static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) { MACROBLOCKD * const xd = &x->e_mbd; - SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data; -#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID - int16_t out[1024]; -#endif + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32); +} + +static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + xd->mode_info_context->mbmi.txfm_size = TX_32X32; vp9_transform_sby_32x32(x); vp9_quantize_sby_32x32(x); -#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID - vp9_short_idct32x32(xd_sb->dqcoeff, out, 64); -#endif -#if !CONFIG_DWTDCTHYBRID - *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024); -#else - *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4; -#endif -#if DEBUG_ERROR - printf("IDCT/FDCT error 32x32: %d (d: %d)\n", - vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion); -#endif - *rate = rdcost_sby_32x32(x, backup); - *skippable = vp9_sby_is_skippable_32x32(&x->e_mbd); + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 0); + *rate = rdcost_sby_32x32(cm, x); + *skippable = vp9_sby_is_skippable_32x32(xd); } static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int *skip, int64_t txfm_cache[NB_TXFM_MODES]) { + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n; + int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB]; const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer; int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride; - ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_MB][2], - *orig_above = xd->above_context; - ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_MB][2], - *orig_left = xd->left_context; - - for (n = TX_4X4; n < TX_SIZE_MAX_MB; n++) { - vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n])); - vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n])); - r[n][0] = 0; - d[n] = 0; - s[n] = 1; - } - - vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride, - dst, dst_y_stride); - super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1); - -#if DEBUG_ERROR - int err[3] = { 0, 0, 0 }; -#endif - for (n = 0; n < 4; n++) { - int x_idx = n & 1, y_idx = n >> 1; - int r_tmp, d_tmp, s_tmp; - - vp9_subtract_mby_s_c(x->src_diff, - src + x_idx * 16 + y_idx * 16 * src_y_stride, - src_y_stride, - dst + x_idx * 16 + y_idx * 16 * dst_y_stride, - dst_y_stride); - - xd->above_context = &t_above[TX_16X16][x_idx]; - xd->left_context = &t_left[TX_16X16][y_idx]; - macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_16X16] += d_tmp; - r[TX_16X16][0] += r_tmp; - s[TX_16X16] = s[TX_16X16] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_16x16(xd); - err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - xd->above_context = &t_above[TX_4X4][x_idx]; - xd->left_context = &t_left[TX_4X4][y_idx]; - macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_4X4] += d_tmp; - r[TX_4X4][0] += r_tmp; - s[TX_4X4] = s[TX_4X4] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_4x4(xd); - err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif + vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride); + super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); + super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); + super_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); + super_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); - xd->above_context = &t_above[TX_8X8][x_idx]; - xd->left_context = &t_left[TX_8X8][y_idx]; - macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_8X8] += d_tmp; - r[TX_8X8][0] += r_tmp; - s[TX_8X8] = s[TX_8X8] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_8x8(xd); - err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - } -#if DEBUG_ERROR - printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]); - printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]); - printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]); -#endif choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, TX_SIZE_MAX_SB - 1); +} + +static int rdcost_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 256; b++) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb64[TX_4X4][b], + tl + vp9_block2left_sb64[TX_4X4][b], TX_4X4); - xd->above_context = orig_above; - xd->left_context = orig_left; + return cost; +} + +static void super_block64_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_4X4; + vp9_transform_sb64y_4x4(x); + vp9_quantize_sb64y_4x4(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2); + *rate = rdcost_sb64y_4x4(cm, x); + *skippable = vp9_sb64y_is_skippable_4x4(xd); +} + +static int rdcost_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 256; b += 4) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb64[TX_8X8][b], + tl + vp9_block2left_sb64[TX_8X8][b], TX_8X8); + + return cost; +} + +static void super_block64_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_8X8; + vp9_transform_sb64y_8x8(x); + vp9_quantize_sb64y_8x8(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2); + *rate = rdcost_sb64y_8x8(cm, x); + *skippable = vp9_sb64y_is_skippable_8x8(xd); +} + +static int rdcost_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 256; b += 16) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb64[TX_16X16][b], + tl + vp9_block2left_sb64[TX_16X16][b], TX_16X16); + + return cost; +} + +static void super_block64_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, + int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_16X16; + vp9_transform_sb64y_16x16(x); + vp9_quantize_sb64y_16x16(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2); + *rate = rdcost_sb64y_16x16(cm, x); + *skippable = vp9_sb64y_is_skippable_16x16(xd); +} + +static int rdcost_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD * const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 256; b += 64) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb64[TX_32X32][b], + tl + vp9_block2left_sb64[TX_32X32][b], TX_32X32); + + return cost; +} + +static void super_block64_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, + int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_32X32; + vp9_transform_sb64y_32x32(x); + vp9_quantize_sb64y_32x32(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 0); + *rate = rdcost_sb64y_32x32(cm, x); + *skippable = vp9_sb64y_is_skippable_32x32(xd); } static void super_block_64_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int *skip, int64_t txfm_cache[NB_TXFM_MODES]) { + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n; + int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB]; const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer; int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride; - ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_SB][4], - *orig_above = xd->above_context; - ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_SB][4], - *orig_left = xd->left_context; - - for (n = TX_4X4; n < TX_SIZE_MAX_SB; n++) { - vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n])); - vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n])); - r[n][0] = 0; - d[n] = 0; - s[n] = 1; - } - - for (n = 0; n < 4; n++) { - int x_idx = n & 1, y_idx = n >> 1; - int r_tmp, d_tmp, s_tmp; - - xd->above_context = &t_above[TX_32X32][x_idx << 1]; - xd->left_context = &t_left[TX_32X32][y_idx << 1]; - vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, - src + 32 * x_idx + 32 * y_idx * src_y_stride, - src_y_stride, - dst + 32 * x_idx + 32 * y_idx * dst_y_stride, - dst_y_stride); - super_block_yrd_32x32(x, &r_tmp, &d_tmp, &s_tmp, 0); - r[TX_32X32][0] += r_tmp; - d[TX_32X32] += d_tmp; - s[TX_32X32] = s[TX_32X32] && s_tmp; - } - -#if DEBUG_ERROR - int err[3] = { 0, 0, 0 }; -#endif - for (n = 0; n < 16; n++) { - int x_idx = n & 3, y_idx = n >> 2; - int r_tmp, d_tmp, s_tmp; - - vp9_subtract_mby_s_c(x->src_diff, - src + x_idx * 16 + y_idx * 16 * src_y_stride, - src_y_stride, - dst + x_idx * 16 + y_idx * 16 * dst_y_stride, - dst_y_stride); - - xd->above_context = &t_above[TX_16X16][x_idx]; - xd->left_context = &t_left[TX_16X16][y_idx]; - macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_16X16] += d_tmp; - r[TX_16X16][0] += r_tmp; - s[TX_16X16] = s[TX_16X16] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_16x16(xd); - err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - xd->above_context = &t_above[TX_4X4][x_idx]; - xd->left_context = &t_left[TX_4X4][y_idx]; - macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_4X4] += d_tmp; - r[TX_4X4][0] += r_tmp; - s[TX_4X4] = s[TX_4X4] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_4x4(xd); - err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif + vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride); + super_block64_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); + super_block64_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); + super_block64_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); + super_block64_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); - xd->above_context = &t_above[TX_8X8][x_idx]; - xd->left_context = &t_left[TX_8X8][y_idx]; - macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_8X8] += d_tmp; - r[TX_8X8][0] += r_tmp; - s[TX_8X8] = s[TX_8X8] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_8x8(xd); - err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - } -#if DEBUG_ERROR - printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]); - printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]); - printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]); -#endif choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, TX_SIZE_MAX_SB - 1); - - xd->above_context = orig_above; - xd->left_context = orig_left; } static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) { @@ -1091,6 +1172,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, int64_t best_rd = INT64_MAX; int rate = 0; int distortion; + VP9_COMMON *const cm = &cpi->common; ENTROPY_CONTEXT ta = *a, tempa = *a; ENTROPY_CONTEXT tl = *l, templ = *l; @@ -1105,8 +1187,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16); #if CONFIG_NEWBINTRAMODES - b->bmi.as_mode.context = vp9_find_bpred_context(b); + b->bmi.as_mode.context = vp9_find_bpred_context(xd, b); #endif + xd->mode_info_context->mbmi.txfm_size = TX_4X4; for (mode = B_DC_PRED; mode < LEFT4X4; mode++) { int64_t this_rd; int ratey; @@ -1129,23 +1212,24 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, rate = bmode_costs[mode]; #endif - vp9_intra4x4_predict(b, mode, b->predictor); + vp9_intra4x4_predict(xd, b, mode, b->predictor); vp9_subtract_b(be, b, 16); b->bmi.as_mode.first = mode; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, be - x->block); if (tx_type != DCT_DCT) { - vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4); - vp9_ht_quantize_b_4x4(be, b, tx_type); + vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_ht_quantize_b_4x4(x, be - x->block, tx_type); } else { - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, b); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, be - x->block); } tempa = ta; templ = tl; - ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4); + ratey = cost_coeffs(cm, x, b - xd->block, + PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4); rate += ratey; distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2; @@ -1168,9 +1252,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, // inverse transform if (best_tx_type != DCT_DCT) - vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob); + vp9_short_iht4x4(best_dqcoeff, b->diff, 16, best_tx_type); else - xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32); + xd->inv_txm4x4(best_dqcoeff, b->diff, 32); vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); @@ -1179,8 +1263,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, - int *Distortion, int64_t best_rd, - int update_contexts) { + int *Distortion, int64_t best_rd) { int i; MACROBLOCKD *const xd = &mb->e_mbd; int cost = mb->mbmode_cost [xd->frame_type] [B_PRED]; @@ -1191,18 +1274,13 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, ENTROPY_CONTEXT *ta, *tl; int *bmode_costs; - if (update_contexts) { - ta = (ENTROPY_CONTEXT *)xd->above_context; - tl = (ENTROPY_CONTEXT *)xd->left_context; - } else { - vpx_memcpy(&t_above, xd->above_context, - sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, - sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_above, xd->above_context, + sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_left, xd->left_context, + sizeof(ENTROPY_CONTEXT_PLANES)); - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - } + ta = (ENTROPY_CONTEXT *)&t_above; + tl = (ENTROPY_CONTEXT *)&t_left; xd->mode_info_context->mbmi.mode = B_PRED; bmode_costs = mb->inter_bmode_costs; @@ -1220,7 +1298,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, bmode_costs = mb->bmode_costs[A][L]; } #if CONFIG_NEWBINTRAMODES - mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd->block + i); + mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, xd->block + i); #endif total_rd += rd_pick_intra4x4block( @@ -1401,14 +1479,16 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int *bestdistortion) { + VP9_COMMON *const cm = &cpi->common; MB_PREDICTION_MODE mode; MACROBLOCKD *xd = &x->e_mbd; int64_t best_rd = INT64_MAX; int distortion = 0, rate = 0; BLOCK *be = x->block + ib; BLOCKD *b = xd->block + ib; - ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0; - ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0; + ENTROPY_CONTEXT_PLANES ta, tl; + ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0; + ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0; /* * The predictor buffer is a 2d buffer with a stride of 16. Create @@ -1430,58 +1510,76 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, rate = mode_costs[mode]; b->bmi.as_mode.first = mode; - vp9_intra8x8_predict(b, mode, b->predictor); + vp9_intra8x8_predict(xd, b, mode, b->predictor); vp9_subtract_4b_c(be, b, 16); - assert(get_2nd_order_usage(xd) == 0); if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { - TX_TYPE tx_type = get_tx_type_8x8(xd, b); + TX_TYPE tx_type = get_tx_type_8x8(xd, ib); if (tx_type != DCT_DCT) - vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8); + vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type); else - x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32); - x->quantize_b_8x8(x->block + idx, xd->block + idx); + x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32); + x->quantize_b_8x8(x, idx, tx_type); // compute quantization mse of 8x8 block distortion = vp9_block_error_c((x->block + idx)->coeff, (xd->block + idx)->dqcoeff, 64); - ta0 = a[vp9_block2above[TX_8X8][idx]]; - tl0 = l[vp9_block2left[TX_8X8][idx]]; - rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC, - &ta0, &tl0, TX_8X8); + vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES)); + + ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_8X8][idx]; + tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_8X8][idx]; + ta1 = ta0 + 1; + tl1 = tl0 + 1; + + rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC, + ta0, tl0, TX_8X8); rate += rate_t; - ta1 = ta0; - tl1 = tl0; } else { static const int iblock[4] = {0, 1, 4, 5}; TX_TYPE tx_type; int i; - ta0 = a[vp9_block2above[TX_4X4][ib]]; - ta1 = a[vp9_block2above[TX_4X4][ib + 1]]; - tl0 = l[vp9_block2left[TX_4X4][ib]]; - tl1 = l[vp9_block2left[TX_4X4][ib + 4]]; + vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES)); + ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_4X4][ib]; + tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_4X4][ib]; + ta1 = ta0 + 1; + tl1 = tl0 + 1; distortion = 0; rate_t = 0; for (i = 0; i < 4; ++i) { + int do_two = 0; b = &xd->block[ib + iblock[i]]; be = &x->block[ib + iblock[i]]; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, ib + iblock[i]); if (tx_type != DCT_DCT) { - vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4); - vp9_ht_quantize_b_4x4(be, b, tx_type); + vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); + vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type); + } else if (!(i & 1) && + get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) { + x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1); + do_two = 1; } else { - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, b); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, ib + iblock[i]); } - distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16); - rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, - // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0, - &ta0, &tl0, + distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two); + rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC, + i&1 ? ta1 : ta0, i&2 ? tl1 : tl0, TX_4X4); + if (do_two) { + i++; + rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC, + i&1 ? ta1 : ta0, i&2 ? tl1 : tl0, + TX_4X4); + } } + b = &xd->block[ib]; + be = &x->block[ib]; rate += rate_t; } @@ -1491,10 +1589,10 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, *bestrate = rate; *bestratey = rate_t; *bestdistortion = distortion; - besta0 = ta0; - besta1 = ta1; - bestl0 = tl0; - bestl1 = tl1; + besta0 = *ta0; + besta1 = *ta1; + bestl0 = *tl0; + bestl1 = *tl1; best_rd = this_rd; *best_mode = mode; copy_predictor_8x8(best_predictor, b->predictor); @@ -1563,7 +1661,80 @@ static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, return RDCOST(mb->rdmult, mb->rddiv, cost, distortion); } -static int rd_cost_mbuv_4x4(MACROBLOCK *mb, int backup) { +static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, int *rate_y, + int *distortion, + int *mode8x8, + int64_t best_yrd, + int64_t *txfm_cache) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + int cost0 = vp9_cost_bit(cm->prob_tx[0], 0); + int cost1 = vp9_cost_bit(cm->prob_tx[0], 1); + int64_t tmp_rd_4x4s, tmp_rd_8x8s; + int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd; + int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8; + + mbmi->txfm_size = TX_4X4; + tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4, + &d4x4, best_yrd); + mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first; + mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first; + mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first; + mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first; + mbmi->txfm_size = TX_8X8; + tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8, + &d8x8, best_yrd); + txfm_cache[ONLY_4X4] = tmp_rd_4x4; + txfm_cache[ALLOW_8X8] = tmp_rd_8x8; + txfm_cache[ALLOW_16X16] = tmp_rd_8x8; + tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0); + tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0); + txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? + tmp_rd_4x4s : tmp_rd_8x8s; + if (cm->txfm_mode == TX_MODE_SELECT) { + if (tmp_rd_4x4s < tmp_rd_8x8s) { + *rate = r4x4 + cost0; + *rate_y = tok4x4 + cost0; + *distortion = d4x4; + mbmi->txfm_size = TX_4X4; + tmp_rd = tmp_rd_4x4s; + } else { + *rate = r8x8 + cost1; + *rate_y = tok8x8 + cost1; + *distortion = d8x8; + mbmi->txfm_size = TX_8X8; + tmp_rd = tmp_rd_8x8s; + + mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first; + mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first; + mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first; + mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first; + } + } else if (cm->txfm_mode == ONLY_4X4) { + *rate = r4x4; + *rate_y = tok4x4; + *distortion = d4x4; + mbmi->txfm_size = TX_4X4; + tmp_rd = tmp_rd_4x4; + } else { + *rate = r8x8; + *rate_y = tok8x8; + *distortion = d8x8; + mbmi->txfm_size = TX_8X8; + tmp_rd = tmp_rd_8x8; + + mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first; + mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first; + mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first; + mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first; + } + + return tmp_rd; +} + +static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) { int b; int cost = 0; MACROBLOCKD *xd = &mb->e_mbd; @@ -1582,7 +1753,7 @@ static int rd_cost_mbuv_4x4(MACROBLOCK *mb, int backup) { } for (b = 16; b < 24; b++) - cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV, + cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV, ta + vp9_block2above[TX_4X4][b], tl + vp9_block2left[TX_4X4][b], TX_4X4); @@ -1597,14 +1768,14 @@ static int64_t rd_inter16x16_uv_4x4(VP9_COMP *cpi, MACROBLOCK *x, int *rate, vp9_transform_mbuv_4x4(x); vp9_quantize_mbuv_4x4(x); - *rate = rd_cost_mbuv_4x4(x, do_ctx_backup); + *rate = rd_cost_mbuv_4x4(&cpi->common, x, do_ctx_backup); *distortion = vp9_mbuverror(x) / 4; *skip = vp9_mbuv_is_skippable_4x4(&x->e_mbd); return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); } -static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) { +static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) { int b; int cost = 0; MACROBLOCKD *xd = &mb->e_mbd; @@ -1623,7 +1794,7 @@ static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) { } for (b = 16; b < 24; b += 4) - cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV, + cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV, ta + vp9_block2above[TX_8X8][b], tl + vp9_block2left[TX_8X8][b], TX_8X8); @@ -1636,23 +1807,23 @@ static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate, vp9_transform_mbuv_8x8(x); vp9_quantize_mbuv_8x8(x); - *rate = rd_cost_mbuv_8x8(x, do_ctx_backup); + *rate = rd_cost_mbuv_8x8(&cpi->common, x, do_ctx_backup); *distortion = vp9_mbuverror(x) / 4; *skip = vp9_mbuv_is_skippable_8x8(&x->e_mbd); return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); } -static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) { +static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, int backup) { int b; int cost = 0; MACROBLOCKD *const xd = &x->e_mbd; - ENTROPY_CONTEXT_PLANES t_above, t_left; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; ENTROPY_CONTEXT *ta, *tl; if (backup) { - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2); + vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2); ta = (ENTROPY_CONTEXT *) &t_above; tl = (ENTROPY_CONTEXT *) &t_left; @@ -1662,24 +1833,24 @@ static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) { } for (b = 16; b < 24; b += 4) - cost += cost_coeffs(x, xd->block + b, PLANE_TYPE_UV, + cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_UV, ta + vp9_block2above[TX_8X8][b], tl + vp9_block2left[TX_8X8][b], TX_16X16); return cost; } -static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate, - int *distortion, int *skip, +static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skip, int backup) { MACROBLOCKD *const xd = &x->e_mbd; vp9_transform_sbuv_16x16(x); vp9_quantize_sbuv_16x16(x); - *rate = rd_cost_sbuv_16x16(x, backup); - *distortion = vp9_block_error_c(x->sb_coeff_data.coeff + 1024, - xd->sb_coeff_data.dqcoeff + 1024, 512) >> 2; + *rate = rd_cost_sbuv_16x16(cm, x, backup); + *distortion = vp9_sb_block_error_c(x->coeff + 1024, + xd->dqcoeff + 1024, 512, 2); *skip = vp9_sbuv_is_skippable_16x16(xd); } @@ -1691,11 +1862,11 @@ static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate, const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer; int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride; - if (mbmi->txfm_size == TX_32X32) { - vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff, + if (mbmi->txfm_size >= TX_16X16) { + vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride, udst, vdst, dst_uv_stride); - rd_inter32x32_uv_16x16(x, rate, distortion, skip, 1); + rd_inter32x32_uv_16x16(&cpi->common, x, rate, distortion, skip, 1); } else { int n, r = 0, d = 0; int skippable = 1; @@ -1743,22 +1914,14 @@ static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate, return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); } -static void super_block_64_uvrd(MACROBLOCK *x, int *rate, +static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate, int *distortion, int *skip); static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel, int *skip) { - super_block_64_uvrd(x, rate, distortion, skip); + super_block_64_uvrd(&cpi->common, x, rate, distortion, skip); return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); } -static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate, - int *distortion, int *skip, int fullpixel) { - vp9_build_inter4x4_predictors_mbuv(&x->e_mbd); - vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, - x->e_mbd.predictor, x->src.uv_stride); - return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1); -} - static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, @@ -1773,6 +1936,7 @@ static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi, int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r); int rate_to, UNINITIALIZED_IS_SAFE(skip); + xd->mode_info_context->mbmi.txfm_size = TX_4X4; for (mode = DC_PRED; mode <= TM_PRED; mode++) { int rate; int distortion; @@ -1786,7 +1950,7 @@ static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi, vp9_transform_mbuv_4x4(x); vp9_quantize_mbuv_4x4(x); - rate_to = rd_cost_mbuv_4x4(x, 1); + rate_to = rd_cost_mbuv_4x4(&cpi->common, x, 1); rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode]; @@ -1825,6 +1989,7 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi, int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r); int rate_to, UNINITIALIZED_IS_SAFE(skip); + xd->mode_info_context->mbmi.txfm_size = TX_8X8; for (mode = DC_PRED; mode <= TM_PRED; mode++) { int rate; int distortion; @@ -1838,7 +2003,7 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi, vp9_quantize_mbuv_8x8(x); - rate_to = rd_cost_mbuv_8x8(x, 1); + rate_to = rd_cost_mbuv_8x8(&cpi->common, x, 1); rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode]; distortion = vp9_mbuverror(x) / 4; @@ -1860,7 +2025,8 @@ static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi, } // TODO(rbultje) very similar to rd_inter32x32_uv(), merge? -static void super_block_uvrd(MACROBLOCK *x, +static void super_block_uvrd(VP9_COMMON *const cm, + MACROBLOCK *x, int *rate, int *distortion, int *skippable) { @@ -1870,11 +2036,11 @@ static void super_block_uvrd(MACROBLOCK *x, const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer; int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride; - if (mbmi->txfm_size == TX_32X32) { - vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff, + if (mbmi->txfm_size >= TX_16X16) { + vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride, udst, vdst, dst_uv_stride); - rd_inter32x32_uv_16x16(x, rate, distortion, skippable, 1); + rd_inter32x32_uv_16x16(cm, x, rate, distortion, skippable, 1); } else { int d = 0, r = 0, n, s = 1; ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; @@ -1908,9 +2074,9 @@ static void super_block_uvrd(MACROBLOCK *x, xd->above_context = t_above + x_idx; xd->left_context = t_left + y_idx; if (mbmi->txfm_size == TX_4X4) { - r += rd_cost_mbuv_4x4(x, 0); + r += rd_cost_mbuv_4x4(cm, x, 0); } else { - r += rd_cost_mbuv_8x8(x, 0); + r += rd_cost_mbuv_8x8(cm, x, 0); } } @@ -1923,7 +2089,48 @@ static void super_block_uvrd(MACROBLOCK *x, } } -static void super_block_64_uvrd(MACROBLOCK *x, +static int rd_cost_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x, + int backup) { + int b; + int cost = 0; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta, *tl; + + if (backup) { + vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4); + vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4); + + ta = (ENTROPY_CONTEXT *) &t_above; + tl = (ENTROPY_CONTEXT *) &t_left; + } else { + ta = (ENTROPY_CONTEXT *)xd->above_context; + tl = (ENTROPY_CONTEXT *)xd->left_context; + } + + for (b = 16; b < 24; b += 4) + cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_UV, + ta + vp9_block2above[TX_8X8][b], + tl + vp9_block2left[TX_8X8][b], TX_32X32); + + return cost; +} + +static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skip, + int backup) { + MACROBLOCKD *const xd = &x->e_mbd; + + vp9_transform_sb64uv_32x32(x); + vp9_quantize_sb64uv_32x32(x); + + *rate = rd_cost_sb64uv_32x32(cm, x, backup); + *distortion = vp9_sb_block_error_c(x->coeff + 4096, + xd->dqcoeff + 4096, 2048, 0); + *skip = vp9_sb64uv_is_skippable_32x32(xd); +} + +static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate, int *distortion, int *skippable) { @@ -1937,10 +2144,15 @@ static void super_block_64_uvrd(MACROBLOCK *x, ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context; int d = 0, r = 0, n, s = 1; + // FIXME not needed if tx=32x32 memcpy(t_above, xd->above_context, sizeof(t_above)); memcpy(t_left, xd->left_context, sizeof(t_left)); if (mbmi->txfm_size == TX_32X32) { + vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride, + udst, vdst, dst_uv_stride); + rd_inter64x64_uv_32x32(cm, x, &r, &d, &s, 1); + } else if (mbmi->txfm_size == TX_16X16) { int n; *rate = 0; @@ -1948,7 +2160,7 @@ static void super_block_64_uvrd(MACROBLOCK *x, int x_idx = n & 1, y_idx = n >> 1; int r_tmp, d_tmp, s_tmp; - vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff, + vp9_subtract_sbuv_s_c(x->src_diff, usrc + x_idx * 16 + y_idx * 16 * src_uv_stride, vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride, src_uv_stride, @@ -1957,7 +2169,7 @@ static void super_block_64_uvrd(MACROBLOCK *x, dst_uv_stride); xd->above_context = t_above + x_idx * 2; xd->left_context = t_left + y_idx * 2; - rd_inter32x32_uv_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0); + rd_inter32x32_uv_16x16(cm, x, &r_tmp, &d_tmp, &s_tmp, 0); r += r_tmp; d += d_tmp; s = s && s_tmp; @@ -1987,9 +2199,9 @@ static void super_block_64_uvrd(MACROBLOCK *x, xd->left_context = t_left + y_idx; d += vp9_mbuverror(x) >> 2; if (mbmi->txfm_size == TX_4X4) { - r += rd_cost_mbuv_4x4(x, 0); + r += rd_cost_mbuv_4x4(cm, x, 0); } else { - r += rd_cost_mbuv_8x8(x, 0); + r += rd_cost_mbuv_8x8(cm, x, 0); } } } @@ -2018,7 +2230,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, x->e_mbd.mode_info_context->mbmi.uv_mode = mode; vp9_build_intra_predictors_sbuv_s(&x->e_mbd); - super_block_uvrd(x, &this_rate_tokenonly, + super_block_uvrd(&cpi->common, x, &this_rate_tokenonly, &this_distortion, &s); this_rate = this_rate_tokenonly + x->intra_uv_mode_cost[x->e_mbd.frame_type][mode]; @@ -2055,7 +2267,7 @@ static int64_t rd_pick_intra_sb64uv_mode(VP9_COMP *cpi, x->e_mbd.mode_info_context->mbmi.uv_mode = mode; vp9_build_intra_predictors_sb64uv_s(&x->e_mbd); - super_block_64_uvrd(x, &this_rate_tokenonly, + super_block_64_uvrd(&cpi->common, x, &this_rate_tokenonly, &this_distortion, &s); this_rate = this_rate_tokenonly + x->intra_uv_mode_cost[x->e_mbd.frame_type][mode]; @@ -2082,12 +2294,8 @@ int vp9_cost_mv_ref(VP9_COMP *cpi, MACROBLOCKD *xd = &cpi->mb.e_mbd; int segment_id = xd->mode_info_context->mbmi.segment_id; - // If the mode coding is done entirely at the segment level - // we should not account for it at the per mb level in rd code. - // Note that if the segment level coding is expanded from single mode - // to multiple mode masks as per reference frame coding we will need - // to do something different here. - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { + // Dont account for mode here if segment skip is enabled. + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { VP9_COMMON *pc = &cpi->common; vp9_prob p [VP9_MVREFS - 1]; @@ -2156,14 +2364,18 @@ static int labels2mode( } break; case LEFT4X4: - this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i); + this_mv->as_int = col ? d[-1].bmi.as_mv[0].as_int : + left_block_mv(xd, mic, i); if (mbmi->second_ref_frame > 0) - this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i); + this_second_mv->as_int = col ? d[-1].bmi.as_mv[1].as_int : + left_block_second_mv(xd, mic, i); break; case ABOVE4X4: - this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis); + this_mv->as_int = row ? d[-4].bmi.as_mv[0].as_int : + above_block_mv(mic, i, mis); if (mbmi->second_ref_frame > 0) - this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis); + this_second_mv->as_int = row ? d[-4].bmi.as_mv[1].as_int : + above_block_second_mv(mic, i, mis); break; case ZERO4X4: this_mv->as_int = 0; @@ -2178,11 +2390,11 @@ static int labels2mode( int_mv left_mv, left_second_mv; left_second_mv.as_int = 0; - left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int : - left_block_mv(mic, i); + left_mv.as_int = col ? d[-1].bmi.as_mv[0].as_int : + left_block_mv(xd, mic, i); if (mbmi->second_ref_frame > 0) - left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int : - left_block_second_mv(mic, i); + left_second_mv.as_int = col ? d[-1].bmi.as_mv[1].as_int : + left_block_second_mv(xd, mic, i); if (left_mv.as_int == this_mv->as_int && (mbmi->second_ref_frame <= 0 || @@ -2198,9 +2410,9 @@ static int labels2mode( #endif } - d->bmi.as_mv.first.as_int = this_mv->as_int; + d->bmi.as_mv[0].as_int = this_mv->as_int; if (mbmi->second_ref_frame > 0) - d->bmi.as_mv.second.as_int = this_second_mv->as_int; + d->bmi.as_mv[1].as_int = this_second_mv->as_int; x->partition_info->bmi[i].mode = m; x->partition_info->bmi[i].mv.as_int = this_mv->as_int; @@ -2212,7 +2424,8 @@ static int labels2mode( return cost; } -static int64_t encode_inter_mb_segment(MACROBLOCK *x, +static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, + MACROBLOCK *x, int const *labels, int which_label, int *labelyrate, @@ -2230,15 +2443,30 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x, BLOCK *be = &x->block[i]; int thisdistortion; - vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4); - if (xd->mode_info_context->mbmi.second_ref_frame > 0) - vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4); + vp9_build_inter_predictor(*(bd->base_pre) + bd->pre, + bd->pre_stride, + bd->predictor, 16, + &bd->bmi.as_mv[0], + &xd->scale_factor[0], + 4, 4, 0 /* no avg */, &xd->subpix); + + // TODO(debargha): Make this work properly with the + // implicit-compoundinter-weight experiment when implicit + // weighting for splitmv modes is turned on. + if (xd->mode_info_context->mbmi.second_ref_frame > 0) { + vp9_build_inter_predictor( + *(bd->base_second_pre) + bd->pre, bd->pre_stride, bd->predictor, 16, + &bd->bmi.as_mv[1], &xd->scale_factor[1], 4, 4, + 1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT) /* avg */, + &xd->subpix); + } + vp9_subtract_b(be, bd, 16); - x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4(be, bd); + x->fwd_txm4x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4(x, i); thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16); *distortion += thisdistortion; - *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC, + *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC, ta + vp9_block2above[TX_4X4][i], tl + vp9_block2left[TX_4X4][i], TX_4X4); } @@ -2247,7 +2475,8 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x, return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion); } -static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x, +static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm, + MACROBLOCK *x, int const *labels, int which_label, int *labelyrate, @@ -2274,67 +2503,90 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x, int ib = vp9_i8x8_block[i]; if (labels[ib] == which_label) { + const int use_second_ref = + xd->mode_info_context->mbmi.second_ref_frame > 0; + int which_mv; int idx = (ib & 8) + ((ib & 2) << 1); BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx]; BLOCK *be = &x->block[ib], *be2 = &x->block[idx]; int thisdistortion; - vp9_build_inter_predictors4b(xd, bd, 16); - if (xd->mode_info_context->mbmi.second_ref_frame > 0) - vp9_build_2nd_inter_predictors4b(xd, bd, 16); + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + uint8_t **base_pre = which_mv ? bd->base_second_pre : bd->base_pre; + + // TODO(debargha): Make this work properly with the + // implicit-compoundinter-weight experiment when implicit + // weighting for splitmv modes is turned on. + vp9_build_inter_predictor( + *base_pre + bd->pre, bd->pre_stride, bd->predictor, 16, + &bd->bmi.as_mv[which_mv], &xd->scale_factor[which_mv], 8, 8, + which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), + &xd->subpix); + } + vp9_subtract_4b_c(be, bd, 16); if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) { if (otherrd) { - x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32); - x->quantize_b_8x8(be2, bd2); + x->fwd_txm8x8(be->src_diff, be2->coeff, 32); + x->quantize_b_8x8(x, idx, DCT_DCT); thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64); otherdist += thisdistortion; - othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC, - tacp + vp9_block2above[TX_8X8][idx], - tlcp + vp9_block2left[TX_8X8][idx], - TX_8X8); + xd->mode_info_context->mbmi.txfm_size = TX_8X8; + othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC, + tacp + vp9_block2above[TX_8X8][idx], + tlcp + vp9_block2left[TX_8X8][idx], + TX_8X8); + xd->mode_info_context->mbmi.txfm_size = TX_4X4; } for (j = 0; j < 4; j += 2) { bd = &xd->block[ib + iblock[j]]; be = &x->block[ib + iblock[j]]; - x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1); + x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1); thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32); *distortion += thisdistortion; - *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC, - ta + vp9_block2above[TX_4X4][ib + iblock[j]], - tl + vp9_block2left[TX_4X4][ib + iblock[j]], - TX_4X4); - *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC, - ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1], - tl + vp9_block2left[TX_4X4][ib + iblock[j]], - TX_4X4); + *labelyrate += + cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above[TX_4X4][ib + iblock[j]], + tl + vp9_block2left[TX_4X4][ib + iblock[j]], + TX_4X4); + *labelyrate += + cost_coeffs(cm, x, ib + iblock[j] + 1, + PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1], + tl + vp9_block2left[TX_4X4][ib + iblock[j]], + TX_4X4); } } else /* 8x8 */ { if (otherrd) { for (j = 0; j < 4; j += 2) { BLOCKD *bd = &xd->block[ib + iblock[j]]; BLOCK *be = &x->block[ib + iblock[j]]; - x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32); - x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1); + x->fwd_txm8x4(be->src_diff, be->coeff, 32); + x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]); thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32); otherdist += thisdistortion; - othercost += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC, - tacp + vp9_block2above[TX_4X4][ib + iblock[j]], - tlcp + vp9_block2left[TX_4X4][ib + iblock[j]], - TX_4X4); - othercost += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC, - tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1], - tlcp + vp9_block2left[TX_4X4][ib + iblock[j]], - TX_4X4); + xd->mode_info_context->mbmi.txfm_size = TX_4X4; + othercost += + cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC, + tacp + vp9_block2above[TX_4X4][ib + iblock[j]], + tlcp + vp9_block2left[TX_4X4][ib + iblock[j]], + TX_4X4); + othercost += + cost_coeffs(cm, x, ib + iblock[j] + 1, + PLANE_TYPE_Y_WITH_DC, + tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1], + tlcp + vp9_block2left[TX_4X4][ib + iblock[j]], + TX_4X4); + xd->mode_info_context->mbmi.txfm_size = TX_8X8; } } - x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32); - x->quantize_b_8x8(be2, bd2); + x->fwd_txm8x8(be->src_diff, be2->coeff, 32); + x->quantize_b_8x8(x, idx, DCT_DCT); thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64); *distortion += thisdistortion; - *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC, + *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC, ta + vp9_block2above[TX_8X8][idx], tl + vp9_block2left[TX_8X8][idx], TX_8X8); } @@ -2373,8 +2625,7 @@ typedef struct { } BEST_SEG_INFO; -static __inline -int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { +static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { int r = 0; r |= (mv->as_mv.row >> 3) < x->mv_row_min; r |= (mv->as_mv.row >> 3) > x->mv_row_max; @@ -2487,9 +2738,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // use previous block's result as next block's MV predictor. if (segmentation == PARTITIONING_4X4 && i > 0) { - bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int; + bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv[0].as_int; if (i == 4 || i == 8 || i == 12) - bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int; + bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv[0].as_int; step_param = 2; } } @@ -2528,11 +2779,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (thissme < bestsme) { bestsme = thissme; - mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int; + mode_mv[NEW4X4].as_int = e->bmi.as_mv[0].as_int; } else { /* The full search result is actually worse so re-instate the * previous best vector */ - e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int; + e->bmi.as_mv[0].as_int = mode_mv[NEW4X4].as_int; } } } @@ -2575,11 +2826,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, continue; if (segmentation == PARTITIONING_4X4) { - this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate, + this_rd = encode_inter_mb_segment(&cpi->common, + x, labels, i, &labelyrate, &distortion, ta_s, tl_s); other_rd = this_rd; } else { - this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate, + this_rd = encode_inter_mb_segment_8x8(&cpi->common, + x, labels, i, &labelyrate, &distortion, &other_rd, ta_s, tl_s); } @@ -2595,13 +2848,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) { for (j = 0; j < 16; j++) if (labels[j] == i) - best_eobs[j] = x->e_mbd.block[j].eob; + best_eobs[j] = x->e_mbd.eobs[j]; } else { for (j = 0; j < 4; j++) { int ib = vp9_i8x8_block[j], idx = j * 4; if (labels[ib] == i) - best_eobs[idx] = x->e_mbd.block[idx].eob; + best_eobs[idx] = x->e_mbd.eobs[idx]; } } if (other_rd < best_other_rd) @@ -2734,8 +2987,9 @@ static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x, if (base_rd < txfm_cache[ONLY_4X4]) { txfm_cache[ONLY_4X4] = base_rd; } - if (base_rd + diff < txfm_cache[1]) { - txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff; + if (base_rd + diff < txfm_cache[ALLOW_8X8]) { + txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = + txfm_cache[ALLOW_32X32] = base_rd + diff; } if (diff < 0) { base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0); @@ -2749,7 +3003,7 @@ static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x, } } -static __inline void cal_step_param(int sr, int *sp) { +static INLINE void cal_step_param(int sr, int *sp) { int step = 0; if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP; @@ -2872,18 +3126,18 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < 16; i++) { BLOCKD *bd = &x->e_mbd.block[i]; - bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int; + bd->bmi.as_mv[0].as_int = bsi.mvs[i].as_int; if (mbmi->second_ref_frame > 0) - bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int; - bd->eob = bsi.eobs[i]; + bd->bmi.as_mv[1].as_int = bsi.second_mvs[i].as_int; + x->e_mbd.eobs[i] = bsi.eobs[i]; } *returntotrate = bsi.r; *returndistortion = bsi.d; *returnyrate = bsi.segment_yrate; *skippable = bsi.txfm_size == TX_4X4 ? - vp9_mby_is_skippable_4x4(&x->e_mbd, 0) : - vp9_mby_is_skippable_8x8(&x->e_mbd, 0); + vp9_mby_is_skippable_4x4(&x->e_mbd) : + vp9_mby_is_skippable_8x8(&x->e_mbd); /* save partitions */ mbmi->txfm_size = bsi.txfm_size; @@ -3016,7 +3270,8 @@ static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], } } -static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) { +static INLINE unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, + int idx, int val, int weight) { unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0; unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0; // weight is 16-bit fixed point, so this basically calculates: @@ -3145,7 +3400,9 @@ static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, // UV cost and distortion vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); - if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4) + if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4 && + x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED && + x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel, &uv_skippable, 1); else @@ -3160,41 +3417,104 @@ static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, int idx, MV_REFERENCE_FRAME frame_type, int block_size, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int_mv frame_nearest_mv[MAX_REF_FRAMES], int_mv frame_near_mv[MAX_REF_FRAMES], int frame_mdcounts[4][4], - uint8_t *y_buffer[4], - uint8_t *u_buffer[4], - uint8_t *v_buffer[4]) { - YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx]; + YV12_BUFFER_CONFIG yv12_mb[4], + struct scale_factors scale[MAX_REF_FRAMES]) { + VP9_COMMON *cm = &cpi->common; + YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]]; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + int use_prev_in_find_mv_refs, use_prev_in_find_best_ref; - y_buffer[frame_type] = yv12->y_buffer + recon_yoffset; - u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset; - v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset; + // set up scaling factors + scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1]; + scale[frame_type].x_offset_q4 = + (mb_col * 16 * scale[frame_type].x_num / scale[frame_type].x_den) & 0xf; + scale[frame_type].y_offset_q4 = + (mb_row * 16 * scale[frame_type].y_num / scale[frame_type].y_den) & 0xf; + + // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this + // use the UV scaling factors. + setup_pred_block(&yv12_mb[frame_type], yv12, mb_row, mb_col, + &scale[frame_type], &scale[frame_type]); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(xd, xd->mode_info_context, - xd->prev_mode_info_context, + use_prev_in_find_mv_refs = cm->width == cm->last_width && + cm->height == cm->last_height && + !cpi->common.error_resilient_mode; + vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context, + use_prev_in_find_mv_refs ? xd->prev_mode_info_context : NULL, frame_type, mbmi->ref_mvs[frame_type], cpi->common.ref_frame_sign_bias); // Candidate refinement carried out at encoder and decoder - vp9_find_best_ref_mvs(xd, y_buffer[frame_type], + use_prev_in_find_best_ref = + scale[frame_type].x_num == scale[frame_type].x_den && + scale[frame_type].y_num == scale[frame_type].y_den && + !cm->error_resilient_mode && + !cm->frame_parallel_decoding_mode; + vp9_find_best_ref_mvs(xd, + use_prev_in_find_best_ref ? + yv12_mb[frame_type].y_buffer : NULL, yv12->y_stride, mbmi->ref_mvs[frame_type], &frame_nearest_mv[frame_type], &frame_near_mv[frame_type]); - // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the centre point for subsequent searches. - mv_pred(cpi, x, y_buffer[frame_type], yv12->y_stride, - frame_type, block_size); + // The current implementation doesn't support scaling. + if (scale[frame_type].x_num == scale[frame_type].x_den && + scale[frame_type].y_num == scale[frame_type].y_den) + mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride, + frame_type, block_size); +} +static void model_rd_from_var_lapndz(int var, int n, int qstep, + int *rate, int *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + // The function is implemented as piecewise approximation to the + // exact computation. + // TODO(debargha): Implement the functions by interpolating from a + // look-up table + vp9_clear_system_state(); + { + double D, R; + double s2 = (double) var / n; + double s = sqrt(s2); + double x = qstep / s; + if (x > 1.0) { + double y = exp(-x / 2); + double y2 = y * y; + D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275; + R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017; + } else { + double x2 = x * x; + D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807; + if (x > 0.125) + R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x + + 0.1626989668625); + else + R = -1.442252874826093 * log(x) + 1.944647760719664; + } + if (R < 0) { + *rate = 0; + *dist = var; + } else { + *rate = (n * R * 256 + 0.5); + *dist = (n * D * s2 + 0.5); + } + } + vp9_clear_system_state(); } static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, @@ -3209,9 +3529,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate_y, int *distortion_y, int *rate_uv, int *distortion_uv, int *mode_excluded, int *disable_skip, - int recon_yoffset, int mode_index, + int mode_index, + INTERPOLATIONFILTERTYPE *best_filter, int_mv frame_mv[MB_MODE_COUNT] - [MAX_REF_FRAMES]) { + [MAX_REF_FRAMES], + YV12_BUFFER_CONFIG *scaled_ref_frame, + int mb_row, int mb_col) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; @@ -3229,6 +3552,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int_mv cur_mv[2]; int_mv ref_mv[2]; int64_t this_rd = 0; + unsigned char tmp_ybuf[64 * 64]; + unsigned char tmp_ubuf[32 * 32]; + unsigned char tmp_vbuf[32 * 32]; + int pred_exists = 0; + int interpolating_intpel_seen = 0; + int intpel_mv; + int64_t rd, best_rd = INT64_MAX; switch (this_mode) { case NEWMV: @@ -3248,6 +3578,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, x->nmvjointcost, x->mvcost, 96, x->e_mbd.allow_high_precision_mv); } else { + YV12_BUFFER_CONFIG backup_yv12 = xd->pre; int bestsme = INT_MAX; int further_steps, step_param = cpi->sf.first_step; int sadpb = x->sadperbit16; @@ -3259,17 +3590,26 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int tmp_row_min = x->mv_row_min; int tmp_row_max = x->mv_row_max; + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + xd->pre = *scaled_ref_frame; + xd->pre.y_buffer += mb_row * 16 * xd->pre.y_stride + mb_col * 16; + xd->pre.u_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8; + xd->pre.v_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8; + } + vp9_clamp_mv_min_max(x, &ref_mv[0]); + sr = vp9_init_search_range(cpi->common.width, cpi->common.height); + // mvp_full.as_int = ref_mv[0].as_int; mvp_full.as_int = mbmi->ref_mvs[refs[0]][x->mv_best_ref_index[refs[0]]].as_int; mvp_full.as_mv.col >>= 3; mvp_full.as_mv.row >>= 3; - if (mvp_full.as_int != mvp_full.as_int) { - mvp_full.as_int = mvp_full.as_int; - } // adjust search range according to sr from mv prediction step_param = MAX(step_param, sr); @@ -3297,22 +3637,22 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, x->nmvjointcost, x->mvcost, &dis, &sse); } - d->bmi.as_mv.first.as_int = tmp_mv.as_int; - frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int; + d->bmi.as_mv[0].as_int = tmp_mv.as_int; + frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv[0].as_int; // Add the new motion vector cost to our rolling cost variable *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0], x->nmvjointcost, x->mvcost, 96, xd->allow_high_precision_mv); + + // restore the predictor, if required + if (scaled_ref_frame) { + xd->pre = backup_yv12; + } } break; - case NEARESTMV: case NEARMV: - // Do not bother proceeding if the vector (from newmv, nearest or - // near) is 0,0 as this should then be coded using the zeromv mode. - for (i = 0; i < num_refs; ++i) - if (frame_mv[this_mode][refs[i]].as_int == 0) - return INT64_MAX; + case NEARESTMV: case ZEROMV: default: break; @@ -3326,11 +3666,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mv[i].as_int = cur_mv[i].as_int; } - if (cpi->common.mcomp_filter_type == SWITCHABLE) { - const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); - const int m = vp9_switchable_interp_map[mbmi->interp_filter]; - *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; - } /* We don't include the cost of the second reference here, because there * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other @@ -3355,36 +3690,332 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } #endif + pred_exists = 0; + interpolating_intpel_seen = 0; + // Are all MVs integer pel for Y and UV + intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 && + (mbmi->mv[0].as_mv.col & 15) == 0; + if (is_comp_pred) + intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 && + (mbmi->mv[1].as_mv.col & 15) == 0; + // Search for best switchable filter by checking the variance of + // pred error irrespective of whether the filter will be used if (block_size == BLOCK_64X64) { - vp9_build_inter64x64_predictors_sb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); + int switchable_filter_index, newbest; + int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0; + int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0; + for (switchable_filter_index = 0; + switchable_filter_index < VP9_SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int rs = 0; + mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index]; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int m = vp9_switchable_interp_map[mbmi->interp_filter]; + rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; + } + if (interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i, + tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i); + } else { + unsigned int sse, var; + int tmp_rate_y, tmp_rate_u, tmp_rate_v; + int tmp_dist_y, tmp_dist_u, tmp_dist_v; + vp9_build_inter64x64_predictors_sb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); + var = vp9_variance64x64(*(b->base_src), b->src_stride, + xd->dst.y_buffer, xd->dst.y_stride, &sse); + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + model_rd_from_var_lapndz(var, 64 * 64, xd->block[0].dequant[1] >> 3, + &tmp_rate_y, &tmp_dist_y); + var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride, + xd->dst.u_buffer, xd->dst.uv_stride, &sse); + model_rd_from_var_lapndz(var, 32 * 32, xd->block[16].dequant[1] >> 3, + &tmp_rate_u, &tmp_dist_u); + var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride, + xd->dst.v_buffer, xd->dst.uv_stride, &sse); + model_rd_from_var_lapndz(var, 32 * 32, xd->block[20].dequant[1] >> 3, + &tmp_rate_v, &tmp_dist_v); + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y + tmp_rate_u + tmp_rate_v, + tmp_dist_y + tmp_dist_u + tmp_dist_v); + if (!interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + tmp_rate_y_i = tmp_rate_y; + tmp_rate_u_i = tmp_rate_u; + tmp_rate_v_i = tmp_rate_v; + tmp_dist_y_i = tmp_dist_y; + tmp_dist_u_i = tmp_dist_u; + tmp_dist_v_i = tmp_dist_v; + } + } + newbest = (switchable_filter_index == 0 || rd < best_rd); + if (newbest) { + best_rd = rd; + *best_filter = mbmi->interp_filter; + } + if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || + (cm->mcomp_filter_type != SWITCHABLE && + cm->mcomp_filter_type == mbmi->interp_filter)) { + int i; + for (i = 0; i < 64; ++i) + vpx_memcpy(tmp_ybuf + i * 64, + xd->dst.y_buffer + i * xd->dst.y_stride, + sizeof(unsigned char) * 64); + for (i = 0; i < 32; ++i) + vpx_memcpy(tmp_ubuf + i * 32, + xd->dst.u_buffer + i * xd->dst.uv_stride, + sizeof(unsigned char) * 32); + for (i = 0; i < 32; ++i) + vpx_memcpy(tmp_vbuf + i * 32, + xd->dst.v_buffer + i * xd->dst.uv_stride, + sizeof(unsigned char) * 32); + pred_exists = 1; + } + interpolating_intpel_seen |= + intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter]; + } } else if (block_size == BLOCK_32X32) { - vp9_build_inter32x32_predictors_sb(xd, - xd->dst.y_buffer, - xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.y_stride, - xd->dst.uv_stride); + int switchable_filter_index, newbest; + int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0; + int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0; + for (switchable_filter_index = 0; + switchable_filter_index < VP9_SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int rs = 0; + mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index]; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int m = vp9_switchable_interp_map[mbmi->interp_filter]; + rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; + } + if (interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i, + tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i); + } else { + unsigned int sse, var; + int tmp_rate_y, tmp_rate_u, tmp_rate_v; + int tmp_dist_y, tmp_dist_u, tmp_dist_v; + vp9_build_inter32x32_predictors_sb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); + var = vp9_variance32x32(*(b->base_src), b->src_stride, + xd->dst.y_buffer, xd->dst.y_stride, &sse); + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + model_rd_from_var_lapndz(var, 32 * 32, xd->block[0].dequant[1] >> 3, + &tmp_rate_y, &tmp_dist_y); + var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride, + xd->dst.u_buffer, xd->dst.uv_stride, &sse); + model_rd_from_var_lapndz(var, 16 * 16, xd->block[16].dequant[1] >> 3, + &tmp_rate_u, &tmp_dist_u); + var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride, + xd->dst.v_buffer, xd->dst.uv_stride, &sse); + model_rd_from_var_lapndz(var, 16 * 16, xd->block[20].dequant[1] >> 3, + &tmp_rate_v, &tmp_dist_v); + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y + tmp_rate_u + tmp_rate_v, + tmp_dist_y + tmp_dist_u + tmp_dist_v); + if (!interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + tmp_rate_y_i = tmp_rate_y; + tmp_rate_u_i = tmp_rate_u; + tmp_rate_v_i = tmp_rate_v; + tmp_dist_y_i = tmp_dist_y; + tmp_dist_u_i = tmp_dist_u; + tmp_dist_v_i = tmp_dist_v; + } + } + newbest = (switchable_filter_index == 0 || rd < best_rd); + if (newbest) { + best_rd = rd; + *best_filter = mbmi->interp_filter; + } + if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || + (cm->mcomp_filter_type != SWITCHABLE && + cm->mcomp_filter_type == mbmi->interp_filter)) { + int i; + for (i = 0; i < 32; ++i) + vpx_memcpy(tmp_ybuf + i * 64, + xd->dst.y_buffer + i * xd->dst.y_stride, + sizeof(unsigned char) * 32); + for (i = 0; i < 16; ++i) + vpx_memcpy(tmp_ubuf + i * 32, + xd->dst.u_buffer + i * xd->dst.uv_stride, + sizeof(unsigned char) * 16); + for (i = 0; i < 16; ++i) + vpx_memcpy(tmp_vbuf + i * 32, + xd->dst.v_buffer + i * xd->dst.uv_stride, + sizeof(unsigned char) * 16); + pred_exists = 1; + } + interpolating_intpel_seen |= + intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter]; + } } else { + int switchable_filter_index, newbest; + int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0; + int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0; assert(block_size == BLOCK_16X16); - vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0); - if (is_comp_pred) - vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16); -#if CONFIG_COMP_INTERINTRA_PRED - if (is_comp_interintra_pred) { - vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16); + for (switchable_filter_index = 0; + switchable_filter_index < VP9_SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int rs = 0; + mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index]; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int m = vp9_switchable_interp_map[mbmi->interp_filter]; + rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; + } + if (interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i, + tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i); + } else { + unsigned int sse, var; + int tmp_rate_y, tmp_rate_u, tmp_rate_v; + int tmp_dist_y, tmp_dist_u, tmp_dist_v; + vp9_build_inter16x16_predictors_mb(xd, xd->predictor, + xd->predictor + 256, + xd->predictor + 320, + 16, 8, mb_row, mb_col); + var = vp9_variance16x16(*(b->base_src), b->src_stride, + xd->predictor, 16, &sse); + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + model_rd_from_var_lapndz(var, 16 * 16, xd->block[0].dequant[1] >> 3, + &tmp_rate_y, &tmp_dist_y); + var = vp9_variance8x8(x->src.u_buffer, x->src.uv_stride, + &xd->predictor[256], 8, &sse); + model_rd_from_var_lapndz(var, 8 * 8, xd->block[16].dequant[1] >> 3, + &tmp_rate_u, &tmp_dist_u); + var = vp9_variance8x8(x->src.v_buffer, x->src.uv_stride, + &xd->predictor[320], 8, &sse); + model_rd_from_var_lapndz(var, 8 * 8, xd->block[20].dequant[1] >> 3, + &tmp_rate_v, &tmp_dist_v); + rd = RDCOST(x->rdmult, x->rddiv, + rs + tmp_rate_y + tmp_rate_u + tmp_rate_v, + tmp_dist_y + tmp_dist_u + tmp_dist_v); + if (!interpolating_intpel_seen && intpel_mv && + vp9_is_interpolating_filter[mbmi->interp_filter]) { + tmp_rate_y_i = tmp_rate_y; + tmp_rate_u_i = tmp_rate_u; + tmp_rate_v_i = tmp_rate_v; + tmp_dist_y_i = tmp_dist_y; + tmp_dist_u_i = tmp_dist_u; + tmp_dist_v_i = tmp_dist_v; + } + } + newbest = (switchable_filter_index == 0 || rd < best_rd); + if (newbest) { + best_rd = rd; + *best_filter = mbmi->interp_filter; + } + if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || + (cm->mcomp_filter_type != SWITCHABLE && + cm->mcomp_filter_type == mbmi->interp_filter)) { + vpx_memcpy(tmp_ybuf, xd->predictor, sizeof(unsigned char) * 256); + vpx_memcpy(tmp_ubuf, xd->predictor + 256, sizeof(unsigned char) * 64); + vpx_memcpy(tmp_vbuf, xd->predictor + 320, sizeof(unsigned char) * 64); + pred_exists = 1; + } + interpolating_intpel_seen |= + intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter]; } -#endif + } + + // Set the appripriate filter + if (cm->mcomp_filter_type != SWITCHABLE) + mbmi->interp_filter = cm->mcomp_filter_type; + else + mbmi->interp_filter = *best_filter; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + + if (pred_exists) { + if (block_size == BLOCK_64X64) { + for (i = 0; i < 64; ++i) + vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride, tmp_ybuf + i * 64, + sizeof(unsigned char) * 64); + for (i = 0; i < 32; ++i) + vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32, + sizeof(unsigned char) * 32); + for (i = 0; i < 32; ++i) + vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32, + sizeof(unsigned char) * 32); + } else if (block_size == BLOCK_32X32) { + for (i = 0; i < 32; ++i) + vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride, tmp_ybuf + i * 64, + sizeof(unsigned char) * 32); + for (i = 0; i < 16; ++i) + vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32, + sizeof(unsigned char) * 16); + for (i = 0; i < 16; ++i) + vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32, + sizeof(unsigned char) * 16); + } else { + vpx_memcpy(xd->predictor, tmp_ybuf, sizeof(unsigned char) * 256); + vpx_memcpy(xd->predictor + 256, tmp_ubuf, sizeof(unsigned char) * 64); + vpx_memcpy(xd->predictor + 320, tmp_vbuf, sizeof(unsigned char) * 64); + } + } else { + // Handles the special case when a filter that is not in the + // switchable list (ex. bilinear, 6-tap) is indicated at the frame level + if (block_size == BLOCK_64X64) { + vp9_build_inter64x64_predictors_sb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); + } else if (block_size == BLOCK_32X32) { + vp9_build_inter32x32_predictors_sb(xd, + xd->dst.y_buffer, + xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.y_stride, + xd->dst.uv_stride, + mb_row, mb_col); + } else { + vp9_build_inter16x16_predictors_mb(xd, xd->predictor, + xd->predictor + 256, + xd->predictor + 320, + 16, 8, mb_row, mb_col); + } + } + + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int m = vp9_switchable_interp_map[mbmi->interp_filter]; + *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; } if (cpi->active_map_enabled && x->active_ptr[0] == 0) x->skip = 1; else if (x->encode_breakout) { - unsigned int sse, var; + unsigned int var, sse; int threshold = (xd->block[0].dequant[1] * xd->block[0].dequant[1] >> 4); @@ -3404,9 +4035,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if ((int)sse < threshold) { - unsigned int q2dc = xd->block[24].dequant[0]; + unsigned int q2dc = xd->block[0].dequant[0]; /* If there is no codeable 2nd order dc - or a very small uniform pixel change change */ + or a very small uniform pixel change change */ if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) { // Check u and v to make sure skip is ok @@ -3447,17 +4078,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - if (!(*mode_excluded)) { - if (is_comp_pred) { - *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY); - } else { - *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY); - } -#if CONFIG_COMP_INTERINTRA_PRED - if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1; -#endif - } - if (!x->skip) { if (block_size == BLOCK_64X64) { int skippable_y, skippable_uv; @@ -3491,30 +4111,32 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *skippable = skippable_y && skippable_uv; } else { assert(block_size == BLOCK_16X16); - - vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256], - &xd->predictor[320], 8); - if (is_comp_pred) - vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256], - &xd->predictor[320], 8); -#if CONFIG_COMP_INTERINTRA_PRED - if (is_comp_interintra_pred) { - vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256], - &xd->predictor[320], 8); - } -#endif inter_mode_cost(cpi, x, rate2, distortion, rate_y, distortion_y, rate_uv, distortion_uv, skippable, txfm_cache); } } + + if (!(*mode_excluded)) { + if (is_comp_pred) { + *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY); + } else { + *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY); + } +#if CONFIG_COMP_INTERINTRA_PRED + if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1; +#endif + } + return this_rd; // if 0, this will be re-calculated by caller } static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int *returnrate, int *returndistortion, int64_t *returnintra) { + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; union b_mode_info best_bmodes[16]; @@ -3540,10 +4162,14 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, #if CONFIG_COMP_INTERINTRA_PRED int is_best_interintra = 0; int64_t best_intra16_rd = INT64_MAX; - int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED; + int best_intra16_mode = DC_PRED; +#if SEPARATE_INTERINTRA_UV + int best_intra16_uv_mode = DC_PRED; +#endif #endif int64_t best_overall_rd = INT64_MAX; INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE; + INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly; int uv_intra_skippable = 0; int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0; @@ -3551,7 +4177,6 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int rate_y, UNINITIALIZED_IS_SAFE(rate_uv); int distortion_uv = INT_MAX; int64_t best_yrd = INT64_MAX; - int switchable_filter_index = 0; MB_PREDICTION_MODE uv_intra_mode; MB_PREDICTION_MODE uv_intra_mode_8x8 = 0; @@ -3561,7 +4186,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; int frame_mdcounts[4][4]; - uint8_t *y_buffer[4], *u_buffer[4], *v_buffer[4]; + YV12_BUFFER_CONFIG yv12_mb[4]; unsigned int ref_costs[MAX_REF_FRAMES]; int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1]; @@ -3569,6 +4194,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex, cpi->common.y1dc_delta_q); + struct scale_factors scale_factor[4]; + vpx_memset(mode8x8, 0, sizeof(mode8x8)); vpx_memset(&frame_mv, 0, sizeof(frame_mv)); vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); @@ -3592,24 +4219,24 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME, - BLOCK_16X16, recon_yoffset, recon_uvoffset, + setup_buffer_inter(cpi, x, cpi->lst_fb_idx, + LAST_FRAME, BLOCK_16X16, mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV], - frame_mdcounts, y_buffer, u_buffer, v_buffer); + frame_mdcounts, yv12_mb, scale_factor); } if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME, - BLOCK_16X16, recon_yoffset, recon_uvoffset, + setup_buffer_inter(cpi, x, cpi->gld_fb_idx, + GOLDEN_FRAME, BLOCK_16X16, mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV], - frame_mdcounts, y_buffer, u_buffer, v_buffer); + frame_mdcounts, yv12_mb, scale_factor); } if (cpi->ref_frame_flags & VP9_ALT_FLAG) { - setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME, - BLOCK_16X16, recon_yoffset, recon_uvoffset, + setup_buffer_inter(cpi, x, cpi->alt_fb_idx, + ALTREF_FRAME, BLOCK_16X16, mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV], - frame_mdcounts, y_buffer, u_buffer, v_buffer); + frame_mdcounts, yv12_mb, scale_factor); } *returnintra = INT64_MAX; @@ -3620,6 +4247,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, cpi->zbin_mode_boost = 0; vp9_update_zbin_extra(cpi, x); + xd->mode_info_context->mbmi.mode = DC_PRED; + rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion, &uv_intra_skippable); @@ -3638,8 +4267,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // that depend on the current prediction etc. estimate_ref_frame_costs(cpi, segment_id, ref_costs); - for (mode_index = 0; mode_index < MAX_MODES; - mode_index += (!switchable_filter_index)) { + for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { int64_t this_rd = INT64_MAX; int disable_skip = 0, skippable = 0; int other_cost = 0; @@ -3649,6 +4277,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, #endif int mode_excluded = 0; int64_t txfm_cache[NB_TXFM_MODES] = { 0 }; + YV12_BUFFER_CONFIG *scaled_ref_frame; // These variables hold are rolling total cost and distortion for this mode rate2 = 0; @@ -3664,24 +4293,38 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame; mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame; - // Evaluate all sub-pel filters irrespective of whether we can use - // them for this frame. - if (this_mode >= NEARESTMV && this_mode <= SPLITMV) { - mbmi->interp_filter = - vp9_switchable_interp[switchable_filter_index++]; - if (switchable_filter_index == VP9_SWITCHABLE_FILTERS) - switchable_filter_index = 0; - if ((cm->mcomp_filter_type != SWITCHABLE) && - (cm->mcomp_filter_type != mbmi->interp_filter)) { - mode_excluded = 1; - } - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - } + mbmi->interp_filter = cm->mcomp_filter_type; + + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); + + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); // Test best rd so far against threshold for trying this mode. if (best_rd <= cpi->rd_threshes[mode_index]) continue; + // Ensure that the references used by this mode are available. + if (mbmi->ref_frame && + !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame])) + continue; + + if (mbmi->second_ref_frame > 0 && + !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame])) + continue; + + // only scale on zeromv. + if (mbmi->ref_frame > 0 && + (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 || + yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) && + this_mode != ZEROMV) + continue; + if (mbmi->second_ref_frame > 0 && + (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 || + yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) && + this_mode != ZEROMV) + continue; + // current coding mode under rate-distortion optimization test loop #if CONFIG_COMP_INTERINTRA_PRED mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1); @@ -3693,18 +4336,16 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) { continue; - // If the segment mode feature is enabled.... + // If the segment skip feature is enabled.... // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) && - (this_mode != - vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) { + } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) && + (this_mode != ZEROMV)) { continue; - // Disable this drop out case if either the mode or ref frame - // segment level feature is enabled for this segment. This is to + // Disable this drop out case if the ref frame segment + // level feature is enabled for this segment. This is to // prevent the possibility that the we end up unable to pick any mode. - } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { - // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { @@ -3716,22 +4357,31 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } /* everything but intra */ + scaled_ref_frame = NULL; if (mbmi->ref_frame) { int ref = mbmi->ref_frame; + int fb; - xd->pre.y_buffer = y_buffer[ref]; - xd->pre.u_buffer = u_buffer[ref]; - xd->pre.v_buffer = v_buffer[ref]; + xd->pre = yv12_mb[ref]; best_ref_mv = mbmi->ref_mvs[ref][0]; vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts)); + + if (mbmi->ref_frame == LAST_FRAME) { + fb = cpi->lst_fb_idx; + } else if (mbmi->ref_frame == GOLDEN_FRAME) { + fb = cpi->gld_fb_idx; + } else { + fb = cpi->alt_fb_idx; + } + + if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb]) + scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]]; } if (mbmi->second_ref_frame > 0) { int ref = mbmi->second_ref_frame; - xd->second_pre.y_buffer = y_buffer[ref]; - xd->second_pre.u_buffer = u_buffer[ref]; - xd->second_pre.v_buffer = v_buffer[ref]; + xd->second_pre = yv12_mb[ref]; second_best_ref_mv = mbmi->ref_mvs[ref][0]; } @@ -3798,8 +4448,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED]; mbmi->txfm_size = TX_4X4; tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, - &distortion, best_yrd, - cpi->update_context); + &distortion, best_yrd); rate2 += rate; rate2 += intra_cost_penalty; distortion2 += distortion; @@ -3816,65 +4465,11 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } break; case I8X8_PRED: { - int cost0 = vp9_cost_bit(cm->prob_tx[0], 0); - int cost1 = vp9_cost_bit(cm->prob_tx[0], 1); - int64_t tmp_rd_4x4s, tmp_rd_8x8s; - int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd; - int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8; - mbmi->txfm_size = TX_4X4; - tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4, - &d4x4, best_yrd); - mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first; - mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first; - mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first; - mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first; - mbmi->txfm_size = TX_8X8; - tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8, - &d8x8, best_yrd); - txfm_cache[ONLY_4X4] = tmp_rd_4x4; - txfm_cache[ALLOW_8X8] = tmp_rd_8x8; - txfm_cache[ALLOW_16X16] = tmp_rd_8x8; - tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0); - tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0); - txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s; - if (cm->txfm_mode == TX_MODE_SELECT) { - if (tmp_rd_4x4s < tmp_rd_8x8s) { - rate = r4x4 + cost0; - rate_y = tok4x4 + cost0; - distortion = d4x4; - mbmi->txfm_size = TX_4X4; - tmp_rd = tmp_rd_4x4s; - } else { - rate = r8x8 + cost1; - rate_y = tok8x8 + cost1; - distortion = d8x8; - mbmi->txfm_size = TX_8X8; - tmp_rd = tmp_rd_8x8s; - - mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first; - mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first; - mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first; - mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first; - } - } else if (cm->txfm_mode == ONLY_4X4) { - rate = r4x4; - rate_y = tok4x4; - distortion = d4x4; - mbmi->txfm_size = TX_4X4; - tmp_rd = tmp_rd_4x4; - } else { - rate = r8x8; - rate_y = tok8x8; - distortion = d8x8; - mbmi->txfm_size = TX_8X8; - tmp_rd = tmp_rd_8x8; - - mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first; - mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first; - mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first; - mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first; - } + int64_t tmp_rd; + tmp_rd = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate, &rate_y, + &distortion, mode8x8, + best_yrd, txfm_cache); rate2 += rate; rate2 += intra_cost_penalty; distortion2 += distortion; @@ -3898,37 +4493,120 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // special case it. else if (this_mode == SPLITMV) { const int is_comp_pred = mbmi->second_ref_frame > 0; - int64_t tmp_rd, this_rd_thresh; + int64_t this_rd_thresh; + int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX; + int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX; + int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0; + int switchable_filter_index; int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL; + union b_mode_info tmp_best_bmodes[16]; + MB_MODE_INFO tmp_best_mbmode; + PARTITION_INFO tmp_best_partition; + int pred_exists = 0; this_rd_thresh = - (mbmi->ref_frame == LAST_FRAME) ? + (mbmi->ref_frame == LAST_FRAME) ? cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA]; this_rd_thresh = - (mbmi->ref_frame == GOLDEN_FRAME) ? + (mbmi->ref_frame == GOLDEN_FRAME) ? cpi->rd_threshes[THR_NEWG] : this_rd_thresh; + xd->mode_info_context->mbmi.txfm_size = TX_4X4; + + for (switchable_filter_index = 0; + switchable_filter_index < VP9_SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int newbest; + mbmi->interp_filter = + vp9_switchable_interp[switchable_filter_index]; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, + second_ref, best_yrd, mdcounts, + &rate, &rate_y, &distortion, + &skippable, + (int)this_rd_thresh, seg_mvs, + txfm_cache); + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs + [vp9_get_pred_context(&cpi->common, xd, + PRED_SWITCHABLE_INTERP)] + [vp9_switchable_interp_map[mbmi->interp_filter]]; + tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0); + } + newbest = (tmp_rd < tmp_best_rd); + if (newbest) { + tmp_best_filter = mbmi->interp_filter; + tmp_best_rd = tmp_rd; + } + if ((newbest && cm->mcomp_filter_type == SWITCHABLE) || + (mbmi->interp_filter == cm->mcomp_filter_type && + cm->mcomp_filter_type != SWITCHABLE)) { + tmp_best_rdu = tmp_rd; + tmp_best_rate = rate; + tmp_best_ratey = rate_y; + tmp_best_distortion = distortion; + tmp_best_skippable = skippable; + vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO)); + vpx_memcpy(&tmp_best_partition, x->partition_info, + sizeof(PARTITION_INFO)); + for (i = 0; i < 16; i++) { + tmp_best_bmodes[i] = xd->block[i].bmi; + } + pred_exists = 1; + } + } // switchable_filter_index loop + + mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ? + tmp_best_filter : cm->mcomp_filter_type); + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + if (!pred_exists) { + // Handles the special case when a filter that is not in the + // switchable list (bilinear, 6-tap) is indicated at the frame level + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, + second_ref, best_yrd, mdcounts, + &rate, &rate_y, &distortion, + &skippable, + (int)this_rd_thresh, seg_mvs, + txfm_cache); + } else { + if (cpi->common.mcomp_filter_type == SWITCHABLE) { + int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs + [vp9_get_pred_context(&cpi->common, xd, + PRED_SWITCHABLE_INTERP)] + [vp9_switchable_interp_map[mbmi->interp_filter]]; + tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0); + } + tmp_rd = tmp_best_rdu; + rate = tmp_best_rate; + rate_y = tmp_best_ratey; + distortion = tmp_best_distortion; + skippable = tmp_best_skippable; + vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO)); + vpx_memcpy(x->partition_info, &tmp_best_partition, + sizeof(PARTITION_INFO)); + for (i = 0; i < 16; i++) { + xd->block[i].bmi = xd->mode_info_context->bmi[i] = tmp_best_bmodes[i]; + } + } - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, - second_ref, best_yrd, mdcounts, - &rate, &rate_y, &distortion, - &skippable, - (int)this_rd_thresh, seg_mvs, - txfm_cache); rate2 += rate; distortion2 += distortion; if (cpi->common.mcomp_filter_type == SWITCHABLE) rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)] - [vp9_switchable_interp_map[mbmi->interp_filter]]; + [vp9_switchable_interp_map[mbmi->interp_filter]]; // If even the 'Y' rd value of split is higher than best so far // then dont bother looking at UV if (tmp_rd < best_yrd) { int uv_skippable; - rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, - cpi->common.full_pixel); + vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col); + vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, + x->e_mbd.predictor, x->src.uv_stride); + rd_inter16x16_uv_4x4(cpi, x, &rate_uv, &distortion_uv, + cpi->common.full_pixel, &uv_skippable, 1); rate2 += rate_uv; distortion2 += distortion_uv; skippable = skippable && uv_skippable; @@ -3969,8 +4647,9 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, #endif &rate_y, &distortion, &rate_uv, &distortion_uv, - &mode_excluded, &disable_skip, recon_yoffset, - mode_index, frame_mv); + &mode_excluded, &disable_skip, + mode_index, &tmp_best_filter, frame_mv, + scaled_ref_frame, mb_row, mb_col); if (this_rd == INT64_MAX) continue; } @@ -3995,10 +4674,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->common.mb_no_coeff_skip) { int mb_skip_allowed; - // Is Mb level skip allowed for this mb. - mb_skip_allowed = - !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); + // Is Mb level skip allowed (i.e. not coded at segment level). + mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); if (skippable) { mbmi->mb_skip_coeff = 1; @@ -4050,8 +4727,10 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (this_rd < best_intra16_rd)) { best_intra16_rd = this_rd; best_intra16_mode = this_mode; +#if SEPARATE_INTERINTRA_UV best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ? uv_intra_mode_8x8 : uv_intra_mode); +#endif } #endif @@ -4061,7 +4740,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (this_rd < best_overall_rd) { best_overall_rd = this_rd; - best_filter = mbmi->interp_filter; + best_filter = tmp_best_filter; best_mode = this_mode; #if CONFIG_COMP_INTERINTRA_PRED is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME); @@ -4175,7 +4854,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (x->skip && !mode_excluded) break; - } + } assert((cm->mcomp_filter_type == SWITCHABLE) || (cm->mcomp_filter_type == best_mbmode.interp_filter) || @@ -4204,12 +4883,11 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, cpi->rd_thresh_mult[best_mode_index]; } - // This code force Altref,0,0 and skip for the frame that overlays a + // This code forces Altref,0,0 and skip for the frame that overlays a // an alrtef unless Altref is filtered. However, this is unsafe if - // segment level coding of ref frame or mode is enabled for this + // segment level coding of ref frame is enabled for this // segment. if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) && cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0) && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) { @@ -4224,6 +4902,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; mbmi->partitioning = 0; + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff)); vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff)); @@ -4244,10 +4924,12 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (best_mbmode.mode == SPLITMV) { for (i = 0; i < 16; i++) - xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int; + xd->mode_info_context->bmi[i].as_mv[0].as_int = + best_bmodes[i].as_mv[0].as_int; if (mbmi->second_ref_frame > 0) for (i = 0; i < 16; i++) - xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int; + xd->mode_info_context->bmi[i].as_mv[1].as_int = + best_bmodes[i].as_mv[1].as_int; vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO)); @@ -4265,7 +4947,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!x->skip) { for (i = 0; i < NB_TXFM_MODES; i++) { if (best_txfm_rd[i] == INT64_MAX) - best_txfm_diff[i] = INT_MIN; + best_txfm_diff[i] = 0; else best_txfm_diff[i] = best_rd - best_txfm_rd[i]; } @@ -4274,6 +4956,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } end: + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index], best_mode_index, &best_partition, &mbmi->ref_mvs[mbmi->ref_frame][0], @@ -4291,22 +4975,29 @@ void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, int rate_y_tokenonly = 0, rate_uv_tokenonly; int dist_y = 0, dist_uv; int y_skip = 0, uv_skip; - int64_t txfm_cache[NB_TXFM_MODES]; + int64_t txfm_cache[NB_TXFM_MODES], err; + int i; - rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, - &dist_y, &y_skip, txfm_cache); + xd->mode_info_context->mbmi.mode = DC_PRED; + err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, txfm_cache); rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, - &dist_uv, &uv_skip); + &dist_uv, &uv_skip); if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) { *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1); *returndist = dist_y + (dist_uv >> 2); + memset(x->sb32_context[xd->sb_index].txfm_rd_diff, 0, + sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff)); } else { *returnrate = rate_y + rate_uv; if (cpi->common.mb_no_coeff_skip) *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); *returndist = dist_y + (dist_uv >> 2); + for (i = 0; i < NB_TXFM_MODES; i++) { + x->sb32_context[xd->sb_index].txfm_rd_diff[i] = err - txfm_cache[i]; + } } } @@ -4319,22 +5010,29 @@ void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, int rate_y_tokenonly = 0, rate_uv_tokenonly; int dist_y = 0, dist_uv; int y_skip = 0, uv_skip; - int64_t txfm_cache[NB_TXFM_MODES]; + int64_t txfm_cache[NB_TXFM_MODES], err; + int i; - rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly, - &dist_y, &y_skip, txfm_cache); + xd->mode_info_context->mbmi.mode = DC_PRED; + err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, txfm_cache); rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, - &dist_uv, &uv_skip); + &dist_uv, &uv_skip); if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) { *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1); *returndist = dist_y + (dist_uv >> 2); + memset(x->sb64_context.txfm_rd_diff, 0, + sizeof(x->sb64_context.txfm_rd_diff)); } else { *returnrate = rate_y + rate_uv; if (cm->mb_no_coeff_skip) *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); *returndist = dist_y + (dist_uv >> 2); + for (i = 0; i < NB_TXFM_MODES; i++) { + x->sb64_context.txfm_rd_diff[i] = err - txfm_cache[i]; + } } } @@ -4356,77 +5054,96 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, int mode16x16; int mode8x8[4]; int dist; - int modeuv, uv_intra_skippable, uv_intra_skippable_8x8; + int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8; int y_intra16x16_skippable = 0; - int64_t txfm_cache[NB_TXFM_MODES]; - TX_SIZE txfm_size_16x16; + int64_t txfm_cache[2][NB_TXFM_MODES]; + TX_SIZE txfm_size_16x16, txfm_size_8x8; int i; mbmi->ref_frame = INTRA_FRAME; + mbmi->mode = DC_PRED; rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv, &uv_intra_skippable); modeuv = mbmi->uv_mode; if (cpi->common.txfm_mode != ONLY_4X4) { rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly, &distuv8x8, &uv_intra_skippable_8x8); + modeuv8x8 = mbmi->uv_mode; } else { uv_intra_skippable_8x8 = uv_intra_skippable; rateuv8x8 = rateuv; distuv8x8 = distuv; rateuv8x8_tokenonly = rateuv_tokenonly; + modeuv8x8 = modeuv; } // current macroblock under rate-distortion optimization test loop error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16, - &y_intra16x16_skippable, txfm_cache); + &y_intra16x16_skippable, + txfm_cache[1]); mode16x16 = mbmi->mode; txfm_size_16x16 = mbmi->txfm_size; + if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable && + ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) || + (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) { + error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0); + rate16x16 -= rate16x16_tokenonly; + } + for (i = 0; i < NB_TXFM_MODES; i++) { + txfm_cache[0][i] = error16x16 - txfm_cache[1][cm->txfm_mode] + + txfm_cache[1][i]; + } - // FIXME(rbultje) support transform-size selection - mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8; - error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly, - &dist8x8, error16x16); - mode8x8[0]= xd->mode_info_context->bmi[0].as_mode.first; - mode8x8[1]= xd->mode_info_context->bmi[2].as_mode.first; - mode8x8[2]= xd->mode_info_context->bmi[8].as_mode.first; - mode8x8[3]= xd->mode_info_context->bmi[10].as_mode.first; + error8x8 = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate8x8, + &rate8x8_tokenonly, + &dist8x8, mode8x8, + error16x16, txfm_cache[1]); + txfm_size_8x8 = mbmi->txfm_size; + for (i = 0; i < NB_TXFM_MODES; i++) { + int64_t tmp_rd = error8x8 - txfm_cache[1][cm->txfm_mode] + txfm_cache[1][i]; + if (tmp_rd < txfm_cache[0][i]) + txfm_cache[0][i] = tmp_rd; + } + mbmi->txfm_size = TX_4X4; error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, - &dist4x4, error16x16, - cpi->update_context); + &dist4x4, error16x16); + for (i = 0; i < NB_TXFM_MODES; i++) { + if (error4x4 < txfm_cache[0][i]) + txfm_cache[0][i] = error4x4; + } mbmi->mb_skip_coeff = 0; - if (cpi->common.mb_no_coeff_skip && - y_intra16x16_skippable && uv_intra_skippable_8x8) { + if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable && + ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) || + (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) { mbmi->mb_skip_coeff = 1; mbmi->mode = mode16x16; - mbmi->uv_mode = modeuv; - rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly + - vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1); - dist = dist16x16 + (distuv8x8 >> 2); + mbmi->uv_mode = (cm->txfm_mode == ONLY_4X4) ? modeuv : modeuv8x8; + rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1); + dist = dist16x16; + if (cm->txfm_mode == ONLY_4X4) { + rate += rateuv - rateuv_tokenonly; + dist += (distuv >> 2); + } else { + rate += rateuv8x8 - rateuv8x8_tokenonly; + dist += (distuv8x8 >> 2); + } mbmi->txfm_size = txfm_size_16x16; - memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0, - sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff)); } else if (error8x8 > error16x16) { if (error4x4 < error16x16) { rate = rateuv + rate4x4; mbmi->mode = B_PRED; mbmi->txfm_size = TX_4X4; dist = dist4x4 + (distuv >> 2); - memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0, - sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff)); } else { mbmi->txfm_size = txfm_size_16x16; mbmi->mode = mode16x16; rate = rate16x16 + rateuv8x8; dist = dist16x16 + (distuv8x8 >> 2); - for (i = 0; i < NB_TXFM_MODES; i++) { - x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] = - error16x16 - txfm_cache[i]; - } } if (cpi->common.mb_no_coeff_skip) rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); @@ -4436,28 +5153,28 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mode = B_PRED; mbmi->txfm_size = TX_4X4; dist = dist4x4 + (distuv >> 2); - memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0, - sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff)); } else { - // FIXME(rbultje) support transform-size selection mbmi->mode = I8X8_PRED; - mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8; + mbmi->txfm_size = txfm_size_8x8; set_i8x8_block_modes(x, mode8x8); rate = rate8x8 + rateuv; dist = dist8x8 + (distuv >> 2); - memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0, - sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff)); } if (cpi->common.mb_no_coeff_skip) rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); } + for (i = 0; i < NB_TXFM_MODES; i++) { + x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] = + txfm_cache[0][cm->txfm_mode] - txfm_cache[0][i]; + } + *returnrate = rate; *returndist = dist; } static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int *returnrate, int *returndistortion, int block_size) { @@ -4471,13 +5188,13 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int comp_pred, i; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; int frame_mdcounts[4][4]; - uint8_t *y_buffer[4]; - uint8_t *u_buffer[4]; - uint8_t *v_buffer[4]; + YV12_BUFFER_CONFIG yv12_mb[4]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; - int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx, - cpi->common.alt_fb_idx }; + int idx_list[4] = {0, + cpi->lst_fb_idx, + cpi->gld_fb_idx, + cpi->alt_fb_idx}; int mdcounts[4]; int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; int saddone = 0; @@ -4492,20 +5209,23 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, #if CONFIG_COMP_INTERINTRA_PRED int is_best_interintra = 0; int64_t best_intra16_rd = INT64_MAX; - int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED; + int best_intra16_mode = DC_PRED; +#if SEPARATE_INTERINTRA_UV + int best_intra16_uv_mode = DC_PRED; +#endif #endif int64_t best_overall_rd = INT64_MAX; INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE; + INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; int rate_uv_4x4 = 0, rate_uv_8x8 = 0, rate_uv_tokenonly_4x4 = 0, rate_uv_tokenonly_8x8 = 0; int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0; MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV; - int switchable_filter_index = 0; int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0; int dist_uv_16x16 = 0, uv_skip_16x16 = 0; MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV; + struct scale_factors scale_factor[4]; - x->skip = 0; xd->mode_info_context->mbmi.segment_id = segment_id; estimate_ref_frame_costs(cpi, segment_id, ref_costs); vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); @@ -4518,9 +5238,9 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size, - recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV], + mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV], frame_mdcounts, - y_buffer, u_buffer, v_buffer); + yv12_mb, scale_factor); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; @@ -4570,8 +5290,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - for (mode_index = 0; mode_index < MAX_MODES; - mode_index += (!switchable_filter_index)) { + for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { int mode_excluded = 0; int64_t this_rd = INT64_MAX; int disable_skip = 0; @@ -4588,10 +5307,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Test best rd so far against threshold for trying this mode. if (best_rd <= cpi->rd_threshes[mode_index] || cpi->rd_threshes[mode_index] == INT_MAX) { - switchable_filter_index = 0; continue; } + x->skip = 0; this_mode = vp9_mode_order[mode_index].mode; ref_frame = vp9_mode_order[mode_index].ref_frame; if (!(ref_frame == INTRA_FRAME || @@ -4600,6 +5319,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } mbmi->ref_frame = ref_frame; mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame; + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); comp_pred = mbmi->second_ref_frame > INTRA_FRAME; mbmi->mode = this_mode; mbmi->uv_mode = DC_PRED; @@ -4607,19 +5328,11 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1); mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1); #endif + // Evaluate all sub-pel filters irrespective of whether we can use // them for this frame. - if (this_mode >= NEARESTMV && this_mode <= SPLITMV) { - mbmi->interp_filter = - vp9_switchable_interp[switchable_filter_index++]; - if (switchable_filter_index == VP9_SWITCHABLE_FILTERS) - switchable_filter_index = 0; - if ((cm->mcomp_filter_type != SWITCHABLE) && - (cm->mcomp_filter_type != mbmi->interp_filter)) { - mode_excluded = 1; - } - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - } + mbmi->interp_filter = cm->mcomp_filter_type; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); // if (!(cpi->ref_frame_flags & flag_list[ref_frame])) // continue; @@ -4640,10 +5353,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (!(cpi->ref_frame_flags & flag_list[second_ref])) continue; mbmi->second_ref_frame = second_ref; + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); - xd->second_pre.y_buffer = y_buffer[second_ref]; - xd->second_pre.u_buffer = u_buffer[second_ref]; - xd->second_pre.v_buffer = v_buffer[second_ref]; + xd->second_pre = yv12_mb[second_ref]; mode_excluded = mode_excluded ? mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY; @@ -4661,9 +5374,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - xd->pre.y_buffer = y_buffer[ref_frame]; - xd->pre.u_buffer = u_buffer[ref_frame]; - xd->pre.v_buffer = v_buffer[ref_frame]; + xd->pre = yv12_mb[ref_frame]; vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts)); // If the segment reference frame feature is enabled.... @@ -4671,16 +5382,15 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && !vp9_check_segref(xd, segment_id, ref_frame)) { continue; - // If the segment mode feature is enabled.... + // If the segment skip feature is enabled.... // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) && - (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) { + } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) && + (this_mode != ZEROMV)) { continue; - // Disable this drop out case if either the mode or ref frame + // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. - } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) { + } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) { // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative @@ -4722,6 +5432,20 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv; distortion2 = distortion_y + distortion_uv; } else { + YV12_BUFFER_CONFIG *scaled_ref_frame = NULL; + int fb; + + if (mbmi->ref_frame == LAST_FRAME) { + fb = cpi->lst_fb_idx; + } else if (mbmi->ref_frame == GOLDEN_FRAME) { + fb = cpi->gld_fb_idx; + } else { + fb = cpi->alt_fb_idx; + } + + if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb]) + scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]]; + #if CONFIG_COMP_INTERINTRA_PRED if (mbmi->second_ref_frame == INTRA_FRAME) { if (best_intra16_mode == DC_PRED - 1) continue; @@ -4742,8 +5466,9 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, #endif &rate_y, &distortion_y, &rate_uv, &distortion_uv, - &mode_excluded, &disable_skip, recon_yoffset, - mode_index, frame_mv); + &mode_excluded, &disable_skip, + mode_index, &tmp_best_filter, frame_mv, + scaled_ref_frame, mb_row, mb_col); if (this_rd == INT64_MAX) continue; } @@ -4769,10 +5494,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->common.mb_no_coeff_skip) { int mb_skip_allowed; - // Is Mb level skip allowed for this mb. - mb_skip_allowed = - !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); + // Is Mb level skip allowed (i.e. not coded at segment level). + mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); if (skippable) { // Back out the coefficient coding costs @@ -4821,8 +5544,10 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, (this_rd < best_intra16_rd)) { best_intra16_rd = this_rd; best_intra16_mode = this_mode; +#if SEPARATE_INTERINTRA_UV best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ? mode_uv_8x8 : mode_uv_4x4); +#endif } #endif @@ -4832,7 +5557,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (this_rd < best_overall_rd) { best_overall_rd = this_rd; - best_filter = mbmi->interp_filter; + best_filter = tmp_best_filter; best_mode = this_mode; #if CONFIG_COMP_INTERINTRA_PRED is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME); @@ -4956,10 +5681,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // This code forces Altref,0,0 and skip for the frame that overlays a // an alrtef unless Altref is filtered. However, this is unsafe if - // segment level coding of ref frame or mode is enabled for this - // segment. + // segment level coding of ref frame is enabled for this segment. if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) && cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0) && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) { @@ -4971,7 +5694,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; mbmi->partitioning = 0; mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ? - TX_16X16 : cm->txfm_mode; + TX_32X32 : cm->txfm_mode; vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff)); vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff)); @@ -4991,7 +5714,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (!x->skip) { for (i = 0; i < NB_TXFM_MODES; i++) { if (best_txfm_rd[i] == INT64_MAX) - best_txfm_diff[i] = INT_MIN; + best_txfm_diff[i] = 0; else best_txfm_diff[i] = best_rd - best_txfm_rd[i]; } @@ -5000,6 +5723,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } end: + set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame, + scale_factor); { PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ? &x->sb32_context[xd->sb_index] : @@ -5015,24 +5740,23 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int *returnrate, int *returndistortion) { - return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset, + return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col, returnrate, returndistortion, BLOCK_32X32); } int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col, int *returnrate, int *returndistortion) { - return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset, + return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col, returnrate, returndistortion, BLOCK_64X64); } void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, - int recon_uvoffset, + int mb_row, int mb_col, int *totalrate, int *totaldist) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; @@ -5050,7 +5774,7 @@ void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, { int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled; - rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, + rd_pick_inter_mode(cpi, x, mb_row, mb_col, &rate, &distortion, &intra_error); /* restore cpi->zbin_mode_boost_enabled */ diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 8ee2c0bf9204f12ab05f7a001d6a9c311c16b5db..d1b4777171505d6ab5ba59dde14cc11f05b58e10 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -15,34 +15,34 @@ #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) #define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) -extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue); +void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); -extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex); +void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); -extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, - int *r, int *d); +void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, + int *r, int *d); -extern void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, - int *r, int *d); +void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, + int *r, int *d); -extern void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, - int *r, int *d); +void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, + int *r, int *d); -extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, - int ref_yoffset, int ref_uvoffset, - int *r, int *d); +void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, + int mb_row, int mb_col, + int *r, int *d); -extern int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, - int ref_yoffset, int ref_uvoffset, - int *r, int *d); +int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x, + int mb_row, int mb_col, + int *r, int *d); -extern int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, - int ref_yoffset, int ref_uvoffset, - int *r, int *d); +int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x, + int mb_row, int mb_col, + int *r, int *d); -extern void vp9_init_me_luts(); +void vp9_init_me_luts(); -extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x, - MB_PREDICTION_MODE mb, int_mv *mv); +void vp9_set_mbmode_and_mvs(MACROBLOCK *x, + MB_PREDICTION_MODE mb, int_mv *mv); #endif // VP9_ENCODER_VP9_RDOPT_H_ diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c index 84121f79c39f4b1d1ac1f905aba41a462c261e97..96d993863e2095b29c01ad4dd48c7c690fc84582 100644 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad_c.c @@ -13,12 +13,13 @@ #include "vp9/common/vp9_sadmxn.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "./vp9_rtcd.h" unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64); } @@ -26,7 +27,7 @@ unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32); } @@ -34,7 +35,7 @@ unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16); } @@ -42,7 +43,7 @@ unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8); } @@ -51,7 +52,7 @@ unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8); } @@ -59,7 +60,7 @@ unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16); } @@ -68,7 +69,7 @@ unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - int max_sad) { + unsigned int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4); } @@ -77,12 +78,12 @@ void vp9_sad64x64x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad64x64(src_ptr, src_stride, ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 2, ref_stride, + 0x7fffffff); } void vp9_sad32x32x3_c(const uint8_t *src_ptr, @@ -90,74 +91,74 @@ void vp9_sad32x32x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + unsigned int *sad_array) { + sad_array[0] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + unsigned int *sad_array) { + sad_array[0] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad16x16x3_c(const uint8_t *src_ptr, @@ -165,43 +166,43 @@ void vp9_sad16x16x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad16x8x3_c(const uint8_t *src_ptr, @@ -209,43 +210,43 @@ void vp9_sad16x8x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad8x8x3_c(const uint8_t *src_ptr, @@ -253,43 +254,43 @@ void vp9_sad8x8x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad8x16x3_c(const uint8_t *src_ptr, @@ -297,43 +298,43 @@ void vp9_sad8x16x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad4x4x3_c(const uint8_t *src_ptr, @@ -341,204 +342,146 @@ void vp9_sad4x4x3_c(const uint8_t *src_ptr, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr, ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 2, ref_stride, 0x7fffffff); } void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint16_t *sad_array) { - sad_array[0] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + uint32_t *sad_array) { + sad_array[0] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr, ref_stride, + 0x7fffffff); + sad_array[1] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 1, ref_stride, + 0x7fffffff); + sad_array[2] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 2, ref_stride, + 0x7fffffff); + sad_array[3] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 3, ref_stride, + 0x7fffffff); + sad_array[4] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 4, ref_stride, + 0x7fffffff); + sad_array[5] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 5, ref_stride, + 0x7fffffff); + sad_array[6] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 6, ref_stride, + 0x7fffffff); + sad_array[7] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr + 7, ref_stride, + 0x7fffffff); } void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad64x64_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad64x64(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad32x32(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad16x16(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad16x8(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad8x8(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + sad_array[0] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad8x16(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -/* Copy 2 macroblocks to a buffer */ -void vp9_copy32xn_c(uint8_t *src_ptr, - int src_stride, - uint8_t *dst_ptr, - int dst_stride, - int height) { - int r; - - for (r = 0; r < height; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - dst_ptr[2] = src_ptr[2]; - dst_ptr[3] = src_ptr[3]; - dst_ptr[4] = src_ptr[4]; - dst_ptr[5] = src_ptr[5]; - dst_ptr[6] = src_ptr[6]; - dst_ptr[7] = src_ptr[7]; - dst_ptr[8] = src_ptr[8]; - dst_ptr[9] = src_ptr[9]; - dst_ptr[10] = src_ptr[10]; - dst_ptr[11] = src_ptr[11]; - dst_ptr[12] = src_ptr[12]; - dst_ptr[13] = src_ptr[13]; - dst_ptr[14] = src_ptr[14]; - dst_ptr[15] = src_ptr[15]; - dst_ptr[16] = src_ptr[16]; - dst_ptr[17] = src_ptr[17]; - dst_ptr[18] = src_ptr[18]; - dst_ptr[19] = src_ptr[19]; - dst_ptr[20] = src_ptr[20]; - dst_ptr[21] = src_ptr[21]; - dst_ptr[22] = src_ptr[22]; - dst_ptr[23] = src_ptr[23]; - dst_ptr[24] = src_ptr[24]; - dst_ptr[25] = src_ptr[25]; - dst_ptr[26] = src_ptr[26]; - dst_ptr[27] = src_ptr[27]; - dst_ptr[28] = src_ptr[28]; - dst_ptr[29] = src_ptr[29]; - dst_ptr[30] = src_ptr[30]; - dst_ptr[31] = src_ptr[31]; -#else - ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0]; - ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1]; - ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2]; - ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3]; - ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4]; - ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5]; - ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6]; - ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7]; -#endif - src_ptr += src_stride; - dst_ptr += dst_stride; - - } + sad_array[0] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } diff --git a/vp9/encoder/vp9_satd_c.c b/vp9/encoder/vp9_satd_c.c deleted file mode 100644 index 212c2243d6db385acc37e40d35209e485832ad3f..0000000000000000000000000000000000000000 --- a/vp9/encoder/vp9_satd_c.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <stdlib.h> -#include "vpx_ports/mem.h" -#include "./vp9_rtcd.h" - -unsigned int vp9_satd16x16_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *psatd) { - int r, c, i; - unsigned int satd = 0; - DECLARE_ALIGNED(16, int16_t, diff_in[256]); - DECLARE_ALIGNED(16, int16_t, diff_out[16]); - int16_t *in; - - for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { - diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c]; - } - src_ptr += src_stride; - ref_ptr += ref_stride; - } - - in = diff_in; - for (r = 0; r < 16; r += 4) { - for (c = 0; c < 16; c += 4) { - vp9_short_walsh4x4_c(in + c, diff_out, 32); - for (i = 0; i < 16; i++) - satd += abs(diff_out[i]); - } - in += 64; - } - - if (psatd) - *psatd = satd; - - return satd; -} diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index 49195e80c2a794194b2110e54eb61769b4b9e392..a04a20c29a68683f8f0e3bc3af7a755a577640d8 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -9,10 +9,11 @@ */ -#include "limits.h" +#include <limits.h> #include "vpx_mem/vpx_mem.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_tile_common.h" void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) { int mb_row, mb_col; @@ -21,7 +22,7 @@ void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) { x->gf_active_ptr = (signed char *)cpi->gf_active_flags; - if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) { + if ((cm->frame_type == KEY_FRAME) || (cpi->refresh_golden_frame)) { // Reset Gf useage monitors vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; @@ -143,11 +144,74 @@ static int cost_segmap(MACROBLOCKD *xd, return cost; } +// Based on set of segment counts calculate a probability tree +static void calc_segtree_probs_pred(MACROBLOCKD *xd, + int (*segcounts)[MAX_MB_SEGMENTS], + vp9_prob *segment_tree_probs, + vp9_prob *mod_probs) { + int count[4]; + + assert(!segcounts[0][0] && !segcounts[1][1] && + !segcounts[2][2] && !segcounts[3][3]); + + // Total count for all segments + count[0] = segcounts[3][0] + segcounts[1][0] + segcounts[2][0]; + count[1] = segcounts[2][1] + segcounts[0][1] + segcounts[3][1]; + count[2] = segcounts[0][2] + segcounts[3][2] + segcounts[1][2]; + count[3] = segcounts[1][3] + segcounts[2][3] + segcounts[0][3]; + + // Work out probabilities of each segment + segment_tree_probs[0] = get_binary_prob(count[0] + count[1], + count[2] + count[3]); + segment_tree_probs[1] = get_binary_prob(count[0], count[1]); + segment_tree_probs[2] = get_binary_prob(count[2], count[3]); + + // now work out modified counts that the decoder would have + count[0] = segment_tree_probs[0] * segment_tree_probs[1]; + count[1] = segment_tree_probs[0] * (256 - segment_tree_probs[1]); + count[2] = (256 - segment_tree_probs[0]) * segment_tree_probs[2]; + count[3] = (256 - segment_tree_probs[0]) * (256 - segment_tree_probs[2]); + + // Work out modified probabilties depending on what segment was predicted + mod_probs[0] = get_binary_prob(count[1], count[2] + count[3]); + mod_probs[1] = get_binary_prob(count[0], count[2] + count[3]); + mod_probs[2] = get_binary_prob(count[0] + count[1], count[3]); + mod_probs[3] = get_binary_prob(count[0] + count[1], count[2]); +} + +// Based on set of segment counts and probabilities calculate a cost estimate +static int cost_segmap_pred(MACROBLOCKD *xd, + int (*segcounts)[MAX_MB_SEGMENTS], + vp9_prob *probs, vp9_prob *mod_probs) { + int pred_seg, cost = 0; + + for (pred_seg = 0; pred_seg < MAX_MB_SEGMENTS; pred_seg++) { + int count1, count2; + + // Cost the top node of the tree + count1 = segcounts[pred_seg][0] + segcounts[pred_seg][1]; + count2 = segcounts[pred_seg][2] + segcounts[pred_seg][3]; + cost += count1 * vp9_cost_zero(mod_probs[pred_seg]) + + count2 * vp9_cost_one(mod_probs[pred_seg]); + + // Now add the cost of each individual segment branch + if (pred_seg >= 2 && count1) { + cost += segcounts[pred_seg][0] * vp9_cost_zero(probs[1]) + + segcounts[pred_seg][1] * vp9_cost_one(probs[1]); + } else if (pred_seg < 2 && count2 > 0) { + cost += segcounts[pred_seg][2] * vp9_cost_zero(probs[2]) + + segcounts[pred_seg][3] * vp9_cost_one(probs[2]); + } + } + + return cost; +} + static void count_segs(VP9_COMP *cpi, MODE_INFO *mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], - int *t_unpred_seg_counts, + int (*t_unpred_seg_counts)[MAX_MB_SEGMENTS], int mb_size, int mb_row, int mb_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; @@ -155,10 +219,8 @@ static void count_segs(VP9_COMP *cpi, const int segment_id = mi->mbmi.segment_id; xd->mode_info_context = mi; - xd->mb_to_top_edge = -((mb_row * 16) << 3); - xd->mb_to_left_edge = -((mb_col * 16) << 3); - xd->mb_to_bottom_edge = ((cm->mb_rows - mb_size - mb_row) * 16) << 3; - xd->mb_to_right_edge = ((cm->mb_cols - mb_size - mb_col) * 16) << 3; + set_mb_row(cm, xd, mb_row, mb_size); + set_mb_col(cm, xd, mb_col, mb_size); // Count the number of hits on each segment with no prediction no_pred_segcounts[segment_id]++; @@ -166,8 +228,8 @@ static void count_segs(VP9_COMP *cpi, // Temporal prediction not allowed on key frames if (cm->frame_type != KEY_FRAME) { // Test to see if the segment id matches the predicted value. - const int seg_predicted = - (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index)); + const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, segmap_index); + const int seg_predicted = (segment_id == pred_seg_id); // Get the segment id prediction context const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID); @@ -179,7 +241,7 @@ static void count_segs(VP9_COMP *cpi, if (!seg_predicted) // Update the "unpredicted" segment count - t_unpred_seg_counts[segment_id]++; + t_unpred_seg_counts[pred_seg_id][segment_id]++; } } @@ -191,18 +253,19 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { int t_pred_cost = INT_MAX; int i; - int mb_row, mb_col; + int tile_col, mb_row, mb_col; int temporal_predictor_count[PREDICTION_PROBS][2]; int no_pred_segcounts[MAX_MB_SEGMENTS]; - int t_unpred_seg_counts[MAX_MB_SEGMENTS]; + int t_unpred_seg_counts[MAX_MB_SEGMENTS][MAX_MB_SEGMENTS]; vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS]; vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS]; + vp9_prob t_pred_tree_mod[MAX_MB_SEGMENTS]; vp9_prob t_nopred_prob[PREDICTION_PROBS]; const int mis = cm->mode_info_stride; - MODE_INFO *mi_ptr = cm->mi, *mi; + MODE_INFO *mi_ptr, *mi; // Set default state for the segment tree probabilities and the // temporal coding probabilities @@ -218,42 +281,49 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // First of all generate stats regarding how well the last segment map // predicts this one - for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) { - mi = mi_ptr; - for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) { - if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) { - count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, 4, mb_row, mb_col); - } else { - for (i = 0; i < 4; i++) { - int x_idx = (i & 1) << 1, y_idx = i & 2; - MODE_INFO *sb_mi = mi + y_idx * mis + x_idx; - - if (mb_col + x_idx >= cm->mb_cols || - mb_row + y_idx >= cm->mb_rows) { - continue; - } - - if (sb_mi->mbmi.sb_type) { - assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32); - count_segs(cpi, sb_mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, 2, mb_row + y_idx, mb_col + x_idx); - } else { - int j; - - for (j = 0; j < 4; j++) { - const int x_idx_mb = x_idx + (j & 1), y_idx_mb = y_idx + (j >> 1); - MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis; + for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) { + vp9_get_tile_col_offsets(cm, tile_col); + mi_ptr = cm->mi + cm->cur_tile_mb_col_start; + for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) { + mi = mi_ptr; + for (mb_col = cm->cur_tile_mb_col_start; + mb_col < cm->cur_tile_mb_col_end; mb_col += 4, mi += 4) { + if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) { + count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, 4, mb_row, mb_col); + } else { + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) << 1, y_idx = i & 2; + MODE_INFO *sb_mi = mi + y_idx * mis + x_idx; + + if (mb_col + x_idx >= cm->mb_cols || + mb_row + y_idx >= cm->mb_rows) { + continue; + } - if (mb_col + x_idx_mb >= cm->mb_cols || - mb_row + y_idx_mb >= cm->mb_rows) { - continue; + if (sb_mi->mbmi.sb_type) { + assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32); + count_segs(cpi, sb_mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, 2, + mb_row + y_idx, mb_col + x_idx); + } else { + int j; + + for (j = 0; j < 4; j++) { + const int x_idx_mb = x_idx + (j & 1); + const int y_idx_mb = y_idx + (j >> 1); + MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis; + + if (mb_col + x_idx_mb >= cm->mb_cols || + mb_row + y_idx_mb >= cm->mb_rows) { + continue; + } + + assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16); + count_segs(cpi, mb_mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, + 1, mb_row + y_idx_mb, mb_col + x_idx_mb); } - - assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16); - count_segs(cpi, mb_mi, no_pred_segcounts, - temporal_predictor_count, t_unpred_seg_counts, - 1, mb_row + y_idx_mb, mb_col + x_idx_mb); } } } @@ -270,8 +340,10 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { if (cm->frame_type != KEY_FRAME) { // Work out probability tree for coding those segments not // predicted using the temporal method and the cost. - calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree); - t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree); + calc_segtree_probs_pred(xd, t_unpred_seg_counts, t_pred_tree, + t_pred_tree_mod); + t_pred_cost = cost_segmap_pred(xd, t_unpred_seg_counts, t_pred_tree, + t_pred_tree_mod); // Add in the cost of the signalling for each prediction context for (i = 0; i < PREDICTION_PROBS; i++) { @@ -291,6 +363,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { cm->temporal_update = 1; vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree)); + vpx_memcpy(xd->mb_segment_mispred_tree_probs, + t_pred_tree_mod, sizeof(t_pred_tree_mod)); vpx_memcpy(&cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob)); } else { diff --git a/vp9/encoder/vp9_segmentation.h b/vp9/encoder/vp9_segmentation.h index 3c75c68d80fbbd428a595a883381de38eca7591e..1c90c2f2d9001af15767bad6225910eb5ecdde5a 100644 --- a/vp9/encoder/vp9_segmentation.h +++ b/vp9/encoder/vp9_segmentation.h @@ -9,23 +9,20 @@ */ -#include "string.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/encoder/vp9_onyx_int.h" - #ifndef VP9_ENCODER_VP9_SEGMENTATION_H_ #define VP9_ENCODER_VP9_SEGMENTATION_H_ -extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, - MACROBLOCK *x); +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_onyx_int.h" + +void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x); -extern void vp9_enable_segmentation(VP9_PTR ptr); -extern void vp9_disable_segmentation(VP9_PTR ptr); +void vp9_enable_segmentation(VP9_PTR ptr); +void vp9_disable_segmentation(VP9_PTR ptr); // Valid values for a segment are 0 to 3 // Segmentation map is arrange as [Rows][Columns] -extern void vp9_set_segmentation_map(VP9_PTR ptr, - unsigned char *segmentation_map); +void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map); // The values given for each segment can be either deltas (from the default // value chosen for the frame) or absolute values. @@ -37,10 +34,9 @@ extern void vp9_set_segmentation_map(VP9_PTR ptr, // // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use // the absolute values given). -// -extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data, - unsigned char abs_delta); +void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data, + unsigned char abs_delta); -extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi); +void vp9_choose_segmap_coding_method(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_SEGMENTATION_H_ diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 8bbe534860c99df912bbc75f14329cccf6d3e1ce..22a12f4a8558978014a7aaaf1e8cbb44f6f957c4 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -8,8 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <math.h> +#include <limits.h> #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_quantize.h" @@ -26,15 +29,9 @@ #include "vp9/common/vp9_swapyv12buffer.h" #include "vpx_ports/vpx_timer.h" -#include <math.h> -#include <limits.h> - #define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering -#if VP9_TEMPORAL_ALT_REF - - static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, @@ -43,39 +40,44 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, int mv_row, int mv_col, uint8_t *pred) { - int offset; - uint8_t *yptr, *uptr, *vptr; - int omv_row, omv_col; - - // Y - yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3); - - if ((mv_row | mv_col) & 7) { - xd->subpixel_predict16x16(yptr, stride, - (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16); - } else { - vp9_copy_mem16x16(yptr, stride, &pred[0], 16); - } + const int which_mv = 0; + int_mv subpel_mv; + int_mv fullpel_mv; + + subpel_mv.as_mv.row = mv_row; + subpel_mv.as_mv.col = mv_col; + // TODO(jkoleszar): Make this rounding consistent with the rest of the code + fullpel_mv.as_mv.row = (mv_row >> 1) & ~7; + fullpel_mv.as_mv.col = (mv_col >> 1) & ~7; + + vp9_build_inter_predictor(y_mb_ptr, stride, + &pred[0], 16, + &subpel_mv, + &xd->scale_factor[which_mv], + 16, 16, + which_mv << + (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), + &xd->subpix); - // U & V - omv_row = mv_row; - omv_col = mv_col; - mv_row >>= 1; - mv_col >>= 1; stride = (stride + 1) >> 1; - offset = (mv_row >> 3) * stride + (mv_col >> 3); - uptr = u_mb_ptr + offset; - vptr = v_mb_ptr + offset; - - if ((omv_row | omv_col) & 15) { - xd->subpixel_predict8x8(uptr, stride, - (omv_col & 15), (omv_row & 15), &pred[256], 8); - xd->subpixel_predict8x8(vptr, stride, - (omv_col & 15), (omv_row & 15), &pred[320], 8); - } else { - vp9_copy_mem8x8(uptr, stride, &pred[256], 8); - vp9_copy_mem8x8(vptr, stride, &pred[320], 8); - } + + vp9_build_inter_predictor_q4(u_mb_ptr, stride, + &pred[256], 8, + &fullpel_mv, &subpel_mv, + &xd->scale_factor_uv[which_mv], + 8, 8, + which_mv << + (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), + &xd->subpix); + + vp9_build_inter_predictor_q4(v_mb_ptr, stride, + &pred[320], 8, + &fullpel_mv, &subpel_mv, + &xd->scale_factor_uv[which_mv], + 8, 8, + which_mv << + (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), + &xd->subpix); } void vp9_temporal_filter_apply_c(uint8_t *frame1, @@ -170,7 +172,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, /*cpi->sf.search_method == HEX*/ // TODO Check that the 16x16 vf & sdf are selected here // Ignore mv costing by sending NULL pointer instead of cost arrays - bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first, + bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv[0], step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16], NULL, NULL, NULL, NULL, &best_ref_mv1); @@ -182,7 +184,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, int distortion; unsigned int sse; // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first, + bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv[0], &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], @@ -262,8 +264,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, if (cpi->frames[frame] == NULL) continue; - mbd->block[0].bmi.as_mv.first.as_mv.row = 0; - mbd->block[0].bmi.as_mv.first.as_mv.col = 0; + mbd->block[0].bmi.as_mv[0].as_mv.row = 0; + mbd->block[0].bmi.as_mv[0].as_mv.col = 0; if (frame == alt_ref_index) { filter_weight = 2; @@ -296,8 +298,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, cpi->frames[frame]->u_buffer + mb_uv_offset, cpi->frames[frame]->v_buffer + mb_uv_offset, cpi->frames[frame]->y_stride, - mbd->block[0].bmi.as_mv.first.as_mv.row, - mbd->block[0].bmi.as_mv.first.as_mv.col, + mbd->block[0].bmi.as_mv[0].as_mv.row, + mbd->block[0].bmi.as_mv[0].as_mv.col, predictor); // Apply the filter (YUV) @@ -375,11 +377,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, mbd->pre.v_buffer = v_buffer; } -void vp9_temporal_filter_prepare -( - VP9_COMP *cpi, - int distance -) { +void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { int frame = 0; int num_frames_backward = 0; @@ -389,10 +387,8 @@ void vp9_temporal_filter_prepare int frames_to_blur = 0; int start_frame = 0; - int strength = cpi->oxcf.arnr_strength; - + int strength = cpi->active_arnr_strength; int blur_type = cpi->oxcf.arnr_type; - int max_frames = cpi->active_arnr_frames; num_frames_backward = distance; @@ -464,6 +460,13 @@ void vp9_temporal_filter_prepare , start_frame); #endif + // Setup scaling factors. Scaling on each of the arnr frames is not supported + vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0], + &cpi->common.yv12_fb[cpi->common.new_fb_idx], + cpi->common.width, + cpi->common.height); + cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0]; + // Setup frame pointers, NULL indicates frame not included in filter vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *)); for (frame = 0; frame < frames_to_blur; frame++) { @@ -479,4 +482,3 @@ void vp9_temporal_filter_prepare frames_to_blur_backward, strength); } -#endif diff --git a/vp9/encoder/vp9_temporal_filter.h b/vp9/encoder/vp9_temporal_filter.h index 27fc35f826557c769efedc48ce9733c18946f8d0..f3ca8c6169107123325d73914498f69c5812f74d 100644 --- a/vp9/encoder/vp9_temporal_filter.h +++ b/vp9/encoder/vp9_temporal_filter.h @@ -11,6 +11,6 @@ #ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ #define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ -extern void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance); +void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance); #endif // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index fc99311ae62b70074433f0a3fcd0a8ede538bcb0..21401d1759c598c2ae41c4377a07f730d83a1853 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -25,23 +25,32 @@ compressions, then generating vp9_context.c = initial stats. */ #ifdef ENTROPY_STATS -vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4]; -vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4]; -vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8]; -vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8]; -vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16]; -vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16]; -vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32]; - -extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4]; -extern vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4]; -extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8]; -extern vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8]; -extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16]; -extern vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16]; -extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32]; +vp9_coeff_accum context_counters_4x4[BLOCK_TYPES]; +vp9_coeff_accum context_counters_8x8[BLOCK_TYPES]; +vp9_coeff_accum context_counters_16x16[BLOCK_TYPES]; +vp9_coeff_accum context_counters_32x32[BLOCK_TYPES]; + +extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES]; +extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES]; +extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES]; +extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES]; #endif /* ENTROPY_STATS */ +#if CONFIG_CODE_NONZEROCOUNT +#ifdef NZC_STATS +unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC4X4_TOKENS]; +unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC8X8_TOKENS]; +unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC16X16_TOKENS]; +unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES] + [NZC32X32_TOKENS]; +unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA] + [NZC_BITS_EXTRA][2]; +#endif +#endif + static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2]; const TOKENVALUE *vp9_dct_value_tokens_ptr; static int dct_value_cost[DCT_MAX_VALUE * 2]; @@ -100,11 +109,7 @@ static void fill_value_tokens() { vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; } -#if CONFIG_NEWCOEFCONTEXT -#define PT pn -#else -#define PT pt -#endif +extern const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad); static void tokenize_b(VP9_COMP *cpi, MACROBLOCKD *xd, @@ -113,79 +118,92 @@ static void tokenize_b(VP9_COMP *cpi, PLANE_TYPE type, TX_SIZE tx_size, int dry_run) { + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; int pt; /* near block/prev token context index */ - int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0; - const BLOCKD * const b = xd->block + ib; - const int eob = b->eob; /* one beyond last nonzero coeff */ + int c = 0; + const int eob = xd->eobs[ib]; /* one beyond last nonzero coeff */ TOKENEXTRA *t = *tp; /* store tokens starting here */ - int16_t *qcoeff_ptr = b->qcoeff; - int seg_eob; - const int segment_id = xd->mode_info_context->mbmi.segment_id; - const int *bands, *scan; + int16_t *qcoeff_ptr = xd->qcoeff + 16 * ib; + int seg_eob, default_eob, pad; + const int segment_id = mbmi->segment_id; + const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type; + const int *scan, *nb; vp9_coeff_count *counts; vp9_coeff_probs *probs; - const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type(xd, b) : DCT_DCT; -#if CONFIG_NEWCOEFCONTEXT - const int *neighbors; - int pn; + const int ref = mbmi->ref_frame != INTRA_FRAME; + ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec; + uint8_t token_cache[1024]; +#if CONFIG_CODE_NONZEROCOUNT + int zerosleft, nzc = 0; + if (eob == 0) + assert(xd->nzcs[ib] == 0); #endif - ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context + - vp9_block2above[tx_size][ib]; - ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context + - vp9_block2left[tx_size][ib]; - ENTROPY_CONTEXT a_ec = *a, l_ec = *l; - - ENTROPY_CONTEXT *const a1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]) + - vp9_block2above[tx_size][ib]; - ENTROPY_CONTEXT *const l1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]) + - vp9_block2left[tx_size][ib]; - + if (sb_type == BLOCK_SIZE_SB64X64) { + a = (ENTROPY_CONTEXT *)xd->above_context + + vp9_block2above_sb64[tx_size][ib]; + l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib]; + a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + } else if (sb_type == BLOCK_SIZE_SB32X32) { + a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib]; + l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib]; + a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a2 = a3 = l2 = l3 = NULL; + } else { + a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib]; + l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib]; + a1 = l1 = a2 = l2 = a3 = l3 = NULL; + } switch (tx_size) { default: - case TX_4X4: + case TX_4X4: { + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_4x4(xd, ib) : DCT_DCT; + a_ec = *a; + l_ec = *l; seg_eob = 16; - bands = vp9_coef_bands_4x4; scan = vp9_default_zig_zag1d_4x4; if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_4x4; - probs = cpi->common.fc.hybrid_coef_probs_4x4; if (tx_type == ADST_DCT) { scan = vp9_row_scan_4x4; } else if (tx_type == DCT_ADST) { scan = vp9_col_scan_4x4; } - } else { - counts = cpi->coef_counts_4x4; - probs = cpi->common.fc.coef_probs_4x4; } + counts = cpi->coef_counts_4x4; + probs = cpi->common.fc.coef_probs_4x4; break; - case TX_8X8: - if (type == PLANE_TYPE_Y2) { - seg_eob = 4; - bands = vp9_coef_bands_4x4; - scan = vp9_default_zig_zag1d_4x4; - } else { -#if CONFIG_CNVCONTEXT - a_ec = (a[0] + a[1]) != 0; - l_ec = (l[0] + l[1]) != 0; -#endif - seg_eob = 64; - bands = vp9_coef_bands_8x8; - scan = vp9_default_zig_zag1d_8x8; - } + } + case TX_8X8: { + const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x; + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT; + a_ec = (a[0] + a[1]) != 0; + l_ec = (l[0] + l[1]) != 0; + seg_eob = 64; + scan = vp9_default_zig_zag1d_8x8; if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_8x8; - probs = cpi->common.fc.hybrid_coef_probs_8x8; - } else { - counts = cpi->coef_counts_8x8; - probs = cpi->common.fc.coef_probs_8x8; + if (tx_type == ADST_DCT) { + scan = vp9_row_scan_8x8; + } else if (tx_type == DCT_ADST) { + scan = vp9_col_scan_8x8; + } } + counts = cpi->coef_counts_8x8; + probs = cpi->common.fc.coef_probs_8x8; break; - case TX_16X16: -#if CONFIG_CNVCONTEXT + } + case TX_16X16: { + const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x; + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT; if (type != PLANE_TYPE_UV) { a_ec = (a[0] + a[1] + a[2] + a[3]) != 0; l_ec = (l[0] + l[1] + l[2] + l[3]) != 0; @@ -193,89 +211,99 @@ static void tokenize_b(VP9_COMP *cpi, a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; } -#endif seg_eob = 256; - bands = vp9_coef_bands_16x16; scan = vp9_default_zig_zag1d_16x16; if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_16x16; - probs = cpi->common.fc.hybrid_coef_probs_16x16; - } else { - counts = cpi->coef_counts_16x16; - probs = cpi->common.fc.coef_probs_16x16; - } - if (type == PLANE_TYPE_UV) { - int uv_idx = (ib - 16) >> 2; - qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx; + if (tx_type == ADST_DCT) { + scan = vp9_row_scan_16x16; + } else if (tx_type == DCT_ADST) { + scan = vp9_col_scan_16x16; + } } + counts = cpi->coef_counts_16x16; + probs = cpi->common.fc.coef_probs_16x16; break; + } case TX_32X32: -#if CONFIG_CNVCONTEXT - a_ec = a[0] + a[1] + a[2] + a[3] + - a1[0] + a1[1] + a1[2] + a1[3]; - l_ec = l[0] + l[1] + l[2] + l[3] + - l1[0] + l1[1] + l1[2] + l1[3]; - a_ec = a_ec != 0; - l_ec = l_ec != 0; -#endif + if (type != PLANE_TYPE_UV) { + a_ec = (a[0] + a[1] + a[2] + a[3] + + a1[0] + a1[1] + a1[2] + a1[3]) != 0; + l_ec = (l[0] + l[1] + l[2] + l[3] + + l1[0] + l1[1] + l1[2] + l1[3]) != 0; + } else { + a_ec = (a[0] + a[1] + a1[0] + a1[1] + + a2[0] + a2[1] + a3[0] + a3[1]) != 0; + l_ec = (l[0] + l[1] + l1[0] + l1[1] + + l2[0] + l2[1] + l3[0] + l3[1]) != 0; + } seg_eob = 1024; - bands = vp9_coef_bands_32x32; scan = vp9_default_zig_zag1d_32x32; counts = cpi->coef_counts_32x32; probs = cpi->common.fc.coef_probs_32x32; - qcoeff_ptr = xd->sb_coeff_data.qcoeff; break; } VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec); -#if CONFIG_NEWCOEFCONTEXT - neighbors = vp9_get_coef_neighbors_handle(scan); - pn = pt; -#endif + nb = vp9_get_coef_neighbors_handle(scan, &pad); + default_eob = seg_eob; - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) - seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB); + if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) + seg_eob = 0; do { - const int band = bands[c]; + const int band = get_coef_band(scan, tx_size, c); int token; - + int v = 0; +#if CONFIG_CODE_NONZEROCOUNT + zerosleft = seg_eob - xd->nzcs[ib] - c + nzc; +#endif if (c < eob) { const int rc = scan[c]; - const int v = qcoeff_ptr[rc]; + v = qcoeff_ptr[rc]; assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE); t->Extra = vp9_dct_value_tokens_ptr[v].Extra; token = vp9_dct_value_tokens_ptr[v].Token; } else { +#if CONFIG_CODE_NONZEROCOUNT + break; +#else token = DCT_EOB_TOKEN; +#endif } t->Token = token; - t->context_tree = probs[type][band][PT]; - t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) || - (band > 1 && type == PLANE_TYPE_Y_NO_DC)); + t->context_tree = probs[type][ref][band][pt]; +#if CONFIG_CODE_NONZEROCOUNT + // Skip zero node if there are no zeros left + t->skip_eob_node = 1 + (zerosleft == 0); +#else + t->skip_eob_node = (c > 0) && (token_cache[c - 1] == 0); +#endif assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0); if (!dry_run) { - ++counts[type][band][PT][token]; + ++counts[type][ref][band][pt][token]; + if (!t->skip_eob_node) + ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt]; } - pt = vp9_prev_token_class[token]; -#if CONFIG_NEWCOEFCONTEXT - if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(bands[c + 1])) - pn = vp9_get_coef_neighbor_context( - qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]); - else - pn = pt; +#if CONFIG_CODE_NONZEROCOUNT + nzc += (v != 0); #endif + token_cache[c] = token; + + pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob); ++t; } while (c < eob && ++c < seg_eob); +#if CONFIG_CODE_NONZEROCOUNT + assert(nzc == xd->nzcs[ib]); +#endif *tp = t; - a_ec = l_ec = (c > !type); /* 0 <-> all coeff data is zero */ + a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */ a[0] = a_ec; l[0] = l_ec; - if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) { + if (tx_size == TX_8X8) { a[1] = a_ec; l[1] = l_ec; } else if (tx_size == TX_16X16) { @@ -287,25 +315,27 @@ static void tokenize_b(VP9_COMP *cpi, l1[0] = l1[1] = l[1] = l_ec; } } else if (tx_size == TX_32X32) { - a[1] = a[2] = a[3] = a_ec; - l[1] = l[2] = l[3] = l_ec; - a1[0] = a1[1] = a1[2] = a1[3] = a_ec; - l1[0] = l1[1] = l1[2] = l1[3] = l_ec; + if (type != PLANE_TYPE_UV) { + a[1] = a[2] = a[3] = a_ec; + l[1] = l[2] = l[3] = l_ec; + a1[0] = a1[1] = a1[2] = a1[3] = a_ec; + l1[0] = l1[1] = l1[2] = l1[3] = l_ec; + } else { + a[1] = a1[0] = a1[1] = a_ec; + l[1] = l1[0] = l1[1] = l_ec; + a2[0] = a2[1] = a3[0] = a3[1] = a_ec; + l2[0] = l2[1] = l3[0] = l3[1] = l_ec; + } } } -int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) { +int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd) { int skip = 1; int i = 0; - if (has_2nd_order) { - for (i = 0; i < 16; i++) - skip &= (xd->block[i].eob < 2); - skip &= (!xd->block[24].eob); - } else { - for (i = 0; i < 16; i++) - skip &= (!xd->block[i].eob); - } + for (i = 0; i < 16; i++) + skip &= (!xd->eobs[i]); + return skip; } @@ -314,48 +344,41 @@ int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) { int i; for (i = 16; i < 24; i++) - skip &= (!xd->block[i].eob); + skip &= (!xd->eobs[i]); return skip; } -static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) { - return (vp9_mby_is_skippable_4x4(xd, has_2nd_order) & +static int mb_is_skippable_4x4(MACROBLOCKD *xd) { + return (vp9_mby_is_skippable_4x4(xd) & vp9_mbuv_is_skippable_4x4(xd)); } -int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_2nd_order) { +int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd) { int skip = 1; int i = 0; - if (has_2nd_order) { - for (i = 0; i < 16; i += 4) - skip &= (xd->block[i].eob < 2); - skip &= (!xd->block[24].eob); - } else { - for (i = 0; i < 16; i += 4) - skip &= (!xd->block[i].eob); - } + for (i = 0; i < 16; i += 4) + skip &= (!xd->eobs[i]); + return skip; } int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) { - return (!xd->block[16].eob) & (!xd->block[20].eob); + return (!xd->eobs[16]) & (!xd->eobs[20]); } -static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_2nd_order) { - return (vp9_mby_is_skippable_8x8(xd, has_2nd_order) & +static int mb_is_skippable_8x8(MACROBLOCKD *xd) { + return (vp9_mby_is_skippable_8x8(xd) & vp9_mbuv_is_skippable_8x8(xd)); } -static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_2nd_order) { - return (vp9_mby_is_skippable_8x8(xd, has_2nd_order) & +static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd) { + return (vp9_mby_is_skippable_8x8(xd) & vp9_mbuv_is_skippable_4x4(xd)); } int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) { - int skip = 1; - skip &= !xd->block[0].eob; - return skip; + return (!xd->eobs[0]); } static int mb_is_skippable_16x16(MACROBLOCKD *xd) { @@ -363,13 +386,11 @@ static int mb_is_skippable_16x16(MACROBLOCKD *xd) { } int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) { - int skip = 1; - skip &= !xd->block[0].eob; - return skip; + return (!xd->eobs[0]); } int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) { - return (!xd->block[16].eob) & (!xd->block[20].eob); + return (!xd->eobs[64]) & (!xd->eobs[80]); } static int sb_is_skippable_32x32(MACROBLOCKD *xd) { @@ -377,6 +398,68 @@ static int sb_is_skippable_32x32(MACROBLOCKD *xd) { vp9_sbuv_is_skippable_16x16(xd); } +int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 0; i < 64; i += 16) + skip &= (!xd->eobs[i]); + + return skip; +} + +static int sb_is_skippable_16x16(MACROBLOCKD *xd) { + return vp9_sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd); +} + +int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 0; i < 64; i += 4) + skip &= (!xd->eobs[i]); + + return skip; +} + +int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 64; i < 96; i += 4) + skip &= (!xd->eobs[i]); + + return skip; +} + +static int sb_is_skippable_8x8(MACROBLOCKD *xd) { + return vp9_sby_is_skippable_8x8(xd) & vp9_sbuv_is_skippable_8x8(xd); +} + +int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 0; i < 64; i++) + skip &= (!xd->eobs[i]); + + return skip; +} + +int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 64; i < 96; i++) + skip &= (!xd->eobs[i]); + + return skip; +} + +static int sb_is_skippable_4x4(MACROBLOCKD *xd) { + return vp9_sby_is_skippable_4x4(xd) & vp9_sbuv_is_skippable_4x4(xd); +} + void vp9_tokenize_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, @@ -384,17 +467,26 @@ void vp9_tokenize_sb(VP9_COMP *cpi, VP9_COMMON * const cm = &cpi->common; MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi; TOKENEXTRA *t_backup = *t; - ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0), - (ENTROPY_CONTEXT *) (xd->above_context + 1), }; - ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0), - (ENTROPY_CONTEXT *) (xd->left_context + 1), }; const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP); const int segment_id = mbmi->segment_id; - const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0); + const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); int b; - mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd); + switch (mbmi->txfm_size) { + case TX_32X32: + mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd); + break; + case TX_16X16: + mbmi->mb_skip_coeff = sb_is_skippable_16x16(xd); + break; + case TX_8X8: + mbmi->mb_skip_coeff = sb_is_skippable_8x8(xd); + break; + case TX_4X4: + mbmi->mb_skip_coeff = sb_is_skippable_4x4(xd); + break; + default: assert(0); + } if (mbmi->mb_skip_coeff) { if (!dry_run) @@ -402,7 +494,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, if (!cm->mb_no_coeff_skip) { vp9_stuff_sb(cpi, xd, t, dry_run); } else { - vp9_fix_contexts_sb(xd); + vp9_reset_sb_tokens_context(xd); } if (dry_run) *t = t_backup; @@ -412,14 +504,215 @@ void vp9_tokenize_sb(VP9_COMP *cpi, if (!dry_run) cpi->skip_false_count[mb_skip_context] += skip_inc; - tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, - TX_32X32, dry_run); + switch (mbmi->txfm_size) { + case TX_32X32: + tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, + TX_32X32, dry_run); + for (b = 64; b < 96; b += 16) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, + TX_16X16, dry_run); + break; + case TX_16X16: + for (b = 0; b < 64; b += 16) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, + TX_16X16, dry_run); + for (b = 64; b < 96; b += 16) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, + TX_16X16, dry_run); + break; + case TX_8X8: + for (b = 0; b < 64; b += 4) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, + TX_8X8, dry_run); + for (b = 64; b < 96; b += 4) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, + TX_8X8, dry_run); + break; + case TX_4X4: + for (b = 0; b < 64; b++) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, + TX_4X4, dry_run); + for (b = 64; b < 96; b++) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, + TX_4X4, dry_run); + break; + default: assert(0); + } - for (b = 16; b < 24; b += 4) { - tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, - TX_16X16, dry_run); + if (dry_run) + *t = t_backup; +} + +int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 0; i < 256; i += 64) + skip &= (!xd->eobs[i]); + + return skip; +} + +int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd) { + return (!xd->eobs[256]) & (!xd->eobs[320]); +} + +static int sb64_is_skippable_32x32(MACROBLOCKD *xd) { + return vp9_sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd); +} + +int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 0; i < 256; i += 16) + skip &= (!xd->eobs[i]); + + return skip; +} + +int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 256; i < 384; i += 16) + skip &= (!xd->eobs[i]); + + return skip; +} + +static int sb64_is_skippable_16x16(MACROBLOCKD *xd) { + return vp9_sb64y_is_skippable_16x16(xd) & vp9_sb64uv_is_skippable_16x16(xd); +} + +int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 0; i < 256; i += 4) + skip &= (!xd->eobs[i]); + + return skip; +} + +int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 256; i < 384; i += 4) + skip &= (!xd->eobs[i]); + + return skip; +} + +static int sb64_is_skippable_8x8(MACROBLOCKD *xd) { + return vp9_sb64y_is_skippable_8x8(xd) & vp9_sb64uv_is_skippable_8x8(xd); +} + +int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 0; i < 256; i++) + skip &= (!xd->eobs[i]); + + return skip; +} + +int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd) { + int skip = 1; + int i = 0; + + for (i = 256; i < 384; i++) + skip &= (!xd->eobs[i]); + + return skip; +} + +static int sb64_is_skippable_4x4(MACROBLOCKD *xd) { + return vp9_sb64y_is_skippable_4x4(xd) & vp9_sb64uv_is_skippable_4x4(xd); +} + +void vp9_tokenize_sb64(VP9_COMP *cpi, + MACROBLOCKD *xd, + TOKENEXTRA **t, + int dry_run) { + VP9_COMMON * const cm = &cpi->common; + MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi; + TOKENEXTRA *t_backup = *t; + const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP); + const int segment_id = mbmi->segment_id; + const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); + int b; + + switch (mbmi->txfm_size) { + case TX_32X32: + mbmi->mb_skip_coeff = sb64_is_skippable_32x32(xd); + break; + case TX_16X16: + mbmi->mb_skip_coeff = sb64_is_skippable_16x16(xd); + break; + case TX_8X8: + mbmi->mb_skip_coeff = sb64_is_skippable_8x8(xd); + break; + case TX_4X4: + mbmi->mb_skip_coeff = sb64_is_skippable_4x4(xd); + break; + default: assert(0); + } + + if (mbmi->mb_skip_coeff) { + if (!dry_run) + cpi->skip_true_count[mb_skip_context] += skip_inc; + if (!cm->mb_no_coeff_skip) { + vp9_stuff_sb64(cpi, xd, t, dry_run); + } else { + vp9_reset_sb64_tokens_context(xd); + } + if (dry_run) + *t = t_backup; + return; + } + + if (!dry_run) + cpi->skip_false_count[mb_skip_context] += skip_inc; + + switch (mbmi->txfm_size) { + case TX_32X32: + for (b = 0; b < 256; b += 64) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, + TX_32X32, dry_run); + for (b = 256; b < 384; b += 64) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, + TX_32X32, dry_run); + break; + case TX_16X16: + for (b = 0; b < 256; b += 16) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, + TX_16X16, dry_run); + for (b = 256; b < 384; b += 16) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, + TX_16X16, dry_run); + break; + case TX_8X8: + for (b = 0; b < 256; b += 4) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, + TX_8X8, dry_run); + for (b = 256; b < 384; b += 4) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, + TX_8X8, dry_run); + break; + case TX_4X4: + for (b = 0; b < 256; b++) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, + TX_4X4, dry_run); + for (b = 256; b < 384; b++) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, + TX_4X4, dry_run); + break; + default: assert(0); } - A[0][8] = L[0][8] = A[1][8] = L[1][8] = 0; + if (dry_run) *t = t_backup; } @@ -428,8 +721,6 @@ void vp9_tokenize_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { - PLANE_TYPE plane_type; - int has_2nd_order; int b; int tx_size = xd->mode_info_context->mbmi.txfm_size; int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP); @@ -441,14 +732,11 @@ void vp9_tokenize_mb(VP9_COMP *cpi, int skip_inc; int segment_id = xd->mode_info_context->mbmi.segment_id; - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) || - (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) { + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { skip_inc = 1; } else skip_inc = 0; - has_2nd_order = get_2nd_order_usage(xd); - switch (tx_size) { case TX_16X16: @@ -458,15 +746,15 @@ void vp9_tokenize_mb(VP9_COMP *cpi, if (xd->mode_info_context->mbmi.mode == I8X8_PRED || xd->mode_info_context->mbmi.mode == SPLITMV) xd->mode_info_context->mbmi.mb_skip_coeff = - mb_is_skippable_8x8_4x4uv(xd, 0); + mb_is_skippable_8x8_4x4uv(xd); else xd->mode_info_context->mbmi.mb_skip_coeff = - mb_is_skippable_8x8(xd, has_2nd_order); + mb_is_skippable_8x8(xd); break; default: xd->mode_info_context->mbmi.mb_skip_coeff = - mb_is_skippable_4x4(xd, has_2nd_order); + mb_is_skippable_4x4(xd); break; } @@ -487,15 +775,6 @@ void vp9_tokenize_mb(VP9_COMP *cpi, if (!dry_run) cpi->skip_false_count[mb_skip_context] += skip_inc; - if (has_2nd_order) { - tokenize_b(cpi, xd, 24, t, PLANE_TYPE_Y2, tx_size, dry_run); - plane_type = PLANE_TYPE_Y_NO_DC; - } else { - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; - plane_type = PLANE_TYPE_Y_WITH_DC; - } - if (tx_size == TX_16X16) { tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run); for (b = 16; b < 24; b += 4) { @@ -503,7 +782,7 @@ void vp9_tokenize_mb(VP9_COMP *cpi, } } else if (tx_size == TX_8X8) { for (b = 0; b < 16; b += 4) { - tokenize_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run); + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run); } if (xd->mode_info_context->mbmi.mode == I8X8_PRED || xd->mode_info_context->mbmi.mode == SPLITMV) { @@ -516,11 +795,10 @@ void vp9_tokenize_mb(VP9_COMP *cpi, } } } else { - for (b = 0; b < 24; b++) { - if (b >= 16) - plane_type = PLANE_TYPE_UV; - tokenize_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run); - } + for (b = 0; b < 16; b++) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run); + for (b = 16; b < 24; b++) + tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run); } if (dry_run) *t = t_backup; @@ -531,25 +809,13 @@ void init_context_counters(void) { FILE *f = fopen("context.bin", "rb"); if (!f) { vpx_memset(context_counters_4x4, 0, sizeof(context_counters_4x4)); - vpx_memset(hybrid_context_counters_4x4, 0, - sizeof(hybrid_context_counters_4x4)); vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8)); - vpx_memset(hybrid_context_counters_8x8, 0, - sizeof(hybrid_context_counters_8x8)); vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16)); - vpx_memset(hybrid_context_counters_16x16, 0, - sizeof(hybrid_context_counters_16x16)); vpx_memset(context_counters_32x32, 0, sizeof(context_counters_32x32)); } else { fread(context_counters_4x4, sizeof(context_counters_4x4), 1, f); - fread(hybrid_context_counters_4x4, - sizeof(hybrid_context_counters_4x4), 1, f); fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f); - fread(hybrid_context_counters_8x8, - sizeof(hybrid_context_counters_8x8), 1, f); fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f); - fread(hybrid_context_counters_16x16, - sizeof(hybrid_context_counters_16x16), 1, f); fread(context_counters_32x32, sizeof(context_counters_32x32), 1, f); fclose(f); } @@ -557,25 +823,13 @@ void init_context_counters(void) { f = fopen("treeupdate.bin", "rb"); if (!f) { vpx_memset(tree_update_hist_4x4, 0, sizeof(tree_update_hist_4x4)); - vpx_memset(hybrid_tree_update_hist_4x4, 0, - sizeof(hybrid_tree_update_hist_4x4)); vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8)); - vpx_memset(hybrid_tree_update_hist_8x8, 0, - sizeof(hybrid_tree_update_hist_8x8)); vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16)); - vpx_memset(hybrid_tree_update_hist_16x16, 0, - sizeof(hybrid_tree_update_hist_16x16)); vpx_memset(tree_update_hist_32x32, 0, sizeof(tree_update_hist_32x32)); } else { fread(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f); - fread(hybrid_tree_update_hist_4x4, - sizeof(hybrid_tree_update_hist_4x4), 1, f); fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f); - fread(hybrid_tree_update_hist_8x8, - sizeof(hybrid_tree_update_hist_8x8), 1, f); fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f); - fread(hybrid_tree_update_hist_16x16, - sizeof(hybrid_tree_update_hist_16x16), 1, f); fread(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f); fclose(f); } @@ -583,33 +837,38 @@ void init_context_counters(void) { static void print_counter(FILE *f, vp9_coeff_accum *context_counters, int block_types, const char *header) { - int type, band, pt, t; + int type, ref, band, pt, t; fprintf(f, "static const vp9_coeff_count %s = {\n", header); #define Comma(X) (X ? "," : "") type = 0; do { + ref = 0; fprintf(f, "%s\n { /* block Type %d */", Comma(type), type); - band = 0; do { - fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); - pt = 0; + fprintf(f, "%s\n { /* %s */", Comma(type), ref ? "Inter" : "Intra"); + band = 0; do { - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; + fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); + pt = 0; do { - const int64_t x = context_counters[type][band][pt][t]; - const int y = (int) x; - - assert(x == (int64_t) y); /* no overflow handling yet */ - fprintf(f, "%s %d", Comma(t), y); - } while (++t < MAX_ENTROPY_TOKENS); - fprintf(f, "}"); - } while (++pt < PREV_COEF_CONTEXTS); + fprintf(f, "%s\n {", Comma(pt)); + + t = 0; + do { + const int64_t x = context_counters[type][ref][band][pt][t]; + const int y = (int) x; + + assert(x == (int64_t) y); /* no overflow handling yet */ + fprintf(f, "%s %d", Comma(t), y); + } while (++t < 1 + MAX_ENTROPY_TOKENS); + fprintf(f, "}"); + } while (++pt < PREV_COEF_CONTEXTS); + fprintf(f, "\n }"); + } while (++band < COEF_BANDS); fprintf(f, "\n }"); - } while (++band < COEF_BANDS); + } while (++ref < REF_TYPES); fprintf(f, "\n }"); } while (++type < block_types); fprintf(f, "\n};\n"); @@ -617,7 +876,7 @@ static void print_counter(FILE *f, vp9_coeff_accum *context_counters, static void print_probs(FILE *f, vp9_coeff_accum *context_counters, int block_types, const char *header) { - int type, band, pt, t; + int type, ref, band, pt, t; fprintf(f, "static const vp9_coeff_probs %s = {", header); @@ -626,32 +885,41 @@ static void print_probs(FILE *f, vp9_coeff_accum *context_counters, do { fprintf(f, "%s%s{ /* block Type %d */", Comma(type), Newline(type, " "), type); - band = 0; + ref = 0; do { - fprintf(f, "%s%s{ /* Coeff Band %d */", - Comma(band), Newline(band, " "), band); - pt = 0; + fprintf(f, "%s%s{ /* %s */", + Comma(band), Newline(band, " "), ref ? "Inter" : "Intra"); + band = 0; do { - unsigned int branch_ct[ENTROPY_NODES][2]; - unsigned int coef_counts[MAX_ENTROPY_TOKENS]; - vp9_prob coef_probs[ENTROPY_NODES]; - - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - coef_counts[t] = context_counters[type][band][pt][t]; - vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS, - vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, coef_counts); - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; + fprintf(f, "%s%s{ /* Coeff Band %d */", + Comma(band), Newline(band, " "), band); + pt = 0; do { - fprintf(f, "%s %3d", Comma(t), coef_probs[t]); - } while (++t < ENTROPY_NODES); - - fprintf(f, " }"); - } while (++pt < PREV_COEF_CONTEXTS); + unsigned int branch_ct[ENTROPY_NODES][2]; + unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1]; + vp9_prob coef_probs[ENTROPY_NODES]; + + if (pt >= 3 && band == 0) + break; + for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t) + coef_counts[t] = context_counters[type][ref][band][pt][t]; + vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs, + branch_ct, coef_counts, 0); + branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0]; + coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); + fprintf(f, "%s\n {", Comma(pt)); + + t = 0; + do { + fprintf(f, "%s %3d", Comma(t), coef_probs[t]); + } while (++t < ENTROPY_NODES); + + fprintf(f, " }"); + } while (++pt < PREV_COEF_CONTEXTS); + fprintf(f, "\n }"); + } while (++band < COEF_BANDS); fprintf(f, "\n }"); - } while (++band < COEF_BANDS); + } while (++ref < REF_TYPES); fprintf(f, "\n }"); } while (++type < block_types); fprintf(f, "\n};\n"); @@ -664,49 +932,31 @@ void print_context_counters() { fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n"); /* print counts */ - print_counter(f, context_counters_4x4, BLOCK_TYPES_4X4, - "vp9_default_coef_counts_4x4[BLOCK_TYPES_4X4]"); - print_counter(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4, - "vp9_default_hybrid_coef_counts_4x4[BLOCK_TYPES_4X4]"); - print_counter(f, context_counters_8x8, BLOCK_TYPES_8X8, - "vp9_default_coef_counts_8x8[BLOCK_TYPES_8X8]"); - print_counter(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8, - "vp9_default_hybrid_coef_counts_8x8[BLOCK_TYPES_8X8]"); - print_counter(f, context_counters_16x16, BLOCK_TYPES_16X16, - "vp9_default_coef_counts_16x16[BLOCK_TYPES_16X16]"); - print_counter(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16, - "vp9_default_hybrid_coef_counts_16x16[BLOCK_TYPES_16X16]"); - print_counter(f, context_counters_32x32, BLOCK_TYPES_32X32, - "vp9_default_coef_counts_32x32[BLOCK_TYPES_32X32]"); + print_counter(f, context_counters_4x4, BLOCK_TYPES, + "vp9_default_coef_counts_4x4[BLOCK_TYPES]"); + print_counter(f, context_counters_8x8, BLOCK_TYPES, + "vp9_default_coef_counts_8x8[BLOCK_TYPES]"); + print_counter(f, context_counters_16x16, BLOCK_TYPES, + "vp9_default_coef_counts_16x16[BLOCK_TYPES]"); + print_counter(f, context_counters_32x32, BLOCK_TYPES, + "vp9_default_coef_counts_32x32[BLOCK_TYPES]"); /* print coefficient probabilities */ - print_probs(f, context_counters_4x4, BLOCK_TYPES_4X4, - "default_coef_probs_4x4[BLOCK_TYPES_4X4]"); - print_probs(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4, - "default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]"); - print_probs(f, context_counters_8x8, BLOCK_TYPES_8X8, - "default_coef_probs_8x8[BLOCK_TYPES_8X8]"); - print_probs(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8, - "default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]"); - print_probs(f, context_counters_16x16, BLOCK_TYPES_16X16, - "default_coef_probs_16x16[BLOCK_TYPES_16X16]"); - print_probs(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16, - "default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]"); - print_probs(f, context_counters_32x32, BLOCK_TYPES_32X32, - "default_coef_probs_32x32[BLOCK_TYPES_32X32]"); + print_probs(f, context_counters_4x4, BLOCK_TYPES, + "default_coef_probs_4x4[BLOCK_TYPES]"); + print_probs(f, context_counters_8x8, BLOCK_TYPES, + "default_coef_probs_8x8[BLOCK_TYPES]"); + print_probs(f, context_counters_16x16, BLOCK_TYPES, + "default_coef_probs_16x16[BLOCK_TYPES]"); + print_probs(f, context_counters_32x32, BLOCK_TYPES, + "default_coef_probs_32x32[BLOCK_TYPES]"); fclose(f); f = fopen("context.bin", "wb"); fwrite(context_counters_4x4, sizeof(context_counters_4x4), 1, f); - fwrite(hybrid_context_counters_4x4, - sizeof(hybrid_context_counters_4x4), 1, f); fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f); - fwrite(hybrid_context_counters_8x8, - sizeof(hybrid_context_counters_8x8), 1, f); fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f); - fwrite(hybrid_context_counters_16x16, - sizeof(hybrid_context_counters_16x16), 1, f); fwrite(context_counters_32x32, sizeof(context_counters_32x32), 1, f); fclose(f); } @@ -716,61 +966,65 @@ void vp9_tokenize_initialize() { fill_value_tokens(); } -static __inline void stuff_b(VP9_COMP *cpi, - MACROBLOCKD *xd, - const int ib, - TOKENEXTRA **tp, - PLANE_TYPE type, - TX_SIZE tx_size, - int dry_run) { - const BLOCKD * const b = xd->block + ib; - const int *bands; +static void stuff_b(VP9_COMP *cpi, + MACROBLOCKD *xd, + const int ib, + TOKENEXTRA **tp, + PLANE_TYPE type, + TX_SIZE tx_size, + int dry_run) { + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type; +#if CONFIG_CODE_NONZEROCOUNT == 0 vp9_coeff_count *counts; vp9_coeff_probs *probs; int pt, band; TOKENEXTRA *t = *tp; - const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type(xd, b) : DCT_DCT; - ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context + - vp9_block2above[tx_size][ib]; - ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context + - vp9_block2left[tx_size][ib]; - ENTROPY_CONTEXT a_ec = *a, l_ec = *l; - ENTROPY_CONTEXT *const a1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]) + - vp9_block2above[tx_size][ib]; - ENTROPY_CONTEXT *const l1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]) + - vp9_block2left[tx_size][ib]; + const int ref = mbmi->ref_frame != INTRA_FRAME; +#endif + ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec; + + if (sb_type == BLOCK_SIZE_SB32X32) { + a = (ENTROPY_CONTEXT *)xd->above_context + + vp9_block2above_sb64[tx_size][ib]; + l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib]; + a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + } else if (sb_type == BLOCK_SIZE_SB32X32) { + a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib]; + l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib]; + a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT); + a2 = l2 = a3 = l3 = NULL; + } else { + a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib]; + l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib]; + a1 = l1 = a2 = l2 = a3 = l3 = NULL; + } switch (tx_size) { default: case TX_4X4: - bands = vp9_coef_bands_4x4; - if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_4x4; - probs = cpi->common.fc.hybrid_coef_probs_4x4; - } else { - counts = cpi->coef_counts_4x4; - probs = cpi->common.fc.coef_probs_4x4; - } + a_ec = a[0]; + l_ec = l[0]; +#if CONFIG_CODE_NONZEROCOUNT == 0 + counts = cpi->coef_counts_4x4; + probs = cpi->common.fc.coef_probs_4x4; +#endif break; case TX_8X8: -#if CONFIG_CNVCONTEXT - if (type != PLANE_TYPE_Y2) { - a_ec = (a[0] + a[1]) != 0; - l_ec = (l[0] + l[1]) != 0; - } + a_ec = (a[0] + a[1]) != 0; + l_ec = (l[0] + l[1]) != 0; +#if CONFIG_CODE_NONZEROCOUNT == 0 + counts = cpi->coef_counts_8x8; + probs = cpi->common.fc.coef_probs_8x8; #endif - bands = vp9_coef_bands_8x8; - if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_8x8; - probs = cpi->common.fc.hybrid_coef_probs_8x8; - } else { - counts = cpi->coef_counts_8x8; - probs = cpi->common.fc.coef_probs_8x8; - } break; case TX_16X16: -#if CONFIG_CNVCONTEXT if (type != PLANE_TYPE_UV) { a_ec = (a[0] + a[1] + a[2] + a[3]) != 0; l_ec = (l[0] + l[1] + l[2] + l[3]) != 0; @@ -778,41 +1032,44 @@ static __inline void stuff_b(VP9_COMP *cpi, a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0; l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0; } +#if CONFIG_CODE_NONZEROCOUNT == 0 + counts = cpi->coef_counts_16x16; + probs = cpi->common.fc.coef_probs_16x16; #endif - bands = vp9_coef_bands_16x16; - if (tx_type != DCT_DCT) { - counts = cpi->hybrid_coef_counts_16x16; - probs = cpi->common.fc.hybrid_coef_probs_16x16; - } else { - counts = cpi->coef_counts_16x16; - probs = cpi->common.fc.coef_probs_16x16; - } break; case TX_32X32: -#if CONFIG_CNVCONTEXT - a_ec = a[0] + a[1] + a[2] + a[3] + - a1[0] + a1[1] + a1[2] + a1[3]; - l_ec = l[0] + l[1] + l[2] + l[3] + - l1[0] + l1[1] + l1[2] + l1[3]; - a_ec = a_ec != 0; - l_ec = l_ec != 0; -#endif - bands = vp9_coef_bands_32x32; + if (type != PLANE_TYPE_UV) { + a_ec = (a[0] + a[1] + a[2] + a[3] + + a1[0] + a1[1] + a1[2] + a1[3]) != 0; + l_ec = (l[0] + l[1] + l[2] + l[3] + + l1[0] + l1[1] + l1[2] + l1[3]) != 0; + } else { + a_ec = (a[0] + a[1] + a1[0] + a1[1] + + a2[0] + a2[1] + a3[0] + a3[1]) != 0; + l_ec = (l[0] + l[1] + l1[0] + l1[1] + + l2[0] + l2[1] + l3[0] + l3[1]) != 0; + } +#if CONFIG_CODE_NONZEROCOUNT == 0 counts = cpi->coef_counts_32x32; probs = cpi->common.fc.coef_probs_32x32; +#endif break; } +#if CONFIG_CODE_NONZEROCOUNT == 0 VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec); - - band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0]; + band = 0; t->Token = DCT_EOB_TOKEN; - t->context_tree = probs[type][band][pt]; + t->context_tree = probs[type][ref][band][pt]; t->skip_eob_node = 0; ++t; *tp = t; + if (!dry_run) { + ++counts[type][ref][band][pt][DCT_EOB_TOKEN]; + } +#endif *a = *l = 0; - if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) { + if (tx_size == TX_8X8) { a[1] = 0; l[1] = 0; } else if (tx_size == TX_16X16) { @@ -824,39 +1081,28 @@ static __inline void stuff_b(VP9_COMP *cpi, l1[0] = l1[1] = l[1] = l_ec; } } else if (tx_size == TX_32X32) { - a[1] = a[2] = a[3] = a_ec; - l[1] = l[2] = l[3] = l_ec; - a1[0] = a1[1] = a1[2] = a1[3] = a_ec; - l1[0] = l1[1] = l1[2] = l1[3] = l_ec; - } - - if (!dry_run) { - ++counts[type][band][pt][DCT_EOB_TOKEN]; + if (type != PLANE_TYPE_Y_WITH_DC) { + a[1] = a[2] = a[3] = a_ec; + l[1] = l[2] = l[3] = l_ec; + a1[0] = a1[1] = a1[2] = a1[3] = a_ec; + l1[0] = l1[1] = l1[2] = l1[3] = l_ec; + } else { + a[1] = a1[0] = a1[1] = a_ec; + l[1] = l1[0] = l1[1] = l_ec; + a2[0] = a2[1] = a3[0] = a3[1] = a_ec; + l2[0] = l2[1] = l3[0] = l3[1] = l_ec; + } } } static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { - PLANE_TYPE plane_type; int b; - int has_2nd_order = get_2nd_order_usage(xd); - if (has_2nd_order) { - stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run); - plane_type = PLANE_TYPE_Y_NO_DC; - } else { -#if CONFIG_CNVCONTEXT - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; -#endif - plane_type = PLANE_TYPE_Y_WITH_DC; - } - - for (b = 0; b < 24; b += 4) { - if (b >= 16) - plane_type = PLANE_TYPE_UV; - stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run); - } + for (b = 0; b < 16; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run); + for (b = 16; b < 24; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run); } static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd, @@ -867,56 +1113,26 @@ static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd, for (b = 16; b < 24; b += 4) { stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run); } -#if CONFIG_CNVCONTEXT - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; -#endif } static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { int b; - PLANE_TYPE plane_type; - int has_2nd_order = get_2nd_order_usage(xd); - if (has_2nd_order) { - stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_4X4, dry_run); - plane_type = PLANE_TYPE_Y_NO_DC; - } else { - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; - plane_type = PLANE_TYPE_Y_WITH_DC; - } - - for (b = 0; b < 24; b++) { - if (b >= 16) - plane_type = PLANE_TYPE_UV; - stuff_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run); - } + for (b = 0; b < 16; b++) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run); + for (b = 16; b < 24; b++) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run); } static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { - PLANE_TYPE plane_type; int b; - int has_2nd_order = get_2nd_order_usage(xd); - if (has_2nd_order) { - stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run); - plane_type = PLANE_TYPE_Y_NO_DC; - } else { - xd->above_context->y2 = 0; - xd->left_context->y2 = 0; - plane_type = PLANE_TYPE_Y_WITH_DC; - } - - for (b = 0; b < 16; b += 4) { - stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run); - } - - for (b = 16; b < 24; b++) { + for (b = 0; b < 16; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run); + for (b = 16; b < 24; b++) stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run); - } } void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { @@ -941,27 +1157,76 @@ void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { } } -static void stuff_sb_32x32(VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run) { +void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { + TOKENEXTRA * const t_backup = *t; int b; - stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run); - for (b = 16; b < 24; b += 4) { - stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run); + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_32X32: + stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run); + for (b = 64; b < 96; b += 16) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run); + break; + case TX_16X16: + for (b = 0; b < 64; b += 16) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run); + for (b = 64; b < 96; b += 16) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run); + break; + case TX_8X8: + for (b = 0; b < 64; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run); + for (b = 64; b < 96; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run); + break; + case TX_4X4: + for (b = 0; b < 64; b++) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run); + for (b = 64; b < 96; b++) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run); + break; + default: assert(0); + } + + if (dry_run) { + *t = t_backup; } } -void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) { +void vp9_stuff_sb64(VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run) { TOKENEXTRA * const t_backup = *t; + int b; - stuff_sb_32x32(cpi, xd, t, dry_run); + switch (xd->mode_info_context->mbmi.txfm_size) { + case TX_32X32: + for (b = 0; b < 256; b += 64) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run); + for (b = 256; b < 384; b += 64) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_32X32, dry_run); + break; + case TX_16X16: + for (b = 0; b < 256; b += 16) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run); + for (b = 256; b < 384; b += 16) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run); + break; + case TX_8X8: + for (b = 0; b < 256; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run); + for (b = 256; b < 384; b += 4) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run); + break; + case TX_4X4: + for (b = 0; b < 256; b++) + stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run); + for (b = 256; b < 384; b++) + stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run); + break; + default: assert(0); + } if (dry_run) { *t = t_backup; } } - -void vp9_fix_contexts_sb(MACROBLOCKD *xd) { - vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2); - vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2); -} diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index 3eeb8fa5a66051e46ab5c85f215759177767f8aa..907f814ac1debe966574220eba544dee1a3a1a5a 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -28,42 +28,54 @@ typedef struct { uint8_t skip_eob_node; } TOKENEXTRA; -typedef int64_t vp9_coeff_accum[COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; +typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [MAX_ENTROPY_TOKENS + 1]; -extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block); -extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd); -extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block); -extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd); -extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd); -extern int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd); -extern int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd); +int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd); +int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd); +int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd); +int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd); +int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd); +int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd); +int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd); +int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd); struct VP9_COMP; -extern void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run); -extern void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run); +void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); +void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); +void vp9_tokenize_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); -extern void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run); -extern void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run); +void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); +void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); +void vp9_stuff_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd, + TOKENEXTRA **t, int dry_run); -extern void vp9_fix_contexts_sb(MACROBLOCKD *xd); #ifdef ENTROPY_STATS void init_context_counters(); void print_context_counters(); -extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4]; -extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8]; -extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16]; -extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32]; - -extern vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4]; -extern vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8]; -extern vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16]; +extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES]; +extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES]; +extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES]; +extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES]; #endif extern const int *vp9_dct_value_cost_ptr; diff --git a/vp9/encoder/vp9_treewriter.c b/vp9/encoder/vp9_treewriter.c index 8e252813cc0cebd4e96d17cb4a607949027fcb61..52da3c6ce3f3f2a2aaefe7747ba1c468247f3ced 100644 --- a/vp9/encoder/vp9_treewriter.c +++ b/vp9/encoder/vp9_treewriter.c @@ -10,6 +10,7 @@ #include "vp9/encoder/vp9_treewriter.h" +#include "vp9/common/vp9_common.h" static void cost( int *const C, @@ -35,5 +36,7 @@ void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) { } void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) { + assert(t[1] > 0 && t[0] <= 0); + c[-t[0]] = vp9_cost_bit(p[0], 0); cost(c, t, p, 2, 0); } diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h index 4e0e5e12c5bf9d973e0905d5a46ccfefae2fed6c..832471aa80652d780d53f47e1e8df2bc55374b73 100644 --- a/vp9/encoder/vp9_treewriter.h +++ b/vp9/encoder/vp9_treewriter.h @@ -36,30 +36,28 @@ typedef BOOL_CODER vp9_writer; /* Both of these return bits, not scaled bits. */ - -static __inline unsigned int cost_branch(const unsigned int ct[2], - vp9_prob p) { +static INLINE unsigned int cost_branch256(const unsigned int ct[2], + vp9_prob p) { /* Imitate existing calculation */ - return ((ct[0] * vp9_cost_zero(p)) - + (ct[1] * vp9_cost_one(p))) >> 8; + return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p); } -static __inline unsigned int cost_branch256(const unsigned int ct[2], - vp9_prob p) { +static INLINE unsigned int cost_branch(const unsigned int ct[2], + vp9_prob p) { /* Imitate existing calculation */ - return ((ct[0] * vp9_cost_zero(p)) - + (ct[1] * vp9_cost_one(p))); + return cost_branch256(ct, p) >> 8; } + /* Small functions to write explicit values and tokens, as well as estimate their lengths. */ -static __inline void treed_write(vp9_writer *const w, - vp9_tree t, - const vp9_prob *const p, - int v, - /* number of bits in v, assumed nonzero */ - int n) { +static INLINE void treed_write(vp9_writer *const w, + vp9_tree t, + const vp9_prob *const p, + int v, + /* number of bits in v, assumed nonzero */ + int n) { vp9_tree_index i = 0; do { @@ -69,18 +67,18 @@ static __inline void treed_write(vp9_writer *const w, } while (n); } -static __inline void write_token(vp9_writer *const w, - vp9_tree t, - const vp9_prob *const p, - vp9_token *const x) { +static INLINE void write_token(vp9_writer *const w, + vp9_tree t, + const vp9_prob *const p, + vp9_token *const x) { treed_write(w, t, p, x->value, x->Len); } -static __inline int treed_cost(vp9_tree t, - const vp9_prob *const p, - int v, - /* number of bits in v, assumed nonzero */ - int n) { +static INLINE int treed_cost(vp9_tree t, + const vp9_prob *const p, + int v, + /* number of bits in v, assumed nonzero */ + int n) { int c = 0; vp9_tree_index i = 0; @@ -93,9 +91,9 @@ static __inline int treed_cost(vp9_tree t, return c; } -static __inline int cost_token(vp9_tree t, - const vp9_prob *const p, - vp9_token *const x) { +static INLINE int cost_token(vp9_tree t, + const vp9_prob *const p, + vp9_token *const x) { return treed_cost(t, p, x->value, x->Len); } diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 675dbb63e73f8499cd03909d8c96aaddf7b78fce..13dabbda41d711a8dce115cf2892182450083a14 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -19,12 +19,6 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int ref_stride, unsigned int max_sad); -typedef void (*vp9_copy32xn_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride, - int n); - typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -35,11 +29,11 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, - unsigned short *sad_array); + unsigned int *sad_array); typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr, int source_stride, - const uint8_t ** ref_ptr, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr, @@ -79,7 +73,6 @@ typedef struct vp9_variance_vtable { vp9_sad_multi_fn_t sdx3f; vp9_sad_multi1_fn_t sdx8f; vp9_sad_multi_d_fn_t sdx4df; - vp9_copy32xn_fn_t copymem; } vp9_variance_fn_ptr_t; #endif // VP9_ENCODER_VP9_VARIANCE_H_ diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c index d03e285c6340f43a12dcf7fb2bca23a96b4db28e..d07a65b4551c8171c4946ead30d7c5c375fa3e97 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance_c.c @@ -142,8 +142,8 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, const int16_t *HFilter, *VFilter; uint16_t FData3[5 * 4]; // Temp data bufffer used in filtering - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); // First filter 1d Horizontal var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter); @@ -166,8 +166,8 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); @@ -186,8 +186,8 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); @@ -206,8 +206,8 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, uint8_t temp2[68 * 64]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 65, 64, HFilter); @@ -227,8 +227,8 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, uint8_t temp2[36 * 32]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter); @@ -367,8 +367,8 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); @@ -387,8 +387,8 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c new file mode 100644 index 0000000000000000000000000000000000000000..358d979ebed215eb45ee08d3e4297d122d68e204 --- /dev/null +++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c @@ -0,0 +1,895 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> // SSE2 +#include "vp9/common/vp9_idct.h" // for cospi constants + +void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { + const int stride = pitch >> 1; + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // Load input + __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride)); + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/substract + const __m128i q0 = _mm_add_epi16(in0, in7); + const __m128i q1 = _mm_add_epi16(in1, in6); + const __m128i q2 = _mm_add_epi16(in2, in5); + const __m128i q3 = _mm_add_epi16(in3, in4); + const __m128i q4 = _mm_sub_epi16(in3, in4); + const __m128i q5 = _mm_sub_epi16(in2, in5); + const __m128i q6 = _mm_sub_epi16(in1, in6); + const __m128i q7 = _mm_sub_epi16(in0, in7); + // Work on first four results + { + // Add/substract + const __m128i r0 = _mm_add_epi16(q0, q3); + const __m128i r1 = _mm_add_epi16(q1, q2); + const __m128i r2 = _mm_sub_epi16(q1, q2); + const __m128i r3 = _mm_sub_epi16(q0, q3); + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); + // Add/substract + const __m128i x0 = _mm_add_epi16(q4, r0); + const __m128i x1 = _mm_sub_epi16(q4, r0); + const __m128i x2 = _mm_sub_epi16(q7, r1); + const __m128i x3 = _mm_add_epi16(q7, r1); + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + // store results + _mm_storeu_si128 ((__m128i *)(output + 0 * 8), in0); + _mm_storeu_si128 ((__m128i *)(output + 1 * 8), in1); + _mm_storeu_si128 ((__m128i *)(output + 2 * 8), in2); + _mm_storeu_si128 ((__m128i *)(output + 3 * 8), in3); + _mm_storeu_si128 ((__m128i *)(output + 4 * 8), in4); + _mm_storeu_si128 ((__m128i *)(output + 5 * 8), in5); + _mm_storeu_si128 ((__m128i *)(output + 6 * 8), in6); + _mm_storeu_si128 ((__m128i *)(output + 7 * 8), in7); + } +} + +void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we tranpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + const int stride = pitch >> 1; + int pass; + // We need an intermediate buffer between passes. + int16_t intermediate[256]; + int16_t *in = input; + int16_t *out = intermediate; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 16; column_start += 8) { + __m128i in00, in01, in02, in03, in04, in05, in06, in07; + __m128i in08, in09, in10, in11, in12, in13, in14, in15; + __m128i input0, input1, input2, input3, input4, input5, input6, input7; + __m128i step1_0, step1_1, step1_2, step1_3; + __m128i step1_4, step1_5, step1_6, step1_7; + __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + __m128i step3_0, step3_1, step3_2, step3_3; + __m128i step3_4, step3_5, step3_6, step3_7; + __m128i res00, res01, res02, res03, res04, res05, res06, res07; + __m128i res08, res09, res10, res11, res12, res13, res14, res15; + // Load and pre-condition input. + if (0 == pass) { + in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride)); + in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride)); + in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride)); + in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride)); + in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride)); + in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride)); + in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride)); + in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride)); + in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride)); + in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride)); + in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride)); + in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride)); + in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride)); + in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride)); + in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride)); + in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride)); + // x = x << 2 + in00 = _mm_slli_epi16(in00, 2); + in01 = _mm_slli_epi16(in01, 2); + in02 = _mm_slli_epi16(in02, 2); + in03 = _mm_slli_epi16(in03, 2); + in04 = _mm_slli_epi16(in04, 2); + in05 = _mm_slli_epi16(in05, 2); + in06 = _mm_slli_epi16(in06, 2); + in07 = _mm_slli_epi16(in07, 2); + in08 = _mm_slli_epi16(in08, 2); + in09 = _mm_slli_epi16(in09, 2); + in10 = _mm_slli_epi16(in10, 2); + in11 = _mm_slli_epi16(in11, 2); + in12 = _mm_slli_epi16(in12, 2); + in13 = _mm_slli_epi16(in13, 2); + in14 = _mm_slli_epi16(in14, 2); + in15 = _mm_slli_epi16(in15, 2); + } else { + in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16)); + in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16)); + in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16)); + in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16)); + in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16)); + in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16)); + in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16)); + in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16)); + in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16)); + in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16)); + in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16)); + in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16)); + in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16)); + in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16)); + in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16)); + in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16)); + // x = (x + 1) >> 2 + in00 = _mm_add_epi16(in00, kOne); + in01 = _mm_add_epi16(in01, kOne); + in02 = _mm_add_epi16(in02, kOne); + in03 = _mm_add_epi16(in03, kOne); + in04 = _mm_add_epi16(in04, kOne); + in05 = _mm_add_epi16(in05, kOne); + in06 = _mm_add_epi16(in06, kOne); + in07 = _mm_add_epi16(in07, kOne); + in08 = _mm_add_epi16(in08, kOne); + in09 = _mm_add_epi16(in09, kOne); + in10 = _mm_add_epi16(in10, kOne); + in11 = _mm_add_epi16(in11, kOne); + in12 = _mm_add_epi16(in12, kOne); + in13 = _mm_add_epi16(in13, kOne); + in14 = _mm_add_epi16(in14, kOne); + in15 = _mm_add_epi16(in15, kOne); + in00 = _mm_srai_epi16(in00, 2); + in01 = _mm_srai_epi16(in01, 2); + in02 = _mm_srai_epi16(in02, 2); + in03 = _mm_srai_epi16(in03, 2); + in04 = _mm_srai_epi16(in04, 2); + in05 = _mm_srai_epi16(in05, 2); + in06 = _mm_srai_epi16(in06, 2); + in07 = _mm_srai_epi16(in07, 2); + in08 = _mm_srai_epi16(in08, 2); + in09 = _mm_srai_epi16(in09, 2); + in10 = _mm_srai_epi16(in10, 2); + in11 = _mm_srai_epi16(in11, 2); + in12 = _mm_srai_epi16(in12, 2); + in13 = _mm_srai_epi16(in13, 2); + in14 = _mm_srai_epi16(in14, 2); + in15 = _mm_srai_epi16(in15, 2); + } + in += 8; + // Calculate input for the first 8 results. + { + input0 = _mm_add_epi16(in00, in15); + input1 = _mm_add_epi16(in01, in14); + input2 = _mm_add_epi16(in02, in13); + input3 = _mm_add_epi16(in03, in12); + input4 = _mm_add_epi16(in04, in11); + input5 = _mm_add_epi16(in05, in10); + input6 = _mm_add_epi16(in06, in09); + input7 = _mm_add_epi16(in07, in08); + } + // Calculate input for the next 8 results. + { + step1_0 = _mm_sub_epi16(in07, in08); + step1_1 = _mm_sub_epi16(in06, in09); + step1_2 = _mm_sub_epi16(in05, in10); + step1_3 = _mm_sub_epi16(in04, in11); + step1_4 = _mm_sub_epi16(in03, in12); + step1_5 = _mm_sub_epi16(in02, in13); + step1_6 = _mm_sub_epi16(in01, in14); + step1_7 = _mm_sub_epi16(in00, in15); + } + // Work on the first eight values; fdct8_1d(input, even_results); + { + // Add/substract + const __m128i q0 = _mm_add_epi16(input0, input7); + const __m128i q1 = _mm_add_epi16(input1, input6); + const __m128i q2 = _mm_add_epi16(input2, input5); + const __m128i q3 = _mm_add_epi16(input3, input4); + const __m128i q4 = _mm_sub_epi16(input3, input4); + const __m128i q5 = _mm_sub_epi16(input2, input5); + const __m128i q6 = _mm_sub_epi16(input1, input6); + const __m128i q7 = _mm_sub_epi16(input0, input7); + // Work on first four results + { + // Add/substract + const __m128i r0 = _mm_add_epi16(q0, q3); + const __m128i r1 = _mm_add_epi16(q1, q2); + const __m128i r2 = _mm_sub_epi16(q1, q2); + const __m128i r3 = _mm_sub_epi16(q0, q3); + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res00 = _mm_packs_epi32(w0, w1); + res08 = _mm_packs_epi32(w2, w3); + res04 = _mm_packs_epi32(w4, w5); + res12 = _mm_packs_epi32(w6, w7); + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); + // Add/substract + const __m128i x0 = _mm_add_epi16(q4, r0); + const __m128i x1 = _mm_sub_epi16(q4, r0); + const __m128i x2 = _mm_sub_epi16(q7, r1); + const __m128i x3 = _mm_add_epi16(q7, r1); + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res02 = _mm_packs_epi32(w0, w1); + res14 = _mm_packs_epi32(w2, w3); + res10 = _mm_packs_epi32(w4, w5); + res06 = _mm_packs_epi32(w6, w7); + } + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + step2_2 = _mm_packs_epi32(w0, w1); + step2_3 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + step2_5 = _mm_packs_epi32(w0, w1); + step2_4 = _mm_packs_epi32(w2, w3); + } + // step 3 + { + step3_0 = _mm_add_epi16(step1_0, step2_3); + step3_1 = _mm_add_epi16(step1_1, step2_2); + step3_2 = _mm_sub_epi16(step1_1, step2_2); + step3_3 = _mm_sub_epi16(step1_0, step2_3); + step3_4 = _mm_sub_epi16(step1_7, step2_4); + step3_5 = _mm_sub_epi16(step1_6, step2_5); + step3_6 = _mm_add_epi16(step1_6, step2_5); + step3_7 = _mm_add_epi16(step1_7, step2_4); + } + // step 4 + { + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + step2_1 = _mm_packs_epi32(w0, w1); + step2_2 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + step2_6 = _mm_packs_epi32(w0, w1); + step2_5 = _mm_packs_epi32(w2, w3); + } + // step 5 + { + step1_0 = _mm_add_epi16(step3_0, step2_1); + step1_1 = _mm_sub_epi16(step3_0, step2_1); + step1_2 = _mm_sub_epi16(step3_3, step2_2); + step1_3 = _mm_add_epi16(step3_3, step2_2); + step1_4 = _mm_add_epi16(step3_4, step2_5); + step1_5 = _mm_sub_epi16(step3_4, step2_5); + step1_6 = _mm_sub_epi16(step3_7, step2_6); + step1_7 = _mm_add_epi16(step3_7, step2_6); + } + // step 6 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + res01 = _mm_packs_epi32(w0, w1); + res09 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + res05 = _mm_packs_epi32(w0, w1); + res13 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + res11 = _mm_packs_epi32(w0, w1); + res03 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + res15 = _mm_packs_epi32(w0, w1); + res07 = _mm_packs_epi32(w2, w3); + } + } + // Transpose the results, do it as two 8x8 transposes. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); + const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); + const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); + const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); + const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); + const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); + const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); + const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + _mm_storeu_si128 ((__m128i *)(out + 0 * 16), tr2_0); + _mm_storeu_si128 ((__m128i *)(out + 1 * 16), tr2_1); + _mm_storeu_si128 ((__m128i *)(out + 2 * 16), tr2_2); + _mm_storeu_si128 ((__m128i *)(out + 3 * 16), tr2_3); + _mm_storeu_si128 ((__m128i *)(out + 4 * 16), tr2_4); + _mm_storeu_si128 ((__m128i *)(out + 5 * 16), tr2_5); + _mm_storeu_si128 ((__m128i *)(out + 6 * 16), tr2_6); + _mm_storeu_si128 ((__m128i *)(out + 7 * 16), tr2_7); + } + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); + const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); + const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); + const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); + const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); + const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); + const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); + const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + // Store results + _mm_storeu_si128 ((__m128i *)(out + 8 + 0 * 16), tr2_0); + _mm_storeu_si128 ((__m128i *)(out + 8 + 1 * 16), tr2_1); + _mm_storeu_si128 ((__m128i *)(out + 8 + 2 * 16), tr2_2); + _mm_storeu_si128 ((__m128i *)(out + 8 + 3 * 16), tr2_3); + _mm_storeu_si128 ((__m128i *)(out + 8 + 4 * 16), tr2_4); + _mm_storeu_si128 ((__m128i *)(out + 8 + 5 * 16), tr2_5); + _mm_storeu_si128 ((__m128i *)(out + 8 + 6 * 16), tr2_6); + _mm_storeu_si128 ((__m128i *)(out + 8 + 7 * 16), tr2_7); + } + out += 8*16; + } + // Setup in/out for next pass. + in = intermediate; + out = output; + } +} diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm index 5d9f7769d440d4d936c02002bf8c53935802dbe8..90c793d4fb84e871ca036ab20762785bc641d0ff 100644 --- a/vp9/encoder/x86/vp9_encodeopt.asm +++ b/vp9/encoder/x86/vp9_encodeopt.asm @@ -125,7 +125,7 @@ sym(vp9_block_error_mmx): ret -;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr); global sym(vp9_mbblock_error_mmx_impl) PRIVATE sym(vp9_mbblock_error_mmx_impl): push rbp @@ -142,10 +142,6 @@ sym(vp9_mbblock_error_mmx_impl): mov rdi, arg(1) ;dcoef_ptr pxor mm2, mm2 - movd mm1, dword ptr arg(2) ;dc - por mm1, mm2 - - pcmpeqw mm1, mm7 mov rcx, 16 .mberror_loop_mmx: @@ -160,7 +156,6 @@ sym(vp9_mbblock_error_mmx_impl): pmaddwd mm5, mm5 psubw mm3, mm4 - pand mm3, mm1 pmaddwd mm3, mm3 paddd mm2, mm5 @@ -202,28 +197,24 @@ sym(vp9_mbblock_error_mmx_impl): ret -;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); +;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr); global sym(vp9_mbblock_error_xmm_impl) PRIVATE sym(vp9_mbblock_error_xmm_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 6 + SAVE_XMM 5 push rsi push rdi ; end prolog mov rsi, arg(0) ;coeff_ptr - pxor xmm6, xmm6 + pxor xmm5, xmm5 mov rdi, arg(1) ;dcoef_ptr pxor xmm4, xmm4 - movd xmm5, dword ptr arg(2) ;dc - por xmm5, xmm4 - - pcmpeqw xmm5, xmm6 mov rcx, 16 .mberror_loop: @@ -238,7 +229,6 @@ sym(vp9_mbblock_error_xmm_impl): pmaddwd xmm2, xmm2 psubw xmm0, xmm1 - pand xmm0, xmm5 pmaddwd xmm0, xmm0 add rsi, 32 @@ -252,9 +242,9 @@ sym(vp9_mbblock_error_xmm_impl): jnz .mberror_loop movdqa xmm0, xmm4 - punpckldq xmm0, xmm6 + punpckldq xmm0, xmm5 - punpckhdq xmm4, xmm6 + punpckhdq xmm4, xmm5 paddd xmm0, xmm4 movdqa xmm1, xmm0 diff --git a/vp9/encoder/x86/vp9_sad4d_sse2.asm b/vp9/encoder/x86/vp9_sad4d_sse2.asm new file mode 100644 index 0000000000000000000000000000000000000000..3716d91ecd04372ad60ca523afff88e99cb9e2c3 --- /dev/null +++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm @@ -0,0 +1,225 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_4x2x4 5-6 0 + movd m0, [srcq +%2] +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + punpckldq m0, [srcq +%4] + punpckldq m6, [ref1q+%5] + punpckldq m4, [ref2q+%5] + punpckldq m7, [ref3q+%5] + punpckldq m5, [ref4q+%5] + psadbw m6, m0 + psadbw m4, m0 + psadbw m7, m0 + psadbw m5, m0 + punpckldq m6, m4 + punpckldq m7, m5 +%else + movd m1, [ref1q+%3] + movd m2, [ref2q+%3] + movd m3, [ref3q+%3] + movd m4, [ref4q+%3] + punpckldq m0, [srcq +%4] + punpckldq m1, [ref1q+%5] + punpckldq m2, [ref2q+%5] + punpckldq m3, [ref3q+%5] + punpckldq m4, [ref4q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + psadbw m4, m0 + punpckldq m1, m2 + punpckldq m3, m4 + paddd m6, m1 + paddd m7, m3 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_8x2x4 5-6 0 + movh m0, [srcq +%2] +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq +%4] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movh m3, [ref3q+%3] + movhps m0, [srcq +%4] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] + movhps m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movh m1, [ref4q+%3] + movhps m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_16x2x4 5-6 0 + ; 1st 16 px + mova m0, [srcq +%2] +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] + movu m3, [ref3q+%3] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%3] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif + + ; 2nd 16 px + mova m0, [srcq +%4] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + movu m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif + psadbw m1, m0 + paddd m7, m1 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_32x2x4 5-6 0 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_64x2x4 5-6 0 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +%endmacro + +; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; unsigned int res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro SADNXN4D 2 +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + +%if mmsize == 16 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + movifnidn r4, r4mp + paddd m4, m5 + movu [r4], m4 + RET +%else + movifnidn r4, r4mp + movq [r4+0], m6 + movq [r4+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +SADNXN4D 64, 64 +SADNXN4D 32, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 + +INIT_MMX sse +SADNXN4D 4, 4 diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm index 33271635c46bf1ef1df6c918fe31c6eadf284382..ea482e071a1091e2be52df22b8b37dffdaf87d71 100644 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/vp9/encoder/x86/vp9_sad_sse2.asm @@ -8,403 +8,175 @@ ; be found in the AUTHORS file in the root of the source tree. ; - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp9_sad16x16_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp9_sad16x16_wmt) PRIVATE -sym(vp9_sad16x16_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 6 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - - lea rcx, [rcx+rax*8] - pxor xmm6, xmm6 - -.x16x16sad_wmt_loop: - - movq xmm0, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rsi+8] - - movq xmm1, QWORD PTR [rdi] - movq xmm3, QWORD PTR [rdi+8] - - movq xmm4, QWORD PTR [rsi+rax] - movq xmm5, QWORD PTR [rdi+rdx] - - - punpcklbw xmm0, xmm2 - punpcklbw xmm1, xmm3 - - psadbw xmm0, xmm1 - movq xmm2, QWORD PTR [rsi+rax+8] - - movq xmm3, QWORD PTR [rdi+rdx+8] - lea rsi, [rsi+rax*2] - - lea rdi, [rdi+rdx*2] - punpcklbw xmm4, xmm2 - - punpcklbw xmm5, xmm3 - psadbw xmm4, xmm5 - - paddw xmm6, xmm0 - paddw xmm6, xmm4 - - cmp rsi, rcx - jne .x16x16sad_wmt_loop - - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movq rax, xmm0 - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;unsigned int vp9_sad8x16_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int max_err) -global sym(vp9_sad8x16_wmt) PRIVATE -sym(vp9_sad8x16_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - - lea rcx, [rcx+rbx*8] - pxor mm7, mm7 - -.x8x16sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - jg .x8x16sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - movq mm2, QWORD PTR [rsi+rbx] - movq mm3, QWORD PTR [rdi+rdx] - - psadbw mm0, mm1 - psadbw mm2, mm3 - - lea rsi, [rsi+rbx*2] - lea rdi, [rdi+rdx*2] - - paddw mm7, mm0 - paddw mm7, mm2 - - cmp rsi, rcx - jne .x8x16sad_wmt_loop - - movq rax, mm7 - -.x8x16sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad8x8_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp9_sad8x8_wmt) PRIVATE -sym(vp9_sad8x8_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - pxor mm7, mm7 - -.x8x8sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - jg .x8x8sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - psadbw mm0, mm1 - lea rsi, [rsi+rbx] - - add rdi, rdx - paddw mm7, mm0 - - cmp rsi, rcx - jne .x8x8sad_wmt_loop - - movq rax, mm7 -.x8x8sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - -;unsigned int vp9_sad4x4_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp9_sad4x4_wmt) PRIVATE -sym(vp9_sad4x4_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - movd mm0, DWORD PTR [rsi] - movd mm1, DWORD PTR [rdi] - - movd mm2, DWORD PTR [rsi+rax] - movd mm3, DWORD PTR [rdi+rdx] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - psadbw mm0, mm1 - lea rsi, [rsi+rax*2] - - lea rdi, [rdi+rdx*2] - movd mm4, DWORD PTR [rsi] - - movd mm5, DWORD PTR [rdi] - movd mm6, DWORD PTR [rsi+rax] - - movd mm7, DWORD PTR [rdi+rdx] - punpcklbw mm4, mm6 - - punpcklbw mm5, mm7 - psadbw mm4, mm5 - - paddw mm0, mm4 - movq rax, mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad16x8_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -global sym(vp9_sad16x8_wmt) PRIVATE -sym(vp9_sad16x8_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rbx - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rbx, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rbx*8] - pxor mm7, mm7 - -.x16x8sad_wmt_loop: - - movq rax, mm7 - cmp eax, arg(4) - jg .x16x8sad_wmt_early_exit - - movq mm0, QWORD PTR [rsi] - movq mm2, QWORD PTR [rsi+8] - - movq mm1, QWORD PTR [rdi] - movq mm3, QWORD PTR [rdi+8] - - movq mm4, QWORD PTR [rsi+rbx] - movq mm5, QWORD PTR [rdi+rdx] - - psadbw mm0, mm1 - psadbw mm2, mm3 - - movq mm1, QWORD PTR [rsi+rbx+8] - movq mm3, QWORD PTR [rdi+rdx+8] - - psadbw mm4, mm5 - psadbw mm1, mm3 - - lea rsi, [rsi+rbx*2] - lea rdi, [rdi+rdx*2] - - paddw mm0, mm2 - paddw mm4, mm1 - - paddw mm7, mm0 - paddw mm7, mm4 - - cmp rsi, rcx - jne .x16x8sad_wmt_loop - - movq rax, mm7 - -.x16x8sad_wmt_early_exit: - - ; begin epilog - pop rdi - pop rsi - pop rbx - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_copy32xn_sse2( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp9_copy32xn_sse2) PRIVATE -sym(vp9_copy32xn_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;dst_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;dst_stride - movsxd rcx, dword ptr arg(4) ;height - -.block_copy_sse2_loopx4: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - movdqu xmm2, XMMWORD PTR [rsi + rax] - movdqu xmm3, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqu xmm4, XMMWORD PTR [rsi] - movdqu xmm5, XMMWORD PTR [rsi + 16] - movdqu xmm6, XMMWORD PTR [rsi + rax] - movdqu xmm7, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - movdqa XMMWORD PTR [rdi + rdx], xmm2 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 - - lea rdi, [rdi+rdx*2] - - movdqa XMMWORD PTR [rdi], xmm4 - movdqa XMMWORD PTR [rdi + 16], xmm5 - movdqa XMMWORD PTR [rdi + rdx], xmm6 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 - - lea rdi, [rdi+rdx*2] - - sub rcx, 4 - cmp rcx, 4 - jge .block_copy_sse2_loopx4 - - cmp rcx, 0 - je .copy_is_done - -.block_copy_sse2_loop: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - lea rsi, [rsi+rax] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - lea rdi, [rdi+rdx] - - sub rcx, 1 - jne .block_copy_sse2_loop - -.copy_is_done: - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_XMM sse2 +cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov n_rowsd, 64 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_XMM sse2 +cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov n_rowsd, 16 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1 +cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 + +; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1 +cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 + +; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +INIT_MMX sse +cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + movd m0, [refq] + movd m1, [refq+ref_strideq] + movd m2, [srcq] + movd m3, [srcq+src_strideq] + lea refq, [refq+ref_strideq*2] + lea srcq, [srcq+src_strideq*2] + movd m4, [refq] + movd m5, [refq+ref_strideq] + movd m6, [srcq] + movd m7, [srcq+src_strideq] + punpckldq m0, m1 + punpckldq m2, m3 + punpckldq m4, m5 + punpckldq m6, m7 + psadbw m0, m2 + psadbw m4, m6 + paddd m0, m4 + movd eax, m0 + RET diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm index 1c39a08f8ac6530d0dd5e845863875f9934931d8..2b90a5d54789fa46787e0e2e7efe1dc6c978fdb8 100644 --- a/vp9/encoder/x86/vp9_sad_sse3.asm +++ b/vp9/encoder/x86/vp9_sad_sse3.asm @@ -83,87 +83,6 @@ ret %endmacro -%macro STACK_FRAME_CREATE_X4 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define r0_ptr rcx - %define r1_ptr rdx - %define r2_ptr rbx - %define r3_ptr rdi - %define ref_stride rbp - %define result_ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - push rbp - mov rdi, arg(2) ; ref_ptr_base - - LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi - - mov rsi, arg(0) ; src_ptr - - movsxd rbx, dword ptr arg(1) ; src_stride - movsxd rbp, dword ptr arg(3) ; ref_stride - - xchg rbx, rax -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define r0_ptr rsi - %define r1_ptr r10 - %define r2_ptr r11 - %define r3_ptr r8 - %define ref_stride r9 - %define result_ptr [rsp+xmm_stack_space+16+4*8] - push rsi - - LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr - %else - %define src_ptr rdi - %define src_stride rsi - %define r0_ptr r9 - %define r1_ptr r10 - %define r2_ptr r11 - %define r3_ptr rdx - %define ref_stride rcx - %define result_ptr r8 - - LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr - - %endif -%endif -%endmacro - -%macro STACK_FRAME_DESTROY_X4 0 - %define src_ptr - %define src_stride - %define r0_ptr - %define r1_ptr - %define r2_ptr - %define r3_ptr - %define ref_stride - %define result_ptr - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - pop rsi - RESTORE_XMM - %endif -%endif - ret -%endmacro - %macro PROCESS_16X2X3 5 %if %1==0 movdqa xmm0, XMMWORD PTR [%2] @@ -250,130 +169,6 @@ paddw mm7, mm3 %endmacro -%macro LOAD_X4_ADDRESSES 5 - mov %2, [%1+REG_SZ_BYTES*0] - mov %3, [%1+REG_SZ_BYTES*1] - - mov %4, [%1+REG_SZ_BYTES*2] - mov %5, [%1+REG_SZ_BYTES*3] -%endmacro - -%macro PROCESS_16X2X4 8 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm4, XMMWORD PTR [%3] - lddqu xmm5, XMMWORD PTR [%4] - lddqu xmm6, XMMWORD PTR [%5] - lddqu xmm7, XMMWORD PTR [%6] - - psadbw xmm4, xmm0 - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%4] - lddqu xmm3, XMMWORD PTR [%5] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6] - paddw xmm5, xmm2 - paddw xmm6, xmm3 - - psadbw xmm1, xmm0 - paddw xmm7, xmm1 -%endif - movdqa xmm0, XMMWORD PTR [%2+%7] - lddqu xmm1, XMMWORD PTR [%3+%8] - lddqu xmm2, XMMWORD PTR [%4+%8] - lddqu xmm3, XMMWORD PTR [%5+%8] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm4, xmm1 - lddqu xmm1, XMMWORD PTR [%6+%8] - paddw xmm5, xmm2 - paddw xmm6, xmm3 - -%if %1==0 || %1==1 - lea %2, [%2+%7*2] - lea %3, [%3+%8*2] - - lea %4, [%4+%8*2] - lea %5, [%5+%8*2] - - lea %6, [%6+%8*2] -%endif - psadbw xmm1, xmm0 - paddw xmm7, xmm1 - -%endmacro - -%macro PROCESS_8X2X4 8 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm4, QWORD PTR [%3] - movq mm5, QWORD PTR [%4] - movq mm6, QWORD PTR [%5] - movq mm7, QWORD PTR [%6] - - psadbw mm4, mm0 - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%4] - movq mm3, QWORD PTR [%5] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm4, mm1 - movq mm1, QWORD PTR [%6] - paddw mm5, mm2 - paddw mm6, mm3 - - psadbw mm1, mm0 - paddw mm7, mm1 -%endif - movq mm0, QWORD PTR [%2+%7] - movq mm1, QWORD PTR [%3+%8] - movq mm2, QWORD PTR [%4+%8] - movq mm3, QWORD PTR [%5+%8] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm4, mm1 - movq mm1, QWORD PTR [%6+%8] - paddw mm5, mm2 - paddw mm6, mm3 - -%if %1==0 || %1==1 - lea %2, [%2+%7*2] - lea %3, [%3+%8*2] - - lea %4, [%4+%8*2] - lea %5, [%5+%8*2] - - lea %6, [%6+%8*2] -%endif - psadbw mm1, mm0 - paddw mm7, mm1 - -%endmacro - ;void int vp9_sad16x16x3_sse3( ; unsigned char *src_ptr, ; int src_stride, @@ -581,380 +376,3 @@ sym(vp9_sad4x4x3_sse3): movd [rcx+8], mm7 STACK_FRAME_DESTROY_X3 - -;unsigned int vp9_sad16x16_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int max_err) -;%define lddqu movdqu -global sym(vp9_sad16x16_sse3) PRIVATE -sym(vp9_sad16x16_sse3): - - STACK_FRAME_CREATE_X3 - - mov end_ptr, 4 - pxor xmm7, xmm7 - -.vp9_sad16x16_sse3_loop: - movdqa xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [ref_ptr] - movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] - movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movdqa xmm4, XMMWORD PTR [src_ptr] - movdqu xmm5, XMMWORD PTR [ref_ptr] - movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] - - psadbw xmm0, xmm1 - - movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] - - psadbw xmm2, xmm3 - psadbw xmm4, xmm5 - psadbw xmm6, xmm1 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - paddw xmm7, xmm0 - paddw xmm7, xmm2 - paddw xmm7, xmm4 - paddw xmm7, xmm6 - - sub end_ptr, 1 - jne .vp9_sad16x16_sse3_loop - - movq xmm0, xmm7 - psrldq xmm7, 8 - paddw xmm0, xmm7 - movq rax, xmm0 - - STACK_FRAME_DESTROY_X3 - -;void vp9_copy32xn_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp9_copy32xn_sse3) PRIVATE -sym(vp9_copy32xn_sse3): - - STACK_FRAME_CREATE_X3 - -.block_copy_sse3_loopx4: - lea end_ptr, [src_ptr+src_stride*2] - - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] - movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] - movdqu xmm4, XMMWORD PTR [end_ptr] - movdqu xmm5, XMMWORD PTR [end_ptr + 16] - movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] - movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] - - lea src_ptr, [src_ptr+src_stride*4] - - lea end_ptr, [ref_ptr+ref_stride*2] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 - movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 - movdqa XMMWORD PTR [end_ptr], xmm4 - movdqa XMMWORD PTR [end_ptr + 16], xmm5 - movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 - movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 - - lea ref_ptr, [ref_ptr+ref_stride*4] - - sub height, 4 - cmp height, 4 - jge .block_copy_sse3_loopx4 - - ;Check to see if there is more rows need to be copied. - cmp height, 0 - je .copy_is_done - -.block_copy_sse3_loop: - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - lea src_ptr, [src_ptr+src_stride] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - lea ref_ptr, [ref_ptr+ref_stride] - - sub height, 1 - jne .block_copy_sse3_loop - -.copy_is_done: - STACK_FRAME_DESTROY_X3 - -;void vp9_sad16x16x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp9_sad16x16x4d_sse3) PRIVATE -sym(vp9_sad16x16x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - movq xmm0, xmm4 - psrldq xmm4, 8 - - paddw xmm0, xmm4 - movd [rcx], xmm0 -;- - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+8], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+12], xmm0 - - STACK_FRAME_DESTROY_X4 - -;void vp9_sad16x8x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr_base, -; int ref_stride, -; int *results) -global sym(vp9_sad16x8x4d_sse3) PRIVATE -sym(vp9_sad16x8x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - movq xmm0, xmm4 - psrldq xmm4, 8 - - paddw xmm0, xmm4 - movd [rcx], xmm0 -;- - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+8], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+12], xmm0 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad8x16x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad8x16x4d_sse3) PRIVATE -sym(vp9_sad8x16x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - punpckldq mm4, mm5 - punpckldq mm6, mm7 - - movq [rcx], mm4 - movq [rcx+8], mm6 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad8x8x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad8x8x4d_sse3) PRIVATE -sym(vp9_sad8x8x4d_sse3): - - STACK_FRAME_CREATE_X4 - - PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride - -%if ABI_IS_32BIT - pop rbp -%endif - mov rcx, result_ptr - - punpckldq mm4, mm5 - punpckldq mm6, mm7 - - movq [rcx], mm4 - movq [rcx+8], mm6 - - STACK_FRAME_DESTROY_X4 - -;void int vp9_sad4x4x4d_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad4x4x4d_sse3) PRIVATE -sym(vp9_sad4x4x4d_sse3): - - STACK_FRAME_CREATE_X4 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [r0_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [r0_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [r1_ptr] - movd mm5, DWORD PTR [r2_ptr] - - movd mm6, DWORD PTR [r3_ptr] - movd mm2, DWORD PTR [r1_ptr+ref_stride] - - movd mm3, DWORD PTR [r2_ptr+ref_stride] - movd mm7, DWORD PTR [r3_ptr+ref_stride] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - punpcklbw mm6, mm7 - psadbw mm4, mm0 - - psadbw mm5, mm0 - psadbw mm6, mm0 - - - - lea src_ptr, [src_ptr+src_stride*2] - lea r0_ptr, [r0_ptr+ref_stride*2] - - lea r1_ptr, [r1_ptr+ref_stride*2] - lea r2_ptr, [r2_ptr+ref_stride*2] - - lea r3_ptr, [r3_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [r0_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm7, DWORD PTR [r0_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm7 - - movd mm3, DWORD PTR [r1_ptr] - movd mm7, DWORD PTR [r2_ptr] - - psadbw mm2, mm0 -%if ABI_IS_32BIT - mov rax, rbp - - pop rbp -%define ref_stride rax -%endif - mov rsi, result_ptr - - paddw mm1, mm2 - movd [rsi], mm1 - - movd mm2, DWORD PTR [r1_ptr+ref_stride] - movd mm1, DWORD PTR [r2_ptr+ref_stride] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm1 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - movd mm2, DWORD PTR [r3_ptr] - movd mm1, DWORD PTR [r3_ptr+ref_stride] - - paddw mm3, mm4 - paddw mm7, mm5 - - movd [rsi+4], mm3 - punpcklbw mm2, mm1 - - movd [rsi+8], mm7 - psadbw mm2, mm0 - - paddw mm2, mm6 - movd [rsi+12], mm2 - - - STACK_FRAME_DESTROY_X4 - diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vp9/encoder/x86/vp9_sad_sse4.asm index b42982a1fb52418587bf352cc3deedf2753bf518..faf1768a983bff9d25cc973cdb790dce856414c3 100644 --- a/vp9/encoder/x86/vp9_sad_sse4.asm +++ b/vp9/encoder/x86/vp9_sad_sse4.asm @@ -154,6 +154,16 @@ paddw xmm1, xmm5 %endmacro +%macro WRITE_AS_INTS 0 + mov rdi, arg(4) ;Results + pxor xmm0, xmm0 + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm0 + punpckhwd xmm2, xmm0 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm2 +%endmacro ;void vp9_sad16x16x8_sse4( ; const unsigned char *src_ptr, @@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi @@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4): push rdi ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 - mov rdi, arg(4) ;Results - movdqa XMMWORD PTR [rdi], xmm1 + WRITE_AS_INTS ; begin epilog pop rdi diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c index 36fae6e8cdd2882324adc254f035ebdc6ef5b181..fc363b6b033aa022be2a2d822d3eab489376ebaa 100644 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ b/vp9/encoder/x86/vp9_variance_sse2.c @@ -186,6 +186,7 @@ unsigned int vp9_variance16x16_wmt *sse = sse0; return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); } + unsigned int vp9_mse16x16_wmt( const unsigned char *src_ptr, int source_stride, @@ -305,20 +306,16 @@ unsigned int vp9_sub_pixel_variance8x8_wmt return (xxsum - (((unsigned int)xsum * xsum) >> 6)); } -unsigned int vp9_sub_pixel_variance16x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { +static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, int *avg) { int xsum0, xsum1; unsigned int xxsum0, xxsum1; - // note we could avoid these if statements if the calling function // just called the appropriate functions inside. if (xoffset == HALFNDX && yoffset == 0) { @@ -355,10 +352,136 @@ unsigned int vp9_sub_pixel_variance16x16_wmt } *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); + *avg = xsum0; +} + +unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse_ptr) { + int avg; + unsigned int sse; + + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse, &avg); + *sse_ptr = sse; + + return (sse - (((unsigned int) avg * avg) >> 8)); +} + +unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse_ptr) { + int avg0, avg1, avg2, avg3; + unsigned int sse0, sse1, sse2, sse3; + + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse0, &avg0); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse1, &avg1); + src_ptr += 16 * src_pixels_per_line; + dst_ptr += 16 * dst_pixels_per_line; + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse3, &avg3); + sse0 += sse1 + sse2 + sse3; + avg0 += avg1 + avg2 + avg3; + *sse_ptr = sse0; + + return (sse0 - (((unsigned int) avg0 * avg0) >> 10)); +} + +unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse_ptr) { + int avg0, avg1, avg2, avg3, avg4; + unsigned int sse0, sse1, sse2, sse3, sse4; + + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse0, &avg0); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse1, &avg1); + sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 32, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 48, dst_pixels_per_line, + &sse3, &avg3); + src_ptr += 16 * src_pixels_per_line; + dst_ptr += 16 * dst_pixels_per_line; + avg0 += avg1 + avg2 + avg3; + sse0 += sse1 + sse2 + sse3; + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse1, &avg1); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 32, dst_pixels_per_line, + &sse3, &avg3); + sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 48, dst_pixels_per_line, + &sse4, &avg4); + src_ptr += 16 * src_pixels_per_line; + dst_ptr += 16 * dst_pixels_per_line; + avg0 += avg1 + avg2 + avg3 + avg4; + sse0 += sse1 + sse2 + sse3 + sse4; + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse1, &avg1); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 32, dst_pixels_per_line, + &sse3, &avg3); + sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 48, dst_pixels_per_line, + &sse4, &avg4); + src_ptr += 16 * src_pixels_per_line; + dst_ptr += 16 * dst_pixels_per_line; + avg0 += avg1 + avg2 + avg3 + avg4; + sse0 += sse1 + sse2 + sse3 + sse4; + sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, + &sse1, &avg1); + sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 16, dst_pixels_per_line, + &sse2, &avg2); + sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 32, dst_pixels_per_line, + &sse3, &avg3); + sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, + yoffset, dst_ptr + 48, dst_pixels_per_line, + &sse4, &avg4); + avg0 += avg1 + avg2 + avg3 + avg4; + sse0 += sse1 + sse2 + sse3 + sse4; + *sse_ptr = sse0; + + return (sse0 - (((unsigned int) avg0 * avg0) >> 12)); } -unsigned int vp9_sub_pixel_mse16x16_wmt( +unsigned int vp9_sub_pixel_mse16x16_sse2( const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, @@ -367,7 +490,8 @@ unsigned int vp9_sub_pixel_mse16x16_wmt( int dst_pixels_per_line, unsigned int *sse ) { - vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); + vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, + yoffset, dst_ptr, dst_pixels_per_line, sse); return *sse; } diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c index 3beef53a2feebe4fd8c0f5f49d84068864ec3a2f..2bf32c569e7b2d6effca63aa0f7babe6123330f1 100644 --- a/vp9/encoder/x86/vp9_x86_csystemdependent.c +++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c @@ -23,11 +23,11 @@ void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) { vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch); } -int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) { +int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr); +int vp9_mbblock_error_mmx(MACROBLOCK *mb) { short *coeff_ptr = mb->block[0].coeff; short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; - return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); + return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr); } int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); @@ -51,11 +51,11 @@ void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) { #endif #if HAVE_SSE2 -int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) { +int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr); +int vp9_mbblock_error_xmm(MACROBLOCK *mb) { short *coeff_ptr = mb->block[0].coeff; short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; - return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); + return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr); } int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 714cefdcc9f0609bfc96edf1068bceecdc4004a3..5e1ff62f7b7d2a6e2ad223a3be3c380fada1ce38 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -9,6 +9,7 @@ ## VP9_COMMON_SRCS-yes += vp9_common.mk +VP9_COMMON_SRCS-yes += vp9_iface_common.h VP9_COMMON_SRCS-yes += common/vp9_pragmas.h VP9_COMMON_SRCS-yes += common/vp9_ppflags.h VP9_COMMON_SRCS-yes += common/vp9_onyx.h @@ -16,6 +17,8 @@ VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c VP9_COMMON_SRCS-yes += common/vp9_blockd.c VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h +VP9_COMMON_SRCS-yes += common/vp9_convolve.c +VP9_COMMON_SRCS-yes += common/vp9_convolve.h VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h VP9_COMMON_SRCS-yes += common/vp9_entropy.c @@ -26,7 +29,7 @@ VP9_COMMON_SRCS-yes += common/vp9_filter.c VP9_COMMON_SRCS-yes += common/vp9_filter.h VP9_COMMON_SRCS-yes += common/vp9_findnearmv.c VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c -VP9_COMMON_SRCS-yes += common/vp9_idctllm.c +VP9_COMMON_SRCS-yes += common/vp9_idct.c VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h VP9_COMMON_SRCS-yes += common/vp9_blockd.h VP9_COMMON_SRCS-yes += common/vp9_common.h @@ -36,6 +39,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h VP9_COMMON_SRCS-yes += common/vp9_extend.h VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h VP9_COMMON_SRCS-yes += common/vp9_header.h +VP9_COMMON_SRCS-yes += common/vp9_idct.h VP9_COMMON_SRCS-yes += common/vp9_invtrans.h VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h VP9_COMMON_SRCS-yes += common/vp9_modecont.h @@ -46,7 +50,6 @@ VP9_COMMON_SRCS-yes += common/vp9_pred_common.c VP9_COMMON_SRCS-yes += common/vp9_quant_common.h VP9_COMMON_SRCS-yes += common/vp9_reconinter.h VP9_COMMON_SRCS-yes += common/vp9_reconintra.h -VP9_COMMON_SRCS-yes += common/vp9_reconintra4x4.h VP9_COMMON_SRCS-yes += common/vp9_rtcd.c VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h @@ -54,10 +57,11 @@ VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.c VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h -VP9_COMMON_SRCS-yes += common/vp9_subpixel.h VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h VP9_COMMON_SRCS-yes += common/vp9_textblit.h +VP9_COMMON_SRCS-yes += common/vp9_tile_common.h +VP9_COMMON_SRCS-yes += common/vp9_tile_common.c VP9_COMMON_SRCS-yes += common/vp9_treecoder.h VP9_COMMON_SRCS-yes += common/vp9_invtrans.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c @@ -79,7 +83,6 @@ VP9_COMMON_SRCS-yes += common/vp9_treecoder.c VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c @@ -89,18 +92,15 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm -VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm @@ -112,19 +112,13 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm endif -VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c -ifeq ($(HAVE_SSE4_1),yes) -vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4 -vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4 -endif - -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c ifeq ($(HAVE_SSE2),yes) -vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2 +vp9/common/x86/vp9_idct_x86.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2 -vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2 +vp9/common/x86/vp9_idct_x86.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2 endif diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 1ef5ff19efe99025186ee667e641e7f293c30757..56453e2496aa4fb0d6bca8bd13ded5be7729f762 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -16,6 +16,7 @@ #include "vpx/vp8cx.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/common/vp9_onyx.h" +#include "vp9/vp9_iface_common.h" #include <stdlib.h> #include <string.h> @@ -26,7 +27,8 @@ struct vp8_extracfg { unsigned int noise_sensitivity; unsigned int Sharpness; unsigned int static_thresh; - unsigned int token_partitions; + unsigned int tile_columns; + unsigned int tile_rows; unsigned int arnr_max_frames; /* alt_ref Noise Reduction Max Frame Count */ unsigned int arnr_strength; /* alt_ref Noise Reduction Strength */ unsigned int arnr_type; /* alt_ref filter type */ @@ -34,9 +36,8 @@ struct vp8_extracfg { vp8e_tuning tuning; unsigned int cq_level; /* constrained quality level */ unsigned int rc_max_intra_bitrate_pct; -#if CONFIG_LOSSLESS unsigned int lossless; -#endif + unsigned int frame_parallel_decoding_mode; }; struct extraconfig_map { @@ -54,7 +55,8 @@ static const struct extraconfig_map extracfg_map[] = { 0, /* noise_sensitivity */ 0, /* Sharpness */ 0, /* static_thresh */ - VP8_ONE_TOKENPARTITION, /* token_partitions */ + 0, /* tile_columns */ + 0, /* tile_rows */ 0, /* arnr_max_frames */ 3, /* arnr_strength */ 3, /* arnr_type*/ @@ -62,9 +64,8 @@ static const struct extraconfig_map extracfg_map[] = { 0, /* tuning*/ 10, /* cq_level */ 0, /* rc_max_intra_bitrate_pct */ -#if CONFIG_LOSSLESS 0, /* lossless */ -#endif + 0, /* frame_parallel_decoding_mode */ } } }; @@ -79,8 +80,10 @@ struct vpx_codec_alg_priv { unsigned int cx_data_sz; unsigned char *pending_cx_data; unsigned int pending_cx_data_sz; + int pending_frame_count; + uint32_t pending_frame_sizes[8]; + uint32_t pending_frame_magnitude; vpx_image_t preview_img; - unsigned int next_frame_flag; vp8_postproc_cfg_t preview_ppcfg; vpx_codec_pkt_list_decl(64) pkt_list; // changed to accomendate the maximum number of lagged frames allowed unsigned int fixed_kf_cntr; @@ -129,21 +132,19 @@ update_error_state(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, const struct vp8_extracfg *vp8_cfg) { - RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */ - RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */ + RANGE_CHECK(cfg, g_w, 1, 65535); /* 16 bits available */ + RANGE_CHECK(cfg, g_h, 1, 65535); /* 16 bits available */ RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den); RANGE_CHECK_HI(cfg, g_profile, 3); RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); -#if CONFIG_LOSSLESS RANGE_CHECK_BOOL(vp8_cfg, lossless); if (vp8_cfg->lossless) { RANGE_CHECK_HI(cfg, rc_max_quantizer, 0); RANGE_CHECK_HI(cfg, rc_min_quantizer, 0); } -#endif RANGE_CHECK_HI(cfg, g_threads, 64); RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); @@ -172,7 +173,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(vp8_cfg, noise_sensitivity, 6); - RANGE_CHECK(vp8_cfg, token_partitions, VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION); + RANGE_CHECK(vp8_cfg, tile_columns, 0, 6); + RANGE_CHECK(vp8_cfg, tile_rows, 0, 2); RANGE_CHECK_HI(vp8_cfg, Sharpness, 7); RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15); RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6); @@ -226,11 +228,9 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf, vpx_codec_enc_cfg_t cfg, struct vp8_extracfg vp8_cfg) { - oxcf->Version = cfg.g_profile; - oxcf->Version |= vp8_cfg.experimental ? 0x4 : 0; - - oxcf->Width = cfg.g_w; - oxcf->Height = cfg.g_h; + oxcf->version = cfg.g_profile | (vp8_cfg.experimental ? 0x4 : 0); + oxcf->width = cfg.g_w; + oxcf->height = cfg.g_h; /* guess a frame rate if out of whack, use 30 */ oxcf->frame_rate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num); @@ -309,37 +309,43 @@ static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf, oxcf->tuning = vp8_cfg.tuning; -#if CONFIG_LOSSLESS + oxcf->tile_columns = vp8_cfg.tile_columns; + oxcf->tile_rows = vp8_cfg.tile_rows; + oxcf->lossless = vp8_cfg.lossless; -#endif + oxcf->error_resilient_mode = cfg.g_error_resilient; + oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode; /* - printf("Current VP8 Settings: \n"); - printf("target_bandwidth: %d\n", oxcf->target_bandwidth); - printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity); - printf("Sharpness: %d\n", oxcf->Sharpness); - printf("cpu_used: %d\n", oxcf->cpu_used); - printf("Mode: %d\n", oxcf->Mode); - printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file); - printf("auto_key: %d\n", oxcf->auto_key); - printf("key_freq: %d\n", oxcf->key_freq); - printf("end_usage: %d\n", oxcf->end_usage); - printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct); - printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct); - printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level); - printf("optimal_buffer_level: %d\n", oxcf->optimal_buffer_level); - printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size); - printf("fixed_q: %d\n", oxcf->fixed_q); - printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q); - printf("best_allowed_q: %d\n", oxcf->best_allowed_q); - printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); - printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); - printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); - printf("allow_lag: %d\n", oxcf->allow_lag); - printf("lag_in_frames: %d\n", oxcf->lag_in_frames); - printf("play_alternate: %d\n", oxcf->play_alternate); - printf("Version: %d\n", oxcf->Version); - printf("encode_breakout: %d\n", oxcf->encode_breakout); + printf("Current VP9 Settings: \n"); + printf("target_bandwidth: %d\n", oxcf->target_bandwidth); + printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity); + printf("Sharpness: %d\n", oxcf->Sharpness); + printf("cpu_used: %d\n", oxcf->cpu_used); + printf("Mode: %d\n", oxcf->Mode); + // printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file); + printf("auto_key: %d\n", oxcf->auto_key); + printf("key_freq: %d\n", oxcf->key_freq); + printf("end_usage: %d\n", oxcf->end_usage); + printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct); + printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct); + printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level); + printf("optimal_buffer_level: %d\n", oxcf->optimal_buffer_level); + printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size); + printf("fixed_q: %d\n", oxcf->fixed_q); + printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q); + printf("best_allowed_q: %d\n", oxcf->best_allowed_q); + printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); + printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); + printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); + printf("allow_lag: %d\n", oxcf->allow_lag); + printf("lag_in_frames: %d\n", oxcf->lag_in_frames); + printf("play_alternate: %d\n", oxcf->play_alternate); + printf("Version: %d\n", oxcf->Version); + printf("encode_breakout: %d\n", oxcf->encode_breakout); + printf("error resilient: %d\n", oxcf->error_resilient_mode); + printf("frame parallel detokenization: %d\n", + oxcf->frame_parallel_decoding_mode); */ return VPX_CODEC_OK; } @@ -409,7 +415,8 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, MAP(VP8E_SET_NOISE_SENSITIVITY, xcfg.noise_sensitivity); MAP(VP8E_SET_SHARPNESS, xcfg.Sharpness); MAP(VP8E_SET_STATIC_THRESHOLD, xcfg.static_thresh); - MAP(VP8E_SET_TOKEN_PARTITIONS, xcfg.token_partitions); + MAP(VP9E_SET_TILE_COLUMNS, xcfg.tile_columns); + MAP(VP9E_SET_TILE_ROWS, xcfg.tile_rows); MAP(VP8E_SET_ARNR_MAXFRAMES, xcfg.arnr_max_frames); MAP(VP8E_SET_ARNR_STRENGTH, xcfg.arnr_strength); @@ -417,9 +424,8 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, MAP(VP8E_SET_TUNING, xcfg.tuning); MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level); MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct); -#if CONFIG_LOSSLESS MAP(VP9E_SET_LOSSLESS, xcfg.lossless); -#endif + MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode); } res = validate_config(ctx, &ctx->cfg, &xcfg); @@ -540,6 +546,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; yv12->y_width = img->d_w; yv12->y_height = img->d_h; yv12->uv_width = (1 + yv12->y_width) / 2; @@ -578,6 +586,46 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, } +static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { + uint8_t marker = 0xc0; + int mag, mask, index_sz; + + assert(ctx->pending_frame_count); + assert(ctx->pending_frame_count <= 8); + + /* Add the number of frames to the marker byte */ + marker |= ctx->pending_frame_count - 1; + + /* Choose the magnitude */ + for (mag = 0, mask = 0xff; mag < 4; mag++) { + if (ctx->pending_frame_magnitude < mask) + break; + mask <<= 8; + mask |= 0xff; + } + marker |= mag << 3; + + /* Write the index */ + index_sz = 2 + (mag + 1) * ctx->pending_frame_count; + if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) { + uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz; + int i, j; + + *x++ = marker; + for (i = 0; i < ctx->pending_frame_count; i++) { + int this_sz = ctx->pending_frame_sizes[i]; + + for (j = 0; j <= mag; j++) { + *x++ = this_sz & 0xff; + this_sz >>= 8; + } + } + *x++ = marker; + ctx->pending_cx_data_sz += index_sz; + } + return index_sz; +} + static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, @@ -670,14 +718,11 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (img != NULL) { res = image2yuvconfig(img, &sd); - if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, + if (vp9_receive_raw_frame(ctx->cpi, lib_flags, &sd, dst_time_stamp, dst_end_time_stamp)) { VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; res = update_error_state(ctx, &cpi->common.error); } - - /* reset for next frame */ - ctx->next_frame_flag = 0; } cx_data = ctx->cx_data; @@ -714,6 +759,8 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (!ctx->pending_cx_data) ctx->pending_cx_data = cx_data; ctx->pending_cx_data_sz += size; + ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + ctx->pending_frame_magnitude |= size; cx_data += size; cx_data_sz -= size; continue; @@ -773,10 +820,16 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, else*/ { if (ctx->pending_cx_data) { + ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + ctx->pending_frame_magnitude |= size; + ctx->pending_cx_data_sz += size; + size += write_superframe_index(ctx); pkt.data.frame.buf = ctx->pending_cx_data; - pkt.data.frame.sz = ctx->pending_cx_data_sz + size; + pkt.data.frame.sz = ctx->pending_cx_data_sz; ctx->pending_cx_data = NULL; ctx->pending_cx_data_sz = 0; + ctx->pending_frame_count = 0; + ctx->pending_frame_magnitude = 0; } else { pkt.data.frame.buf = cx_data; pkt.data.frame.sz = size; @@ -818,9 +871,9 @@ static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx, } -static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { +static vpx_codec_err_t vp8e_copy_reference(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); @@ -829,12 +882,28 @@ static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx, YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); - vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd); + vp9_copy_reference_enc(ctx->cpi, frame->frame_type, &sd); return VPX_CODEC_OK; } else return VPX_CODEC_INVALID_PARAM; } +static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { + vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); + + if (data) { + YV12_BUFFER_CONFIG* fb; + + vp9_get_reference_enc(ctx->cpi, data->idx, &fb); + yuvconfig2image(&data->img, fb, NULL); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { @@ -979,8 +1048,6 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx, scalemode.v_scaling_mode); if (!res) { - /*force next frame a key frame to effect scaling mode */ - ctx->next_frame_flag |= FRAMEFLAGS_KEY; return VPX_CODEC_OK; } else return VPX_CODEC_INVALID_PARAM; @@ -991,7 +1058,7 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx, static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { {VP8_SET_REFERENCE, vp8e_set_reference}, - {VP8_COPY_REFERENCE, vp8e_get_reference}, + {VP8_COPY_REFERENCE, vp8e_copy_reference}, {VP8_SET_POSTPROC, vp8e_set_previewpp}, {VP8E_UPD_ENTROPY, vp8e_update_entropy}, {VP8E_UPD_REFERENCE, vp8e_update_reference}, @@ -1004,7 +1071,8 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { {VP8E_SET_ENABLEAUTOALTREF, set_param}, {VP8E_SET_SHARPNESS, set_param}, {VP8E_SET_STATIC_THRESHOLD, set_param}, - {VP8E_SET_TOKEN_PARTITIONS, set_param}, + {VP9E_SET_TILE_COLUMNS, set_param}, + {VP9E_SET_TILE_ROWS, set_param}, {VP8E_GET_LAST_QUANTIZER, get_param}, {VP8E_GET_LAST_QUANTIZER_64, get_param}, {VP8E_SET_ARNR_MAXFRAMES, set_param}, @@ -1013,9 +1081,8 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { {VP8E_SET_TUNING, set_param}, {VP8E_SET_CQ_LEVEL, set_param}, {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param}, -#if CONFIG_LOSSLESS {VP9E_SET_LOSSLESS, set_param}, -#endif + {VP9_GET_REFERENCE, get_reference}, { -1, NULL}, }; diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 2d7e4136965d27cd7d3d019a19c1be1af687e604..d0c23f07a7d418246facb70e90f9ef97f9f829d6 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -17,6 +17,7 @@ #include "vpx_version.h" #include "decoder/vp9_onyxd.h" #include "decoder/vp9_onyxd_int.h" +#include "vp9/vp9_iface_common.h" #define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) typedef vpx_codec_stream_info_t vp8_stream_info_t; @@ -63,6 +64,7 @@ struct vpx_codec_alg_priv { vpx_image_t img; int img_setup; int img_avail; + int invert_tile_order; }; static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, @@ -229,8 +231,8 @@ static vpx_codec_err_t vp8_peek_si(const uint8_t *data, if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a) res = VPX_CODEC_UNSUP_BITSTREAM; - si->w = (c[3] | (c[4] << 8)) & 0x3fff; - si->h = (c[5] | (c[6] << 8)) & 0x3fff; + si->w = (c[3] | (c[4] << 8)); + si->h = (c[5] | (c[6] << 8)); /*printf("w=%d, h=%d\n", si->w, si->h);*/ if (!(si->h | si->w)) @@ -273,36 +275,6 @@ update_error_state(vpx_codec_alg_priv_t *ctx, return res; } -static void yuvconfig2image(vpx_image_t *img, - const YV12_BUFFER_CONFIG *yv12, - void *user_priv) { - /** vpx_img_wrap() doesn't allow specifying independent strides for - * the Y, U, and V planes, nor other alignment adjustments that - * might be representable by a YV12_BUFFER_CONFIG, so we just - * initialize all the fields.*/ - img->fmt = yv12->clrtype == REG_YUV ? - VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420; - img->w = yv12->y_stride; - img->h = (yv12->y_height + 2 * VP9BORDERINPIXELS + 15) & ~15; - img->d_w = yv12->y_width; - img->d_h = yv12->y_height; - img->x_chroma_shift = 1; - img->y_chroma_shift = 1; - img->planes[VPX_PLANE_Y] = yv12->y_buffer; - img->planes[VPX_PLANE_U] = yv12->u_buffer; - img->planes[VPX_PLANE_V] = yv12->v_buffer; - img->planes[VPX_PLANE_ALPHA] = NULL; - img->stride[VPX_PLANE_Y] = yv12->y_stride; - img->stride[VPX_PLANE_U] = yv12->uv_stride; - img->stride[VPX_PLANE_V] = yv12->uv_stride; - img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; - img->bps = 12; - img->user_priv = user_priv; - img->img_data = yv12->buffer_alloc; - img->img_data_owner = 0; - img->self_allocd = 0; -} - static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, @@ -362,6 +334,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, oxcf.Version = 9; oxcf.postprocess = 0; oxcf.max_threads = ctx->cfg.threads; + oxcf.inv_tile_order = ctx->invert_tile_order; optr = vp9_create_decompressor(&oxcf); /* If postprocessing was enabled by the application and a @@ -424,6 +397,39 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, return res; } +static void parse_superframe_index(const uint8_t *data, + size_t data_sz, + uint32_t sizes[8], + int *count) { + uint8_t marker; + + assert(data_sz); + marker = data[data_sz - 1]; + *count = 0; + + if ((marker & 0xe0) == 0xc0) { + const int frames = (marker & 0x7) + 1; + const int mag = ((marker >> 3) & 3) + 1; + const int index_sz = 2 + mag * frames; + + if (data_sz >= index_sz && data[data_sz - index_sz] == marker) { + // found a valid superframe index + int i, j; + const uint8_t *x = data + data_sz - index_sz + 1; + + for (i = 0; i < frames; i++) { + int this_sz = 0; + + for (j = 0; j < mag; j++) + this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + + *count = frames; + } + } +} + static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, @@ -431,9 +437,43 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t *ctx, long deadline) { const uint8_t *data_start = data; const uint8_t *data_end = data + data_sz; - vpx_codec_err_t res; + vpx_codec_err_t res = 0; + uint32_t sizes[8]; + int frames_this_pts, frame_count = 0; + + parse_superframe_index(data, data_sz, sizes, &frames_this_pts); do { + // Skip over the superframe index, if present + if (data_sz && (*data_start & 0xe0) == 0xc0) { + const uint8_t marker = *data_start; + const int frames = (marker & 0x7) + 1; + const int mag = ((marker >> 3) & 3) + 1; + const int index_sz = 2 + mag * frames; + + if (data_sz >= index_sz && data_start[index_sz - 1] == marker) { + data_start += index_sz; + data_sz -= index_sz; + if (data_start < data_end) + continue; + else + break; + } + } + + // Use the correct size for this frame, if an index is present. + if (frames_this_pts) { + uint32_t this_sz = sizes[frame_count]; + + if (data_sz < this_sz) { + ctx->base.err_detail = "Invalid frame size in index"; + return VPX_CODEC_CORRUPT_FRAME; + } + + data_sz = this_sz; + frame_count++; + } + res = decode_one(ctx, &data_start, data_sz, user_priv, deadline); assert(data_start >= data); assert(data_start <= data_end); @@ -545,6 +585,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; yv12->y_width = img->d_w; yv12->y_height = img->d_h; yv12->uv_width = yv12->y_width / 2; @@ -580,9 +622,9 @@ static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx, } -static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { +static vpx_codec_err_t vp9_copy_reference(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); @@ -592,13 +634,29 @@ static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx, image2yuvconfig(&frame->img, &sd); - return vp9_get_reference_dec(ctx->pbi, - (VP9_REFFRAME)frame->frame_type, &sd); + return vp9_copy_reference_dec(ctx->pbi, + (VP9_REFFRAME)frame->frame_type, &sd); } else return VPX_CODEC_INVALID_PARAM; } +static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { + vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); + + if (data) { + YV12_BUFFER_CONFIG* fb; + + vp9_get_reference_dec(ctx->pbi, data->idx, &fb); + yuvconfig2image(&data->img, fb, NULL); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { @@ -645,9 +703,7 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi; if (update_info) { - *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME - + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME - + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME; + *update_info = pbi->refresh_frame_flags; return VPX_CODEC_OK; } else @@ -671,9 +727,16 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, } +static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { + ctx->invert_tile_order = va_arg(args, int); + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t ctf_maps[] = { {VP8_SET_REFERENCE, vp9_set_reference}, - {VP8_COPY_REFERENCE, vp9_get_reference}, + {VP8_COPY_REFERENCE, vp9_copy_reference}, {VP8_SET_POSTPROC, vp8_set_postproc}, {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options}, {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options}, @@ -681,6 +744,8 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = { {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options}, {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates}, {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted}, + {VP9_GET_REFERENCE, get_reference}, + {VP9_INVERT_TILE_DECODE_ORDER, set_invert_tile_order}, { -1, NULL}, }; diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h new file mode 100644 index 0000000000000000000000000000000000000000..450be7dfda09f655eaa10598d68a28e11caecb31 --- /dev/null +++ b/vp9/vp9_iface_common.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_VP9_IFACE_COMMON_H_ +#define VP9_VP9_IFACE_COMMON_H_ + +static void yuvconfig2image(vpx_image_t *img, + const YV12_BUFFER_CONFIG *yv12, + void *user_priv) { + /** vpx_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ + img->fmt = yv12->clrtype == REG_YUV ? + VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420; + img->w = yv12->y_stride; + img->h = (yv12->y_height + 2 * VP9BORDERINPIXELS + 15) & ~15; + img->d_w = yv12->y_width; + img->d_h = yv12->y_height; + img->x_chroma_shift = 1; + img->y_chroma_shift = 1; + img->planes[VPX_PLANE_Y] = yv12->y_buffer; + img->planes[VPX_PLANE_U] = yv12->u_buffer; + img->planes[VPX_PLANE_V] = yv12->v_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = yv12->y_stride; + img->stride[VPX_PLANE_U] = yv12->uv_stride; + img->stride[VPX_PLANE_V] = yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; + img->bps = 12; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; +} + +#endif diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 12d1ec4e7c78eac31571110858b2537d30ca6bfb..43dba1373d060d87cac7697df8cf6077bca8d022 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -65,7 +65,6 @@ VP9_CX_SRCS-yes += encoder/vp9_quantize.c VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c VP9_CX_SRCS-yes += encoder/vp9_rdopt.c VP9_CX_SRCS-yes += encoder/vp9_sad_c.c -VP9_CX_SRCS-yes += encoder/vp9_satd_c.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c @@ -95,21 +94,28 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm +#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm +#VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm -VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm +#VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2_intrinsics.c +ifeq ($(HAVE_SSE2),yes) +vp9/encoder/x86/vp9_dct_sse2_intrinsics.c.d: CFLAGS += -msse2 +vp9/encoder/x86/vp9_dct_sse2_intrinsics.c.o: CFLAGS += -msse2 +endif + VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk index 7622fc0b23ca49a386233d67142b32a7d08be0bd..239ae30b693f179d1f8d50a6ccce7dd53087fe49 100644 --- a/vp9/vp9dx.mk +++ b/vp9/vp9dx.mk @@ -38,5 +38,11 @@ VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c +VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/vp9_dequantize_x86.c +ifeq ($(HAVE_SSE2),yes) +vp9/decoder/x86/vp9_dequantize_x86.c.o: CFLAGS += -msse2 +vp9/decoder/x86/vp9_dequantize_x86.c.d: CFLAGS += -msse2 +endif + $(eval $(call asm_offsets_template,\ vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c)) diff --git a/vpx/vp8.h b/vpx/vp8.h index 3c313632b9e7aab35818219ccfc853277fabf35e..0b4cb1b9e6d24b9d51a25e229c29d27273f4f932 100644 --- a/vpx/vp8.h +++ b/vpx/vp8.h @@ -44,6 +44,12 @@ enum vp8_com_control_id { VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */ VP8_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */ VP8_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */ + + /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+) + * for its control ids. These should be migrated to something like the + * VP8_DECODER_CTRL_ID_START range next time we're ready to break the ABI. + */ + VP9_GET_REFERENCE = 128, /**< get a pointer to a reference frame */ VP8_COMMON_CTRL_ID_MAX, VP8_DECODER_CTRL_ID_START = 256 }; @@ -97,6 +103,10 @@ typedef struct vpx_ref_frame { vpx_image_t img; /**< reference frame data in image format */ } vpx_ref_frame_t; +typedef struct vp9_ref_frame { + int idx; /**< frame index to get (input) */ + vpx_image_t img; /**< img structure to populate (output) */ +} vp9_ref_frame_t; /*!\brief vp8 decoder control function parameter type * @@ -110,6 +120,7 @@ VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int) VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int) VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int) VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int) +VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) /*! @} - end defgroup vp8 */ diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 90b7169d59b73307f9e22c4dcbeac3ec6716939c..7f19dd033ae477b2853e7664dc1502521da50375 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -187,7 +187,10 @@ enum vp8e_enc_control_id { /* TODO(jkoleszar): Move to vp9cx.h */ - VP9E_SET_LOSSLESS + VP9E_SET_LOSSLESS, + VP9E_SET_TILE_COLUMNS, + VP9E_SET_TILE_ROWS, + VP9E_SET_FRAME_PARALLEL_DECODING }; /*!\brief vpx 1-D scaling mode @@ -298,6 +301,9 @@ VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_TYPE, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */ VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int) +VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) + VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) @@ -305,6 +311,7 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int) /*! @} - end defgroup vp8_encoder */ #include "vpx_codec_impl_bottom.h" #endif diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index ca3d63c8fc1df86a0beefe5d022a7a15239782e8..201df88fe2b646c391d28cf5e80cdf10f0aa418c 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -69,6 +69,9 @@ enum vp8_dec_control_id { */ VP8_SET_DECRYPT_KEY, + /** For testing. */ + VP9_INVERT_TILE_DECODE_ORDER, + VP8_DECODER_CTRL_ID_MAX }; @@ -85,6 +88,7 @@ VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *) VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) VPX_CTRL_USE_TYPE(VP8_SET_DECRYPT_KEY, const unsigned char *) +VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) /*! @} - end defgroup vp8_decoder */ diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h index f04c61c04b696921cb7499532daccdbe521515c5..0ccc96cd12761eeac6dd8b1201e788e666106ad7 100644 --- a/vpx/vpx_integer.h +++ b/vpx/vpx_integer.h @@ -28,6 +28,8 @@ typedef unsigned int uint32_t; typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; #define INT64_MAX _I64_MAX +#define INT16_MAX _I16_MAX +#define INT16_MIN _I16_MIN #endif #ifndef _UINTPTR_T_DEFINED diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h index b130da8a022e51aba79aad173ab196270fbff896..62b86bb1df8ae4db468529080b959222d101fcfb 100644 --- a/vpx_ports/mem.h +++ b/vpx_ports/mem.h @@ -11,6 +11,7 @@ #ifndef VPX_PORTS_MEM_H #define VPX_PORTS_MEM_H + #include "vpx_config.h" #include "vpx/vpx_integer.h" diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index 4cb2a41904c87d41d99f3bc66f843fe2a0436664..fc7f828814f8c9f0c85c6e1aa8b75e7ab8243552 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -35,36 +35,41 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) { return 0; } -/**************************************************************************** - * - ****************************************************************************/ -int -vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border) { - /*NOTE:*/ - +int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int border) { if (ybf) { - int y_stride = ((width + 2 * border) + 31) & ~31; - int yplane_size = (height + 2 * border) * y_stride; - int uv_width = width >> 1; - int uv_height = height >> 1; + int aligned_width = (width + 15) & ~15; + int aligned_height = (height + 15) & ~15; + int y_stride = ((aligned_width + 2 * border) + 31) & ~31; + int yplane_size = (aligned_height + 2 * border) * y_stride; + int uv_width = aligned_width >> 1; + int uv_height = aligned_height >> 1; /** There is currently a bunch of code which assumes * uv_stride == y_stride/2, so enforce this here. */ int uv_stride = y_stride >> 1; int uvplane_size = (uv_height + border) * uv_stride; + const int frame_size = yplane_size + 2 * uvplane_size; - vp8_yv12_de_alloc_frame_buffer(ybf); + if (!ybf->buffer_alloc) { + ybf->buffer_alloc = vpx_memalign(32, frame_size); + ybf->buffer_alloc_sz = frame_size; + } + + if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) + return -1; - /** Only support allocating buffers that have a height and width that - * are multiples of 16, and a border that's a multiple of 32. - * The border restriction is required to get 16-byte alignment of the - * start of the chroma rows without intoducing an arbitrary gap - * between planes, which would break the semantics of things like - * vpx_img_set_rect(). */ - if ((width & 0xf) | (height & 0xf) | (border & 0x1f)) + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without intoducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if (border & 0x1f) return -3; - ybf->y_width = width; - ybf->y_height = height; + ybf->y_crop_width = width; + ybf->y_crop_height = height; + ybf->y_width = aligned_width; + ybf->y_height = aligned_height; ybf->y_stride = y_stride; ybf->uv_width = uv_width; @@ -72,21 +77,23 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ybf->uv_stride = uv_stride; ybf->border = border; - ybf->frame_size = yplane_size + 2 * uvplane_size; - - ybf->buffer_alloc = (unsigned char *) vpx_memalign(32, ybf->frame_size); - - if (ybf->buffer_alloc == NULL) - return -1; + ybf->frame_size = frame_size; ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * uv_stride) + border / 2; ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * uv_stride) + border / 2; ybf->corrupted = 0; /* assume not currupted by errors */ - } else { - return -2; + return 0; } + return -2; +} - return 0; +int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int border) { + if (ybf) { + vp8_yv12_de_alloc_frame_buffer(ybf); + return vp8_yv12_realloc_frame_buffer(ybf, width, height, border); + } + return -2; } diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c index 5a427356b846eaabb9ac3bb4221faa35ec919dc4..a322e0a2c3452f1df5fff41a8d7b7a299acd7d24 100644 --- a/vpx_scale/generic/yv12extend.c +++ b/vpx_scale/generic/yv12extend.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ - +#include <assert.h> #include "vpx_scale/yv12config.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpx_scale.h" @@ -20,180 +20,81 @@ /**************************************************************************** * ****************************************************************************/ -void -vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { +static void extend_plane(uint8_t *s, /* source */ + int sp, /* source pitch */ + int w, /* width */ + int h, /* height */ + int et, /* extend top border */ + int el, /* extend left border */ + int eb, /* extend bottom border */ + int er) { /* extend right border */ int i; - unsigned char *src_ptr1, *src_ptr2; - unsigned char *dest_ptr1, *dest_ptr2; - - unsigned int Border; - int plane_stride; - int plane_height; - int plane_width; - - /***********/ - /* Y Plane */ - /***********/ - Border = ybf->border; - plane_stride = ybf->y_stride; - plane_height = ybf->y_height; - plane_width = ybf->y_width; - - /* copy the left and right most columns out */ - src_ptr1 = ybf->y_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) { - vpx_memset(dest_ptr1, src_ptr1[0], Border); - vpx_memset(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - /* Now copy the top and bottom source lines into each line of the respective borders */ - src_ptr1 = ybf->y_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)Border; i++) { - vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); - vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - - /***********/ - /* U Plane */ - /***********/ - plane_stride = ybf->uv_stride; - plane_height = ybf->uv_height; - plane_width = ybf->uv_width; - Border /= 2; + uint8_t *src_ptr1, *src_ptr2; + uint8_t *dest_ptr1, *dest_ptr2; + int linesize; /* copy the left and right most columns out */ - src_ptr1 = ybf->u_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) { - vpx_memset(dest_ptr1, src_ptr1[0], Border); - vpx_memset(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; + src_ptr1 = s; + src_ptr2 = s + w - 1; + dest_ptr1 = s - el; + dest_ptr2 = s + w; + + for (i = 0; i < h; i++) { + vpx_memset(dest_ptr1, src_ptr1[0], el); + vpx_memset(dest_ptr2, src_ptr2[0], er); + src_ptr1 += sp; + src_ptr2 += sp; + dest_ptr1 += sp; + dest_ptr2 += sp; } - /* Now copy the top and bottom source lines into each line of the respective borders */ - src_ptr1 = ybf->u_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)(Border); i++) { - vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); - vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = s - el; + src_ptr2 = s + sp * (h - 1) - el; + dest_ptr1 = s + sp * (-et) - el; + dest_ptr2 = s + sp * (h) - el; + linesize = el + er + w; + + for (i = 0; i < et; i++) { + vpx_memcpy(dest_ptr1, src_ptr1, linesize); + dest_ptr1 += sp; } - /***********/ - /* V Plane */ - /***********/ - - /* copy the left and right most columns out */ - src_ptr1 = ybf->v_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) { - vpx_memset(dest_ptr1, src_ptr1[0], Border); - vpx_memset(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - /* Now copy the top and bottom source lines into each line of the respective borders */ - src_ptr1 = ybf->v_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)(Border); i++) { - vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); - vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; + for (i = 0; i < eb; i++) { + vpx_memcpy(dest_ptr2, src_ptr2, linesize); + dest_ptr2 += sp; } } - -static void -extend_frame_borders_yonly_c(YV12_BUFFER_CONFIG *ybf) { - int i; - unsigned char *src_ptr1, *src_ptr2; - unsigned char *dest_ptr1, *dest_ptr2; - - unsigned int Border; - int plane_stride; - int plane_height; - int plane_width; - - /***********/ - /* Y Plane */ - /***********/ - Border = ybf->border; - plane_stride = ybf->y_stride; - plane_height = ybf->y_height; - plane_width = ybf->y_width; - - /* copy the left and right most columns out */ - src_ptr1 = ybf->y_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) { - vpx_memset(dest_ptr1, src_ptr1[0], Border); - vpx_memset(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - /* Now copy the top and bottom source lines into each line of the respective borders */ - src_ptr1 = ybf->y_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)Border; i++) { - vpx_memcpy(dest_ptr1, src_ptr1, plane_stride); - vpx_memcpy(dest_ptr2, src_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - plane_stride /= 2; - plane_height /= 2; - plane_width /= 2; - Border /= 2; - +void +vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + + extend_plane(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ybf->border, ybf->border, + ybf->border + ybf->y_height - ybf->y_crop_height, + ybf->border + ybf->y_width - ybf->y_crop_width); + + extend_plane(ybf->u_buffer, ybf->uv_stride, + (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, + ybf->border / 2, ybf->border / 2, + (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, + (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + + extend_plane(ybf->v_buffer, ybf->uv_stride, + (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, + ybf->border / 2, ybf->border / 2, + (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, + (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); } - /**************************************************************************** * * ROUTINE : vp8_yv12_copy_frame @@ -216,6 +117,14 @@ vp8_yv12_copy_frame_c(YV12_BUFFER_CONFIG *src_ybc, int row; unsigned char *source, *dest; +#if 0 + /* These assertions are valid in the codec, but the libvpx-tester uses + * this code slightly differently. + */ + assert(src_ybc->y_width == dst_ybc->y_width); + assert(src_ybc->y_height == dst_ybc->y_height); +#endif + source = src_ybc->y_buffer; dest = dst_ybc->y_buffer; diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h index 23be8f3dd78a8d6f017a82bdaa2ee9121bcf74be..14b6e278b1a052b3714684d736689aa0ff688520 100644 --- a/vpx_scale/yv12config.h +++ b/vpx_scale/yv12config.h @@ -42,6 +42,8 @@ extern "C" { typedef struct yv12_buffer_config { int y_width; int y_height; + int y_crop_width; + int y_crop_height; int y_stride; /* int yinternal_width; */ @@ -55,6 +57,7 @@ extern "C" { uint8_t *v_buffer; uint8_t *buffer_alloc; + int buffer_alloc_sz; int border; int frame_size; YUV_TYPE clrtype; @@ -65,6 +68,8 @@ extern "C" { int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border); + int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int border); int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf); #ifdef __cplusplus diff --git a/vpxenc.c b/vpxenc.c index eacfe5bdc16d26afbfc611768299eb224a030507..3295fd9a458ca42f47fccc930d7b8b5736706bcf 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -1028,7 +1028,8 @@ static const arg_def_t lag_in_frames = ARG_DEF(NULL, "lag-in-frames", 1, static const arg_def_t *global_args[] = { &use_yv12, &use_i420, &usage, &threads, &profile, - &width, &height, &stereo_mode, &timebase, &framerate, &error_resilient, + &width, &height, &stereo_mode, &timebase, &framerate, + &error_resilient, &lag_in_frames, NULL }; @@ -1103,7 +1104,11 @@ static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1, static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-16..16)"); static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1, - "Number of token partitions to use, log2"); + "Number of token partitions to use, log2"); +static const arg_def_t tile_cols = ARG_DEF(NULL, "tile-columns", 1, + "Number of tile columns to use, log2"); +static const arg_def_t tile_rows = ARG_DEF(NULL, "tile-rows", 1, + "Number of tile rows to use, log2"); static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"); static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1, @@ -1123,8 +1128,10 @@ static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1, "Constrained Quality Level"); static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); -#if CONFIG_LOSSLESS static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode"); +#if CONFIG_VP9_ENCODER +static const arg_def_t frame_parallel_decoding = ARG_DEF( + NULL, "frame-parallel", 1, "Enable frame parallel decodability features"); #endif #if CONFIG_VP8_ENCODER @@ -1147,22 +1154,18 @@ static const int vp8_arg_ctrl_map[] = { #if CONFIG_VP9_ENCODER static const arg_def_t *vp9_args[] = { &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh, - &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type, - &tune_ssim, &cq_level, &max_intra_rate_pct, -#if CONFIG_LOSSLESS - &lossless, -#endif + &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type, + &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless, + &frame_parallel_decoding, NULL }; static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF, VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD, - VP8E_SET_TOKEN_PARTITIONS, + VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS, VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE, VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT, -#if CONFIG_LOSSLESS - VP9E_SET_LOSSLESS, -#endif + VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, 0 }; #endif @@ -1479,14 +1482,16 @@ static void show_rate_histogram(struct rate_hist *hist, #define mmin(a, b) ((a) < (b) ? (a) : (b)) static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2, int yloc[2], int uloc[2], int vloc[2]) { - int match = 1; - int i, j; - yloc[0] = yloc[1] = -1; - for (i = 0, match = 1; match && i < img1->d_h; i+=32) { - for (j = 0; match && j < img1->d_w; j+=32) { + const unsigned int bsize = 64; + const unsigned int bsize2 = bsize >> 1; + unsigned int match = 1; + unsigned int i, j; + yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; + for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { + for (j = 0; match && j < img1->d_w; j += bsize) { int k, l; - int si = mmin(i + 32, img1->d_h) - i; - int sj = mmin(j + 32, img1->d_w) - j; + int si = mmin(i + bsize, img1->d_h) - i; + int sj = mmin(j + bsize, img1->d_w) - j; for (k = 0; match && k < si; k++) for (l = 0; match && l < sj; l++) { if (*(img1->planes[VPX_PLANE_Y] + @@ -1495,18 +1500,22 @@ static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2, (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) { yloc[0] = i + k; yloc[1] = j + l; + yloc[2] = *(img1->planes[VPX_PLANE_Y] + + (i + k) * img1->stride[VPX_PLANE_Y] + j + l); + yloc[3] = *(img2->planes[VPX_PLANE_Y] + + (i + k) * img2->stride[VPX_PLANE_Y] + j + l); match = 0; break; } } } } - uloc[0] = uloc[1] = -1; - for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i+=16) { - for (j = 0; j < match && (img1->d_w + 1) / 2; j+=16) { + uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; + for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) { + for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) { int k, l; - int si = mmin(i + 16, (img1->d_h + 1) / 2) - i; - int sj = mmin(j + 16, (img1->d_w + 1) / 2) - j; + int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i; + int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j; for (k = 0; match && k < si; k++) for (l = 0; match && l < sj; l++) { if (*(img1->planes[VPX_PLANE_U] + @@ -1515,18 +1524,22 @@ static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2, (i + k) * img2->stride[VPX_PLANE_U] + j + l)) { uloc[0] = i + k; uloc[1] = j + l; + uloc[2] = *(img1->planes[VPX_PLANE_U] + + (i + k) * img1->stride[VPX_PLANE_U] + j + l); + uloc[3] = *(img2->planes[VPX_PLANE_U] + + (i + k) * img2->stride[VPX_PLANE_V] + j + l); match = 0; break; } } } } - vloc[0] = vloc[1] = -1; - for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i+=16) { - for (j = 0; j < match && (img1->d_w + 1) / 2; j+=16) { + vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; + for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) { + for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) { int k, l; - int si = mmin(i + 16, (img1->d_h + 1) / 2) - i; - int sj = mmin(j + 16, (img1->d_w + 1) / 2) - j; + int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i; + int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j; for (k = 0; match && k < si; k++) for (l = 0; match && l < sj; l++) { if (*(img1->planes[VPX_PLANE_V] + @@ -1535,6 +1548,10 @@ static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2, (i + k) * img2->stride[VPX_PLANE_V] + j + l)) { vloc[0] = i + k; vloc[1] = j + l; + vloc[2] = *(img1->planes[VPX_PLANE_V] + + (i + k) * img1->stride[VPX_PLANE_V] + j + l); + vloc[3] = *(img2->planes[VPX_PLANE_V] + + (i + k) * img2->stride[VPX_PLANE_V] + j + l); match = 0; break; } @@ -1546,7 +1563,7 @@ static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2, static int compare_img(vpx_image_t *img1, vpx_image_t *img2) { int match = 1; - int i; + unsigned int i; match &= (img1->fmt == img2->fmt); match &= (img1->w == img2->w); @@ -1638,8 +1655,6 @@ struct stream_state { stats_io_t stats; struct vpx_image *img; vpx_codec_ctx_t decoder; - vpx_ref_frame_t ref_enc; - vpx_ref_frame_t ref_dec; int mismatch_seen; }; @@ -2221,16 +2236,7 @@ static void initialize_encoder(struct stream_state *stream, #if CONFIG_DECODERS if (global->test_decode != TEST_DECODE_OFF) { - int width, height; - vpx_codec_dec_init(&stream->decoder, global->codec->dx_iface(), NULL, 0); - - width = (stream->config.cfg.g_w + 15) & ~15; - height = (stream->config.cfg.g_h + 15) & ~15; - vpx_img_alloc(&stream->ref_enc.img, VPX_IMG_FMT_I420, width, height, 1); - vpx_img_alloc(&stream->ref_dec.img, VPX_IMG_FMT_I420, width, height, 1); - stream->ref_enc.frame_type = VP8_LAST_FRAME; - stream->ref_dec.frame_type = VP8_LAST_FRAME; } #endif } @@ -2311,6 +2317,8 @@ static void get_cx_data(struct stream_state *stream, if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) { stream->frames_out++; } + if (!global->quiet) + fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz); update_rate_histogram(&stream->rate_hist, cfg, pkt); if (stream->config.write_webm) { @@ -2373,6 +2381,8 @@ static void get_cx_data(struct stream_state *stream, stream->psnr_sse_total += pkt->data.psnr.sse[0]; stream->psnr_samples_total += pkt->data.psnr.samples[0]; for (i = 0; i < 4; i++) { + if (!global->quiet) + fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]); stream->psnr_totals[i] += pkt->data.psnr.psnr[i]; } stream->psnr_count++; @@ -2411,26 +2421,59 @@ static float usec_to_fps(uint64_t usec, unsigned int frames) { static void test_decode(struct stream_state *stream, - enum TestDecodeFatality fatal) { + enum TestDecodeFatality fatal, + const struct codec_item *codec) { + vpx_image_t enc_img, dec_img; + if (stream->mismatch_seen) return; - vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &stream->ref_enc); + /* Get the internal reference frame */ + if (codec->fourcc == VP8_FOURCC) { + struct vpx_ref_frame ref_enc, ref_dec; + int width, height; + + width = (stream->config.cfg.g_w + 15) & ~15; + height = (stream->config.cfg.g_h + 15) & ~15; + vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1); + enc_img = ref_enc.img; + vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1); + dec_img = ref_dec.img; + + ref_enc.frame_type = VP8_LAST_FRAME; + ref_dec.frame_type = VP8_LAST_FRAME; + vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc); + vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec); + } else { + struct vp9_ref_frame ref; + + ref.idx = 0; + vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref); + enc_img = ref.img; + vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref); + dec_img = ref.img; + } ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame"); - vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &stream->ref_dec); ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame"); - if (!compare_img(&stream->ref_enc.img, &stream->ref_dec.img)) { - int y[2], u[2], v[2]; - find_mismatch(&stream->ref_enc.img, &stream->ref_dec.img, - y, u, v); + if (!compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; + find_mismatch(&enc_img, &dec_img, y, u, v); + stream->decoder.err = 1; warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL, - "Stream %d: Encode/decode mismatch on frame %d" - " at Y[%d, %d], U[%d, %d], V[%d, %d]", + "Stream %d: Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", stream->index, stream->frames_out, - y[0], y[1], u[0], u[1], v[0], v[1]); + y[0], y[1], y[2], y[3], + u[0], u[1], u[2], u[3], + v[0], v[1], v[2], v[3]); stream->mismatch_seen = stream->frames_out; } + + vpx_img_free(&enc_img); + vpx_img_free(&dec_img); } @@ -2544,7 +2587,6 @@ int main(int argc, const char **argv_) { " and --passes=2\n", stream->index, global.pass); }); - /* Use the frame rate from the file only if none was specified * on the command-line. */ @@ -2656,7 +2698,7 @@ int main(int argc, const char **argv_) { } if (got_data && global.test_decode != TEST_DECODE_OFF) - FOREACH_STREAM(test_decode(stream, global.test_decode)); + FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec)); } fflush(stdout); @@ -2688,8 +2730,6 @@ int main(int argc, const char **argv_) { if (global.test_decode != TEST_DECODE_OFF) { FOREACH_STREAM(vpx_codec_destroy(&stream->decoder)); - FOREACH_STREAM(vpx_img_free(&stream->ref_enc.img)); - FOREACH_STREAM(vpx_img_free(&stream->ref_dec.img)); } close_input_file(&input);