diff --git a/test/convolve_test.cc b/test/convolve_test.cc index d1e7f1d06ff79c5b735856af69a8286517ada68a..99bdf82ab262c11ad0b499c93001bdd227755efa 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -21,6 +21,9 @@ #include "vpx_ports/mem.h" namespace { + +static const int kMaxDimension = 64; + typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, @@ -30,9 +33,10 @@ typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, struct ConvolveFunctions { ConvolveFunctions(ConvolveFunc h8, ConvolveFunc h8_avg, ConvolveFunc v8, ConvolveFunc v8_avg, - ConvolveFunc hv8, ConvolveFunc hv8_avg) + ConvolveFunc hv8, ConvolveFunc hv8_avg, + int bd) : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg), - hv8_avg_(hv8_avg) {} + hv8_avg_(hv8_avg), use_high_bd_(bd) {} ConvolveFunc h8_; ConvolveFunc v8_; @@ -40,6 +44,7 @@ struct ConvolveFunctions { ConvolveFunc h8_avg_; ConvolveFunc v8_avg_; ConvolveFunc hv8_avg_; + int use_high_bd_; // 0 if high bitdepth not used, else the actual bit depth. }; typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam; @@ -66,6 +71,119 @@ void filter_block2d_8_c(const uint8_t *src_ptr, // This buffer is allocated to be big enough for the largest block type we // support. const int kInterp_Extend = 4; + const unsigned int intermediate_height = + (kInterp_Extend - 1) + output_height + kInterp_Extend; + unsigned int i, j; + + // Size of intermediate_buffer is max_intermediate_height * filter_max_width, + // where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height + // + kInterp_Extend + // = 3 + 16 + 4 + // = 23 + // and filter_max_width = 16 + // + uint8_t intermediate_buffer[71 * kMaxDimension]; + const int intermediate_next_stride = 1 - intermediate_height * output_width; + + // Horizontal pass (src -> transposed intermediate). + uint8_t *output_ptr = intermediate_buffer; + const int src_next_row_stride = src_stride - output_width; + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * HFilter[0]) + + (src_ptr[1] * HFilter[1]) + + (src_ptr[2] * HFilter[2]) + + (src_ptr[3] * HFilter[3]) + + (src_ptr[4] * HFilter[4]) + + (src_ptr[5] * HFilter[5]) + + (src_ptr[6] * HFilter[6]) + + (src_ptr[7] * HFilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT); + ++src_ptr; + output_ptr += intermediate_height; + } + src_ptr += src_next_row_stride; + output_ptr += intermediate_next_stride; + } + + // Vertical pass (transposed intermediate -> dst). + src_ptr = intermediate_buffer; + const int dst_next_row_stride = dst_stride - output_width; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + // Apply filter... + const int temp = (src_ptr[0] * VFilter[0]) + + (src_ptr[1] * VFilter[1]) + + (src_ptr[2] * VFilter[2]) + + (src_ptr[3] * VFilter[3]) + + (src_ptr[4] * VFilter[4]) + + (src_ptr[5] * VFilter[5]) + + (src_ptr[6] * VFilter[6]) + + (src_ptr[7] * VFilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT); + src_ptr += intermediate_height; + } + src_ptr += intermediate_next_stride; + dst_ptr += dst_next_row_stride; + } +} + +void block2d_average_c(uint8_t *src, + unsigned int src_stride, + uint8_t *output_ptr, + unsigned int output_stride, + unsigned int output_width, + unsigned int output_height) { + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; + } + output_ptr += output_stride; + } +} + +void filter_average_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint8_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height) { + uint8_t tmp[kMaxDimension * kMaxDimension]; + + assert(output_width <= kMaxDimension); + assert(output_height <= kMaxDimension); + filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64, + output_width, output_height); + block2d_average_c(tmp, 64, dst_ptr, dst_stride, + output_width, output_height); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void high_filter_block2d_8_c(const uint16_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint16_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height, + int bd) { + // Between passes, we use an intermediate buffer whose height is extended to + // have enough horizontally filtered values as input for the vertical pass. + // This buffer is allocated to be big enough for the largest block type we + // support. + const int kInterp_Extend = 4; const unsigned int intermediate_height = (kInterp_Extend - 1) + output_height + kInterp_Extend; @@ -76,12 +194,12 @@ void filter_block2d_8_c(const uint8_t *src_ptr, * = 23 * and filter_max_width = 16 */ - uint8_t intermediate_buffer[71 * 64]; + uint16_t intermediate_buffer[71 * kMaxDimension]; const int intermediate_next_stride = 1 - intermediate_height * output_width; // Horizontal pass (src -> transposed intermediate). { - uint8_t *output_ptr = intermediate_buffer; + uint16_t *output_ptr = intermediate_buffer; const int src_next_row_stride = src_stride - output_width; unsigned int i, j; src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); @@ -99,7 +217,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr, (VP9_FILTER_WEIGHT >> 1); // Rounding // Normalize back to 0-255... - *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT); + *output_ptr = clip_pixel_high(temp >> VP9_FILTER_SHIFT, bd); ++src_ptr; output_ptr += intermediate_height; } @@ -110,7 +228,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr, // Vertical pass (transposed intermediate -> dst). { - uint8_t *src_ptr = intermediate_buffer; + uint16_t *src_ptr = intermediate_buffer; const int dst_next_row_stride = dst_stride - output_width; unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -127,7 +245,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr, (VP9_FILTER_WEIGHT >> 1); // Rounding // Normalize back to 0-255... - *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT); + *dst_ptr++ = clip_pixel_high(temp >> VP9_FILTER_SHIFT, bd); src_ptr += intermediate_height; } src_ptr += intermediate_next_stride; @@ -136,12 +254,13 @@ void filter_block2d_8_c(const uint8_t *src_ptr, } } -void block2d_average_c(uint8_t *src, - unsigned int src_stride, - uint8_t *output_ptr, - unsigned int output_stride, - unsigned int output_width, - unsigned int output_height) { +void high_block2d_average_c(uint16_t *src, + unsigned int src_stride, + uint16_t *output_ptr, + unsigned int output_stride, + unsigned int output_width, + unsigned int output_height, + int bd) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { @@ -151,23 +270,25 @@ void block2d_average_c(uint8_t *src, } } -void filter_average_block2d_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter, - const int16_t *VFilter, - uint8_t *dst_ptr, - unsigned int dst_stride, - unsigned int output_width, - unsigned int output_height) { - uint8_t tmp[64 * 64]; - - assert(output_width <= 64); - assert(output_height <= 64); - filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64, - output_width, output_height); - block2d_average_c(tmp, 64, dst_ptr, dst_stride, - output_width, output_height); +void high_filter_average_block2d_8_c(const uint16_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint16_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height, + int bd) { + uint16_t tmp[kMaxDimension * kMaxDimension]; + + assert(output_width <= kMaxDimension); + assert(output_height <= kMaxDimension); + high_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64, + output_width, output_height, bd); + high_block2d_average_c(tmp, 64, dst_ptr, dst_stride, + output_width, output_height, bd); } +#endif // CONFIG_VP9_HIGHBITDEPTH class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { public: @@ -177,6 +298,13 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1; output_ = reinterpret_cast<uint8_t*>( vpx_memalign(kDataAlignment, kOutputBufferSize)); +#if CONFIG_VP9_HIGHBITDEPTH + input16_ = reinterpret_cast<uint16_t*>( + vpx_memalign(kDataAlignment, + (kInputBufferSize + 1) * sizeof(uint16_t))) + 1; + output16_ = reinterpret_cast<uint16_t*>( + vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t))); +#endif } static void TearDownTestCase() { @@ -184,6 +312,12 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { input_ = NULL; vpx_free(output_); output_ = NULL; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_free(input16_ - 1); + input16_ = NULL; + vpx_free(output16_); + output16_ = NULL; +#endif } protected: @@ -191,7 +325,6 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { static const int kOuterBlockSize = 256; static const int kInputStride = kOuterBlockSize; static const int kOutputStride = kOuterBlockSize; - static const int kMaxDimension = 64; static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize; static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize; @@ -212,6 +345,12 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { virtual void SetUp() { UUT_ = GET_PARAM(2); +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ != 0) + mask_ = (1 << UUT_->use_high_bd_) - 1; + else + mask_ = 255; +#endif /* Set up guard blocks for an inner block centered in the outer block */ for (int i = 0; i < kOutputBufferSize; ++i) { if (IsIndexInBorder(i)) @@ -222,15 +361,25 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { ::libvpx_test::ACMRandom prng; for (int i = 0; i < kInputBufferSize; ++i) { - if (i & 1) + if (i & 1) { input_[i] = 255; - else +#if CONFIG_VP9_HIGHBITDEPTH + input16_[i] = mask_; +#endif + } else { input_[i] = prng.Rand8Extremes(); +#if CONFIG_VP9_HIGHBITDEPTH + input16_[i] = prng.Rand16() & mask_; +#endif + } } } void SetConstantInput(int value) { memset(input_, value, kInputBufferSize); +#if CONFIG_VP9_HIGHBITDEPTH + vpx_memset16(input16_, value, kInputBufferSize); +#endif } void CheckGuardBlocks() { @@ -240,20 +389,123 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> { } } - uint8_t* input() const { + uint8_t *input() const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ == 0) { + return input_ + BorderTop() * kOuterBlockSize + BorderLeft(); + } else { + return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize + + BorderLeft()); + } +#else return input_ + BorderTop() * kOuterBlockSize + BorderLeft(); +#endif } - uint8_t* output() const { + uint8_t *output() const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ == 0) { + return output_ + BorderTop() * kOuterBlockSize + BorderLeft(); + } else { + return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize + + BorderLeft()); + } +#else return output_ + BorderTop() * kOuterBlockSize + BorderLeft(); +#endif + } + + uint16_t lookup(uint8_t *list, int index) const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ == 0) { + return list[index]; + } else { + return CONVERT_TO_SHORTPTR(list)[index]; + } +#else + return list[index]; +#endif + } + + void assign_val(uint8_t *list, int index, uint16_t val) const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ == 0) { + list[index] = (uint8_t) val; + } else { + CONVERT_TO_SHORTPTR(list)[index] = val; + } +#else + list[index] = (uint8_t) val; +#endif + } + + void wrapper_filter_average_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint8_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height) { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ == 0) { + filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, + dst_ptr, dst_stride, output_width, + output_height); + } else { + high_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + HFilter, VFilter, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + output_width, output_height, + UUT_->use_high_bd_); + } +#else + filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, + dst_ptr, dst_stride, output_width, + output_height); +#endif + } + + void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, + const unsigned int src_stride, + const int16_t *HFilter, + const int16_t *VFilter, + uint8_t *dst_ptr, + unsigned int dst_stride, + unsigned int output_width, + unsigned int output_height) { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ == 0) { + filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, + dst_ptr, dst_stride, output_width, output_height); + } else { + high_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + HFilter, VFilter, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + output_width, output_height, UUT_->use_high_bd_); + } +#else + filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, + dst_ptr, dst_stride, output_width, output_height); +#endif } const ConvolveFunctions* UUT_; static uint8_t* input_; static uint8_t* output_; +#if CONFIG_VP9_HIGHBITDEPTH + static uint16_t* input16_; + static uint16_t* output16_; + int mask_; +#endif }; + uint8_t* ConvolveTest::input_ = NULL; uint8_t* ConvolveTest::output_ = NULL; +#if CONFIG_VP9_HIGHBITDEPTH +uint16_t* ConvolveTest::input16_ = NULL; +uint16_t* ConvolveTest::output16_ = NULL; +#endif TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); @@ -272,7 +524,8 @@ TEST_P(ConvolveTest, CopyHoriz) { for (int y = 0; y < Height(); ++y) for (int x = 0; x < Width(); ++x) - ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x]) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) << "(" << x << "," << y << ")"; } @@ -289,7 +542,8 @@ TEST_P(ConvolveTest, CopyVert) { for (int y = 0; y < Height(); ++y) for (int x = 0; x < Width(); ++x) - ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x]) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) << "(" << x << "," << y << ")"; } @@ -306,7 +560,8 @@ TEST_P(ConvolveTest, Copy2D) { for (int y = 0; y < Height(); ++y) for (int x = 0; x < Width(); ++x) - ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x]) + ASSERT_EQ(lookup(out, y * kOutputStride + x), + lookup(in, y * kInputStride + x)) << "(" << x << "," << y << ")"; } @@ -339,8 +594,18 @@ const int16_t kInvalidFilter[8] = { 0 }; TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { uint8_t* const in = input(); uint8_t* const out = output(); +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t ref8[kOutputStride * kMaxDimension]; + uint16_t ref16[kOutputStride * kMaxDimension]; + uint8_t* ref; + if (UUT_->use_high_bd_ == 0) { + ref = ref8; + } else { + ref = CONVERT_TO_BYTEPTR(ref16); + } +#else uint8_t ref[kOutputStride * kMaxDimension]; - +#endif for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { const InterpKernel *filters = @@ -350,10 +615,10 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { - filter_block2d_8_c(in, kInputStride, - filters[filter_x], filters[filter_y], - ref, kOutputStride, - Width(), Height()); + wrapper_filter_block2d_8_c(in, kInputStride, + filters[filter_x], filters[filter_y], + ref, kOutputStride, + Width(), Height()); if (filters == eighttap_smooth || (filter_x && filter_y)) ASM_REGISTER_STATE_CHECK( @@ -375,7 +640,8 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { for (int y = 0; y < Height(); ++y) for (int x = 0; x < Width(); ++x) - ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x]) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) << "mismatch at (" << x << "," << y << "), " << "filters (" << filter_bank << "," << filter_x << "," << filter_y << ")"; @@ -387,16 +653,36 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) { uint8_t* const in = input(); uint8_t* const out = output(); +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t ref8[kOutputStride * kMaxDimension]; + uint16_t ref16[kOutputStride * kMaxDimension]; + uint8_t* ref; + if (UUT_->use_high_bd_ == 0) { + ref = ref8; + } else { + ref = CONVERT_TO_BYTEPTR(ref16); + } +#else uint8_t ref[kOutputStride * kMaxDimension]; +#endif // Populate ref and out with some random data ::libvpx_test::ACMRandom prng; for (int y = 0; y < Height(); ++y) { for (int x = 0; x < Width(); ++x) { - const uint8_t r = prng.Rand8Extremes(); + uint16_t r; +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ == 0 || UUT_->use_high_bd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; + } +#else + r = prng.Rand8Extremes(); +#endif - out[y * kOutputStride + x] = r; - ref[y * kOutputStride + x] = r; + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); } } @@ -408,10 +694,10 @@ TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) { for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { - filter_average_block2d_8_c(in, kInputStride, - filters[filter_x], filters[filter_y], - ref, kOutputStride, - Width(), Height()); + wrapper_filter_average_block2d_8_c(in, kInputStride, + filters[filter_x], filters[filter_y], + ref, kOutputStride, + Width(), Height()); if (filters == eighttap_smooth || (filter_x && filter_y)) ASM_REGISTER_STATE_CHECK( @@ -433,7 +719,8 @@ TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) { for (int y = 0; y < Height(); ++y) for (int x = 0; x < Width(); ++x) - ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x]) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) << "mismatch at (" << x << "," << y << "), " << "filters (" << filter_bank << "," << filter_x << "," << filter_y << ")"; @@ -442,6 +729,103 @@ TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) { } } +TEST_P(ConvolveTest, FilterExtremes) { + uint8_t *const in = input(); + uint8_t *const out = output(); +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t ref8[kOutputStride * kMaxDimension]; + uint16_t ref16[kOutputStride * kMaxDimension]; + uint8_t *ref; + if (UUT_->use_high_bd_ == 0) { + ref = ref8; + } else { + ref = CONVERT_TO_BYTEPTR(ref16); + } +#else + uint8_t ref[kOutputStride * kMaxDimension]; +#endif + + // Populate ref and out with some random data + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + uint16_t r; +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_high_bd_ == 0 || UUT_->use_high_bd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; + } +#else + r = prng.Rand8Extremes(); +#endif + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); + } + } + + for (int axis = 0; axis < 2; axis++) { + int seed_val = 0; + while (seed_val < 256) { + for (int y = 0; y < 8; ++y) { + for (int x = 0; x < 8; ++x) { +#if CONFIG_VP9_HIGHBITDEPTH + assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1, + ((seed_val >> (axis ? y : x)) & 1) * mask_); +#else + assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1, + ((seed_val >> (axis ? y : x)) & 1) * 255); +#endif + if (axis) seed_val++; + } + if (axis) + seed_val-= 8; + else + seed_val++; + } + if (axis) seed_val += 8; + + for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) { + const InterpKernel *filters = + vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank)); + const InterpKernel *const eighttap_smooth = + vp9_get_interp_kernel(EIGHTTAP_SMOOTH); + for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) { + for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) { + wrapper_filter_block2d_8_c(in, kInputStride, + filters[filter_x], filters[filter_y], + ref, kOutputStride, + Width(), Height()); + if (filters == eighttap_smooth || (filter_x && filter_y)) + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_(in, kInputStride, out, kOutputStride, + filters[filter_x], 16, filters[filter_y], 16, + Width(), Height())); + else if (filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->v8_(in, kInputStride, out, kOutputStride, + kInvalidFilter, 16, filters[filter_y], 16, + Width(), Height())); + else + ASM_REGISTER_STATE_CHECK( + UUT_->h8_(in, kInputStride, out, kOutputStride, + filters[filter_x], 16, kInvalidFilter, 16, + Width(), Height())); + + for (int y = 0; y < Height(); ++y) + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "mismatch at (" << x << "," << y << "), " + << "filters (" << filter_bank << "," + << filter_x << "," << filter_y << ")"; + } + } + } + } + } +} + DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = { { 0, 0, 0, 0, 0, 0, 0, 128}, { 0, 0, 0, 0, 0, 0, 128}, @@ -505,7 +889,8 @@ TEST_P(ConvolveTest, ChangeFilterWorks) { kPixelSelected + ((kInitialSubPelOffset + kFilterPeriodAdjust * kInputPixelStep) >> SUBPEL_BITS); - ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width(); + ASSERT_EQ(lookup(in, ref_x), lookup(out, x)) + << "x == " << x << "width = " << Width(); } /* Test the vertical filter. */ @@ -520,7 +905,8 @@ TEST_P(ConvolveTest, ChangeFilterWorks) { kPixelSelected + ((kInitialSubPelOffset + kFilterPeriodAdjust * kInputPixelStep) >> SUBPEL_BITS); - ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y; + ASSERT_EQ(lookup(in, ref_y * kInputStride), lookup(out, y * kInputStride)) + << "y == " << y; } /* Test the horizontal and vertical filters in combination. */ @@ -543,7 +929,8 @@ TEST_P(ConvolveTest, ChangeFilterWorks) { + kFilterPeriodAdjustX * kInputPixelStep) >> SUBPEL_BITS); - ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x]) + ASSERT_EQ(lookup(in, ref_y * kInputStride + ref_x), + lookup(out, y * kOutputStride + x)) << "x == " << x << ", y == " << y; } } @@ -570,7 +957,8 @@ TEST_P(ConvolveTest, CheckScalingFiltering) { for (int y = 0; y < Height(); ++y) { for (int x = 0; x < Width(); ++x) { - ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x]) + ASSERT_EQ(lookup(in, y * kInputStride + x), + lookup(out, y * kOutputStride + x)) << "x == " << x << ", y == " << y << ", frac == " << frac << ", step == " << step; } @@ -582,11 +970,479 @@ TEST_P(ConvolveTest, CheckScalingFiltering) { using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSE2 && ARCH_X86_64 +void wrap_convolve8_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 8); +} + +void wrap_convolve8_avg_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_avg_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 8); +} + +void wrap_convolve8_sse2_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_avg_sse2_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_avg_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_avg_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_sse2_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_avg_sse2_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 10); +} + +void wrap_convolve8_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +void wrap_convolve8_avg_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +void wrap_convolve8_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +void wrap_convolve8_avg_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 12); +} + +void wrap_convolve8_sse2_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 12); +} + +void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 12); +} +#endif // HAVE_SSE2 && ARCH_X86_64 + +void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_avg_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_vert_c_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_avg_vert_c_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_c_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 8); +} + +void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 8); +} + +void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_avg_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 10); +} + +void wrap_convolve8_vert_c_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_avg_vert_c_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_c_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, w, h, 10); +} + +void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +void wrap_convolve8_avg_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +void wrap_convolve8_vert_c_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +void wrap_convolve8_avg_vert_c_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +void wrap_convolve8_c_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, int h) { + vp9_high_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + filter_x_stride, filter_y, filter_y_stride, + w, h, 12); +} + +const ConvolveFunctions convolve8_c( + wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, + wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, + wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8); +INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve8_c), + make_tuple(8, 4, &convolve8_c), + make_tuple(4, 8, &convolve8_c), + make_tuple(8, 8, &convolve8_c), + make_tuple(16, 8, &convolve8_c), + make_tuple(8, 16, &convolve8_c), + make_tuple(16, 16, &convolve8_c), + make_tuple(32, 16, &convolve8_c), + make_tuple(16, 32, &convolve8_c), + make_tuple(32, 32, &convolve8_c), + make_tuple(64, 32, &convolve8_c), + make_tuple(32, 64, &convolve8_c), + make_tuple(64, 64, &convolve8_c))); +const ConvolveFunctions convolve10_c( + wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, + wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10); +INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve10_c), + make_tuple(8, 4, &convolve10_c), + make_tuple(4, 8, &convolve10_c), + make_tuple(8, 8, &convolve10_c), + make_tuple(16, 8, &convolve10_c), + make_tuple(8, 16, &convolve10_c), + make_tuple(16, 16, &convolve10_c), + make_tuple(32, 16, &convolve10_c), + make_tuple(16, 32, &convolve10_c), + make_tuple(32, 32, &convolve10_c), + make_tuple(64, 32, &convolve10_c), + make_tuple(32, 64, &convolve10_c), + make_tuple(64, 64, &convolve10_c))); +const ConvolveFunctions convolve12_c( + wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, + wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12); +INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve12_c), + make_tuple(8, 4, &convolve12_c), + make_tuple(4, 8, &convolve12_c), + make_tuple(8, 8, &convolve12_c), + make_tuple(16, 8, &convolve12_c), + make_tuple(8, 16, &convolve12_c), + make_tuple(16, 16, &convolve12_c), + make_tuple(32, 16, &convolve12_c), + make_tuple(16, 32, &convolve12_c), + make_tuple(32, 32, &convolve12_c), + make_tuple(64, 32, &convolve12_c), + make_tuple(32, 64, &convolve12_c), + make_tuple(64, 64, &convolve12_c))); + #else + const ConvolveFunctions convolve8_c( vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c, vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c, - vp9_convolve8_c, vp9_convolve8_avg_c); + vp9_convolve8_c, vp9_convolve8_avg_c, 0); INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_c), @@ -606,11 +1462,65 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values( #if HAVE_SSE2 && ARCH_X86_64 #if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_sse2( + wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8, + wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8, + wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8); +INSTANTIATE_TEST_CASE_P(SSE2_8, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve8_sse2), + make_tuple(8, 4, &convolve8_sse2), + make_tuple(4, 8, &convolve8_sse2), + make_tuple(8, 8, &convolve8_sse2), + make_tuple(16, 8, &convolve8_sse2), + make_tuple(8, 16, &convolve8_sse2), + make_tuple(16, 16, &convolve8_sse2), + make_tuple(32, 16, &convolve8_sse2), + make_tuple(16, 32, &convolve8_sse2), + make_tuple(32, 32, &convolve8_sse2), + make_tuple(64, 32, &convolve8_sse2), + make_tuple(32, 64, &convolve8_sse2), + make_tuple(64, 64, &convolve8_sse2))); +const ConvolveFunctions convolve10_sse2( + wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10, + wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10, + wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10); +INSTANTIATE_TEST_CASE_P(SSE2_10, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve10_sse2), + make_tuple(8, 4, &convolve10_sse2), + make_tuple(4, 8, &convolve10_sse2), + make_tuple(8, 8, &convolve10_sse2), + make_tuple(16, 8, &convolve10_sse2), + make_tuple(8, 16, &convolve10_sse2), + make_tuple(16, 16, &convolve10_sse2), + make_tuple(32, 16, &convolve10_sse2), + make_tuple(16, 32, &convolve10_sse2), + make_tuple(32, 32, &convolve10_sse2), + make_tuple(64, 32, &convolve10_sse2), + make_tuple(32, 64, &convolve10_sse2), + make_tuple(64, 64, &convolve10_sse2))); +const ConvolveFunctions convolve12_sse2( + wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12, + wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12, + wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12); +INSTANTIATE_TEST_CASE_P(SSE2_12, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve12_sse2), + make_tuple(8, 4, &convolve12_sse2), + make_tuple(4, 8, &convolve12_sse2), + make_tuple(8, 8, &convolve12_sse2), + make_tuple(16, 8, &convolve12_sse2), + make_tuple(8, 16, &convolve12_sse2), + make_tuple(16, 16, &convolve12_sse2), + make_tuple(32, 16, &convolve12_sse2), + make_tuple(16, 32, &convolve12_sse2), + make_tuple(32, 32, &convolve12_sse2), + make_tuple(64, 32, &convolve12_sse2), + make_tuple(32, 64, &convolve12_sse2), + make_tuple(64, 64, &convolve12_sse2))); #else const ConvolveFunctions convolve8_sse2( vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2, vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2, - vp9_convolve8_sse2, vp9_convolve8_avg_sse2); + vp9_convolve8_sse2, vp9_convolve8_avg_sse2, 0); INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_sse2), @@ -626,14 +1536,14 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( make_tuple(64, 32, &convolve8_sse2), make_tuple(32, 64, &convolve8_sse2), make_tuple(64, 64, &convolve8_sse2))); -#endif +#endif // CONFIG_VP9_HIGHBITDEPTH #endif #if HAVE_SSSE3 const ConvolveFunctions convolve8_ssse3( vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3, vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3, - vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3); + vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3, 0); INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_ssse3), @@ -655,7 +1565,7 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( const ConvolveFunctions convolve8_avx2( vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3, vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3, - vp9_convolve8_avx2, vp9_convolve8_avg_ssse3); + vp9_convolve8_avx2, vp9_convolve8_avg_ssse3, 0); INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_avx2), @@ -677,7 +1587,7 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( const ConvolveFunctions convolve8_neon( vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon, vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon, - vp9_convolve8_neon, vp9_convolve8_avg_neon); + vp9_convolve8_neon, vp9_convolve8_avg_neon, 0); INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_neon), @@ -699,7 +1609,7 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( const ConvolveFunctions convolve8_dspr2( vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2, vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2, - vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2); + vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2, 0); INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_dspr2), diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index d8aaf32c44a12ab7d29598d67d7487e8ed20d2c8..ad70e59bf3ad13880753b5cc002949eab0eea7da 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -282,3 +282,280 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, dst += dst_stride; } } + +#if CONFIG_VP9_HIGHBITDEPTH +static void high_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, + int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void high_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, + int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO(dst[x] + + clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void high_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel_high( + ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void high_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + + clip_pixel_high(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void high_convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, + int w, int h, int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[64 * 135]; + int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + if (intermediate_height < h) + intermediate_height = h; + + high_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, CONVERT_TO_BYTEPTR(temp), 64, + x_filters, x0_q4, x_step_q4, w, + intermediate_height, bd); + high_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1), + 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, + w, h, bd); +} + + +void vp9_high_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + high_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h, bd); +} + +void vp9_high_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + high_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h, bd); +} + +void vp9_high_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + high_convolve_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h, bd); +} + +void vp9_high_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + high_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h, bd); +} + +void vp9_high_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + high_convolve(src, src_stride, dst, dst_stride, + filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h, bd); +} + +void vp9_high_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED_ARRAY(16, uint16_t, temp, 64 * 64); + assert(w <= 64); + assert(h <= 64); + + vp9_high_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, + filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); + vp9_high_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, + NULL, 0, NULL, 0, w, h, bd); +} + +void vp9_high_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int r; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (r = h; r > 0; --r) { + vpx_memcpy(dst, src, w * sizeof(uint16_t)); + src += src_stride; + dst += dst_stride; + } +} + +void vp9_high_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + } + src += src_stride; + dst += dst_stride; + } +} +#endif diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h index 6bf71fc7943013b0ec4b5107069bc764018c8c90..faf70b12aaa16e8aeb97c106ecfd45b0eb12c459 100644 --- a/vp9/common/vp9_convolve.h +++ b/vp9/common/vp9_convolve.h @@ -23,6 +23,14 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*high_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd); +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 86ae64839130d7c8f5bb35cf71c2aa8d84694e28..28723f675750687e3305e05ef6a5bb97aa1f7a46 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -63,6 +63,53 @@ static void build_mc_border(const uint8_t *src, int src_stride, } while (--b_h); } +#if CONFIG_VP9_HIGHBITDEPTH +static void high_build_mc_border(const uint8_t *src8, int src_stride, + uint16_t *dst, int dst_stride, + int x, int y, int b_w, int b_h, + int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) + left = b_w; + + if (x + b_w > w) + right = x + b_w - w; + + if (right > b_w) + right = b_w; + + copy = b_w - left - right; + + if (left) + vpx_memset16(dst, ref_row[0], left); + + if (copy) + memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t)); + + if (right) + vpx_memset16(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) + ref_row += src_stride; + } while (--b_h); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static void inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int subpel_x, @@ -97,6 +144,42 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4); } +#if CONFIG_VP9_HIGHBITDEPTH +static void high_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int subpel_x, + const int subpel_y, + const struct scale_factors *sf, + int w, int h, int ref, + const InterpKernel *kernel, + int xs, int ys, int bd) { + sf->high_predict[subpel_x != 0][subpel_y != 0][ref]( + src, src_stride, dst, dst_stride, + kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd); +} + +void vp9_high_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const MV *src_mv, + const struct scale_factors *sf, + int w, int h, int ref, + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y, int bd) { + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + + high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, + sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static INLINE int round_mv_comp_q4(int value) { return (value < 0 ? value - 2 : value + 2) / 4; } @@ -222,8 +305,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride + (scaled_mv.col >> SUBPEL_BITS); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, + xd->bd); + } else { + inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys); + } +#else inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys); +#endif // CONFIG_VP9_HIGHBITDEPTH } } @@ -393,16 +487,64 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0; // Extend the border. - build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1, - x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width, +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_build_mc_border(buf_ptr1, + pre_buf->stride, + xd->mc_buf_high, + x1 - x0 + 1, + x0, + y0, + x1 - x0 + 1, + y1 - y0 + 1, + frame_width, + frame_height); + buf_stride = x1 - x0 + 1; + buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) + + y_pad * 3 * buf_stride + x_pad * 3; + } else { + build_mc_border(buf_ptr1, + pre_buf->stride, + xd->mc_buf, + x1 - x0 + 1, + x0, + y0, + x1 - x0 + 1, + y1 - y0 + 1, + frame_width, + frame_height); + buf_stride = x1 - x0 + 1; + buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3; + } +#else + build_mc_border(buf_ptr1, + pre_buf->stride, + xd->mc_buf, + x1 - x0 + 1, + x0, + y0, + x1 - x0 + 1, + y1 - y0 + 1, + frame_width, frame_height); buf_stride = x1 - x0 + 1; buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3; +#endif // CONFIG_VP9_HIGHBITDEPTH } } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); + } else { + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); + } +#else inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys); +#endif // CONFIG_VP9_HIGHBITDEPTH } } diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 58c596ee87f0452127e29df6156959f8f0407393..e70cc4c99fd52872a8d65f339eaf9675d9ac4453 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -39,6 +39,17 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, enum mv_precision precision, int x, int y); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const MV *mv_q3, + const struct scale_factors *sf, + int w, int h, int do_avg, + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y, int bd); +#endif + static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, const struct scale_factors *sf) { const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index b75ea64f094564643fc083ef60f81a87a08ca0a5..0f52ae15c351c0d41e4ed47569ac10ae448cf417 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -606,6 +606,33 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; specialize qw/vp9_high_dc_128_predictor_32x32/; + # + # Sub Pixel Filters + # + add_proto qw/void vp9_high_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vp9_high_convolve_copy/; + + add_proto qw/void vp9_high_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vp9_high_convolve_avg/; + + add_proto qw/void vp9_high_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vp9_high_convolve8/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vp9_high_convolve8_horiz/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vp9_high_convolve8_vert/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vp9_high_convolve8_avg/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vp9_high_convolve8_avg_horiz/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vp9_high_convolve8_avg_vert/, "$sse2_x86_64"; + # # dct # diff --git a/vp9/common/vp9_scale.c b/vp9/common/vp9_scale.c index 2f58323aa450418471bb8d53852305141e14fbb8..63e2b5306d5660a0e737b7df7db87e4d79347c6c 100644 --- a/vp9/common/vp9_scale.c +++ b/vp9/common/vp9_scale.c @@ -43,9 +43,16 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) { return res; } +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, + int other_w, int other_h, + int this_w, int this_h, + int use_high) { +#else void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h) { +#endif if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { sf->x_scale_fp = REF_INVALID_SCALE; sf->y_scale_fp = REF_INVALID_SCALE; @@ -111,4 +118,48 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, // 2D subpel motion always gets filtered in both directions sf->predict[1][1][0] = vp9_convolve8; sf->predict[1][1][1] = vp9_convolve8_avg; +#if CONFIG_VP9_HIGHBITDEPTH + if (use_high) { + if (sf->x_step_q4 == 16) { + if (sf->y_step_q4 == 16) { + // No scaling in either direction. + sf->high_predict[0][0][0] = vp9_high_convolve_copy; + sf->high_predict[0][0][1] = vp9_high_convolve_avg; + sf->high_predict[0][1][0] = vp9_high_convolve8_vert; + sf->high_predict[0][1][1] = vp9_high_convolve8_avg_vert; + sf->high_predict[1][0][0] = vp9_high_convolve8_horiz; + sf->high_predict[1][0][1] = vp9_high_convolve8_avg_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + sf->high_predict[0][0][0] = vp9_high_convolve8_vert; + sf->high_predict[0][0][1] = vp9_high_convolve8_avg_vert; + sf->high_predict[0][1][0] = vp9_high_convolve8_vert; + sf->high_predict[0][1][1] = vp9_high_convolve8_avg_vert; + sf->high_predict[1][0][0] = vp9_high_convolve8; + sf->high_predict[1][0][1] = vp9_high_convolve8_avg; + } + } else { + if (sf->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + sf->high_predict[0][0][0] = vp9_high_convolve8_horiz; + sf->high_predict[0][0][1] = vp9_high_convolve8_avg_horiz; + sf->high_predict[0][1][0] = vp9_high_convolve8; + sf->high_predict[0][1][1] = vp9_high_convolve8_avg; + sf->high_predict[1][0][0] = vp9_high_convolve8_horiz; + sf->high_predict[1][0][1] = vp9_high_convolve8_avg_horiz; + } else { + // Must always scale in both directions. + sf->high_predict[0][0][0] = vp9_high_convolve8; + sf->high_predict[0][0][1] = vp9_high_convolve8_avg; + sf->high_predict[0][1][0] = vp9_high_convolve8; + sf->high_predict[0][1][1] = vp9_high_convolve8_avg; + sf->high_predict[1][0][0] = vp9_high_convolve8; + sf->high_predict[1][0][1] = vp9_high_convolve8_avg; + } + } + // 2D subpel motion always gets filtered in both directions. + sf->high_predict[1][1][0] = vp9_high_convolve8; + sf->high_predict[1][1][1] = vp9_high_convolve8_avg; + } +#endif } diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h index ad6f5d702aff533e01aad69b5ab169d7058c06b9..2e923db9c417eb911d0428434b6ea52b68235d18 100644 --- a/vp9/common/vp9_scale.h +++ b/vp9/common/vp9_scale.h @@ -32,13 +32,23 @@ struct scale_factors { int (*scale_value_y)(int val, const struct scale_factors *sf); convolve_fn_t predict[2][2][2]; // horiz, vert, avg +#if CONFIG_VP9_HIGHBITDEPTH + high_convolve_fn_t high_predict[2][2][2]; // horiz, vert, avg +#endif }; MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, + int other_w, int other_h, + int this_w, int this_h, + int use_high); +#else void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); +#endif static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) { return sf->x_scale_fp != REF_INVALID_SCALE && diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index b6847b92e5470c681b993b6f823e61866d4db0a7..407573aeeba4010a32b8d54626f40c666f6a46d0 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -139,6 +139,153 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ } \ } + +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void high_filter8_1dfunction ( + const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, + int bd +); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_high_convolve8_##name##_##opt(const uint8_t *src8, \ + ptrdiff_t src_stride, \ + uint8_t *dst8, ptrdiff_t dst_stride, \ + const int16_t *filter_x, \ + int x_step_q4, \ + const int16_t *filter_y, \ + int y_step_q4, \ + int w, int h, int bd) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_high_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_high_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_high_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_high_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_high_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_high_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_high_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ +} + +#define HIGH_FUN_CONV_2D(avg, opt) \ +void vp9_high_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h, int bd) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 71); \ + vp9_high_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7, bd); \ + vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ + 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 65); \ + vp9_high_convolve8_horiz_##opt(src, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1, bd); \ + vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ + dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h, bd); \ + } \ + } else { \ + vp9_high_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h, bd); \ + } \ +} +#endif // CONFIG_VP9_HIGHBITDEPTH + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vp9_filter_block1d16_v8_avx2; filter8_1dfunction vp9_filter_block1d16_h8_avx2; @@ -336,4 +483,75 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); // int w, int h); FUN_CONV_2D(, sse2); FUN_CONV_2D(avg_ , sse2); + +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +high_filter8_1dfunction vp9_high_filter_block1d16_v8_sse2; +high_filter8_1dfunction vp9_high_filter_block1d16_h8_sse2; +high_filter8_1dfunction vp9_high_filter_block1d8_v8_sse2; +high_filter8_1dfunction vp9_high_filter_block1d8_h8_sse2; +high_filter8_1dfunction vp9_high_filter_block1d4_v8_sse2; +high_filter8_1dfunction vp9_high_filter_block1d4_h8_sse2; +high_filter8_1dfunction vp9_high_filter_block1d16_v8_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d16_h8_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d8_v8_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d8_h8_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d4_v8_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d4_h8_avg_sse2; + +high_filter8_1dfunction vp9_high_filter_block1d16_v2_sse2; +high_filter8_1dfunction vp9_high_filter_block1d16_h2_sse2; +high_filter8_1dfunction vp9_high_filter_block1d8_v2_sse2; +high_filter8_1dfunction vp9_high_filter_block1d8_h2_sse2; +high_filter8_1dfunction vp9_high_filter_block1d4_v2_sse2; +high_filter8_1dfunction vp9_high_filter_block1d4_h2_sse2; +high_filter8_1dfunction vp9_high_filter_block1d16_v2_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d16_h2_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d8_v2_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d8_h2_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d4_v2_avg_sse2; +high_filter8_1dfunction vp9_high_filter_block1d4_h2_avg_sse2; + +// void vp9_high_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +// void vp9_high_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +// void vp9_high_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vp9_high_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + sse2); + +// void vp9_high_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +// void vp9_high_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2); +HIGH_FUN_CONV_2D(avg_ , sse2); +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 #endif // HAVE_SSE2 diff --git a/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm b/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm new file mode 100644 index 0000000000000000000000000000000000000000..4bdbb83f4af09b3b5238094aa6a4a17918216167 --- /dev/null +++ b/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm @@ -0,0 +1,962 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro HIGH_GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm6 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + punpcklwd xmm1, xmm7 + + movdqa k0k6, xmm0 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + movdqa k1k7, xmm1 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) + +%endm + +%macro HIGH_APPLY_FILTER_4 1 + punpcklwd xmm0, xmm6 ;two row in one register + punpcklwd xmm1, xmm7 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + + pmaddwd xmm0, k0k6 ;multiply the filter factors + pmaddwd xmm1, k1k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm3, k3k4 + + paddd xmm0, xmm1 ;sum + paddd xmm0, xmm2 + paddd xmm0, xmm3 + + paddd xmm0, krd ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movq [rdi], xmm0 +%endm + +%macro HIGH_GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + punpcklwd xmm0, xmm1 + punpckhwd xmm6, xmm7 + punpckhwd xmm2, xmm5 + punpckhwd xmm3, xmm4 + + movdqa k0k1, xmm0 ;store filter factors on stack + movdqa k6k7, xmm6 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) +%endm + +%macro LOAD_VERT_8 1 + movdqu xmm0, [rsi + %1] ;0 + movdqu xmm1, [rsi + rax + %1] ;1 + movdqu xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movdqu xmm7, [rsi + rdx * 2 + %1] ;7 + movdqu xmm2, [rsi + rax + %1] ;2 + movdqu xmm3, [rsi + rax * 2 + %1] ;3 + movdqu xmm4, [rsi + rdx + %1] ;4 + movdqu xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro HIGH_APPLY_FILTER_8 2 + movdqu temp, xmm4 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm1, xmm6 + punpcklwd xmm6, xmm7 + punpckhwd xmm1, xmm7 + movdqa xmm7, xmm2 + punpcklwd xmm2, xmm5 + punpckhwd xmm7, xmm5 + + movdqu xmm5, temp + movdqu temp, xmm4 + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 + movdqu xmm5, temp + + pmaddwd xmm0, k0k1 + pmaddwd xmm5, k0k1 + pmaddwd xmm6, k6k7 + pmaddwd xmm1, k6k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm7, k2k5 + pmaddwd xmm3, k3k4 + pmaddwd xmm4, k3k4 + + paddd xmm0, xmm6 + paddd xmm0, xmm2 + paddd xmm0, xmm3 + paddd xmm5, xmm1 + paddd xmm5, xmm7 + paddd xmm5, xmm4 + + paddd xmm0, krd ;rounding + paddd xmm5, krd + psrad xmm0, 7 ;shift + psrad xmm5, 7 + packssdw xmm0, xmm5 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi + %2] + pavgw xmm0, xmm1 +%endif + movdqu [rdi + %2], xmm0 +%endm + +;void vp9_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_high_filter_block1d4_v8_sse2) PRIVATE +sym(vp9_high_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_high_filter_block1d8_v8_sse2) PRIVATE +sym(vp9_high_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_high_filter_block1d16_v8_sse2) PRIVATE +sym(vp9_high_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 0, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d4_v8_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d8_v8_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d16_v8_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 1, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_high_filter_block1d4_h8_sse2) PRIVATE +sym(vp9_high_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_high_filter_block1d8_h8_sse2) PRIVATE +sym(vp9_high_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_high_filter_block1d16_h8_sse2) PRIVATE +sym(vp9_high_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 0, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d4_h8_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d8_h8_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d16_h8_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 1, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm b/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm new file mode 100644 index 0000000000000000000000000000000000000000..b7d4a61ffeca6bb13f293d24b9d5f17b250718d6 --- /dev/null +++ b/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm @@ -0,0 +1,494 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro HIGH_GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklwd xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm5, rdx + movq xmm2, rcx + pshufd xmm5, xmm5, 0b + movdqa xmm1, xmm5 + psllw xmm5, xmm2 + psubw xmm5, xmm1 ;max value (for clamping) + pxor xmm2, xmm2 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_4 1 + + punpcklwd xmm0, xmm1 ;two row in one register + pmaddwd xmm0, xmm4 ;multiply the filter factors + + paddd xmm0, xmm3 ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, xmm5 + pmaxsw xmm0, xmm2 + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + + movq [rdi], xmm0 + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%if ARCH_X86_64 +%macro HIGH_GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm6, [rdx] ;load filters + + pshuflw xmm7, xmm6, 11111111b ;k3 + pshufhw xmm6, xmm6, 0b ;k4 + psrldq xmm6, 8 + punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm8, rdx + movq xmm5, rcx + pshufd xmm8, xmm8, 0b + movdqa xmm1, xmm8 + psllw xmm8, xmm5 + psubw xmm8, xmm1 ;max value (for clamping) + pxor xmm5, xmm5 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_8 1 + movdqa xmm6, xmm0 + punpckhwd xmm6, xmm1 + punpcklwd xmm0, xmm1 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + + paddd xmm6, xmm4 ;rounding + paddd xmm0, xmm4 ;rounding + psrad xmm6, 7 ;shift + psrad xmm0, 7 ;shift + packssdw xmm0, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + +%if %1 + movdqu xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_APPLY_FILTER_16 1 + movdqa xmm9, xmm0 + movdqa xmm6, xmm2 + punpckhwd xmm9, xmm1 + punpckhwd xmm6, xmm3 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + pmaddwd xmm9, xmm7 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + pmaddwd xmm2, xmm7 + + paddd xmm9, xmm4 ;rounding + paddd xmm6, xmm4 + paddd xmm0, xmm4 + paddd xmm2, xmm4 + + psrad xmm9, 7 ;shift + psrad xmm6, 7 + psrad xmm0, 7 + psrad xmm2, 7 + + packssdw xmm0, xmm9 ;pack back to word + packssdw xmm2, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + pminsw xmm2, xmm8 + pmaxsw xmm2, xmm5 + +%if %1 + movdqu xmm1, [rdi] + movdqu xmm3, [rdi + 16] + pavgw xmm0, xmm1 + pavgw xmm2, xmm3 +%endif + movdqu [rdi], xmm0 ;store the result + movdqu [rdi + 16], xmm2 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm +%endif + +global sym(vp9_high_filter_block1d4_v2_sse2) PRIVATE +sym(vp9_high_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(vp9_high_filter_block1d8_v2_sse2) PRIVATE +sym(vp9_high_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d16_v2_sse2) PRIVATE +sym(vp9_high_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm2, [rsi + 16] + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(vp9_high_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(vp9_high_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(vp9_high_filter_block1d4_h2_sse2) PRIVATE +sym(vp9_high_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(vp9_high_filter_block1d8_h2_sse2) PRIVATE +sym(vp9_high_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d16_h2_sse2) PRIVATE +sym(vp9_high_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(vp9_high_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(vp9_high_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_high_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vp9_high_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index f99fa7a5806d1e41d4339c8d160af140f3494348..499fb4f71f1229fdac15d863b855b70328bf46e9 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1265,10 +1265,18 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, for (i = 0; i < REFS_PER_FRAME; ++i) { RefBuffer *const ref_buf = &cm->frame_refs[i]; +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&ref_buf->sf, + ref_buf->buf->y_crop_width, + ref_buf->buf->y_crop_height, + cm->width, cm->height, + cm->use_highbitdepth); +#else vp9_setup_scale_factors_for_frame(&ref_buf->sf, ref_buf->buf->y_crop_width, ref_buf->buf->y_crop_height, cm->width, cm->height); +#endif if (vp9_is_scaled(&ref_buf->sf)) vp9_extend_frame_borders(ref_buf->buf); } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 9545ba0f3b02c297ffce58e048e3d49ee63687c6..9a61b2a93b88db9b104d03d944f112a1947ec5b9 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2767,10 +2767,17 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1]; ref_buf->buf = buf; ref_buf->idx = idx; +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&ref_buf->sf, + buf->y_crop_width, buf->y_crop_height, + cm->width, cm->height, + (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? + 1 : 0); +#else vp9_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width, cm->height); - +#endif if (vp9_is_scaled(&ref_buf->sf)) vp9_extend_frame_borders(buf); } diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index ff026666b3120f276a19a76e06d181e0eae63f56..6a532b24339c201fb575f4590b642a3dbb70cb02 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -454,12 +454,20 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { // In spatial svc the scaling factors might be less then 1/2. So we will use // non-normative scaling. int frame_used = 0; +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&sf, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + cm->use_highbitdepth); +#else vp9_setup_scale_factors_for_frame(&sf, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height); - +#endif for (frame = 0; frame < frames_to_blur; ++frame) { if (cm->mi_cols * MI_SIZE != frames[frame]->y_width || cm->mi_rows * MI_SIZE != frames[frame]->y_height) { @@ -481,11 +489,20 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { } } else { // ARF is produced at the native frame size and resized when coded. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&sf, + frames[0]->y_crop_width, + frames[0]->y_crop_height, + frames[0]->y_crop_width, + frames[0]->y_crop_height, + cm->use_highbitdepth); +#else vp9_setup_scale_factors_for_frame(&sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height); +#endif } temporal_filter_iterate_c(cpi, frames, frames_to_blur, diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index e88060c6496e69bb27fd8dce7329bf7c5a1c6ecd..07a3be8dbb9ce54ef79354adf9c16d563549082c 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -91,6 +91,8 @@ endif ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm endif # common (c)