diff --git a/test/test.mk b/test/test.mk index b92b6da730a4a2af6e8034b5c4e4cec422115d33..abf815cc981ff371c2f5f5883f8aff29d31ba7c9 100644 --- a/test/test.mk +++ b/test/test.mk @@ -128,6 +128,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc ifeq ($(CONFIG_VP9_ENCODER),yes) LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7d08d9ee4563c4cafd6998a2d1c5cd44fb1f9b66 --- /dev/null +++ b/test/vp9_intrapred_test.cc @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <string> + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_pred_common.h" +#include "vpx_mem/vpx_mem.h" +#include "test/util.h" + +namespace { + +using libvpx_test::ACMRandom; + +const int count_test_block = 100000; + +// Base class for VP9 intra prediction tests. +class VP9IntraPredBase { + public: + virtual ~VP9IntraPredBase() { libvpx_test::ClearSystemState(); } + + protected: + virtual void Predict(PREDICTION_MODE mode) = 0; + + void CheckPrediction(int test_case_number, int *error_count) const { + // For each pixel ensure that the calculated value is the same as reference. + for (int y = 0; y < block_size_; y++) { + for (int x = 0; x < block_size_; x++) { + *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_]; + if (*error_count == 1) { + ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_]) + << " Failed on Test Case Number "<< test_case_number; + } + } + } + } + + void RunTest(uint16_t* left_col, uint16_t* above_data, + uint16_t* dst, uint16_t* ref_dst) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + left_col_ = left_col; + dst_ = dst; + ref_dst_ = ref_dst; + above_row_ = above_data + 16; + int error_count = 0; + for (int i = 0; i < count_test_block; ++i) { + // Fill edges with random data, try first with saturated values. + for (int x = -1; x <= block_size_*2; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } + for (int y = 0; y < block_size_; y++) { + if (i == 0) { + left_col_[y] = mask_; + } else { + left_col_[y] = rnd.Rand16() & mask_; + } + } + Predict(DC_PRED); + CheckPrediction(i, &error_count); + } + ASSERT_EQ(0, error_count); + } + + int block_size_; + uint16_t *above_row_; + uint16_t *left_col_; + uint16_t *dst_; + uint16_t *ref_dst_; + ptrdiff_t stride_; + int mask_; +}; + +typedef void (*intra_pred_fn_t)( + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int bps); +typedef std::tr1::tuple<intra_pred_fn_t, + intra_pred_fn_t, int, int> intra_pred_params_t; +class VP9IntraPredTest + : public VP9IntraPredBase, + public ::testing::TestWithParam<intra_pred_params_t> { + + virtual void SetUp() { + pred_fn_ = GET_PARAM(0); + ref_fn_ = GET_PARAM(1); + block_size_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + stride_ = block_size_ * 3; + mask_ = (1 << bit_depth_) - 1; + } + + virtual void Predict(PREDICTION_MODE mode) { + const uint16_t *const_above_row = above_row_; + const uint16_t *const_left_col = left_col_; + ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_); + ASM_REGISTER_STATE_CHECK(pred_fn_(dst_, stride_, const_above_row, + const_left_col, bit_depth_)); + } + intra_pred_fn_t pred_fn_; + intra_pred_fn_t ref_fn_; + int bit_depth_; +}; + +TEST_P(VP9IntraPredTest, IntraPredTests) { + // max block size is 32 + DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 2*32); + DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 2*32+32); + DECLARE_ALIGNED_ARRAY(16, uint16_t, dst, 3 * 32 * 32); + DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_dst, 3 * 32 * 32); + RunTest(left_col, above_data, dst, ref_dst); +} + +using std::tr1::make_tuple; + +#if HAVE_SSE2 +#if CONFIG_VP9_HIGHBITDEPTH +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_32x32_sse2, + &vp9_high_dc_predictor_32x32_c, 32, 8), + make_tuple(&vp9_high_tm_predictor_16x16_sse2, + &vp9_high_tm_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_tm_predictor_32x32_sse2, + &vp9_high_tm_predictor_32x32_c, 32, 8), + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 8), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 8), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 8), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 8))); +#else +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 8), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 8), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 8), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 8))); +#endif +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_32x32_sse2, + &vp9_high_dc_predictor_32x32_c, 32, 10), + make_tuple(&vp9_high_tm_predictor_16x16_sse2, + &vp9_high_tm_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_tm_predictor_32x32_sse2, + &vp9_high_tm_predictor_32x32_c, 32, 10), + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 10), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 10), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 10), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 10))); +#else +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 10), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 10), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 10), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 10))); +#endif + +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_32x32_sse2, + &vp9_high_dc_predictor_32x32_c, 32, 12), + make_tuple(&vp9_high_tm_predictor_16x16_sse2, + &vp9_high_tm_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_tm_predictor_32x32_sse2, + &vp9_high_tm_predictor_32x32_c, 32, 12), + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 12), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 12), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 12), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 12))); +#else +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 12), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 12), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 12), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 12))); +#endif +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 +} // namespace diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 5587192e8a5feb5c14a2203c41e0fc0dae2343d9..8305e7fa67289d6e95aba9ff27c179101c5c8d1a 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -65,6 +65,18 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } #if CONFIG_VP9_HIGHBITDEPTH +static INLINE uint16_t clip_pixel_high(int val, int bd) { + switch (bd) { + case 8: + default: + return (uint16_t)clamp(val, 0, 255); + case 10: + return (uint16_t)clamp(val, 0, 1023); + case 12: + return (uint16_t)clamp(val, 0, 4095); + } +} + #define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1)) #define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 )) #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index 471929aea091dd7e183c8fa2751bef113710ec66..0fd8d0ce36f89f1be0ee6a56a5cec52256fe4bf5 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -40,11 +40,291 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { type##_predictor(dst, stride, size, above, left); \ } +#if CONFIG_VP9_HIGHBITDEPTH +#define intra_pred_high_sized(type, size) \ + void vp9_high_##type##_predictor_##size##x##size##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + high_##type##_predictor(dst, stride, size, above, left, bd); \ + } + +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_high_sized(type, 4) \ + intra_pred_high_sized(type, 8) \ + intra_pred_high_sized(type, 16) \ + intra_pred_high_sized(type, 32) + +#else + #define intra_pred_allsizes(type) \ intra_pred_sized(type, 4) \ intra_pred_sized(type, 8) \ intra_pred_sized(type, 16) \ intra_pred_sized(type, 32) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void high_d207_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + (void) above; + (void) bd; + int r, c; + + // First column. + for (r = 0; r < bs - 1; ++r) { + dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1); + } + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Second column. + for (r = 0; r < bs - 2; ++r) { + dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 + + left[r + 2], 2); + } + dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] + + left[bs - 1] * 3, 2); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Rest of last row. + for (c = 0; c < bs - 2; ++c) + dst[(bs - 1) * stride + c] = left[bs - 1]; + + for (r = bs - 2; r >= 0; --r) { + for (c = 0; c < bs - 2; ++c) + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; + } +} + +static INLINE void high_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + (void) left; + (void) bd; + int r, c; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] + + above[r/2 + c + 1] * 2 + + above[r/2 + c + 2], 2) + : ROUND_POWER_OF_TWO(above[r/2 + c] + + above[r/2 + c + 1], 1); + } + dst += stride; + } +} + +static INLINE void high_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + (void) left; + (void) bd; + int r, c; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r + c + 2 < bs * 2 ? ROUND_POWER_OF_TWO(above[r + c] + + above[r + c + 1] * 2 + + above[r + c + 2], 2) + : above[bs * 2 - 1]; + } + dst += stride; + } +} + +static INLINE void high_d117_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + (void) bd; + int r, c; + + // first row + for (c = 0; c < bs; c++) + dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1); + dst += stride; + + // second row + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + for (c = 1; c < bs; c++) + dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + dst += stride; + + // the rest of first col + dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 + + left[r - 1], 2); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) + dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void high_d135_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + (void) bd; + int r, c; + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + for (c = 1; c < bs; c++) + dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + + dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + for (r = 2; r < bs; ++r) + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + + left[r], 2); + + dst += stride; + for (r = 1; r < bs; ++r) { + for (c = 1; c < bs; c++) + dst[c] = dst[-stride + c - 1]; + dst += stride; + } +} + +static INLINE void high_d153_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + (void) bd; + int r, c; + dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1); + for (r = 1; r < bs; r++) + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1); + dst++; + + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + for (r = 2; r < bs; r++) + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + + left[r], 2); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) + dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void high_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + (void) left; + (void) bd; + int r; + + for (r = 0; r < bs; r++) { + vpx_memcpy(dst, above, bs * sizeof(uint16_t)); + dst += stride; + } +} + +static INLINE void high_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, const uint16_t *left, + int bd) { + (void) above; + (void) bd; + int r; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, left[r], bs); + dst += stride; + } +} + +static INLINE void high_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + (void) bd; + int r, c; + int ytop_left = above[-1]; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel_high(left[r] + above[c] - ytop_left, bd); + dst += stride; + } +} + +static INLINE void high_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + (void) above; + (void) left; + int r; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, 128 << (bd - 8), bs); + dst += stride; + } +} + +static INLINE void high_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + (void) above; + (void) bd; + int i, r, expected_dc, sum = 0; + + for (i = 0; i < bs; i++) + sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void high_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + (void) left; + (void) bd; + int i, r, expected_dc, sum = 0; + + for (i = 0; i < bs; i++) + sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void high_dc_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + (void) bd; + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { @@ -293,6 +573,14 @@ typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, static intra_pred_fn pred[INTRA_MODES][TX_SIZES]; static intra_pred_fn dc_pred[2][2][TX_SIZES]; +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd); +static intra_high_pred_fn pred_high[INTRA_MODES][4]; +static intra_high_pred_fn dc_pred_high[2][2][4]; +#endif // CONFIG_VP9_HIGHBITDEPTH + void vp9_init_intra_predictors() { #define INIT_ALL_SIZES(p, type) \ p[TX_4X4] = vp9_##type##_predictor_4x4; \ @@ -315,9 +603,164 @@ void vp9_init_intra_predictors() { INIT_ALL_SIZES(dc_pred[1][0], dc_left); INIT_ALL_SIZES(dc_pred[1][1], dc); -#undef INIT_ALL_SIZES +#if CONFIG_VP9_HIGHBITDEPTH + INIT_ALL_SIZES(pred_high[V_PRED], high_v); + INIT_ALL_SIZES(pred_high[H_PRED], high_h); + INIT_ALL_SIZES(pred_high[D207_PRED], high_d207); + INIT_ALL_SIZES(pred_high[D45_PRED], high_d45); + INIT_ALL_SIZES(pred_high[D63_PRED], high_d63); + INIT_ALL_SIZES(pred_high[D117_PRED], high_d117); + INIT_ALL_SIZES(pred_high[D135_PRED], high_d135); + INIT_ALL_SIZES(pred_high[D153_PRED], high_d153); + INIT_ALL_SIZES(pred_high[TM_PRED], high_tm); + + INIT_ALL_SIZES(dc_pred_high[0][0], high_dc_128); + INIT_ALL_SIZES(dc_pred_high[0][1], high_dc_top); + INIT_ALL_SIZES(dc_pred_high[1][0], high_dc_left); + INIT_ALL_SIZES(dc_pred_high[1][1], high_dc); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#undef intra_pred_allsizes } +#if CONFIG_VP9_HIGHBITDEPTH +static void build_intra_predictors_high(const MACROBLOCKD *xd, + const uint8_t *ref8, + int ref_stride, + uint8_t *dst8, + int dst_stride, + PREDICTION_MODE mode, + TX_SIZE tx_size, + int up_available, + int left_available, + int right_available, + int x, int y, + int plane, int bd) { + int i; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 64); + DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 128 + 16); + uint16_t *above_row = above_data + 16; + const uint16_t *const_above_row = above_row; + const int bs = 4 << tx_size; + int frame_width, frame_height; + int x0, y0; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // int base=128; + int base = 128 << (bd - 8); + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + + // Get current frame pointer, width and height. + if (plane == 0) { + frame_width = xd->cur_buf->y_width; + frame_height = xd->cur_buf->y_height; + } else { + frame_width = xd->cur_buf->uv_width; + frame_height = xd->cur_buf->uv_height; + } + + // Get block position in current frame. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // left + if (left_available) { + if (xd->mb_to_bottom_edge < 0) { + /* slower path if the block needs border extension */ + if (y0 + bs <= frame_height) { + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } else { + const int extend_bottom = frame_height - y0; + for (i = 0; i < extend_bottom; ++i) + left_col[i] = ref[i * ref_stride - 1]; + for (; i < bs; ++i) + left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1]; + } + } else { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } + } else { + // TODO(Peter): this value should probably change for high bitdepth + vpx_memset16(left_col, base + 1, bs); + } + + // TODO(hkuang) do not extend 2*bs pixels for all modes. + // above + if (up_available) { + const uint16_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frame_width) { + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, 2 * bs * sizeof(uint16_t)); + } else { + vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t)); + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 + bs <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t)); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t)); + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t)); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t)); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } + } + // TODO(Peter) this value should probably change for high bitdepth + above_row[-1] = left_available ? above_ref[-1] : (base+1); + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t)); + if (bs == 4 && right_available) + vpx_memcpy(above_row + bs, above_ref + bs, bs * sizeof(uint16_t)); + else + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + // TODO(Peter): this value should probably change for high bitdepth + above_row[-1] = left_available ? above_ref[-1] : (base+1); + } + } + } else { + vpx_memset16(above_row, base - 1, bs * 2); + // TODO(Peter): this value should probably change for high bitdepth + above_row[-1] = base - 1; + } + + // predict + if (mode == DC_PRED) { + dc_pred_high[left_available][up_available][tx_size](dst, dst_stride, + const_above_row, + left_col, xd->bd); + } else { + pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col, + xd->bd); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, PREDICTION_MODE mode, TX_SIZE tx_size, @@ -454,6 +897,14 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, const int y = loff * 4; assert(bwl >= 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode, + tx_size, have_top, have_left, have_right, + x, y, plane, xd->bd); + return; + } +#endif build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size, have_top, have_left, have_right, x, y, plane); } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 9733d475030babc741cd8886fd8137cbfd4f323e..b75ea64f094564643fc083ef60f81a87a08ca0a5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -447,6 +447,165 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + # + # Intra prediction + # + add_proto qw/void vp9_high_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d207_predictor_4x4/; + + add_proto qw/void vp9_high_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d45_predictor_4x4/; + + add_proto qw/void vp9_high_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d63_predictor_4x4/; + + add_proto qw/void vp9_high_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_h_predictor_4x4/; + + add_proto qw/void vp9_high_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d117_predictor_4x4/; + + add_proto qw/void vp9_high_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d135_predictor_4x4/; + + add_proto qw/void vp9_high_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d153_predictor_4x4/; + + add_proto qw/void vp9_high_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_v_predictor_4x4 neon/, "$sse_x86inc"; + + add_proto qw/void vp9_high_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_tm_predictor_4x4/, "$sse_x86inc"; + + add_proto qw/void vp9_high_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_predictor_4x4/, "$sse_x86inc"; + + add_proto qw/void vp9_high_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_top_predictor_4x4/; + + add_proto qw/void vp9_high_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_left_predictor_4x4/; + + add_proto qw/void vp9_high_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_128_predictor_4x4/; + + add_proto qw/void vp9_high_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d207_predictor_8x8/; + + add_proto qw/void vp9_high_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d45_predictor_8x8/; + + add_proto qw/void vp9_high_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d63_predictor_8x8/; + + add_proto qw/void vp9_high_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_h_predictor_8x8/; + + add_proto qw/void vp9_high_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d117_predictor_8x8/; + + add_proto qw/void vp9_high_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d135_predictor_8x8/; + + add_proto qw/void vp9_high_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d153_predictor_8x8/; + + add_proto qw/void vp9_high_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_v_predictor_8x8/, "$sse2_x86inc"; + + add_proto qw/void vp9_high_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_tm_predictor_8x8/, "$sse2_x86inc"; + + add_proto qw/void vp9_high_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_predictor_8x8/, "$sse2_x86inc";; + + add_proto qw/void vp9_high_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_top_predictor_8x8/; + + add_proto qw/void vp9_high_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_left_predictor_8x8/; + + add_proto qw/void vp9_high_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_128_predictor_8x8/; + + add_proto qw/void vp9_high_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d207_predictor_16x16/; + + add_proto qw/void vp9_high_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d45_predictor_16x16/; + + add_proto qw/void vp9_high_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d63_predictor_16x16/; + + add_proto qw/void vp9_high_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_h_predictor_16x16/; + + add_proto qw/void vp9_high_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d117_predictor_16x16/; + + add_proto qw/void vp9_high_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d135_predictor_16x16/; + + add_proto qw/void vp9_high_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d153_predictor_16x16/; + + add_proto qw/void vp9_high_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_v_predictor_16x16 neon/, "$sse2_x86inc"; + + add_proto qw/void vp9_high_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_tm_predictor_16x16/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_predictor_16x16/, "$sse2_x86inc"; + + add_proto qw/void vp9_high_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_top_predictor_16x16/; + + add_proto qw/void vp9_high_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_left_predictor_16x16/; + + add_proto qw/void vp9_high_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_128_predictor_16x16/; + + add_proto qw/void vp9_high_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d207_predictor_32x32/; + + add_proto qw/void vp9_high_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d45_predictor_32x32/; + + add_proto qw/void vp9_high_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d63_predictor_32x32/; + + add_proto qw/void vp9_high_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_h_predictor_32x32/; + + add_proto qw/void vp9_high_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d117_predictor_32x32/; + + add_proto qw/void vp9_high_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d135_predictor_32x32/; + + add_proto qw/void vp9_high_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d153_predictor_32x32/; + + add_proto qw/void vp9_high_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_v_predictor_32x32/, "$sse2_x86inc"; + + add_proto qw/void vp9_high_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_tm_predictor_32x32/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_predictor_32x32/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_top_predictor_32x32/; + + add_proto qw/void vp9_high_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_left_predictor_32x32/; + + add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_128_predictor_32x32/; + # # dct # diff --git a/vp9/common/x86/vp9_high_intrapred_sse2.asm b/vp9/common/x86/vp9_high_intrapred_sse2.asm new file mode 100644 index 0000000000000000000000000000000000000000..ff450711ec38dbf1deddbe85700126d03f817df4 --- /dev/null +++ b/vp9/common/x86/vp9_high_intrapred_sse2.asm @@ -0,0 +1,476 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_MMX sse +cglobal high_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, one + mov oned, 0x0001 + pxor m1, m1 + movd m3, oned + pshufw m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshufw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal high_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal high_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal high_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + mova m5, [leftq] + mova m6, [leftq+16] + mova m7, [leftq+32] + mova m8, [leftq+48] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + paddw m0, m5 + paddw m0, m6 + paddw m0, m7 + paddw m0, m8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET +%endif + +INIT_MMX sse +cglobal high_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal high_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal high_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal high_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_MMX sse +cglobal high_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one + movd m1, [aboveq-2] + movq m0, [aboveq] + pshufw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + movd m3, oned + movd m4, bpsd + pshufw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -2 + mova m2, m3 + psllw m3, m4 + add leftq, 8 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movq m1, [leftq+lineq*4] + movq m2, [leftq+lineq*4+2] + pshufw m1, m1, 0x0 + pshufw m2, m2, 0x0 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m1 + movq [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal high_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one + movd m1, [aboveq-2] + mova m0, [aboveq] + pshuflw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m3, m3 + pxor m4, m4 + pinsrw m3, oned, 0 + pinsrw m4, bpsd, 0 + pshuflw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m3, m3 + mov lineq, -4 + mova m2, m3 + punpcklqdq m1, m1 + psllw m3, m4 + add leftq, 16 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movd m1, [leftq+lineq*4] + movd m2, [leftq+lineq*4+2] + pshuflw m1, m1, 0x0 + pshuflw m2, m2, 0x0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + mova [dstq ], m1 + mova [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal high_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one + movd m2, [aboveq-2] + mova m0, [aboveq] + mova m1, [aboveq+16] + pshuflw m2, m2, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m7, m7 + pxor m8, m8 + pinsrw m7, oned, 0 + pinsrw m8, bpsd, 0 + pshuflw m7, m7, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m7, m7 + mov lineq, -8 + mova m5, m7 + punpcklqdq m2, m2 + psllw m7, m8 + add leftq, 32 + psubw m7, m5 ; max possible value + pxor m8, m8 ; min possible value + psubw m0, m2 + psubw m1, m2 +.loop: + movd m2, [leftq+lineq*4] + movd m3, [leftq+lineq*4+2] + pshuflw m2, m2, 0x0 + pshuflw m3, m3, 0x0 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + paddw m4, m2, m0 + paddw m5, m3, m0 + paddw m2, m1 + paddw m3, m1 + ;Clamp to the bit-depth + pminsw m4, m7 + pminsw m5, m7 + pminsw m2, m7 + pminsw m3, m7 + pmaxsw m4, m8 + pmaxsw m5, m8 + pmaxsw m2, m8 + pmaxsw m3, m8 + ;Store the values + mova [dstq ], m4 + mova [dstq+strideq*2 ], m5 + mova [dstq +16], m2 + mova [dstq+strideq*2+16], m3 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal high_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one + movd m0, [aboveq-2] + mova m1, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + pshuflw m0, m0, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m10, m10 + pxor m11, m11 + pinsrw m10, oned, 0 + pinsrw m11, bpsd, 0 + pshuflw m10, m10, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m10, m10 + mov lineq, -16 + mova m5, m10 + punpcklqdq m0, m0 + psllw m10, m11 + add leftq, 64 + psubw m10, m5 ; max possible value + pxor m11, m11 ; min possible value + psubw m1, m0 + psubw m2, m0 + psubw m3, m0 + psubw m4, m0 +.loop: + movd m5, [leftq+lineq*4] + movd m6, [leftq+lineq*4+2] + pshuflw m5, m5, 0x0 + pshuflw m6, m6, 0x0 + punpcklqdq m5, m5 + punpcklqdq m6, m6 + paddw m7, m5, m1 + paddw m8, m5, m2 + paddw m9, m5, m3 + paddw m5, m4 + ;Clamp these values to the bit-depth + pminsw m7, m10 + pminsw m8, m10 + pminsw m9, m10 + pminsw m5, m10 + pmaxsw m7, m11 + pmaxsw m8, m11 + pmaxsw m9, m11 + pmaxsw m5, m11 + ;Store these values + mova [dstq ], m7 + mova [dstq +16], m8 + mova [dstq +32], m9 + mova [dstq +48], m5 + paddw m7, m6, m1 + paddw m8, m6, m2 + paddw m9, m6, m3 + paddw m6, m4 + ;Clamp these values to the bit-depth + pminsw m7, m10 + pminsw m8, m10 + pminsw m9, m10 + pminsw m6, m10 + pmaxsw m7, m11 + pmaxsw m8, m11 + pmaxsw m9, m11 + pmaxsw m6, m11 + ;Store these values + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m8 + mova [dstq+strideq*2+32], m9 + mova [dstq+strideq*2+48], m6 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET +%endif diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 90f03426bbcebfca987552dd8f7ea9646bd122fa..e88060c6496e69bb27fd8dce7329bf7c5a1c6ecd 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -89,6 +89,10 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm endif +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm +endif + # common (c) VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c