Commit 9a480482 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Gerrit Code Review
Browse files

Merge "SSE2/SSSE3 optimizations and unit test for sub_pixel_avg_variance()."

parents 869d7706 1e6a32f1
......@@ -76,6 +76,34 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
const uint8_t *src,
const uint8_t *second_pred,
int l2w, int l2h,
int xoff, int yoff,
unsigned int *sse_ptr) {
int se = 0;
unsigned int sse = 0;
const int w = 1 << l2w, h = 1 << l2h;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
// bilinear interpolation at a 16th pel step
const int a1 = ref[(w + 1) * (y + 0) + x + 0];
const int a2 = ref[(w + 1) * (y + 0) + x + 1];
const int b1 = ref[(w + 1) * (y + 1) + x + 0];
const int b2 = ref[(w + 1) * (y + 1) + x + 1];
const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
const int r = a + (((b - a) * yoff + 8) >> 4);
int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
se += diff;
sse += diff * diff;
}
}
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
template<typename VarianceFunctionType>
class VarianceTest :
public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
......@@ -174,6 +202,7 @@ class SubpelVarianceTest :
rnd(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = new uint8_t[block_size_];
sec_ = new uint8_t[block_size_];
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(ref_ != NULL);
......@@ -182,14 +211,16 @@ class SubpelVarianceTest :
virtual void TearDown() {
delete[] src_;
delete[] ref_;
delete[] sec_;
}
protected:
void RefTest();
ACMRandom rnd;
uint8_t* src_;
uint8_t* ref_;
uint8_t *src_;
uint8_t *ref_;
uint8_t *sec_;
int width_, log2width_;
int height_, log2height_;
int block_size_;
......@@ -217,6 +248,29 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
}
}
template<>
void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
for (int j = 0; j < block_size_; j++) {
src_[j] = rnd.Rand8();
sec_[j] = rnd.Rand8();
}
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
ref_[j] = rnd.Rand8();
}
unsigned int sse1, sse2;
const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y,
src_, width_, &sse1, sec_);
const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
log2width_, log2height_,
x, y, &sse2);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
}
}
// -----------------------------------------------------------------------------
// VP8 test cases.
......@@ -283,10 +337,12 @@ namespace vp9 {
#if CONFIG_VP9_ENCODER
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
TEST_P(VP9VarianceTest, Ref) { RefTest(); }
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
......@@ -360,6 +416,48 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(6, 5, subpel_variance64x32_c),
make_tuple(6, 6, subpel_variance64x64_c)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
vp9_sub_pixel_avg_variance4x4_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
vp9_sub_pixel_avg_variance4x8_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
vp9_sub_pixel_avg_variance8x4_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
vp9_sub_pixel_avg_variance8x8_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
vp9_sub_pixel_avg_variance8x16_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
vp9_sub_pixel_avg_variance16x8_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
vp9_sub_pixel_avg_variance16x16_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
vp9_sub_pixel_avg_variance16x32_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
vp9_sub_pixel_avg_variance32x16_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
vp9_sub_pixel_avg_variance32x32_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
vp9_sub_pixel_avg_variance32x64_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
vp9_sub_pixel_avg_variance64x32_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
vp9_sub_pixel_avg_variance64x64_c;
INSTANTIATE_TEST_CASE_P(
C, VP9SubpelAvgVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
make_tuple(2, 3, subpel_avg_variance4x8_c),
make_tuple(3, 2, subpel_avg_variance8x4_c),
make_tuple(3, 3, subpel_avg_variance8x8_c),
make_tuple(3, 4, subpel_avg_variance8x16_c),
make_tuple(4, 3, subpel_avg_variance16x8_c),
make_tuple(4, 4, subpel_avg_variance16x16_c),
make_tuple(4, 5, subpel_avg_variance16x32_c),
make_tuple(5, 4, subpel_avg_variance32x16_c),
make_tuple(5, 5, subpel_avg_variance32x32_c),
make_tuple(5, 6, subpel_avg_variance32x64_c),
make_tuple(6, 5, subpel_avg_variance64x32_c),
make_tuple(6, 6, subpel_avg_variance64x64_c)));
#if HAVE_MMX
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
......@@ -446,6 +544,48 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(5, 6, subpel_variance32x64_sse2),
make_tuple(6, 5, subpel_variance64x32_sse2),
make_tuple(6, 6, subpel_variance64x64_sse2)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
vp9_sub_pixel_avg_variance4x4_sse;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
vp9_sub_pixel_avg_variance4x8_sse;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
vp9_sub_pixel_avg_variance8x4_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
vp9_sub_pixel_avg_variance8x8_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
vp9_sub_pixel_avg_variance8x16_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
vp9_sub_pixel_avg_variance16x8_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
vp9_sub_pixel_avg_variance16x16_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
vp9_sub_pixel_avg_variance16x32_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
vp9_sub_pixel_avg_variance32x16_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
vp9_sub_pixel_avg_variance32x32_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
vp9_sub_pixel_avg_variance32x64_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
vp9_sub_pixel_avg_variance64x32_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
vp9_sub_pixel_avg_variance64x64_sse2;
INSTANTIATE_TEST_CASE_P(
SSE2, VP9SubpelAvgVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
make_tuple(2, 3, subpel_avg_variance4x8_sse),
make_tuple(3, 2, subpel_avg_variance8x4_sse2),
make_tuple(3, 3, subpel_avg_variance8x8_sse2),
make_tuple(3, 4, subpel_avg_variance8x16_sse2),
make_tuple(4, 3, subpel_avg_variance16x8_sse2),
make_tuple(4, 4, subpel_avg_variance16x16_sse2),
make_tuple(4, 5, subpel_avg_variance16x32_sse2),
make_tuple(5, 4, subpel_avg_variance32x16_sse2),
make_tuple(5, 5, subpel_avg_variance32x32_sse2),
make_tuple(5, 6, subpel_avg_variance32x64_sse2),
make_tuple(6, 5, subpel_avg_variance64x32_sse2),
make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
#endif
#if HAVE_SSSE3
......@@ -490,6 +630,48 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(5, 6, subpel_variance32x64_ssse3),
make_tuple(6, 5, subpel_variance64x32_ssse3),
make_tuple(6, 6, subpel_variance64x64_ssse3)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
vp9_sub_pixel_avg_variance4x4_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
vp9_sub_pixel_avg_variance4x8_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
vp9_sub_pixel_avg_variance8x4_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
vp9_sub_pixel_avg_variance8x8_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
vp9_sub_pixel_avg_variance8x16_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
vp9_sub_pixel_avg_variance16x8_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
vp9_sub_pixel_avg_variance16x16_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
vp9_sub_pixel_avg_variance16x32_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
vp9_sub_pixel_avg_variance32x16_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
vp9_sub_pixel_avg_variance32x32_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
vp9_sub_pixel_avg_variance32x64_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
vp9_sub_pixel_avg_variance64x32_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
vp9_sub_pixel_avg_variance64x64_ssse3;
INSTANTIATE_TEST_CASE_P(
SSSE3, VP9SubpelAvgVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
#endif
#endif // CONFIG_VP9_ENCODER
......
......@@ -269,81 +269,81 @@ prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int
specialize vp9_sub_pixel_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x64
specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x64
specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x32
specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x16
specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x32
specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x32
specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x16
specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x16
specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x8
specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x8
specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x4
specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance4x8
specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x4 sse ssse3
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance4x4
specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x64 sse2
......
......@@ -116,7 +116,7 @@ bilin_filter_m_ssse3: times 8 db 16, 0
RET
%endmacro
%macro SUBPEL_VARIANCE 1 ; W
%macro SUBPEL_VARIANCE 1-2 0 ; W
%if cpuflag(ssse3)
%define bilin_filter_m bilin_filter_m_ssse3
%define filter_idx_shift 4
......@@ -128,12 +128,38 @@ bilin_filter_m_ssse3: times 8 db 16, 0
; 11, not 13, if the registers are ordered correctly. May make a minor speed
; difference on Win64
%ifdef PIC
%if %2 == 1 ; avg
cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
sec, sec_stride, height, sse
%define sec_str sec_strideq
%else
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
dst, dst_stride, height, sse
%endif
%define h heightd
%define bilin_filter sseq
%else
%if %2 == 1 ; avg
cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
7 + 2 * ARCH_X86_64, 13, src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
sec, sec_stride, \
height, sse
%if ARCH_X86_64
%define h heightd
%define sec_str sec_strideq
%else
%define h dword heightm
%define sec_str sec_stridemp
%endif
%else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
dst, dst_stride, height, sse
%define h heightd
%endif
%define bilin_filter bilin_filter_m
%endif
ASSERT %1 <= 16 ; m6 overflows if w > 16
......@@ -143,7 +169,10 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
; could perhaps use it for something more productive then
pxor m5, m5 ; dedicated zero register
%if %1 < 16
sar heightd, 1
sar h, 1
%if %2 == 1 ; avg
shl sec_str, 1
%endif
%endif
; FIXME(rbultje) replace by jumptable?
......@@ -158,30 +187,55 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%if %1 == 16
movu m0, [srcq]
mova m1, [dstq]
%if %2 == 1 ; avg
pavgb m0, [secq]
punpckhbw m3, m1, m5
punpcklbw m1, m5
%endif
punpckhbw m2, m0, m5
punpcklbw m0, m5
%if %2 == 0 ; !avg
punpckhbw m3, m1, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
%if %2 == 1 ; avg
%if mmsize == 16
movhps m0, [srcq+src_strideq]
%else ; mmsize == 8
punpckldq m0, [srcq+src_strideq]
%endif
%else ; !avg
movh m2, [srcq+src_strideq]
%endif
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
%if %2 == 1 ; avg
pavgb m0, [secq]
punpcklbw m3, m5
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; !avg
punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_zero_y_zero_loop
STORE_AND_RET
......@@ -196,18 +250,40 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
movu m4, [srcq+src_strideq]
mova m1, [dstq]
pavgb m0, m4
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
%if %2 == 1 ; avg
pavgb m0, [secq]
%endif
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
%if %2 == 1 ; avg
%if mmsize == 16
movhps m2, [srcq+src_strideq*2]
%else ; mmsize == 8
punpckldq m2, [srcq+src_strideq*2]
%endif
movh m1, [dstq]
%if mmsize == 16
movlhps m0, m2
%else ; mmsize == 8
punpckldq m0, m2
%endif
movh m3, [dstq+dst_strideq]
pavgb m0, m2
punpcklbw m1, m5
pavgb m0, [secq]
punpcklbw m3, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; !avg
movh m4, [srcq+src_strideq*2]
movh m1, [dstq]
pavgb m0, m2
......@@ -217,12 +293,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_zero_y_half_loop
STORE_AND_RET
......@@ -280,13 +360,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%endif
psraw m2, 4
psraw m0, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
......@@ -318,13 +404,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%endif
psraw m0, 4
psraw m2, 4
%if %2 == 1 ; avg
; FIXME(rbultje) pipeline
packuswb m0, m2
pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
dec heightd
%endif
%if %2 == 1 ; avg
add secq, sec_str
%endif
dec h
jg .x_zero_y_other_loop
%undef filter_y_a
%undef filter_y_b
......@@ -345,18 +441,37 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
movu m4, [srcq+1]
mova m1, [dstq]
pavgb m0, m4
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
%if %2 == 1 ; avg
pavgb m0, [secq]
%endif
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m4, [srcq+1]
%if %2 == 1 ; avg
%if mmsize == 16
movhps m0, [srcq+src_strideq]
movhps m4, [srcq+src_strideq+1]
%else ; mmsize == 8
punpckldq m0, [srcq+src_strideq]
punpckldq m4, [srcq+src_strideq+1]
%endif
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
pavgb m0, m4
punpcklbw m3, m5
pavgb m0, [secq]
punpcklbw m1, m5
punpckhbw m2, m0, m5
punpcklbw m0, m5
%else ; !avg
movh m2, [srcq+src_strideq]
movh m1, [dstq]
pavgb m0, m4
......@@ -367,12 +482,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
%endif
SUM_SSE m0, m1, m2, m3, m6, m7