From 521cf7e8795d38da8aa7b7356d97a6e561d02449 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway <slavarnway@google.com> Date: Tue, 29 Jul 2014 16:47:34 -0700 Subject: [PATCH] Neon version of vp9_sub_pixel_variance16x16(), vp9_variance16x16(), and vp9_get16x16var(). On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~16.7%. Change-Id: Ib163aa99f56e680194aabe00dacdd7f0899a4ecb --- test/variance_test.cc | 12 +++ vp9/common/vp9_rtcd_defs.pl | 6 +- vp9/encoder/arm/neon/vp9_variance_neon.c | 129 +++++++++++++++++++++++ vp9/vp9cx.mk | 1 + 4 files changed, 145 insertions(+), 3 deletions(-) create mode 100644 vp9/encoder/arm/neon/vp9_variance_neon.c diff --git a/test/variance_test.cc b/test/variance_test.cc index 9dc7c6a452..83b7435e6e 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -756,6 +756,18 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2), make_tuple(6, 6, subpel_avg_variance64x64_avx2))); #endif // HAVE_AVX2 +#if HAVE_NEON +const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; +INSTANTIATE_TEST_CASE_P( + NEON, VP9VarianceTest, + ::testing::Values(make_tuple(4, 4, variance16x16_neon))); + +const vp9_subpixvariance_fn_t subpel_variance16x16_neon = + vp9_sub_pixel_variance16x16_neon; +INSTANTIATE_TEST_CASE_P( + NEON, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon))); +#endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER } // namespace vp9 diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index d58774924b..d3d874dfc3 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -420,7 +420,7 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc"; @@ -435,7 +435,7 @@ add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, co specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc"; add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc"; +specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance8x4/, "$sse2_x86inc"; @@ -483,7 +483,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c new file mode 100644 index 0000000000..f6871188b0 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" + +#include "vp9/encoder/vp9_variance.h" + +enum { kWidth16 = 16 }; +enum { kHeight16 = 16 }; +enum { kHeight16PlusOne = 17 }; +enum { kPixelStepOne = 1 }; + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16, + kHeight16, sse, sum); +} + +unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16)); +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[0]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[0], vcombine_u8(out_lo, out_hi)); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, temp2, kHeight16 * kWidth16); + DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, fdata3, kHeight16PlusOne * kWidth16); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight16PlusOne, kWidth16, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16, + kWidth16, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse); +} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 0ea28d313b..77b968bdaf 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -131,5 +131,6 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) -- GitLab