Commit 3fb728c7 authored by Yunqing Wang's avatar Yunqing Wang

SSE2 8-tap sub-pixel filter optimization

To ensure fast encoding/decoding on devices without ssse3 support,
SSE2 optimization of sub-pixel filters was done. Test using 1080p
clip showed the decoder speeds were ~70fps with ssse3 filters, ~60fps
with sse2 filters, and ~15fps with c filters.

Change-Id: Ie2088f87d83a889fba80a613e4d0e287aadd785c
parent 9603989c
......@@ -599,6 +599,28 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
make_tuple(32, 64, &convolve8_c),
make_tuple(64, 64, &convolve8_c)));
#if HAVE_SSE2
const ConvolveFunctions convolve8_sse2(
vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
vp9_convolve8_sse2, vp9_convolve8_avg_sse2);
INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_sse2),
make_tuple(8, 4, &convolve8_sse2),
make_tuple(4, 8, &convolve8_sse2),
make_tuple(8, 8, &convolve8_sse2),
make_tuple(16, 8, &convolve8_sse2),
make_tuple(8, 16, &convolve8_sse2),
make_tuple(16, 16, &convolve8_sse2),
make_tuple(32, 16, &convolve8_sse2),
make_tuple(16, 32, &convolve8_sse2),
make_tuple(32, 32, &convolve8_sse2),
make_tuple(64, 32, &convolve8_sse2),
make_tuple(32, 64, &convolve8_sse2),
make_tuple(64, 64, &convolve8_sse2)));
#endif
#if HAVE_SSSE3
const ConvolveFunctions convolve8_ssse3(
vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
......
......@@ -247,22 +247,22 @@ prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8
specialize vp9_convolve_avg $sse2_x86inc neon dspr2
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8 ssse3 neon dspr2
specialize vp9_convolve8 sse2 ssse3 neon dspr2
prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_horiz ssse3 neon dspr2
specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_vert ssse3 neon dspr2
specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg ssse3 neon dspr2
specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_horiz ssse3 neon dspr2
specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_vert ssse3 neon dspr2
specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
#
# dct
......
This diff is collapsed.
This diff is collapsed.
......@@ -75,6 +75,7 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_ss
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment