Commit 876c72a0 authored by levytamar82's avatar levytamar82

AVX2 Convolve Optimization

Two convolve functions were optimized for AVX2:
1. vp9_filter_block1d16_h8
2. vp9_filter_block1d16_v8
vp9_filter_block1d16_v8 was optimized for AVX2 by reducing the number of
loop strides by half, two strides were processed in parallel.
vp9_filter_block1d16_v8 was also optimized in the same way also some of the
loads were being done outside of the loop and by that preventing redundant
loads.
This Optimization gives 43% function level gain and 1.3% user level gain.
Now can be compiled in Windows

Change-Id: I2714124cfb0c14a77d7a0ce126a20db92ffbf92c
parent 20d0f2b9
......@@ -264,13 +264,13 @@ prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8
specialize vp9_convolve_avg $sse2_x86inc neon dspr2
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8 sse2 ssse3 neon dspr2
specialize vp9_convolve8 sse2 ssse3 avx2 neon dspr2
prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
specialize vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2
prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
specialize vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2
prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
......
......@@ -139,7 +139,49 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
} \
}
#if HAVE_AVX2
filter8_1dfunction vp9_filter_block1d16_v8_avx2;
filter8_1dfunction vp9_filter_block1d16_h8_avx2;
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, avx2);
#endif
#if HAVE_SSSE3
filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
......
This diff is collapsed.
......@@ -79,6 +79,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment