AVX2 SubPixel AVG Variance Optimization

Optimizing 2 functions to process 32 elements in parallel instead of 16: 1. vp9_sub_pixel_avg_variance64x64 2. vp9_sub_pixel_avg_variance32x32 both of those function were calling vp9_sub_pixel_avg_variance16xh_ssse3 instead of calling that function, it calls vp9_sub_pixel_avg_variance32xh_avx2 that is written in avx2 and process 32 elements in parallel. This Optimization gave 80% function level gain and 2% user level gain Change-Id: Iea694654e1b7612dc6ed11e2626208c2179502c8

AVX2 SubPixel AVG Variance Optimization
Optimizing 2 functions to process 32 elements in parallel instead of 16: 1. vp9_sub_pixel_avg_variance64x64 2. vp9_sub_pixel_avg_variance32x32 both of those function were calling vp9_sub_pixel_avg_variance16xh_ssse3 instead of calling that function, it calls vp9_sub_pixel_avg_variance32xh_avx2 that is written in avx2 and process 32 elements in parallel. This Optimization gave 80% function level gain and 2% user level gain Change-Id: Iea694654e1b7612dc6ed11e2626208c2179502c8
ea149096 · levytamar82 · 5a7ac03b · ea149096 · ea149096 · ea149096
Commit ea149096 authored 11 years ago by levytamar82
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -389,7 +389,7 @@ prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int
 specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x64 $sse2_x86inc $ssse3_x86inc
@@ -419,7 +419,7 @@ prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int
 specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc
+specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 $sse2_x86inc $ssse3_x86inc

--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -54,6 +54,20 @@ unsigned int vp9_sub_pixel_variance32xh_avx2
  unsigned int *sse
 );
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2
+(
+  const uint8_t *src,
+  int src_stride,
+  int x_offset,
+  int y_offset,
+  const uint8_t *dst,
+  int dst_stride,
+  const uint8_t *sec,
+  int sec_stride,
+  int height,
+  unsigned int *sseptr
+);
 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
                        const unsigned char *ref_ptr, int  recon_stride,
                        int  w, int  h, unsigned int *sse, int *sum,
@@ -207,3 +221,48 @@ unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
  *sse_ptr = sse;
  return sse - (((int64_t)se * se) >> 10);
 }
+unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sseptr,
+                                                  const uint8_t *sec) {
+  // processing 32 elements in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                               y_offset, dst, dst_stride,
+                                               sec, 64, 64, &sse);
+  unsigned int sse2;
+  // processing the next 32 elements in parallel
+  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+                                                y_offset, dst + 32, dst_stride,
+                                                sec + 32, 64, 64, &sse2);
+  se += se2;
+  sse += sse2;
+  *sseptr = sse;
+  return sse - (((int64_t)se * se) >> 12);
+}
+unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sseptr,
+                                                  const uint8_t *sec) {
+  // processing 32 element in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                 y_offset, dst, dst_stride,
+                                                 sec, 32, 32, &sse);
+  *sseptr = sse;
+  return sse - (((int64_t)se * se) >> 10);
+}