Commit ad8d4454 authored by Yunqing Wang's avatar Yunqing Wang Committed by Gerrit Code Review
Browse files

Merge "AVX2 SubPixel Variance Optimization"

Showing with 695 additions and 2 deletions
...@@ -386,7 +386,7 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid ...@@ -386,7 +386,7 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
specialize vp9_variance4x4 mmx $sse2_x86inc specialize vp9_variance4x4 mmx $sse2_x86inc
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
...@@ -416,7 +416,7 @@ prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, ...@@ -416,7 +416,7 @@ prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr,
specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc
......
This diff is collapsed.
...@@ -42,6 +42,18 @@ void vp9_get32x32var_avx2 ...@@ -42,6 +42,18 @@ void vp9_get32x32var_avx2
int *Sum int *Sum
); );
unsigned int vp9_sub_pixel_variance32xh_avx2
(
const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
int height,
unsigned int *sse
);
static void variance_avx2(const unsigned char *src_ptr, int source_stride, static void variance_avx2(const unsigned char *src_ptr, int source_stride,
const unsigned char *ref_ptr, int recon_stride, const unsigned char *ref_ptr, int recon_stride,
int w, int h, unsigned int *sse, int *sum, int w, int h, unsigned int *sse, int *sum,
...@@ -155,3 +167,43 @@ unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, ...@@ -155,3 +167,43 @@ unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
*sse = var; *sse = var;
return (var - (((int64_t)avg * avg) >> 11)); return (var - (((int64_t)avg * avg) >> 11));
} }
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse_ptr) {
// processing 32 elements in parallel
unsigned int sse;
int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
64, &sse);
// processing the next 32 elements in parallel
unsigned int sse2;
int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
x_offset, y_offset,
dst + 32, dst_stride,
64, &sse2);
se += se2;
sse += sse2;
*sse_ptr = sse;
return sse - (((int64_t)se * se) >> 12);
}
unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
int src_stride,
int x_offset,
int y_offset,
const uint8_t *dst,
int dst_stride,
unsigned int *sse_ptr) {
// processing 32 element in parallel
unsigned int sse;
int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
y_offset, dst, dst_stride,
32, &sse);
*sse_ptr = sse;
return sse - (((int64_t)se * se) >> 10);
}
...@@ -86,6 +86,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm ...@@ -86,6 +86,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment