Commit 0c00af12 authored by Alex Converse's avatar Alex Converse

Add vpx_highbd_convolve_{copy,avg}_sse2

single-threaded:
swanky (silvermont): ~1% faster overall
peppy (celeron,haswell): ~1.5% faster overall

Change-Id: Ib74f014374c63c9eaf2d38191cbd8e2edcc52073
parent 7e77938d
......@@ -973,6 +973,14 @@ void wrap_ ## func ## _ ## bd(const uint8_t *src, ptrdiff_t src_stride, \
w, h, bd); \
}
#if HAVE_SSE2 && ARCH_X86_64
#if CONFIG_USE_X86INC
WRAP(convolve_copy_sse2, 8)
WRAP(convolve_avg_sse2, 8)
WRAP(convolve_copy_sse2, 10)
WRAP(convolve_avg_sse2, 10)
WRAP(convolve_copy_sse2, 12)
WRAP(convolve_avg_sse2, 12)
#endif // CONFIG_USE_X86INC
WRAP(convolve8_horiz_sse2, 8)
WRAP(convolve8_avg_horiz_sse2, 8)
WRAP(convolve8_vert_sse2, 8)
......@@ -1116,7 +1124,11 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
#if HAVE_SSE2 && ARCH_X86_64
#if CONFIG_VP9_HIGHBITDEPTH
const ConvolveFunctions convolve8_sse2(
#if CONFIG_USE_X86INC
wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
#else
wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
#endif // CONFIG_USE_X86INC
wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
......@@ -1124,7 +1136,11 @@ const ConvolveFunctions convolve8_sse2(
wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
const ConvolveFunctions convolve10_sse2(
#if CONFIG_USE_X86INC
wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
#else
wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
#endif // CONFIG_USE_X86INC
wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
......@@ -1132,7 +1148,11 @@ const ConvolveFunctions convolve10_sse2(
wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
const ConvolveFunctions convolve12_sse2(
#if CONFIG_USE_X86INC
wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
#else
wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
#endif // CONFIG_USE_X86INC
wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
......
......@@ -434,10 +434,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Sub Pixel Filters
#
add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve_copy/;
specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve_avg/;
specialize qw/vpx_highbd_convolve_avg/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
......
......@@ -12,20 +12,76 @@
SECTION .text
%macro convolve_fn 1
%macro convolve_fn 1-2
INIT_XMM sse2
%ifidn %2, highbd
%define pavg pavgw
cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h, bd
%else
%define pavg pavgb
cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h
%endif
mov r4d, dword wm
%ifidn %2, highbd
shl r4d, 1
shl srcq, 1
shl src_strideq, 1
shl dstq, 1
shl dst_strideq, 1
%else
cmp r4d, 4
je .w4
%endif
cmp r4d, 8
je .w8
cmp r4d, 16
je .w16
cmp r4d, 32
je .w32
%ifidn %2, highbd
cmp r4d, 64
je .w64
mov r4d, dword hm
.loop128:
movu m0, [srcq]
movu m1, [srcq+16]
movu m2, [srcq+32]
movu m3, [srcq+48]
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq+16]
pavg m2, [dstq+32]
pavg m3, [dstq+48]
%endif
mova [dstq ], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
movu m0, [srcq+64]
movu m1, [srcq+80]
movu m2, [srcq+96]
movu m3, [srcq+112]
add srcq, src_strideq
%ifidn %1, avg
pavg m0, [dstq+64]
pavg m1, [dstq+80]
pavg m2, [dstq+96]
pavg m3, [dstq+112]
%endif
mova [dstq+64], m0
mova [dstq+80], m1
mova [dstq+96], m2
mova [dstq+112], m3
add dstq, dst_strideq
dec r4d
jnz .loop128
RET
%endif
.w64
mov r4d, dword hm
.loop64:
movu m0, [srcq]
......@@ -34,10 +90,10 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
movu m3, [srcq+48]
add srcq, src_strideq
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq+16]
pavgb m2, [dstq+32]
pavgb m3, [dstq+48]
pavg m0, [dstq]
pavg m1, [dstq+16]
pavg m2, [dstq+32]
pavg m3, [dstq+48]
%endif
mova [dstq ], m0
mova [dstq+16], m1
......@@ -57,10 +113,10 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
movu m3, [srcq+src_strideq+16]
lea srcq, [srcq+src_strideq*2]
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq +16]
pavgb m2, [dstq+dst_strideq]
pavgb m3, [dstq+dst_strideq+16]
pavg m0, [dstq]
pavg m1, [dstq +16]
pavg m2, [dstq+dst_strideq]
pavg m3, [dstq+dst_strideq+16]
%endif
mova [dstq ], m0
mova [dstq +16], m1
......@@ -82,10 +138,10 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
movu m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq+dst_strideq]
pavgb m2, [dstq+dst_strideq*2]
pavgb m3, [dstq+r6q]
pavg m0, [dstq]
pavg m1, [dstq+dst_strideq]
pavg m2, [dstq+dst_strideq*2]
pavg m3, [dstq+r6q]
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
......@@ -108,10 +164,10 @@ INIT_MMX sse
movu m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq+dst_strideq]
pavgb m2, [dstq+dst_strideq*2]
pavgb m3, [dstq+r6q]
pavg m0, [dstq]
pavg m1, [dstq+dst_strideq]
pavg m2, [dstq+dst_strideq*2]
pavg m3, [dstq+r6q]
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
......@@ -122,6 +178,7 @@ INIT_MMX sse
jnz .loop8
RET
%ifnidn %2, highbd
.w4:
mov r4d, dword hm
lea r5q, [src_strideq*3]
......@@ -137,10 +194,10 @@ INIT_MMX sse
movh m5, [dstq+dst_strideq]
movh m6, [dstq+dst_strideq*2]
movh m7, [dstq+r6q]
pavgb m0, m4
pavgb m1, m5
pavgb m2, m6
pavgb m3, m7
pavg m0, m4
pavg m1, m5
pavg m2, m6
pavg m3, m7
%endif
movh [dstq ], m0
movh [dstq+dst_strideq ], m1
......@@ -150,7 +207,12 @@ INIT_MMX sse
sub r4d, 4
jnz .loop4
RET
%endif
%endmacro
convolve_fn copy
convolve_fn avg
%if CONFIG_VP9_HIGHBITDEPTH
convolve_fn copy, highbd
convolve_fn avg, highbd
%endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment