Commit aac1ef7f authored by Yaowu Xu's avatar Yaowu Xu

Enable hbd_build to use SSSE3optimized functions

This commit changes the SSSE3 assembly functions for idct32x32 to
support highbitdepth build.

On test clip fdJc1_IBKJA.248.webm, this cuts the speed difference
between hbd and lbd build from between 3-4% to 1-2%.

Change-Id: Ic3390e0113bc1ca5bba8ec80d1795ad31b484fca
parent 8fdab8a4
...@@ -803,14 +803,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { ...@@ -803,14 +803,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct16x16_1_add sse2/; specialize qw/vpx_idct16x16_1_add sse2/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add sse2/; specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_135_add sse2/; specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
# Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2/; specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add sse2/; specialize qw/vpx_idct32x32_1_add sse2/;
......
...@@ -765,6 +765,24 @@ idct32x32_34: ...@@ -765,6 +765,24 @@ idct32x32_34:
lea r4, [rsp + transposed_in] lea r4, [rsp + transposed_in]
idct32x32_34_transpose: idct32x32_34_transpose:
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0] mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4] mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8] mova m2, [r3 + 16 * 8]
...@@ -773,6 +791,7 @@ idct32x32_34_transpose: ...@@ -773,6 +791,7 @@ idct32x32_34_transpose:
mova m5, [r3 + 16 * 20] mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24] mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28] mova m7, [r3 + 16 * 28]
%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
...@@ -1176,6 +1195,24 @@ idct32x32_135: ...@@ -1176,6 +1195,24 @@ idct32x32_135:
mov r7, 2 mov r7, 2
idct32x32_135_transpose: idct32x32_135_transpose:
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0] mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4] mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8] mova m2, [r3 + 16 * 8]
...@@ -1184,7 +1221,7 @@ idct32x32_135_transpose: ...@@ -1184,7 +1221,7 @@ idct32x32_135_transpose:
mova m5, [r3 + 16 * 20] mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24] mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28] mova m7, [r3 + 16 * 28]
%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
mova [r4 + 0], m0 mova [r4 + 0], m0
...@@ -1196,14 +1233,22 @@ idct32x32_135_transpose: ...@@ -1196,14 +1233,22 @@ idct32x32_135_transpose:
mova [r4 + 16 * 6], m6 mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7 mova [r4 + 16 * 7], m7
%if CONFIG_VP9_HIGHBITDEPTH
add r3, 32
%else
add r3, 16 add r3, 16
%endif
add r4, 16 * 8 add r4, 16 * 8
dec r7 dec r7
jne idct32x32_135_transpose jne idct32x32_135_transpose
IDCT32X32_135 16*0, 16*32, 16*64, 16*96 IDCT32X32_135 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8] lea stp, [stp + 16 * 8]
%if CONFIG_VP9_HIGHBITDEPTH
lea inputq, [inputq + 32 * 32]
%else
lea inputq, [inputq + 16 * 32] lea inputq, [inputq + 16 * 32]
%endif
dec r6 dec r6
jnz idct32x32_135 jnz idct32x32_135
...@@ -1614,6 +1659,24 @@ idct32x32_1024: ...@@ -1614,6 +1659,24 @@ idct32x32_1024:
mov r7, 4 mov r7, 4
idct32x32_1024_transpose: idct32x32_1024_transpose:
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0] mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4] mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8] mova m2, [r3 + 16 * 8]
...@@ -1622,6 +1685,7 @@ idct32x32_1024_transpose: ...@@ -1622,6 +1685,7 @@ idct32x32_1024_transpose:
mova m5, [r3 + 16 * 20] mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24] mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28] mova m7, [r3 + 16 * 28]
%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
...@@ -1633,8 +1697,11 @@ idct32x32_1024_transpose: ...@@ -1633,8 +1697,11 @@ idct32x32_1024_transpose:
mova [r4 + 16 * 5], m5 mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6 mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7 mova [r4 + 16 * 7], m7
%if CONFIG_VP9_HIGHBITDEPTH
add r3, 32
%else
add r3, 16 add r3, 16
%endif
add r4, 16 * 8 add r4, 16 * 8
dec r7 dec r7
jne idct32x32_1024_transpose jne idct32x32_1024_transpose
...@@ -1642,7 +1709,11 @@ idct32x32_1024_transpose: ...@@ -1642,7 +1709,11 @@ idct32x32_1024_transpose:
IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8] lea stp, [stp + 16 * 8]
%if CONFIG_VP9_HIGHBITDEPTH
lea inputq, [inputq + 32 * 32]
%else
lea inputq, [inputq + 16 * 32] lea inputq, [inputq + 16 * 32]
%endif
dec r6 dec r6
jnz idct32x32_1024 jnz idct32x32_1024
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment