Commit b2297108 authored by Yaowu Xu's avatar Yaowu Xu

SSSE3 idct8x8 functions for highbitdpeth build

This commit changes SSSE3 optimized idct8x8 functions to work with
highbitdepth build.

With this commit and the previous one that enabled SSSE3 idct32x32
functions, tests showed virtually no difference on decoding speed for
file fdJc1_IBKJA.248.webm for the build with -enable-vp9-highbitdpeth
option and the build without the option.

Change-Id: Ibe0634149ec70e8b921e6b30171664b8690a9c45
parent 432136ef
......@@ -762,7 +762,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add/;
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add/;
......@@ -785,10 +785,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct4x4_1_add sse2/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add sse2/;
specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_12_add sse2/;
specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add sse2/;
......
......@@ -220,7 +220,24 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
mova m12, [pw_11585x2]
lea r3, [2 * strideq]
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [inputq + 0]
packssdw m0, [inputq + 16]
mova m1, [inputq + 32]
packssdw m1, [inputq + 48]
mova m2, [inputq + 64]
packssdw m2, [inputq + 80]
mova m3, [inputq + 96]
packssdw m3, [inputq + 112]
mova m4, [inputq + 128]
packssdw m4, [inputq + 144]
mova m5, [inputq + 160]
packssdw m5, [inputq + 176]
mova m6, [inputq + 192]
packssdw m6, [inputq + 208]
mova m7, [inputq + 224]
packssdw m7, [inputq + 240]
%else
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
......@@ -229,7 +246,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
mova m5, [inputq + 80]
mova m6, [inputq + 96]
mova m7, [inputq + 112]
%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT8_1D
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
......@@ -254,10 +271,21 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
lea r3, [2 * strideq]
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [inputq + 0]
packssdw m0, [inputq + 16]
mova m1, [inputq + 32]
packssdw m1, [inputq + 48]
mova m2, [inputq + 64]
packssdw m2, [inputq + 80]
mova m3, [inputq + 96]
packssdw m3, [inputq + 112]
%else
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
%endif
punpcklwd m0, m1
punpcklwd m2, m3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment