Commit 9412785b authored by Jingning Han's avatar Jingning Han Committed by Gerrit Code Review

Merge changes I3edd4b95,I4514f974,Ie7fa4386

* changes:
  Turn on unit tests for SSSE3 8x8 forward and inverse 2D-DCT
  Change eob threshold for partial inverse 8x8 2D-DCT to 12
  SSSE3 8x8 inverse 2D-DCT with first 10 coeffs non-zero
parents ff3baaef b466ad5e
......@@ -340,4 +340,11 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
#endif
#if HAVE_SSSE3 && ARCH_X86_64
INSTANTIATE_TEST_CASE_P(
SSSE3, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
#endif
} // namespace
......@@ -132,8 +132,8 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct16x16_1_add_c,
TX_16X16, 1),
make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_10_add_c,
TX_8X8, 10),
&vp9_idct8x8_12_add_c,
TX_8X8, 12),
make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_c,
TX_8X8, 1),
......@@ -154,8 +154,8 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct16x16_1_add_neon,
TX_16X16, 1),
make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_10_add_neon,
TX_8X8, 10),
&vp9_idct8x8_12_add_neon,
TX_8X8, 12),
make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_neon,
TX_8X8, 1),
......@@ -181,8 +181,8 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct16x16_1_add_sse2,
TX_16X16, 1),
make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_10_add_sse2,
TX_8X8, 10),
&vp9_idct8x8_12_add_sse2,
TX_8X8, 12),
make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_sse2,
TX_8X8, 1),
......@@ -190,4 +190,13 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct4x4_1_add_sse2,
TX_4X4, 1)));
#endif
#if HAVE_SSSE3 && ARCH_X86_64
INSTANTIATE_TEST_CASE_P(
SSSE3, PartialIDctTest,
::testing::Values(
make_tuple(&vp9_idct8x8_64_add_c,
&vp9_idct8x8_12_add_ssse3,
TX_8X8, 12)));
#endif
} // namespace
......@@ -9,7 +9,7 @@
;
EXPORT |vp9_idct8x8_64_add_neon|
EXPORT |vp9_idct8x8_10_add_neon|
EXPORT |vp9_idct8x8_12_add_neon|
ARM
REQUIRE8
PRESERVE8
......@@ -310,13 +310,13 @@
bx lr
ENDP ; |vp9_idct8x8_64_add_neon|
;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
|vp9_idct8x8_10_add_neon| PROC
|vp9_idct8x8_12_add_neon| PROC
push {r4-r9}
vpush {d8-d15}
vld1.s16 {q8,q9}, [r0]!
......@@ -514,6 +514,6 @@
vpop {d8-d15}
pop {r4-r9}
bx lr
ENDP ; |vp9_idct8x8_10_add_neon|
ENDP ; |vp9_idct8x8_12_add_neon|
END
......@@ -617,7 +617,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
}
}
void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
int dest_stride) {
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
int16_t *outptr = out;
......
......@@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
}
}
void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[8 * 8] = { 0 };
int16_t *outptr = out;
int i, j;
......@@ -1348,8 +1348,8 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
if (eob == 1)
// DC only DCT coefficient
vp9_idct8x8_1_add(input, dest, stride);
else if (eob <= 10)
vp9_idct8x8_10_add(input, dest, stride);
else if (eob <= 12)
vp9_idct8x8_12_add(input, dest, stride);
else
vp9_idct8x8_64_add(input, dest, stride);
}
......
......@@ -312,8 +312,8 @@ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
......
......@@ -995,7 +995,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
RECON_AND_STORE(dest, in[7]);
}
void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
......
......@@ -28,6 +28,29 @@ TRANSFORM_COEFFS 6270, 15137
TRANSFORM_COEFFS 3196, 16069
TRANSFORM_COEFFS 13623, 9102
%macro PAIR_PP_COEFFS 2
dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
%endmacro
%macro PAIR_MP_COEFFS 2
dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
%endmacro
%macro PAIR_MM_COEFFS 2
dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
%endmacro
PAIR_PP_COEFFS 30274, 12540
PAIR_PP_COEFFS 6392, 32138
PAIR_MP_COEFFS 18204, 27246
PAIR_PP_COEFFS 12540, 12540
PAIR_PP_COEFFS 30274, 30274
PAIR_PP_COEFFS 6392, 6392
PAIR_PP_COEFFS 32138, 32138
PAIR_MM_COEFFS 18204, 18204
PAIR_PP_COEFFS 27246, 27246
SECTION .text
%if ARCH_X86_64
......@@ -128,6 +151,7 @@ SECTION .text
%endmacro
INIT_XMM ssse3
; full inverse 8x8 2D-DCT transform
cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m11, [pw_16]
......@@ -159,4 +183,118 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
ADD_STORE_8P_2X 6, 7, 9, 10, 12
RET
; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m11, [pw_16]
mova m12, [pw_11585x2]
lea r3, [2 * strideq]
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
punpcklwd m0, m1
punpcklwd m2, m3
punpckhdq m9, m0, m2
punpckldq m0, m2
SWAP 2, 9
; m0 -> [0], [0]
; m1 -> [1], [1]
; m2 -> [2], [2]
; m3 -> [3], [3]
punpckhqdq m10, m0, m0
punpcklqdq m0, m0
punpckhqdq m9, m2, m2
punpcklqdq m2, m2
SWAP 1, 10
SWAP 3, 9
pmulhrsw m0, m12
pmulhrsw m2, [dpw_30274_12540]
pmulhrsw m1, [dpw_6392_32138]
pmulhrsw m3, [dpw_m18204_27246]
SUM_SUB 0, 2, 9
SUM_SUB 1, 3, 9
punpcklqdq m9, m3, m3
punpckhqdq m5, m3, m9
SUM_SUB 3, 5, 9
punpckhqdq m5, m3
pmulhrsw m5, m12
punpckhqdq m9, m1, m5
punpcklqdq m1, m5
SWAP 5, 9
SUM_SUB 0, 5, 9
SUM_SUB 2, 1, 9
punpckhqdq m3, m0, m0
punpckhqdq m4, m1, m1
punpckhqdq m6, m5, m5
punpckhqdq m7, m2, m2
punpcklwd m0, m3
punpcklwd m7, m2
punpcklwd m1, m4
punpcklwd m6, m5
punpckhdq m4, m0, m7
punpckldq m0, m7
punpckhdq m10, m1, m6
punpckldq m5, m1, m6
punpckhqdq m1, m0, m5
punpcklqdq m0, m5
punpckhqdq m3, m4, m10
punpcklqdq m2, m4, m10
pmulhrsw m0, m12
pmulhrsw m6, m2, [dpw_30274_30274]
pmulhrsw m4, m2, [dpw_12540_12540]
pmulhrsw m7, m1, [dpw_32138_32138]
pmulhrsw m1, [dpw_6392_6392]
pmulhrsw m5, m3, [dpw_m18204_m18204]
pmulhrsw m3, [dpw_27246_27246]
mova m2, m0
SUM_SUB 0, 6, 9
SUM_SUB 2, 4, 9
SUM_SUB 1, 5, 9
SUM_SUB 7, 3, 9
SUM_SUB 3, 5, 9
pmulhrsw m3, m12
pmulhrsw m5, m12
SUM_SUB 0, 7, 9
SUM_SUB 2, 3, 9
SUM_SUB 4, 5, 9
SUM_SUB 6, 1, 9
SWAP 3, 6
SWAP 1, 2
SWAP 2, 4
pxor m12, m12
ADD_STORE_8P_2X 0, 1, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 2, 3, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 4, 5, 9, 10, 12
lea outputq, [outputq + r3]
ADD_STORE_8P_2X 6, 7, 9, 10, 12
RET
%endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment