diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index 02458db22fafde3b78ebf1b57c8b4a0afc28a469..0a8fb9016d674ecf1540ac4e9d6c2d9be5ce5f3a 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -353,6 +353,13 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3))); #endif +#if CONFIG_USE_X86INC && HAVE_MMX +INSTANTIATE_TEST_CASE_P( + MMX, Trans4x4WHT, + ::testing::Values( + make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0))); +#endif + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, Trans4x4DCT, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 63380d69d0ad55ea12f8d926efed4d202169e80e..0414db5cc115d473bae272b1669019d332cfde25 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -693,7 +693,7 @@ add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int str specialize qw/vp9_fht16x16 sse2 avx2/; add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fwht4x4/; +specialize qw/vp9_fwht4x4/, "$mmx_x86inc"; add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride"; specialize qw/vp9_fdct4x4 sse2 avx2/; diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm new file mode 100644 index 0000000000000000000000000000000000000000..f71181c5e91ae26385d4775980fdc6982f0a9db7 --- /dev/null +++ b/vp9/encoder/x86/vp9_dct_mmx.asm @@ -0,0 +1,70 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + movq m4, m0 + movq m5, m2 + punpcklwd m4, m1 + punpckhwd m0, m1 + punpcklwd m5, m3 + punpckhwd m2, m3 + movq m1, m4 + movq m3, m0 + punpckldq m1, m5 + punpckhdq m4, m5 + punpckldq m3, m2 + punpckhdq m0, m2 + SWAP 2, 3, 0, 1, 4 +%endmacro + +INIT_MMX mmx +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + + movq [outputq], m0 + movq [outputq + 8], m1 + movq [outputq + 16], m2 + movq [outputq + 24], m3 + + RET diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 5e88793c86fe5cb44cdfb629d6e2e4ca20212206..bc9a478bc1e6245cabf64808d8f174efe3346806 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -101,6 +101,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm ifeq ($(CONFIG_USE_X86INC),yes) +VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm