Add an x86inc MMX fwht4x4.

Change-Id: Ib0a73d4863478f9b8a00976379d25d2f6ebbb197

Add an x86inc MMX fwht4x4.
Change-Id: Ib0a73d4863478f9b8a00976379d25d2f6ebbb197
b5422fab · Alex Converse · 07f9fa43 · b5422fab · b5422fab · b5422fab
Commit b5422fab authored 10 years ago by Alex Converse
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -353,6 +353,13 @@ INSTANTIATE_TEST_CASE_P(
        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));
 #endif
+#if CONFIG_USE_X86INC && HAVE_MMX
+INSTANTIATE_TEST_CASE_P(
+    MMX, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0)));
+#endif
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
    SSE2, Trans4x4DCT,

--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -693,7 +693,7 @@ add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int str
 specialize qw/vp9_fht16x16 sse2 avx2/;
 add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fwht4x4/;
+specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
 add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct4x4 sse2 avx2/;

--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ b/vp9/encoder/x86/vp9_dct_mmx.asm
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+SECTION .text
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+%macro TRANSPOSE_4X4 0
+  movq            m4,        m0
+  movq            m5,        m2
+  punpcklwd       m4,        m1
+  punpckhwd       m0,        m1
+  punpcklwd       m5,        m3
+  punpckhwd       m2,        m3
+  movq            m1,        m4
+  movq            m3,        m0
+  punpckldq       m1,        m5
+  punpckhdq       m4,        m5
+  punpckldq       m3,        m2
+  punpckhdq       m0,        m2
+  SWAP            2, 3, 0, 1, 4
+%endmacro
+INIT_MMX mmx
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  psllw           m0,        2
+  psllw           m1,        2
+  psllw           m2,        2
+  psllw           m3,        2
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m1
+  movq            [outputq + 16], m2
+  movq            [outputq + 24], m3
+  RET
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -101,6 +101,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm