Commit 88120481 authored by Jian Zhou's avatar Jian Zhou

Code clean of tm_predictor_32x32

Reallocate the xmm register usage so that no ARCH_X86_64 required.
Reduce memory access to the left neighbor by half.
Speed up by single digit on big core machine.

Change-Id: I392515ed8e8aeb02e6a717b3966b1ba13f5be990
parent 62f98626
......@@ -337,21 +337,12 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
#if HAVE_SSE2 && CONFIG_USE_X86INC
#if ARCH_X86_64
INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
vpx_dc_left_predictor_32x32_sse2,
vpx_dc_top_predictor_32x32_sse2,
vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
NULL, vpx_tm_predictor_32x32_sse2)
#else
INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
vpx_dc_left_predictor_32x32_sse2,
vpx_dc_top_predictor_32x32_sse2,
vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
NULL, NULL)
#endif // ARCH_X86_64
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
#if HAVE_SSSE3 && CONFIG_USE_X86INC
......
......@@ -241,7 +241,7 @@ add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, con
specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc";
specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86inc";
add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
......
......@@ -699,9 +699,8 @@ cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
jnz .loop
REP_RET
%if ARCH_X86_64
INIT_XMM sse2
cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
pxor m1, m1
movd m2, [aboveq-1]
mova m0, [aboveq]
......@@ -722,31 +721,29 @@ cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
psubw m5, m2
.loop:
movd m2, [leftq+lineq*2]
movd m6, [leftq+lineq*2+1]
pxor m1, m1
punpcklbw m2, m1
punpcklbw m6, m1
pshuflw m7, m2, 0x55
pshuflw m2, m2, 0x0
pshuflw m6, m6, 0x0
punpcklqdq m2, m2
punpcklqdq m6, m6
paddw m7, m2, m0
paddw m8, m2, m3
paddw m9, m2, m4
paddw m2, m5
packuswb m7, m8
packuswb m9, m2
paddw m2, m6, m0
paddw m8, m6, m3
mova [dstq ], m7
paddw m7, m6, m4
paddw m6, m5
mova [dstq +16], m9
packuswb m2, m8
packuswb m7, m6
mova [dstq+strideq ], m2
mova [dstq+strideq+16], m7
punpcklqdq m7, m7
paddw m6, m2, m3
paddw m1, m2, m0
packuswb m1, m6
mova [dstq ], m1
paddw m6, m2, m5
paddw m1, m2, m4
packuswb m1, m6
mova [dstq+16 ], m1
paddw m6, m7, m3
paddw m1, m7, m0
packuswb m1, m6
mova [dstq+strideq ], m1
paddw m6, m7, m5
paddw m1, m7, m4
packuswb m1, m6
mova [dstq+strideq+16], m1
lea dstq, [dstq+strideq*2]
inc lineq
jnz .loop
REP_RET
%endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment