Commit 4b5e462d authored by Linfeng Zhang's avatar Linfeng Zhang

Upgrade vpx_lpf_{vertical,horizontal}_4 mmx to sse2

Followed the code style of other lpf fuctions.
These 2 functions put 2 rows of data in a single xmm register,
so they have similar but not identical filter operations,
and cannot share the same macros.

Change-Id: I3bab55a5d1a1232926ac8fd1f03251acc38302bc
parent 2240d83d
......@@ -430,16 +430,6 @@ TEST_P(Loop8Test9Param, ValueCheck) {
using std::tr1::make_tuple;
#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
MMX, Loop8Test6Param,
::testing::Values(
make_tuple(&vpx_lpf_horizontal_4_mmx,
&vpx_lpf_horizontal_4_c, 8),
make_tuple(&vpx_lpf_vertical_4_mmx,
&vpx_lpf_vertical_4_c, 8)));
#endif // HAVE_MMX
#if HAVE_SSE2
#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
......@@ -497,12 +487,16 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, Loop8Test6Param,
::testing::Values(
make_tuple(&vpx_lpf_horizontal_4_sse2,
&vpx_lpf_horizontal_4_c, 8),
make_tuple(&vpx_lpf_horizontal_8_sse2,
&vpx_lpf_horizontal_8_c, 8),
make_tuple(&vpx_lpf_horizontal_edge_8_sse2,
&vpx_lpf_horizontal_edge_8_c, 8),
make_tuple(&vpx_lpf_horizontal_edge_16_sse2,
&vpx_lpf_horizontal_edge_16_c, 8),
make_tuple(&vpx_lpf_vertical_4_sse2,
&vpx_lpf_vertical_4_c, 8),
make_tuple(&vpx_lpf_vertical_8_sse2,
&vpx_lpf_vertical_8_c, 8),
make_tuple(&vpx_lpf_vertical_16_sse2,
......
......@@ -39,7 +39,6 @@ endif
DSP_SRCS-yes += intrapred.c
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm
DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
......
......@@ -543,7 +543,7 @@ specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_vertical_4 neon dspr2 msa/, "$mmx_x86inc";
specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
......@@ -564,7 +564,7 @@ specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_horizontal_4 neon dspr2 msa/, "$mmx_x86inc";
specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
......
;
; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION_RODATA
align 16
tfe:
times 8 db 0xfe
t80:
times 8 db 0x80
t3:
times 8 db 0x03
t4:
times 8 db 0x04
ones:
times 4 dw 0x0001
SECTION .text
%define stkreg rsp
%define t0 0
%define t1 t0 + 16
%define p1 t1 + 16
%define p0 p1 + 16
%define q0 p0 + 16
%define q1 q0 + 16
%define lstacksize q1 + 16
%define goffsetq _limitq
;void vpx_lpf_horizontal_4_mmx(unsigned char *src_ptr, int src_pixel_step,
; const char *blimit, const char *limit,
; const char *thresh);
INIT_MMX mmx
cglobal lpf_horizontal_4, 5, 6, 8, 0 - lstacksize, \
s, p, _blimit, _limit, _thresh, s1
movq m7, [_limitq]
GET_GOT goffsetq
%if GET_GOT_DEFINED=1
add rsp, gprsize ; restore stack
%endif
lea s1q, [sq + pq] ; s1q points to row +1
; calculate breakout conditions
movq m2, [s1q + 2 * pq] ; q3
movq m1, [ sq + 2 * pq] ; q2
movq m6, m1 ; q2
psubusb m1, m2 ; q2-=q3
psubusb m2, m6 ; q3-=q2
por m1, m2 ; abs(q3-q2)
psubusb m1, m7
movq m4, [sq + pq] ; q1
movq m3, m4 ; q1
psubusb m4, m6 ; q1-=q2
psubusb m6, m3 ; q2-=q1
por m4, m6 ; abs(q2-q1)
psubusb m4, m7
por m1, m4
movq m4, [sq] ; q0
movq m0, m4 ; q0
psubusb m4, m3 ; q0-=q1
psubusb m3, m0 ; q1-=q0
por m4, m3 ; abs(q0-q1)
movq [stkreg + t0], m4 ; save to t0
psubusb m4, m7
por m1, m4
neg pq ; negate pitch to deal with
; above border
movq m2, [ sq + 4 * pq] ; p3
movq m4, [s1q + 4 * pq] ; p2
movq m5, m4 ; p2
psubusb m4, m2 ; p2-=p3
psubusb m2, m5 ; p3-=p2
por m4, m2 ; abs(p3 - p2)
psubusb m4, m7
por m1, m4
movq m4, [sq + 2 * pq] ; p1
movq m3, m4 ; p1
psubusb m4, m5 ; p1-=p2
psubusb m5, m3 ; p2-=p1
por m4, m5 ; abs(p2 - p1)
psubusb m4, m7
por m1, m4
movq m2, m3 ; p1
movq m4, [sq + pq] ; p0
movq m5, m4 ; p0
psubusb m4, m3 ; p0-=p1
psubusb m3, m5 ; p1-=p0
por m4, m3 ; abs(p1 - p0)
movq [stkreg + t1], m4 ; save to t1
psubusb m4, m7
por m1, m4
movq m3, [s1q] ; q1
movq m4, m3 ; q1
psubusb m3, m2 ; q1-=p1
psubusb m2, m4 ; p1-=q1
por m2, m3 ; abs(p1-q1)
pand m2, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw m2, 1 ; abs(p1-q1)/2
movq m6, m5 ; p0
movq m3, [sq] ; q0
psubusb m5, m3 ; p0-=q0
psubusb m3, m6 ; q0-=p0
por m5, m3 ; abs(p0 - q0)
paddusb m5, m5 ; abs(p0-q0)*2
paddusb m5, m2 ; abs (p0 - q0) * 2 + abs(p1-q1)/2
movq m7, [_blimitq] ; blimit
psubusb m5, m7 ; abs (p0 - q0) * 2 +
; abs(p1-q1)/2 > blimit
por m1, m5
pxor m5, m5
pcmpeqb m1, m5 ; mask m1
; calculate high edge variance
movq m7, [_threshq]
movq m4, [stkreg + t0] ; get abs (q1 - q0)
psubusb m4, m7
movq m3, [stkreg + t1] ; get abs (p1 - p0)
psubusb m3, m7
paddb m4, m3 ; abs(q1 - q0) > thresh ||
; abs(p1 - p0) > thresh
pcmpeqb m4, m5
pcmpeqb m5, m5
movq m3, [GLOBAL(t80)]
pxor m4, m5
; start work on filters
movq m2, [sq + 2 * pq] ; p1
movq m7, [s1q] ; q1
pxor m2, m3 ; p1 converted to signed values
pxor m7, m3 ; q1 converted to signed values
psubsb m2, m7 ; p1 - q1
pand m2, m4 ; high var mask (hvm)(p1 - q1)
pxor m6, m3 ; p0 converted to signed values
pxor m0, m3 ; q0 converted to signed values
movq m3, m0 ; q0
psubsb m0, m6 ; q0 - p0
paddsb m2, m0 ; 1 * (q0 - p0) + hvm(p1 - q1)
paddsb m2, m0 ; 2 * (q0 - p0) + hvm(p1 - q1)
paddsb m2, m0 ; 3 * (q0 - p0) + hvm(p1 - q1)
pand m1, m2 ; mask filter values we don't
; care about
movq m2, m1
paddsb m1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
paddsb m2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
pxor m0, m0
pxor m5, m5
punpcklbw m0, m2
punpckhbw m5, m2
psraw m0, 11
psraw m5, 11
packsswb m0, m5
movq m2, m0 ; (3* (q0 - p0) + hvm(p1 - q1)
; + 3) >> 3;
pxor m0, m0
movq m5, m1 ; abcdefgh
punpcklbw m0, m1 ; e0f0g0h0
psraw m0, 11 ; sign extended shift right by 3
pxor m1, m1
punpckhbw m1, m5 ; a0b0c0d0
psraw m1, 11 ; sign extended shift right by 3
movq m5, m0 ; save results
packsswb m0, m1 ; (3* (q0 - p0) + hvm(p1 - q1)
; + 4) >>3
paddsw m5, [GLOBAL(ones)]
paddsw m1, [GLOBAL(ones)]
psraw m5, 1
psraw m1, 1
packsswb m5, m1 ; (3* (q0 - p0) + hvm(p1 - q1)
; + 4) >>4
movq m1, [GLOBAL(t80)]
pandn m4, m5 ; high edge variance additive
paddsb m6, m2 ; p0+= p0 add
pxor m6, m1 ; unoffset
movq [sq + pq], m6 ; write back
movq m6, [sq + 2 * pq] ; p1
pxor m6, m1 ; reoffset
paddsb m6, m4 ; p1+= p1 add
pxor m6, m1 ; unoffset
movq [sq + 2 * pq], m6 ; write back
psubsb m3, m0 ; q0-= q0 add
pxor m3, m1 ; unoffset
movq [sq], m3 ; write back
psubsb m7, m4 ; q1-= q1 add
pxor m7, m1 ; unoffset
movq [s1q], m7 ; write back
RET
;void vpx_lpf_vertical_4_mmx(unsigned char *src_ptr, int src_pixel_step,
; const char *blimit, const char *limit,
; const char *thresh);
INIT_MMX mmx
cglobal lpf_vertical_4, 5, 6, 8, 0 - lstacksize, \
s, p, _blimit, _limit, _thresh, s1
lea sq, [sq + pq * 4 - 4]
lea s1q, [sq + pq] ; s1q points to row +1
;transpose
movq m6, [ sq + 2 * pq] ; 67 66 65 64 63 62 61 60
movq m7, m6 ; 77 76 75 74 73 72 71 70
punpckhbw m7, [s1q + 2 * pq] ; 77 67 76 66 75 65 74 64
punpcklbw m6, [s1q + 2 * pq] ; 73 63 72 62 71 61 70 60
movq m4, [sq] ; 47 46 45 44 43 42 41 40
movq m5, m4 ; 47 46 45 44 43 42 41 40
punpckhbw m5, [sq + pq] ; 57 47 56 46 55 45 54 44
punpcklbw m4, [sq + pq] ; 53 43 52 42 51 41 50 40
movq m3, m5 ; 57 47 56 46 55 45 54 44
punpckhwd m5, m7 ; 77 67 57 47 76 66 56 46
punpcklwd m3, m7 ; 75 65 55 45 74 64 54 44
movq m2, m4 ; 53 43 52 42 51 41 50 40
punpckhwd m4, m6 ; 73 63 53 43 72 62 52 42
punpcklwd m2, m6 ; 71 61 51 41 70 60 50 40
neg pq
movq m6, [ sq + pq * 2] ; 27 26 25 24 23 22 21 20
movq m1, m6 ; 27 26 25 24 23 22 21 20
punpckhbw m6, [ sq + pq ] ; 37 27 36 36 35 25 34 24
punpcklbw m1, [ sq + pq ] ; 33 23 32 22 31 21 30 20
movq m7, [ sq + pq * 4]; ; 07 06 05 04 03 02 01 00
punpckhbw m7, [s1q + pq * 4] ; 17 07 16 06 15 05 14 04
movq m0, m7 ; 17 07 16 06 15 05 14 04
punpckhwd m7, m6 ; 37 27 17 07 36 26 16 06
punpcklwd m0, m6 ; 35 25 15 05 34 24 14 04
movq m6, m7 ; 37 27 17 07 36 26 16 06
punpckhdq m7, m5 ; 77 67 57 47 37 27 17 07 = q3
punpckldq m6, m5 ; 76 66 56 46 36 26 16 06 = q2
movq m5, m6 ; 76 66 56 46 36 26 16 06
psubusb m5, m7 ; q2-q3
psubusb m7, m6 ; q3-q2
por m7, m5; ; m7=abs (q3-q2)
movq m5, m0 ; 35 25 15 05 34 24 14 04
punpckhdq m5, m3 ; 75 65 55 45 35 25 15 05 = q1
punpckldq m0, m3 ; 74 64 54 44 34 24 15 04 = q0
movq m3, m5 ; 75 65 55 45 35 25 15 05 = q1
psubusb m3, m6 ; q1-q2
psubusb m6, m5 ; q2-q1
por m6, m3 ; m6=abs(q2-q1)
movq [stkreg + q1], m5 ; save q1
movq [stkreg + q0], m0 ; save q0
movq m3, [ sq + pq * 4] ; 07 06 05 04 03 02 01 00
punpcklbw m3, [s1q + pq * 4] ; 13 03 12 02 11 01 10 00
movq m0, m3 ; 13 03 12 02 11 01 10 00
punpcklwd m0, m1 ; 31 21 11 01 30 20 10 00
punpckhwd m3, m1 ; 33 23 13 03 32 22 12 02
movq m1, m0 ; 31 21 11 01 30 20 10 00
punpckldq m0, m2 ; 70 60 50 40 30 20 10 00 =p3
punpckhdq m1, m2 ; 71 61 51 41 31 21 11 01 =p2
movq m2, m1 ; 71 61 51 41 31 21 11 01 =p2
psubusb m2, m0 ; p2-p3
psubusb m0, m1 ; p3-p2
por m0, m2 ; m0=abs(p3-p2)
movq m2, m3 ; 33 23 13 03 32 22 12 02
punpckldq m2, m4 ; 72 62 52 42 32 22 12 02 = p1
punpckhdq m3, m4 ; 73 63 53 43 33 23 13 03 = p0
movq [stkreg + p0], m3 ; save p0
movq [stkreg + p1], m2 ; save p1
movq m5, m2 ; m5 = p1
psubusb m2, m1 ; p1-p2
psubusb m1, m5 ; p2-p1
por m1, m2 ; m1=abs(p2-p1)
movq m4, [_limitq]
GET_GOT goffsetq
%if GET_GOT_DEFINED=1
add rsp, gprsize ; restore stack
%endif
psubusb m7, m4
psubusb m0, m4
psubusb m1, m4
psubusb m6, m4
por m7, m6
por m0, m1
por m0, m7 ; abs(q3-q2) > limit ||
; abs(p3-p2) > limit ||
; abs(p2-p1) > limit ||
; abs(q2-q1) > limit
movq m1, m5 ; p1
movq m7, m3 ; m3=m7=p0
psubusb m7, m5 ; p0 - p1
psubusb m5, m3 ; p1 - p0
por m5, m7 ; abs(p1-p0)
movq [stkreg + t0], m5 ; save abs(p1-p0)
psubusb m5, m4
por m0, m5 ; m0=mask
movq m5, [stkreg + q0] ; m5=q0
movq m7, [stkreg + q1] ; m7=q1
movq m6, m5 ; m6=q0
movq m2, m7 ; q1
psubusb m5, m7 ; q0-q1
psubusb m7, m6 ; q1-q0
por m7, m5 ; abs(q1-q0)
movq [stkreg + t1], m7 ; save abs(q1-q0)
psubusb m7, m4
por m0, m7 ; mask
movq m5, m2 ; q1
psubusb m5, m1 ; q1-=p1
psubusb m1, m2 ; p1-=q1
por m5, m1 ; abs(p1-q1)
pand m5, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw m5, 1 ; abs(p1-q1)/2
movq m4, [_blimitq]
movq m1, m3 ; m1=m3=p0
movq m7, m6 ; m7=m6=q0
psubusb m1, m7 ; p0-q0
psubusb m7, m3 ; q0-p0
por m1, m7 ; abs(q0-p0)
paddusb m1, m1 ; abs(q0-p0)*2
paddusb m1, m5 ; abs(p0 - q0)*2 + abs(p1-q1)/2
psubusb m1, m4 ; abs(p0 - q0)*2 + abs(p1-q1)/2
; > blimit
por m1, m0; ; mask
pxor m0, m0
pcmpeqb m1, m0
; calculate high edge variance
movq m7, [_threshq]
movq m4, [stkreg + t0] ; get abs (q1 - q0)
psubusb m4, m7
movq m3, [stkreg + t1] ; get abs (p1 - p0)
psubusb m3, m7
por m4, m3 ; abs(q1 - q0) > thresh ||
; abs(p1 - p0) > thresh
pcmpeqb m4, m0
pcmpeqb m0, m0
movq m3, [GLOBAL(t80)]
pxor m4, m0
; start work on filters
movq m2, [stkreg + p1]
movq m7, [stkreg + q1]
movq m6, [stkreg + p0]
movq m0, [stkreg + q0]
pxor m2, m3
pxor m7, m3
psubsb m2, m7 ; p1 - q1
pand m2, m4 ; high var mask (hvm)(p1 - q1)
pxor m6, m3
pxor m0, m3
movq m3, m0 ; q0
psubsb m0, m6 ; q0 - p0
paddsb m2, m0 ; 1 * (q0 - p0) + hvm(p1 - q1)
paddsb m2, m0 ; 2 * (q0 - p0) + hvm(p1 - q1)
paddsb m2, m0 ; 3 * (q0 - p0) + hvm(p1 - q1)
pand m1, m2 ; mask filter values we don't
; care about
movq m2, m1
paddsb m1, [GLOBAL(t4)] ; 3*(q0 - p0) + hvm(p1 - q1) + 4
paddsb m2, [GLOBAL(t3)] ; 3*(q0 - p0) + hvm(p1 - q1) + 3
pxor m0, m0
pxor m5, m5
punpcklbw m0, m2
punpckhbw m5, m2
psraw m0, 11
psraw m5, 11
packsswb m0, m5
movq m2, m0 ; (3*(q0 - p0) + hvm(p1 - q1)
; + 3) >> 3;
pxor m0, m0
movq m5, m1 ; abcdefgh
punpcklbw m0, m1 ; e0f0g0h0
psraw m0, 11 ; sign extended shift right by 3
pxor m1, m1
punpckhbw m1, m5 ; a0b0c0d0
psraw m1, 11 ; sign extended shift right by 3
movq m5, m0 ; save results
packsswb m0, m1 ; (3*(q0 - p0) + hvm(p1 - q1)
; + 4) >>3
paddsw m5, [GLOBAL(ones)]
paddsw m1, [GLOBAL(ones)]
psraw m5, 1
psraw m1, 1
packsswb m5, m1 ; (3* (q0 - p0) + hvm(p1 - q1)
; + 4) >>4
pandn m4, m5 ; high edge variance additive
movq m5, [GLOBAL(t80)]
paddsb m6, m2 ; p0+= p0 add
pxor m6, m5 ; unoffset
; m6=p0
movq m1, [stkreg + p1]
pxor m1, m5 ; reoffset
paddsb m1, m4 ; p1+= p1 add
pxor m1, m5 ; unoffset
; m6 = p0 m1 = p1
psubsb m3, m0 ; q0-= q0 add
pxor m3, m5 ; unoffset
; m3 = q0
psubsb m7, m4 ; q1-= q1 add
pxor m7, m5 ; unoffset
; m7 = q1
; transpose and write back
; m1 = 72 62 52 42 32 22 12 02
; m6 = 73 63 53 43 33 23 13 03
; m3 = 74 64 54 44 34 24 14 04
; m7 = 75 65 55 45 35 25 15 05
movq m2, m1 ; 72 62 52 42 32 22 12 02
punpcklbw m2, m6 ; 33 32 23 22 13 12 03 02
movq m4, m3 ; 74 64 54 44 34 24 14 04
punpckhbw m1, m6 ; 73 72 63 62 53 52 43 42
punpcklbw m4, m7 ; 35 34 25 24 15 14 05 04
punpckhbw m3, m7 ; 75 74 65 64 55 54 45 44
movq m6, m2 ; 33 32 23 22 13 12 03 02
punpcklwd m2, m4 ; 15 14 13 12 05 04 03 02
punpckhwd m6, m4 ; 35 34 33 32 25 24 23 22
movq m5, m1 ; 73 72 63 62 53 52 43 42
punpcklwd m1, m3 ; 55 54 53 52 45 44 43 42
punpckhwd m5, m3 ; 75 74 73 72 65 64 63 62
; m2 = 15 14 13 12 05 04 03 02
; m6 = 35 34 33 32 25 24 23 22
; m5 = 55 54 53 52 45 44 43 42
; m1 = 75 74 73 72 65 64 63 62
movd [sq + pq * 4 + 2], m2
psrlq m2, 32
movd [s1q + pq * 4 + 2], m2
movd [sq + pq * 2 + 2], m6
psrlq m6, 32
movd [sq + pq + 2], m6
movd [sq + 2], m1
psrlq m1, 32
movd [s1q + 2], m1
neg pq
movd [s1q + pq + 2], m5
psrlq m5, 32
movd [s1q + pq * 2 + 2], m5
RET
......@@ -18,6 +18,213 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
}
// filter_mask and hev_mask
#define FILTER_HEV_MASK do { \
/* (abs(q1 - q0), abs(p1 - p0) */ \
__m128i flat = abs_diff(q1p1, q0p0); \
/* abs(p1 - q1), abs(p0 - q0) */ \
const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
__m128i abs_p0q0, abs_p1q1, work; \
\
/* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
hev = _mm_cmpgt_epi16(hev, thresh); \
hev = _mm_packs_epi16(hev, hev); \
\
/* const int8_t mask = filter_mask(*limit, *blimit, */ \
/* p3, p2, p1, p0, q0, q1, q2, q3); */ \
abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\
abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
/* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
/* abs(p3 - p2), abs(p2 - p1) */ \
work = abs_diff(p3p2, p2p1); \
flat = _mm_max_epu8(work, flat); \
/* abs(q3 - q2), abs(q2 - q1) */ \
work = abs_diff(q3q2, q2q1); \
flat = _mm_max_epu8(work, flat); \
flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
mask = _mm_unpacklo_epi64(mask, flat); \
mask = _mm_subs_epu8(mask, limit); \
mask = _mm_cmpeq_epi8(mask, zero); \
mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
} while (0)
#define FILTER4 do { \
const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \
4, 4, 4, 4, 4, 4, 4, 4); \
const __m128i t80 = _mm_set1_epi8(0x80); \
__m128i filter, filter2filter1, work; \
\
ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
qs1qs0 = _mm_xor_si128(q1q0, t80); \
\
/* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
work = _mm_subs_epi8(ps1ps0, qs1qs0); \
filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
/* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
filter = _mm_subs_epi8(filter, work); \
filter = _mm_subs_epi8(filter, work); \
filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
filter = _mm_and_si128(filter, mask); /* & mask */ \
filter = _mm_unpacklo_epi64(filter, filter); \
\
/* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
/* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
\
/* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
filter = _mm_unpacklo_epi8(filter, filter); \
filter = _mm_srai_epi16(filter, 9); /* round */ \
filter = _mm_packs_epi16(filter, filter); \
filter = _mm_andnot_si128(hev, filter); \
\
hev = _mm_unpackhi_epi64(filter2filter1, filter); \
filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
\
/* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
/* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
} while (0)
void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i limit =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
_mm_loadl_epi64((const __m128i *)_limit));
const __m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
p3p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
_mm_loadl_epi64((__m128i *)(s - 4 * p)));
q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
_mm_loadl_epi64((__m128i *)(s + 1 * p)));
q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
_mm_loadl_epi64((__m128i *)(s + 0 * p)));
q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
_mm_loadl_epi64((__m128i *)(s + 3 * p)));
p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
FILTER_HEV_MASK;
FILTER4;
_mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
_mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
_mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
_mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
}
void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i limit =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
_mm_loadl_epi64((const __m128i *)_limit));
const __m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i x0, x1, x2, x3;
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
// Transpose 8x8
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
p1p0 = _mm_unpacklo_epi16(q1q0, x1);
// 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
x0 = _mm_unpacklo_epi16(x2, x3);
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
p3p2 = _mm_unpacklo_epi32(p1p0, x0);
// 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
p1p0 = _mm_unpackhi_epi32(p1p0, x0);
p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
q1q0 = _mm_unpackhi_epi16(q1q0, x1);
// 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
x2 = _mm_unpackhi_epi16(x2, x3);
// 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
q3q2 = _mm_unpackhi_epi32(q1q0, x2);
// 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
q1q0 = _mm_unpacklo_epi32(q1q0, x2);
q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
FILTER_HEV_MASK;
FILTER4;
// Transpose 8x4 to 4x8
// qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
// ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
// 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
// 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
// 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
*(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
*(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
*(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
*(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
*(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
*(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
*(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
*(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
}
void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
const unsigned char *_blimit,