Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
ffmpeg
Commits
1700b4e6
Commit
1700b4e6
authored
Oct 29, 2013
by
Diego Biurrun
Browse files
x86: vp8dsp: Split loopfilter code into a separate file
parent
056fd4fe
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1586 additions
and
1556 deletions
+1586
-1556
libavcodec/x86/Makefile
libavcodec/x86/Makefile
+2
-1
libavcodec/x86/vp8dsp.asm
libavcodec/x86/vp8dsp.asm
+0
-1555
libavcodec/x86/vp8dsp_loopfilter.asm
libavcodec/x86/vp8dsp_loopfilter.asm
+1584
-0
No files found.
libavcodec/x86/Makefile
View file @
1700b4e6
...
...
@@ -88,4 +88,5 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER)
+=
x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP3DSP)
+=
x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER)
+=
x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER)
+=
x86/vp8dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER)
+=
x86/vp8dsp.o
\
x86/vp8dsp_loopfilter.o
libavcodec/x86/vp8dsp.asm
View file @
1700b4e6
...
...
@@ -143,27 +143,13 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
filter_h6_shuf2:
db
1
,
2
,
2
,
3
,
3
,
4
,
4
,
5
,
5
,
6
,
6
,
7
,
7
,
8
,
8
,
9
filter_h6_shuf3:
db
3
,
4
,
4
,
5
,
5
,
6
,
6
,
7
,
7
,
8
,
8
,
9
,
9
,
10
,
10
,
11
pw_27:
times
8
dw
27
pw_63:
times
8
dw
63
pw_256:
times
8
dw
256
pw_20091:
times
4
dw
20091
pw_17734:
times
4
dw
17734
pb_4:
times
16
db
4
pb_F8:
times
16
db
0xF8
pb_FE:
times
16
db
0xFE
pb_27_63:
times
8
db
27
,
63
pb_18_63:
times
8
db
18
,
63
pb_9_63:
times
8
db
9
,
63
cextern
pb_1
cextern
pw_3
cextern
pb_3
cextern
pw_4
cextern
pw_9
cextern
pw_18
cextern
pw_64
cextern
pb_80
SECTION
.text
...
...
@@ -1237,1544 +1223,3 @@ VP8_DC_WHT
%endif
INIT_MMX
ss
e
VP8_DC_WHT
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
;-----------------------------------------------------------------------------
; macro called with 7 mm register indexes as argument, and 4 regular registers
;
; first 4 mm registers will carry the transposed pixel data
; the other three are scratchspace (one would be sufficient, but this allows
; for more spreading/pipelining and thus faster execution on OOE CPUs)
;
; first two regular registers are buf+4*stride and buf+5*stride
; third is -stride, fourth is +stride
%macro READ_8x4_INTERLEAVED 11
; interleave 8 (A-H) rows of 4 pixels each
movd
m
%
1
,
[
%
8
+%
10
*
4
]
; A0-3
movd
m
%
5
,
[
%
9
+%
10
*
4
]
; B0-3
movd
m
%
2
,
[
%
8
+%
10
*
2
]
; C0-3
movd
m
%
6
,
[
%
8
+%
10
]
; D0-3
movd
m
%
3
,
[
%
8
]
; E0-3
movd
m
%
7
,
[
%
9
]
; F0-3
movd
m
%
4
,
[
%
9
+%
11
]
; G0-3
punpcklbw
m
%
1
,
m
%
5
; A/B interleaved
movd
m
%
5
,
[
%
9
+%
11
*
2
]
; H0-3
punpcklbw
m
%
2
,
m
%
6
; C/D interleaved
punpcklbw
m
%
3
,
m
%
7
; E/F interleaved
punpcklbw
m
%
4
,
m
%
5
; G/H interleaved
%endmacro
; macro called with 7 mm register indexes as argument, and 5 regular registers
; first 11 mean the same as READ_8x4_TRANSPOSED above
; fifth regular register is scratchspace to reach the bottom 8 rows, it
; will be set to second regular register + 8*stride at the end
%macro READ_16x4_INTERLEAVED 12
; transpose 16 (A-P) rows of 4 pixels each
lea
%
12
,
[
r0
+
8
*
r2
]
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
movd
m
%
1
,
[
%
8
+%
10
*
4
]
; A0-3
movd
m
%
3
,
[
%
12
+%
10
*
4
]
; I0-3
movd
m
%
2
,
[
%
8
+%
10
*
2
]
; C0-3
movd
m
%
4
,
[
%
12
+%
10
*
2
]
; K0-3
movd
m
%
6
,
[
%
8
+%
10
]
; D0-3
movd
m
%
5
,
[
%
12
+%
10
]
; L0-3
movd
m
%
7
,
[
%
12
]
; M0-3
add
%
12
,
%
11
punpcklbw
m
%
1
,
m
%
3
; A/I
movd
m
%
3
,
[
%
8
]
; E0-3
punpcklbw
m
%
2
,
m
%
4
; C/K
punpcklbw
m
%
6
,
m
%
5
; D/L
punpcklbw
m
%
3
,
m
%
7
; E/M
punpcklbw
m
%
2
,
m
%
6
; C/D/K/L interleaved
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
movd
m
%
5
,
[
%
9
+%
10
*
4
]
; B0-3
movd
m
%
4
,
[
%
12
+%
10
*
4
]
; J0-3
movd
m
%
7
,
[
%
9
]
; F0-3
movd
m
%
6
,
[
%
12
]
; N0-3
punpcklbw
m
%
5
,
m
%
4
; B/J
punpcklbw
m
%
7
,
m
%
6
; F/N
punpcklbw
m
%
1
,
m
%
5
; A/B/I/J interleaved
punpcklbw
m
%
3
,
m
%
7
; E/F/M/N interleaved
movd
m
%
4
,
[
%
9
+%
11
]
; G0-3
movd
m
%
6
,
[
%
12
+%
11
]
; O0-3
movd
m
%
5
,
[
%
9
+%
11
*
2
]
; H0-3
movd
m
%
7
,
[
%
12
+%
11
*
2
]
; P0-3
punpcklbw
m
%
4
,
m
%
6
; G/O
punpcklbw
m
%
5
,
m
%
7
; H/P
punpcklbw
m
%
4
,
m
%
5
; G/H/O/P interleaved
%endmacro
; write 4 mm registers of 2 dwords each
; first four arguments are mm register indexes containing source data
; last four are registers containing buf+4*stride, buf+5*stride,
; -stride and +stride
%macro WRITE_4x2D 8
; write out (2 dwords per register)
movd
[
%
5
+%
7
*
4
],
m
%
1
movd
[
%
5
+%
7
*
2
],
m
%
2
movd
[
%
5
],
m
%
3
movd
[
%
6
+%
8
],
m
%
4
punpckhdq
m
%
1
,
m
%
1
punpckhdq
m
%
2
,
m
%
2
punpckhdq
m
%
3
,
m
%
3
punpckhdq
m
%
4
,
m
%
4
movd
[
%
6
+%
7
*
4
],
m
%
1
movd
[
%
5
+%
7
],
m
%
2
movd
[
%
6
],
m
%
3
movd
[
%
6
+%
8
*
2
],
m
%
4
%endmacro
; write 4 xmm registers of 4 dwords each
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
; we add 1*stride to the third regular registry in the process
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
; same memory region), or 8 if they cover two separate buffers (third one points to
; a different memory region than the first two), allowing for more optimal code for
; the 16-width case
%macro WRITE_4x4D 10
; write out (4 dwords per register), start with dwords zero
movd
[
%
5
+%
8
*
4
],
m
%
1
movd
[
%
5
],
m
%
2
movd
[
%
7
+%
8
*
4
],
m
%
3
movd
[
%
7
],
m
%
4
; store dwords 1
psrldq
m
%
1
,
4
psrldq
m
%
2
,
4
psrldq
m
%
3
,
4
psrldq
m
%
4
,
4
movd
[
%
6
+%
8
*
4
],
m
%
1
movd
[
%
6
],
m
%
2
%if %10 == 16
movd
[
%
6
+%
9
*
4
],
m
%
3
%endif
movd
[
%
7
+%
9
],
m
%
4
; write dwords 2
psrldq
m
%
1
,
4
psrldq
m
%
2
,
4
%if %10 == 8
movd
[
%
5
+%
8
*
2
],
m
%
1
movd
%
5
d
,
m
%
3
%endif
psrldq
m
%
3
,
4
psrldq
m
%
4
,
4
%if %10 == 16
movd
[
%
5
+%
8
*
2
],
m
%
1
%endif
movd
[
%
6
+%
9
],
m
%
2
movd
[
%
7
+%
8
*
2
],
m
%
3
movd
[
%
7
+%
9
*
2
],
m
%
4
add
%
7
,
%
9
; store dwords 3
psrldq
m
%
1
,
4
psrldq
m
%
2
,
4
psrldq
m
%
3
,
4
psrldq
m
%
4
,
4
%if %10 == 8
mov
[
%
7
+%
8
*
4
],
%
5
d
movd
[
%
6
+%
8
*
2
],
m
%
1
%else
movd
[
%
5
+%
8
],
m
%
1
%endif
movd
[
%
6
+%
9
*
2
],
m
%
2
movd
[
%
7
+%
8
*
2
],
m
%
3
movd
[
%
7
+%
9
*
2
],
m
%
4
%endmacro
; write 4 or 8 words in the mmx/xmm registers as 8 lines
; 1 and 2 are the registers to write, this can be the same (for SSE2)
; for pre-SSE4:
; 3 is a general-purpose register that we will clobber
; for SSE4:
; 3 is a pointer to the destination's 5th line
; 4 is a pointer to the destination's 4th line
; 5/6 is -stride and +stride
%macro WRITE_2x4W 6
movd
%
3
d
,
%
1
punpckhdq
%
1
,
%
1
mov
[
%
4
+%
5
*
4
],
%
3
w
shr
%
3
,
16
add
%
4
,
%
6
mov
[
%
4
+%
5
*
4
],
%
3
w
movd
%
3
d
,
%
1
add
%
4
,
%
5
mov
[
%
4
+%
5
*
2
],
%
3
w
shr
%
3
,
16
mov
[
%
4
+%
5
],
%
3
w
movd
%
3
d
,
%
2
punpckhdq
%
2
,
%
2
mov
[
%
4
],
%
3
w
shr
%
3
,
16
mov
[
%
4
+%
6
],
%
3
w
movd
%
3
d
,
%
2
add
%
4
,
%
6
mov
[
%
4
+%
6
],
%
3
w
shr
%
3
,
16
mov
[
%
4
+%
6
*
2
],
%
3
w
add
%
4
,
%
5
%endmacro
%macro WRITE_8W 5
%if cpuflag(sse4)
pextrw
[
%
3
+%
4
*
4
],
%
1
,
0
pextrw
[
%
2
+%
4
*
4
],
%
1
,
1
pextrw
[
%
3
+%
4
*
2
],
%
1
,
2
pextrw
[
%
3
+%
4
],
%
1
,
3
pextrw
[
%
3
],
%
1
,
4
pextrw
[
%
2
],
%
1
,
5
pextrw
[
%
2
+%
5
],
%
1
,
6
pextrw
[
%
2
+%
5
*
2
],
%
1
,
7
%else
movd
%
2
d
,
%
1
psrldq
%
1
,
4
mov
[
%
3
+%
4
*
4
],
%
2
w
shr
%
2
,
16
add
%
3
,
%
5
mov
[
%
3
+%
4
*
4
],
%
2
w
movd
%
2
d
,
%
1
psrldq
%
1
,
4
add
%
3
,
%
4
mov
[
%
3
+%
4
*
2
],
%
2
w
shr
%
2
,
16
mov
[
%
3
+%
4
],
%
2
w
movd
%
2
d
,
%
1
psrldq
%
1
,
4
mov
[
%
3
],
%
2
w
shr
%
2
,
16
mov
[
%
3
+%
5
],
%
2
w
movd
%
2
d
,
%
1
add
%
3
,
%
5
mov
[
%
3
+%
5
],
%
2
w
shr
%
2
,
16
mov
[
%
3
+%
5
*
2
],
%
2
w
%endif
%endmacro
%macro SIMPLE_LOOPFILTER 2
cglobal
vp8_
%
1
_loop_filter_simple
,
3
,
%
2
,
8
,
ds
t
,
stride
,
flim
,
cntr
%if mmsize == 8
; mmx/mmxext
mov
cntrq
,
2
%endif
%if cpuflag(ssse3)
pxor
m0
,
m0
%endif
SPLATB_REG
m7
,
flim
,
m0
; splat "flim" into register
; set up indexes to address 4 rows
%if mmsize == 8
DEFINE_ARGS
ds
t1
,
mstride
,
stride
,
cntr
,
ds
t2
%else
DEFINE_ARGS
ds
t1
,
mstride
,
stride
,
ds
t3
,
ds
t2
%endif
mov
strideq
,
mstrideq
neg
mstrideq
%ifidn %1, h
lea
ds
t1q
,
[
ds
t1q
+
4
*
strideq
-
2
]
%endif
%if mmsize == 8
; mmx / mmxext
.next8px:
%endif
%ifidn %1, v
; read 4 half/full rows of pixels
mova
m0
,
[
ds
t1q
+
mstrideq
*
2
]
; p1
mova
m1
,
[
ds
t1q
+
mstrideq
]
; p0
mova
m2
,
[
ds
t1q
]
; q0
mova
m3
,
[
ds
t1q
+
strideq
]
; q1
%else
; h
lea
ds
t2q
,
[
ds
t1q
+
strideq
]
%if mmsize == 8
; mmx/mmxext
READ_8x4_INTERLEAVED
0
,
1
,
2
,
3
,
4
,
5
,
6
,
ds
t1q
,
ds
t2q
,
mstrideq
,
strideq
%else
; sse2
READ_16x4_INTERLEAVED
0
,
1
,
2
,
3
,
4
,
5
,
6
,
ds
t1q
,
ds
t2q
,
mstrideq
,
strideq
,
ds
t3q
%endif
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
%endif
; simple_limit
mova
m5
,
m2
; m5=backup of q0
mova
m6
,
m1
; m6=backup of p0
psubusb
m1
,
m2
; p0-q0
psubusb
m2
,
m6
; q0-p0
por
m1
,
m2
; FFABS(p0-q0)
paddusb
m1
,
m1
; m1=FFABS(p0-q0)*2
mova
m4
,
m3
mova
m2
,
m0
psubusb
m3
,
m0
; q1-p1
psubusb
m0
,
m4
; p1-q1
por
m3
,
m0
; FFABS(p1-q1)
mova
m0
,
[
pb_80
]
pxor
m2
,
m0
pxor
m4
,
m0
psubsb
m2
,
m4
; m2=p1-q1 (signed) backup for below
pand
m3
,
[
pb_FE
]
psrlq
m3
,
1
; m3=FFABS(p1-q1)/2, this can be used signed
paddusb
m3
,
m1
psubusb
m3
,
m7
pxor
m1
,
m1
pcmpeqb
m3
,
m1
; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
mova
m4
,
m5
pxor
m5
,
m0
pxor
m0
,
m6
psubsb
m5
,
m0
; q0-p0 (signed)
paddsb
m2
,
m5
paddsb
m2
,
m5
paddsb
m2
,
m5
; a=(p1-q1) + 3*(q0-p0)
pand
m2
,
m3
; apply filter mask (m3)
mova
m3
,
[
pb_F8
]
mova
m1
,
m2
paddsb
m2
,
[
pb_4
]
; f1<<3=a+4
paddsb
m1
,
[
pb_3
]
; f2<<3=a+3
pand
m2
,
m3
pand
m1
,
m3
; cache f2<<3
pxor
m0
,
m0
pxor
m3
,
m3
pcmpgtb
m0
,
m2
; which values are <0?
psubb
m3
,
m2
; -f1<<3
psrlq
m2
,
3
; +f1
psrlq
m3
,
3
; -f1
pand
m3
,
m0
pandn
m0
,
m2
psubusb
m4
,
m0
paddusb
m4
,
m3
; q0-f1
pxor
m0
,
m0
pxor
m3
,
m3
pcmpgtb
m0
,
m1
; which values are <0?
psubb
m3
,
m1
; -f2<<3
psrlq
m1
,
3
; +f2
psrlq
m3
,
3
; -f2
pand
m3
,
m0
pandn
m0
,
m1
paddusb
m6
,
m0
psubusb
m6
,
m3
; p0+f2
; store
%ifidn %1, v
mova
[
ds
t1q
],
m4
mova
[
ds
t1q
+
mstrideq
],
m6
%else
; h
inc
ds
t1q
SBUTTERFLY
bw
,
6
,
4
,
0
%if mmsize == 16
; sse2
%if cpuflag(sse4)
inc
ds
t2q
%endif
WRITE_8W
m6
,
ds
t2q
,
ds
t1q
,
mstrideq
,
strideq
lea
ds
t2q
,
[
ds
t3q
+
mstrideq
+
1
]
%if cpuflag(sse4)
inc
ds
t3q
%endif
WRITE_8W
m4
,
ds
t3q
,
ds
t2q
,
mstrideq
,
strideq
%else
; mmx/mmxext
WRITE_2x4W
m6
,
m4
,
ds
t2q
,
ds
t1q
,
mstrideq
,
strideq
%endif
%endif
%if mmsize == 8
; mmx/mmxext
; next 8 pixels
%ifidn %1, v
add
ds
t1q
,
8
; advance 8 cols = pixels
%else
; h
lea
ds
t1q
,
[
ds
t1q
+
strideq
*
8
-
1
]
; advance 8 rows = lines
%endif
dec
cntrq
jg
.next8px
REP_RET
%else
; sse2
RET
%endif
%endmacro
%if ARCH_X86_32
INIT_MMX
mmx
SIMPLE_LOOPFILTER
v
,
4
SIMPLE_LOOPFILTER
h
,
5
INIT_MMX
mmxext
SIMPLE_LOOPFILTER
v
,
4
SIMPLE_LOOPFILTER
h
,
5
%endif
INIT_XMM
ss
e2
SIMPLE_LOOPFILTER
v
,
3
SIMPLE_LOOPFILTER
h
,
5
INIT_XMM
ss
se3
SIMPLE_LOOPFILTER
v
,
3
SIMPLE_LOOPFILTER
h
,
5
INIT_XMM
ss
e4
SIMPLE_LOOPFILTER
h
,
5
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
%macro INNER_LOOPFILTER 2
%define stack_size 0
%ifndef m8
; stack layout: [0]=E, [1]=I, [2]=hev_thr
%ifidn %1, v
; [3]=hev() result
%define stack_size mmsize * -4
%else
; h ; extra storage space for transposes
%define stack_size mmsize * -5
%endif
%endif
%if %2 == 8
; chroma
cglobal
vp8_
%
1
_loop_filter8uv_inner
,
6
,
6
,
13
,
stack_size
,
ds
t
,
ds
t8
,
stride
,
flimE
,
flimI
,
hevthr
%else
; luma
cglobal
vp8_
%
1
_loop_filter16y_inner
,
5
,
5
,
13
,
stack_size
,
ds
t
,
stride
,
flimE
,
flimI
,
hevthr
%endif
%if cpuflag(ssse3)
pxor
m7
,
m7
%endif
%ifndef m8
; splat function arguments
SPLATB_REG
m0
,
flimEq
,
m7
; E
SPLATB_REG
m1
,
flimIq
,
m7
; I
SPLATB_REG
m2
,
hevthrq
,
m7
; hev_thresh
%define m_flimE [rsp]
%define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2]
%define m_maskres [rsp+mmsize*3]
%define m_p0backup [rsp+mmsize*3]
%define m_q0backup [rsp+mmsize*4]
mova
m_flimE
,
m0
mova
m_flimI
,
m1
mova
m_hevthr
,
m2
%else
%define m_flimE m9
%define m_flimI m10
%define m_hevthr m11
%define m_maskres m12
%define m_p0backup m12
%define m_q0backup m8
; splat function arguments
SPLATB_REG
m_flimE
,
flimEq
,
m7
; E
SPLATB_REG
m_flimI
,
flimIq
,
m7
; I
SPLATB_REG
m_hevthr
,
hevthrq
,
m7
; hev_thresh
%endif
%if %2 == 8
; chroma
DEFINE_ARGS
ds
t1
,
ds
t8
,
mstride
,
stride
,
ds
t2
%elif mmsize == 8
DEFINE_ARGS
ds
t1
,
mstride
,
stride
,
ds
t2
,
cntr
mov
cntrq
,
2
%else
DEFINE_ARGS
ds
t1
,
mstride
,
stride
,
ds
t2
,
ds
t8
%endif
mov
strideq
,
mstrideq
neg
mstrideq
%ifidn %1, h
lea
ds
t1q
,
[
ds
t1q
+
strideq
*
4
-
4
]
%if %2 == 8
; chroma
lea
ds
t8q
,
[
ds
t8q
+
strideq
*
4
-
4
]
%endif
%endif
%if mmsize == 8
.next8px:
%endif
; read
lea
ds
t2q
,
[
ds
t1q
+
strideq
]
%ifidn %1, v
%if %2 == 8 && mmsize == 16
%define movrow movh
%else
%define movrow mova
%endif
movrow
m0
,
[
ds
t1q
+
mstrideq
*
4
]
; p3
movrow
m1
,
[
ds
t2q
+
mstrideq
*
4
]
; p2
movrow
m2
,
[
ds
t1q
+
mstrideq
*
2
]
; p1
movrow
m5
,
[
ds
t2q
]
; q1
movrow
m6
,
[
ds
t2q
+
strideq
*
1
]
; q2
movrow
m7
,
[
ds
t2q
+
strideq
*
2
]
; q3
%if mmsize == 16 && %2 == 8
movhps
m0
,
[
ds
t8q
+
mstrideq
*
4
]
movhps
m2
,
[
ds
t8q
+
mstrideq
*
2
]
add
ds
t8q
,
strideq
movhps
m1
,
[
ds
t8q
+
mstrideq
*
4
]
movhps
m5
,
[
ds
t8q
]
movhps
m6
,
[
ds
t8q
+
strideq
]
movhps
m7
,
[
ds
t8q
+
strideq
*
2
]
add
ds
t8q
,
mstrideq
%endif
%elif mmsize == 8
; mmx/mmxext (h)
; read 8 rows of 8px each
movu
m0
,
[
ds
t1q
+
mstrideq
*
4
]
movu
m1
,
[
ds
t2q
+
mstrideq
*
4
]
movu
m2
,
[
ds
t1q
+
mstrideq
*
2
]
movu
m3
,
[
ds
t1q
+
mstrideq
]
movu
m4
,
[
ds
t1q
]
movu
m5
,
[
ds
t2q
]
movu
m6
,
[
ds
t2q
+
strideq
]
; 8x8 transpose
TRANSPOSE4x4B
0
,
1
,
2
,
3
,
7
mova
m_q0backup
,
m1
movu
m7
,
[
ds
t2q
+
strideq
*
2
]
TRANSPOSE4x4B
4
,
5
,
6
,
7
,
1
SBUTTERFLY
dq
,
0
,
4
,
1
; p3/p2
SBUTTERFLY
dq
,
2
,
6
,
1
; q0/q1
SBUTTERFLY
dq
,
3
,
7
,
1
; q2/q3
mova
m1
,
m_q0backup
mova
m_q0backup
,
m2
; store q0
SBUTTERFLY
dq
,
1
,
5
,
2
; p1/p0
mova
m_p0backup
,
m5
; store p0
SWAP
1
,
4
SWAP
2
,
4
SWAP
6
,
3
SWAP
5
,
3
%else
; sse2 (h)
%if %2 == 16
lea
ds
t8q
,
[
ds
t1q
+
strideq
*
8
]
%endif
; read 16 rows of 8px each, interleave
movh
m0
,
[
ds
t1q
+
mstrideq
*
4
]
movh
m1
,
[
ds
t8q
+
mstrideq
*
4
]
movh
m2
,
[
ds
t1q
+
mstrideq
*
2
]
movh
m5
,
[
ds
t8q
+
mstrideq
*
2
]
movh
m3
,
[
ds
t1q
+
mstrideq
]
movh
m6
,
[
ds
t8q
+
mstrideq
]
movh
m4
,
[
ds
t1q
]
movh
m7
,
[
ds
t8q
]
punpcklbw
m0
,
m1
; A/I
punpcklbw
m2
,
m5
; C/K
punpcklbw
m3
,
m6
; D/L
punpcklbw
m4
,
m7
; E/M
add
ds
t8q
,
strideq
movh
m1
,
[
ds
t2q
+
mstrideq
*
4
]
movh
m6
,
[
ds
t8q
+
mstrideq
*
4
]
movh
m5
,
[
ds
t2q
]
movh
m7
,
[
ds
t8q
]
punpcklbw
m1
,
m6
; B/J
punpcklbw
m5
,
m7
; F/N
movh
m6
,
[
ds
t2q
+
strideq
]
movh
m7
,
[
ds
t8q
+
strideq
]
punpcklbw
m6
,
m7
; G/O
; 8x16 transpose
TRANSPOSE4x4B
0
,
1
,
2
,
3
,
7
%ifdef m8
SWAP
1
,
8
%else
mova
m_q0backup
,
m1
%endif
movh
m7
,
[
ds
t2q
+
strideq
*
2
]
movh
m1
,
[
ds
t8q
+
strideq
*
2
]
punpcklbw
m7
,
m1
; H/P
TRANSPOSE4x4B
4
,
5
,
6
,
7
,
1