Skip to content
GitLab
Explore
Projects
Groups
Topics
Snippets
Projects
Groups
Topics
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
BC
public
external
libvpx
Commits
afa1b661
Commit
afa1b661
authored
13 years ago
by
Scott LaVarnway
Committed by
Gerrit Code Review
13 years ago
Browse files
Options
Download
Plain Diff
Merge "Improved mmx/sse2 versions of iwalsh"
parents
a69810b8
9fa6132f
v1.14.0-linphone
1.4.X
eider
experimental
feature/update_to_v1.9.0-linphone
feature/uwp_nuget
forest
frame_parallel
highbitdepth
indianrunnerduck
javanwhistlingduck
khakicampbell
linphone
linphone-android
linphone-old
longtailedduck
m29-baseline
m31-baseline
m49-2623
m52-2743
m54-2840
m56-2924
m66-3359
m68-3440
mandarinduck
mcw
mcw2
nextgen
nextgenv2
pcs-2013
playground
sandbox/Jingning/experimental
sandbox/Jingning/transcode
sandbox/Jingning/vpx
sandbox/aconverse@google.com/ansbench
sandbox/debargha/playground
sandbox/hkuang/frame_parallel
sandbox/hkuang@google.com/decode
sandbox/jimbankoski@google.com/proposed-aom
sandbox/jingning@google.com/decoder_test_suite
sandbox/jingning@google.com/experimental
sandbox/jkoleszar/new-rtcd
sandbox/jkoleszar/reuse-modemv
sandbox/jzern@google.com/test
sandbox/wangch@google.com/vp9
sandbox/yaowu@google.com/mergeaom
stable-vp9-decoder
v1.12.0-linphone
v1.6.1_linphone
v1.7.0-linphone
v1.9.0-linphone
vp9-preview
v1.9.0
v1.9.0-rc1
v1.8.2
v1.8.1
v1.8.0
v1.7.0
v1.6.1
v1.6.0
v1.5.0
v1.4.0
v1.3.0
v1.2.0
v1.1.0
v1.0.0
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
vp8/common/x86/iwalsh_mmx.asm
+94
-131
vp8/common/x86/iwalsh_mmx.asm
vp8/common/x86/iwalsh_sse2.asm
+75
-115
vp8/common/x86/iwalsh_sse2.asm
with
169 additions
and
246 deletions
vp8/common/x86/iwalsh_mmx.asm
+
94
−
131
View file @
afa1b661
...
...
@@ -17,160 +17,123 @@ sym(vp8_short_inv_walsh4x4_mmx):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
push
rsi
push
rdi
; end prolog
mov
rax
,
3
mov
rsi
,
arg
(
0
)
mov
rdi
,
arg
(
1
)
shl
rax
,
16
mov
rdx
,
arg
(
0
)
mov
rax
,
30003h
movq
mm0
,
[
r
si
+
0
]
;ip[0]
movq
mm1
,
[
r
si
+
8
]
;ip[4]
or
rax
,
3
;00030003h
movq
mm0
,
[
r
dx
+
0
]
;ip[0]
movq
mm1
,
[
r
dx
+
8
]
;ip[4]
movd
mm7
,
rax
movq
mm2
,
[
rsi
+
16
]
;ip[8]
movq
mm3
,
[
rsi
+
24
]
;ip[12]
movq
mm2
,
[
rdx
+
16
]
;ip[8]
movq
mm3
,
[
rdx
+
24
]
;ip[12]
punpcklwd
mm7
,
mm7
;0003000300030003h
mov
rdx
,
arg
(
1
)
movq
mm
7
,
rax
movq
mm
4
,
mm
0
movq
mm
4
,
mm0
movq
mm
5
,
mm
1
p
unpcklwd
mm
7
,
mm
7
;
0003000300030003h
movq
mm5
,
mm
1
p
addw
mm
4
,
mm
3
;
ip[0] + ip[12] aka al
paddw
mm5
,
mm
2
;ip[4] + ip[8] aka bl
paddw
mm4
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm4
;temp al
paddw
mm4
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
movq
mm6
,
mm4
;temp al
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm1
,
mm2
;ip[4] - ip[8] aka c1
paddw
mm4
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm1
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm1
;dl + cl
psubw
mm5
,
mm1
;dl - cl
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm1
;dl + cl
psubw
mm5
,
mm1
;dl - cl
; 03 02 01 00
; 13 12 11 10
; 23 22 21 20
; 33 32 31 30
movq
mm3
,
mm4
; 03 02 01 00
punpcklwd
mm4
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm3
,
mm4
; 03 02 01 00
punpcklwd
mm4
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm1
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm1
,
mm5
; 33 23 32 22
movq
mm1
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm1
,
mm5
; 33 23 32 22
movq
mm0
,
mm4
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
movq
mm0
,
mm4
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm4
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm4
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm2
,
mm1
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm1
; 33 23 13 03 aka ip[12]
punpckldq
mm2
,
mm1
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm1
; 33 23 13 03 aka ip[12]
;~~~~~~~~~~~~~~~~~~~~~
movq
mm1
,
mm0
movq
mm5
,
mm4
paddw
mm1
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm1
;temp al
paddw
mm1
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm4
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm4
;dl + cl
psubw
mm5
,
mm4
;dl - cl
movq
mm1
,
mm0
movq
mm5
,
mm4
paddw
mm1
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm1
;temp al
paddw
mm1
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
paddw
mm1
,
mm7
paddw
mm6
,
mm7
psraw
mm1
,
3
psraw
mm6
,
3
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm4
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm4
;dl + cl
psubw
mm5
,
mm4
;dl - cl
paddw
mm0
,
mm7
paddw
mm5
,
mm7
psraw
mm0
,
3
psraw
mm5
,
3
;~~~~~~~~~~~~~~~~~~~~~
movq
mm3
,
mm1
; 03 02 01 00
punpcklwd
mm1
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm4
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm4
,
mm5
; 33 23 32 22
movq
mm0
,
mm1
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm1
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm2
,
mm4
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm4
; 33 23 13 03 aka ip[12]
paddw
mm0
,
mm7
paddw
mm1
,
mm7
paddw
mm2
,
mm7
paddw
mm3
,
mm7
psraw
mm0
,
3
psraw
mm1
,
3
psraw
mm2
,
3
psraw
mm3
,
3
; movq [rdi + 0], mm0
; movq [rdi + 8], mm1
; movq [rdi + 16], mm2
; movq [rdi + 24], mm3
movd
eax
,
mm0
psrlq
mm0
,
32
mov
word
ptr
[
rdi
+
32
*
0
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
1
],
ax
movd
eax
,
mm0
mov
word
ptr
[
rdi
+
32
*
2
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
3
],
ax
movd
ecx
,
mm1
psrlq
mm1
,
32
mov
word
ptr
[
rdi
+
32
*
4
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
5
],
cx
movd
ecx
,
mm1
mov
word
ptr
[
rdi
+
32
*
6
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
7
],
cx
movd
eax
,
mm2
psrlq
mm2
,
32
mov
word
ptr
[
rdi
+
32
*
8
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
9
],
ax
movd
eax
,
mm2
mov
word
ptr
[
rdi
+
32
*
10
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
11
],
ax
movd
ecx
,
mm3
psrlq
mm3
,
32
mov
word
ptr
[
rdi
+
32
*
12
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
13
],
cx
movd
ecx
,
mm3
mov
word
ptr
[
rdi
+
32
*
14
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
15
],
cx
movd
eax
,
mm1
movd
ecx
,
mm0
psrlq
mm0
,
32
psrlq
mm1
,
32
mov
word
ptr
[
rdx
+
32
*
0
],
ax
mov
word
ptr
[
rdx
+
32
*
1
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
4
],
ax
mov
word
ptr
[
rdx
+
32
*
5
],
cx
movd
eax
,
mm1
movd
ecx
,
mm0
mov
word
ptr
[
rdx
+
32
*
8
],
ax
mov
word
ptr
[
rdx
+
32
*
9
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
12
],
ax
mov
word
ptr
[
rdx
+
32
*
13
],
cx
movd
eax
,
mm6
movd
ecx
,
mm5
psrlq
mm5
,
32
psrlq
mm6
,
32
mov
word
ptr
[
rdx
+
32
*
2
],
ax
mov
word
ptr
[
rdx
+
32
*
3
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
6
],
ax
mov
word
ptr
[
rdx
+
32
*
7
],
cx
movd
eax
,
mm6
movd
ecx
,
mm5
mov
word
ptr
[
rdx
+
32
*
10
],
ax
mov
word
ptr
[
rdx
+
32
*
11
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
14
],
ax
mov
word
ptr
[
rdx
+
32
*
15
],
cx
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
...
...
This diff is collapsed.
Click to expand it.
vp8/common/x86/iwalsh_sse2.asm
+
75
−
115
View file @
afa1b661
...
...
@@ -17,145 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
SAVE_XMM
6
push
rsi
push
rdi
; end prolog
mov
rsi
,
arg
(
0
)
mov
rd
i
,
arg
(
1
)
mov
rax
,
3
mov
rcx
,
arg
(
0
)
mov
rd
x
,
arg
(
1
)
mov
rax
,
3
0003h
movdqa
xmm0
,
[
r
si
+
0
]
;ip[4] ip[0]
movdqa
xmm1
,
[
r
si
+
16
]
;ip[12] ip[8]
movdqa
xmm0
,
[
r
cx
+
0
]
;ip[4] ip[0]
movdqa
xmm1
,
[
r
cx
+
16
]
;ip[12] ip[8]
shl
rax
,
16
or
rax
,
3
;00030003h
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm0
;ip[4] ip[0]
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm0
;ip[4] ip[0]
paddw
xmm0
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
paddw
xmm0
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa
xmm4
,
xmm0
movdqa
xmm4
,
xmm0
punpcklqdq
xmm0
,
xmm3
;d1 a1
punpckhqdq
xmm4
,
xmm3
;c1 b1
movd
xmm6
,
eax
movdqa
xmm1
,
xmm4
;c1 b1
paddw
xmm4
,
xmm0
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm0
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
movdqa
xmm1
,
xmm4
;c1 b1
paddw
xmm4
,
xmm0
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm0
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
;;;temp output
;; movdqu [rdi + 0], xmm4
;; movdqu [rdi + 16], xmm3
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa
xmm3
,
xmm4
; 13 12 11 10 03 02 01 00
punpcklwd
xmm4
,
xmm0
; 23 03 22 02 21 01 20 00
punpckhwd
xmm3
,
xmm0
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm4
; 23 03 22 02 21 01 20 00
punpcklwd
xmm4
,
xmm3
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm3
; 33 23 13 03 32 22 12 02
movdqa
xmm3
,
xmm4
; 13 12 11 10 03 02 01 00
punpcklwd
xmm4
,
xmm0
; 23 03 22 02 21 01 20 00
punpckhwd
xmm3
,
xmm0
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm4
; 23 03 22 02 21 01 20 00
punpcklwd
xmm4
,
xmm3
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm3
; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm4
;ip[4] ip[0]
movd
xmm0
,
eax
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm4
;ip[4] ip[0]
pshufd
xmm
6
,
xmm
6
,
0
;03 03 03 03 03 03 03 03
pshufd
xmm
0
,
xmm
0
,
0
;03 03 03 03 03 03 03 03
paddw
xmm4
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
paddw
xmm4
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa
xmm5
,
xmm4
movdqa
xmm5
,
xmm4
punpcklqdq
xmm4
,
xmm3
;d1 a1
punpckhqdq
xmm5
,
xmm3
;c1 b1
movdqa
xmm1
,
xmm5
;c1 b1
paddw
xmm5
,
xmm4
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm4
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa
xmm0
,
xmm5
; 13 12 11 10 03 02 01 00
punpcklwd
xmm5
,
xmm4
; 23 03 22 02 21 01 20 00
punpckhwd
xmm0
,
xmm4
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm5
; 23 03 22 02 21 01 20 00
punpcklwd
xmm5
,
xmm0
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm0
; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
paddw
xmm5
,
xmm6
paddw
xmm1
,
xmm6
psraw
xmm5
,
3
psraw
xmm1
,
3
;; movdqa [rdi + 0], xmm5
;; movdqa [rdi + 16], xmm1
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
0
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
1
],
ax
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
2
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
3
],
ax
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
4
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
5
],
ax
movd
eax
,
xmm5
mov
word
ptr
[
rdi
+
32
*
6
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
7
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
8
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
9
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
10
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
11
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
12
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
13
],
ax
movd
eax
,
xmm1
mov
word
ptr
[
rdi
+
32
*
14
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
15
],
ax
movdqa
xmm1
,
xmm5
;c1 b1
paddw
xmm5
,
xmm4
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm4
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
paddw
xmm5
,
xmm0
paddw
xmm4
,
xmm0
psraw
xmm5
,
3
psraw
xmm4
,
3
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
0
],
ax
mov
word
ptr
[
rdx
+
32
*
2
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
4
],
ax
mov
word
ptr
[
rdx
+
32
*
6
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
8
],
ax
mov
word
ptr
[
rdx
+
32
*
10
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
12
],
ax
mov
word
ptr
[
rdx
+
32
*
14
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
1
],
ax
mov
word
ptr
[
rdx
+
32
*
3
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
5
],
ax
mov
word
ptr
[
rdx
+
32
*
7
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
mov
word
ptr
[
rdx
+
32
*
9
],
ax
mov
word
ptr
[
rdx
+
32
*
11
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
13
],
ax
mov
word
ptr
[
rdx
+
32
*
15
],
cx
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
SECTION
_RODATA
align
16
x_s1sqr2:
times
4
dw
0x8A8C
align
16
x_c1sqr2less1:
times
4
dw
0x4E7B
align
16
fours:
times
4
dw
0x0004
This diff is collapsed.
Click to expand it.
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Topics
Snippets