Skip to content
GitLab
Explore
Projects
Groups
Topics
Snippets
Projects
Groups
Topics
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
BC
public
external
libvpx
Commits
e8f8e776
Commit
e8f8e776
authored
11 years ago
by
Yunqing Wang
Committed by
Gerrit Code Review
11 years ago
Browse files
Options
Download
Plain Diff
Merge "Fix decoder mismatch with ssse3 enabled"
parents
75673cfc
3d50da53
v1.14.0-linphone
1.4.X
feature/update_to_v1.9.0-linphone
feature/uwp_nuget
frame_parallel
highbitdepth
indianrunnerduck
javanwhistlingduck
khakicampbell
linphone
linphone-android
linphone-old
longtailedduck
m49-2623
m52-2743
m54-2840
m56-2924
m66-3359
m68-3440
mandarinduck
mcw
mcw2
nextgen
nextgenv2
playground
sandbox/Jingning/experimental
sandbox/Jingning/transcode
sandbox/Jingning/vpx
sandbox/aconverse@google.com/ansbench
sandbox/debargha/playground
sandbox/hkuang/frame_parallel
sandbox/hkuang@google.com/decode
sandbox/jimbankoski@google.com/proposed-aom
sandbox/jingning@google.com/decoder_test_suite
sandbox/jingning@google.com/experimental
sandbox/jzern@google.com/test
sandbox/wangch@google.com/vp9
sandbox/yaowu@google.com/mergeaom
v1.12.0-linphone
v1.6.1_linphone
v1.7.0-linphone
v1.9.0-linphone
v1.9.0
v1.9.0-rc1
v1.8.2
v1.8.1
v1.8.0
v1.7.0
v1.6.1
v1.6.0
v1.5.0
v1.4.0
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+37
-19
vp9/common/x86/vp9_subpixel_8t_ssse3.asm
with
37 additions
and
19 deletions
vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+
37
−
19
View file @
e8f8e776
...
...
@@ -11,17 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
;/************************************************************************************
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
; input pixel array has output_height rows. This routine assumes that output_height is an
; even number. This function handles 8 pixels in horizontal direction, calculating ONE
; rows each iteration to take advantage of the 128 bits operations.
;
; This is an implementation of some of the SSE optimizations first seen in ffvp8
;
;*************************************************************************************/
%macro VERTx4 1
mov
rdx
,
arg
(
5
)
;filter ptr
mov
rsi
,
arg
(
0
)
;src_ptr
...
...
@@ -81,11 +70,14 @@
pmaddubsw
xmm4
,
k4k5
pmaddubsw
xmm6
,
k6k7
movdqa
xmm1
,
xmm2
paddsw
xmm0
,
xmm6
paddsw
xmm0
,
xmm2
pmaxsw
xmm2
,
xmm4
pminsw
xmm4
,
xmm1
paddsw
xmm0
,
xmm4
paddsw
xmm0
,
krd
paddsw
xmm0
,
xmm2
paddsw
xmm0
,
krd
psraw
xmm0
,
7
packuswb
xmm0
,
xmm0
...
...
@@ -538,14 +530,22 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movdqa
%
2
,
%
1
pshufb
%
1
,
[
GLOBAL
(
shuf_t0t1
)]
pshufb
%
2
,
[
GLOBAL
(
shuf_t2t3
)]
pmaddubsw
%
1
,
xmm6
pmaddubsw
%
2
,
xmm
7
pmaddubsw
%
1
,
k0k1k4k5
pmaddubsw
%
2
,
k2k3k6k
7
paddsw
%
1
,
%
2
movdqa
%
2
,
%
1
movdqa
xmm4
,
%
1
movdqa
xmm5
,
%
2
psrldq
%
1
,
8
psrldq
%
2
,
8
paddsw
%
1
,
%
2
paddsw
%
1
,
xmm5
movdqa
xmm6
,
xmm5
paddsw
xmm4
,
%
2
pmaxsw
xmm5
,
%
1
pminsw
%
1
,
xmm6
paddsw
%
1
,
xmm4
paddsw
%
1
,
xmm5
paddsw
%
1
,
krd
psraw
%
1
,
7
packuswb
%
1
,
%
1
%endm
...
...
@@ -565,6 +565,10 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
pshufhw
xmm7
,
xmm7
,
11111111b
;k2_k3_k6_k7
pshufd
xmm5
,
xmm5
,
0
;rounding
movdqa
k0k1k4k5
,
xmm6
movdqa
k2k3k6k7
,
xmm7
movdqa
krd
,
xmm5
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixels_per_line
movsxd
rdx
,
dword
ptr
arg
(
3
)
;output_pitch
movsxd
rcx
,
dword
ptr
arg
(
4
)
;output_height
...
...
@@ -826,8 +830,15 @@ sym(vp9_filter_block1d4_h8_ssse3):
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
16
*
3
%define k0k1k4k5 [rsp + 16 * 0]
%define k2k3k6k7 [rsp + 16 * 1]
%define krd [rsp + 16 * 2]
HORIZx4
0
add
rsp
,
16
*
3
; begin epilog
pop
rdi
pop
rsi
...
...
@@ -932,8 +943,15 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
16
*
3
%define k0k1k4k5 [rsp + 16 * 0]
%define k2k3k6k7 [rsp + 16 * 1]
%define krd [rsp + 16 * 2]
HORIZx4
1
add
rsp
,
16
*
3
; begin epilog
pop
rdi
pop
rsi
...
...
This diff is collapsed.
Click to expand it.
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Topics
Snippets