Skip to content
GitLab
Explore
Projects
Groups
Topics
Snippets
Projects
Groups
Topics
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
BC
public
external
libvpx
Commits
aeca5990
Commit
aeca5990
authored
13 years ago
by
Johann
Committed by
Code Review
13 years ago
Browse files
Options
Download
Plain Diff
Merge "keep values in registers during quantization"
parents
c36b6d4d
508ae1b3
v1.14.0-linphone
1.4.X
cayuga
eider
experimental
feature/update_to_v1.9.0-linphone
feature/uwp_nuget
forest
frame_parallel
highbitdepth
indianrunnerduck
javanwhistlingduck
khakicampbell
linphone
linphone-android
linphone-old
longtailedduck
m29-baseline
m31-baseline
m49-2623
m52-2743
m54-2840
m56-2924
m66-3359
m68-3440
mandarinduck
mcw
mcw2
nextgen
nextgenv2
pcs-2013
playground
sandbox/Jingning/experimental
sandbox/Jingning/transcode
sandbox/Jingning/vpx
sandbox/aconverse@google.com/ansbench
sandbox/debargha/playground
sandbox/hkuang/frame_parallel
sandbox/hkuang@google.com/decode
sandbox/jimbankoski@google.com/proposed-aom
sandbox/jingning@google.com/decoder_test_suite
sandbox/jingning@google.com/experimental
sandbox/jkoleszar/cached-multibit
sandbox/jkoleszar/new-rate-control
sandbox/jkoleszar/new-rtcd
sandbox/jkoleszar/reuse-modemv
sandbox/jzern@google.com/test
sandbox/wangch@google.com/vp9
sandbox/yaowu@google.com/mergeaom
stable-vp9-decoder
v1.12.0-linphone
v1.6.1_linphone
v1.7.0-linphone
v1.9.0-linphone
vp9-preview
v1.9.0
v1.9.0-rc1
v1.8.2
v1.8.1
v1.8.0
v1.7.0
v1.6.1
v1.6.0
v1.5.0
v1.4.0
v1.3.0
v1.2.0
v1.1.0
v1.0.0
v0.9.7
v0.9.7-p1
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
vp8/encoder/x86/quantize_sse4.asm
+254
-0
vp8/encoder/x86/quantize_sse4.asm
vp8/encoder/x86/quantize_x86.h
+13
-0
vp8/encoder/x86/quantize_x86.h
vp8/encoder/x86/x86_csystemdependent.c
+2
-0
vp8/encoder/x86/x86_csystemdependent.c
vp8/vp8cx.mk
+1
-0
vp8/vp8cx.mk
with
270 additions
and
0 deletions
vp8/encoder/x86/quantize_sse4.asm
0 → 100644
+
254
−
0
View file @
aeca5990
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%include "asm_enc_offsets.asm"
; void vp8_regular_quantize_b_sse4 | arg
; (BLOCK *b, | 0
; BLOCKD *d) | 1
global
sym
(
vp8_regular_quantize_b_sse4
)
sym
(
vp8_regular_quantize_b_sse4
):
%if ABI_IS_32BIT
push
rbp
mov
rbp
,
rsp
GET_GOT
rbx
push
rdi
push
rsi
ALIGN
_STACK
16
,
rax
%define qcoeff 0
; 32
%define stack_size 32
sub
rsp
,
stack_size
%else
%ifidn __OUTPUT_FORMAT__,x64
SAVE_XMM
8
,
u
push
rdi
push
rsi
%endif
%endif
; end prolog
%if ABI_IS_32BIT
mov
rdi
,
arg
(
0
)
; BLOCK *b
mov
rsi
,
arg
(
1
)
; BLOCKD *d
%else
%ifidn __OUTPUT_FORMAT__,x64
mov
rdi
,
rcx
; BLOCK *b
mov
rsi
,
rdx
; BLOCKD *d
%else
;mov rdi, rdi ; BLOCK *b
;mov rsi, rsi ; BLOCKD *d
%endif
%endif
mov
rax
,
[
rdi
+
vp8_block_coeff
]
mov
rcx
,
[
rdi
+
vp8_block_zbin
]
mov
rdx
,
[
rdi
+
vp8_block_round
]
movd
xmm7
,
[
rdi
+
vp8_block_zbin_extra
]
; z
movdqa
xmm0
,
[
rax
]
movdqa
xmm1
,
[
rax
+
16
]
; duplicate zbin_oq_value
pshuflw
xmm7
,
xmm7
,
0
punpcklwd
xmm7
,
xmm7
movdqa
xmm2
,
xmm0
movdqa
xmm3
,
xmm1
; sz
psraw
xmm0
,
15
psraw
xmm1
,
15
; (z ^ sz)
pxor
xmm2
,
xmm0
pxor
xmm3
,
xmm1
; x = abs(z)
psubw
xmm2
,
xmm0
psubw
xmm3
,
xmm1
; zbin
movdqa
xmm4
,
[
rcx
]
movdqa
xmm5
,
[
rcx
+
16
]
; *zbin_ptr + zbin_oq_value
paddw
xmm4
,
xmm7
paddw
xmm5
,
xmm7
movdqa
xmm6
,
xmm2
movdqa
xmm7
,
xmm3
; x - (*zbin_ptr + zbin_oq_value)
psubw
xmm6
,
xmm4
psubw
xmm7
,
xmm5
; round
movdqa
xmm4
,
[
rdx
]
movdqa
xmm5
,
[
rdx
+
16
]
mov
rax
,
[
rdi
+
vp8_block_quant_shift
]
mov
rcx
,
[
rdi
+
vp8_block_quant
]
mov
rdx
,
[
rdi
+
vp8_block_zrun_zbin_boost
]
; x + round
paddw
xmm2
,
xmm4
paddw
xmm3
,
xmm5
; quant
movdqa
xmm4
,
[
rcx
]
movdqa
xmm5
,
[
rcx
+
16
]
; y = x * quant_ptr >> 16
pmulhw
xmm4
,
xmm2
pmulhw
xmm5
,
xmm3
; y += x
paddw
xmm2
,
xmm4
paddw
xmm3
,
xmm5
pxor
xmm4
,
xmm4
%if ABI_IS_32BIT
movdqa
[
rsp
+
qcoeff
],
xmm4
movdqa
[
rsp
+
qcoeff
+
16
],
xmm4
%else
pxor
xmm8
,
xmm8
%endif
; quant_shift
movdqa
xmm5
,
[
rax
]
; zrun_zbin_boost
mov
rax
,
rdx
%macro ZIGZAG_LOOP 5
; x
pextrw
ecx
,
%
4
,
%
2
; if (x >= zbin)
sub
cx
,
WORD
PTR
[
rdx
]
; x - zbin
lea
rdx
,
[
rdx
+
2
]
; zbin_boost_ptr++
jl
rq_zigzag_loop_
%
1
; x < zbin
pextrw
edi
,
%
3
,
%
2
; y
; downshift by quant_shift[rc]
pextrb
ecx
,
xmm5
,
%
1
; quant_shift[rc]
sar
edi
,
cl
; also sets Z bit
je
rq_zigzag_loop_
%
1
; !y
%if ABI_IS_32BIT
mov
WORD
PTR
[
rsp
+
qcoeff
+
%
1
*
2
],
di
%else
pinsrw
%
5
,
edi
,
%
2
; qcoeff[rc]
%endif
mov
rdx
,
rax
; reset to b->zrun_zbin_boost
rq_zigzag_loop_
%
1
:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP
0
,
0
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
1
,
1
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
4
,
4
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
8
,
0
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
5
,
5
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
2
,
2
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
3
,
3
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
6
,
6
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
9
,
1
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
12
,
4
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
13
,
5
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
10
,
2
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
7
,
7
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
11
,
3
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
14
,
6
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
15
,
7
,
xmm3
,
xmm7
,
xmm8
mov
rcx
,
[
rsi
+
vp8_blockd_dequant
]
mov
rdi
,
[
rsi
+
vp8_blockd_dqcoeff
]
%if ABI_IS_32BIT
movdqa
xmm4
,
[
rsp
+
qcoeff
]
movdqa
xmm5
,
[
rsp
+
qcoeff
+
16
]
%else
%define xmm5 xmm8
%endif
; y ^ sz
pxor
xmm4
,
xmm0
pxor
xmm5
,
xmm1
; x = (y ^ sz) - sz
psubw
xmm4
,
xmm0
psubw
xmm5
,
xmm1
; dequant
movdqa
xmm0
,
[
rcx
]
movdqa
xmm1
,
[
rcx
+
16
]
mov
rcx
,
[
rsi
+
vp8_blockd_qcoeff
]
pmullw
xmm0
,
xmm4
pmullw
xmm1
,
xmm5
; store qcoeff
movdqa
[
rcx
],
xmm4
movdqa
[
rcx
+
16
],
xmm5
; store dqcoeff
movdqa
[
rdi
],
xmm0
movdqa
[
rdi
+
16
],
xmm1
; select the last value (in zig_zag order) for EOB
pxor
xmm6
,
xmm6
pcmpeqw
xmm4
,
xmm6
pcmpeqw
xmm5
,
xmm6
packsswb
xmm4
,
xmm5
pshufb
xmm4
,
[
GLOBAL
(
zig_zag1d
)]
pmovmskb
edx
,
xmm4
xor
rdi
,
rdi
mov
eax
,
-
1
xor
dx
,
ax
bsr
eax
,
edx
sub
edi
,
edx
sar
edi
,
31
add
eax
,
1
and
eax
,
edi
mov
[
rsi
+
vp8_blockd_eob
],
eax
; begin epilog
%if ABI_IS_32BIT
add
rsp
,
stack_size
pop
rsp
pop
rsi
pop
rdi
REST
ORE_GOT
pop
rbp
%else
%undef xmm5
%ifidn __OUTPUT_FORMAT__,x64
pop
rsi
pop
rdi
REST
ORE_XMM
%endif
%endif
ret
SECTION
_RODATA
align
16
; vp8/common/entropy.c: vp8_default_zig_zag1d
zig_zag1d:
db
0
,
1
,
4
,
8
,
5
,
2
,
3
,
6
,
9
,
12
,
13
,
10
,
7
,
11
,
14
,
15
This diff is collapsed.
Click to expand it.
vp8/encoder/x86/quantize_x86.h
+
13
−
0
View file @
aeca5990
...
@@ -51,4 +51,17 @@ extern prototype_quantize_block(vp8_fast_quantize_b_ssse3);
...
@@ -51,4 +51,17 @@ extern prototype_quantize_block(vp8_fast_quantize_b_ssse3);
#endif
/* HAVE_SSSE3 */
#endif
/* HAVE_SSSE3 */
#if HAVE_SSE4_1
extern
prototype_quantize_block
(
vp8_regular_quantize_b_sse4
);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_quantize_quantb
#define vp8_quantize_quantb vp8_regular_quantize_b_sse4
#endif
/* !CONFIG_RUNTIME_CPU_DETECT */
#endif
/* HAVE_SSE4_1 */
#endif
/* QUANTIZE_X86_H */
#endif
/* QUANTIZE_X86_H */
This diff is collapsed.
Click to expand it.
vp8/encoder/x86/x86_csystemdependent.c
+
2
−
0
View file @
aeca5990
...
@@ -313,6 +313,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
...
@@ -313,6 +313,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi
->
rtcd
.
variance
.
sad8x8x8
=
vp8_sad8x8x8_sse4
;
cpi
->
rtcd
.
variance
.
sad8x8x8
=
vp8_sad8x8x8_sse4
;
cpi
->
rtcd
.
variance
.
sad4x4x8
=
vp8_sad4x4x8_sse4
;
cpi
->
rtcd
.
variance
.
sad4x4x8
=
vp8_sad4x4x8_sse4
;
cpi
->
rtcd
.
search
.
full_search
=
vp8_full_search_sadx8
;
cpi
->
rtcd
.
search
.
full_search
=
vp8_full_search_sadx8
;
cpi
->
rtcd
.
quantize
.
quantb
=
vp8_regular_quantize_b_sse4
;
}
}
#endif
#endif
...
...
This diff is collapsed.
Click to expand it.
vp8/vp8cx.mk
+
1
−
0
View file @
aeca5990
...
@@ -117,6 +117,7 @@ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
...
@@ -117,6 +117,7 @@ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
VP8_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/variance_impl_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/variance_impl_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/quantize_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/quantize_ssse3.asm
VP8_CX_SRCS-$(HAVE_SSE4_1)
+=
encoder/x86/sad_sse4.asm
VP8_CX_SRCS-$(HAVE_SSE4_1)
+=
encoder/x86/sad_sse4.asm
VP8_CX_SRCS-$(HAVE_SSE4_1)
+=
encoder/x86/quantize_sse4.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64)
+=
encoder/x86/quantize_mmx.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64)
+=
encoder/x86/quantize_mmx.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64)
+=
encoder/x86/encodeopt.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64)
+=
encoder/x86/encodeopt.asm
VP8_CX_SRCS-$(ARCH_X86_64)
+=
encoder/x86/ssim_opt.asm
VP8_CX_SRCS-$(ARCH_X86_64)
+=
encoder/x86/ssim_opt.asm
...
...
This diff is collapsed.
Click to expand it.
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Topics
Snippets