Skip to content
GitLab
Explore
Projects
Groups
Topics
Snippets
Projects
Groups
Topics
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
BC
public
external
libvpx
Commits
4cde2ab7
Commit
4cde2ab7
authored
14 years ago
by
Johann
Committed by
Code Review
14 years ago
Browse files
Options
Download
Plain Diff
Merge "ARMv6 optimized fdct4x4"
parents
edfc93ae
a61785b6
v1.14.0-linphone
1.4.X
cayuga
eider
experimental
feature/update_to_v1.9.0-linphone
feature/uwp_nuget
forest
frame_parallel
highbitdepth
indianrunnerduck
javanwhistlingduck
khakicampbell
linphone
linphone-android
linphone-old
longtailedduck
m29-baseline
m31-baseline
m49-2623
m52-2743
m54-2840
m56-2924
m66-3359
m68-3440
mandarinduck
mcw
mcw2
nextgen
nextgenv2
pcs-2013
playground
sandbox/Jingning/experimental
sandbox/Jingning/transcode
sandbox/Jingning/vpx
sandbox/aconverse@google.com/ansbench
sandbox/awatry/initial_opencl_implementation
sandbox/debargha/playground
sandbox/hkuang/frame_parallel
sandbox/hkuang@google.com/decode
sandbox/jimbankoski@google.com/proposed-aom
sandbox/jingning@google.com/decoder_test_suite
sandbox/jingning@google.com/experimental
sandbox/jkoleszar/cached-multibit
sandbox/jkoleszar/new-rate-control
sandbox/jkoleszar/new-rtcd
sandbox/jkoleszar/reuse-modemv
sandbox/jzern@google.com/test
sandbox/wangch@google.com/vp9
sandbox/yaowu@google.com/mergeaom
stable-vp9-decoder
v1.12.0-linphone
v1.6.1_linphone
v1.7.0-linphone
v1.9.0-linphone
vp9-preview
v1.9.0
v1.9.0-rc1
v1.8.2
v1.8.1
v1.8.0
v1.7.0
v1.6.1
v1.6.0
v1.5.0
v1.4.0
v1.3.0
v1.2.0
v1.1.0
v1.0.0
v0.9.7
v0.9.7-p1
No related merge requests found
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
vp8/encoder/arm/arm_csystemdependent.c
+3
-3
vp8/encoder/arm/arm_csystemdependent.c
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
+262
-0
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
vp8/encoder/arm/dct_arm.c
+24
-0
vp8/encoder/arm/dct_arm.c
vp8/encoder/arm/dct_arm.h
+10
-1
vp8/encoder/arm/dct_arm.h
vp8/vp8cx_arm.mk
+2
-0
vp8/vp8cx_arm.mk
with
301 additions
and
4 deletions
vp8/encoder/arm/arm_csystemdependent.c
+
3
−
3
View file @
4cde2ab7
...
...
@@ -59,9 +59,9 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
/*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_
c
;
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_
c;*/
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
*/
cpi
->
rtcd
.
fdct
.
fast4x4
=
vp8_fast_fdct4x4_
armv6
;
cpi
->
rtcd
.
fdct
.
fast8x4
=
vp8_fast_fdct8x4_
armv6
;
cpi
->
rtcd
.
fdct
.
walsh_short4x4
=
vp8_short_walsh4x4_armv6
;
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
...
...
This diff is collapsed.
Click to expand it.
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
0 → 100644
+
262
−
0
View file @
4cde2ab7
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT
|
vp8_fast_fdct4x4_armv6
|
ARM
REQUIRE8
PRESERVE8
AREA
|
.text
|
,
CODE
,
READONLY
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
vp8_fast_fdct4x4_armv6
|
PROC
stmfd
sp
!
,
{
r4
-
r12
,
lr
}
; PART 1
; coeffs 0-3
ldrd
r4
,
r5
,
[
r0
]
; [i1 | i0] [i3 | i2]
ldr
r10
,
c7500
ldr
r11
,
c14500
ldr
r12
,
c0x22a453a0
; [2217*4 | 5352*4]
ldr
lr
,
c0x00080008
ror
r5
,
r5
,
#
16
; [i2 | i3]
qadd16
r6
,
r4
,
r5
; [i1+i2 | i0+i3] = [b1 | a1] without shift
qsub16
r7
,
r4
,
r5
; [i1-i2 | i0-i3] = [c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2*[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r4
,
r6
,
lr
; o0 = (i1+i2)*8 + (i0+i3)*8
smusd
r5
,
r6
,
lr
; o2 = (i1+i2)*8 - (i0+i3)*8
smlad
r6
,
r7
,
r12
,
r11
; o1 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o3 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r8
,
r9
,
[
r0
]
; [i5 | i4] [i7 | i6]
pkhbt
r3
,
r4
,
r6
,
lsl
#
4
; [o1 | o0], keep in register for PART 2
pkhbt
r6
,
r5
,
r7
,
lsl
#
4
; [o3 | o2]
str
r6
,
[
r1
,
#
4
]
; coeffs 4-7
ror
r9
,
r9
,
#
16
; [i6 | i7]
qadd16
r6
,
r8
,
r9
; [i5+i6 | i4+i7] = [b1 | a1] without shift
qsub16
r7
,
r8
,
r9
; [i5-i6 | i4-i7] = [c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r9
,
r6
,
lr
; o4 = (i5+i6)*8 + (i4+i7)*8
smusd
r8
,
r6
,
lr
; o6 = (i5+i6)*8 - (i4+i7)*8
smlad
r6
,
r7
,
r12
,
r11
; o5 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o7 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r4
,
r5
,
[
r0
]
; [i9 | i8] [i11 | i10]
pkhbt
r9
,
r9
,
r6
,
lsl
#
4
; [o5 | o4], keep in register for PART 2
pkhbt
r6
,
r8
,
r7
,
lsl
#
4
; [o7 | o6]
str
r6
,
[
r1
,
#
12
]
; coeffs 8-11
ror
r5
,
r5
,
#
16
; [i10 | i11]
qadd16
r6
,
r4
,
r5
; [i9+i10 | i8+i11]=[b1 | a1] without shift
qsub16
r7
,
r4
,
r5
; [i9-i10 | i8-i11]=[c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r2
,
r6
,
lr
; o8 = (i9+i10)*8 + (i8+i11)*8
smusd
r8
,
r6
,
lr
; o10 = (i9+i10)*8 - (i8+i11)*8
smlad
r6
,
r7
,
r12
,
r11
; o9 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o11 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r4
,
r5
,
[
r0
]
; [i13 | i12] [i15 | i14]
pkhbt
r2
,
r2
,
r6
,
lsl
#
4
; [o9 | o8], keep in register for PART 2
pkhbt
r6
,
r8
,
r7
,
lsl
#
4
; [o11 | o10]
str
r6
,
[
r1
,
#
20
]
; coeffs 12-15
ror
r5
,
r5
,
#
16
; [i14 | i15]
qadd16
r6
,
r4
,
r5
; [i13+i14 | i12+i15]=[b1|a1] without shift
qsub16
r7
,
r4
,
r5
; [i13-i14 | i12-i15]=[c1|d1] without shift
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r4
,
r6
,
lr
; o12 = (i13+i14)*8 + (i12+i15)*8
smusd
r5
,
r6
,
lr
; o14 = (i13+i14)*8 - (i12+i15)*8
smlad
r6
,
r7
,
r12
,
r11
; o13 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o15 = (d1 * 2217 - c1 * 5352 + 7500)
pkhbt
r0
,
r4
,
r6
,
lsl
#
4
; [o13 | o12], keep in register for PART 2
pkhbt
r6
,
r5
,
r7
,
lsl
#
4
; [o15 | o14]
str
r6
,
[
r1
,
#
28
]
; PART 2 -------------------------------------------------
ldr
r11
,
c12000
ldr
r10
,
c51000
ldr
lr
,
c0x00070007
qadd16
r4
,
r3
,
r0
; a1 = [i1+i13 | i0+i12]
qadd16
r5
,
r9
,
r2
; b1 = [i5+i9 | i4+i8]
qsub16
r6
,
r9
,
r2
; c1 = [i5-i9 | i4-i8]
qsub16
r7
,
r3
,
r0
; d1 = [i1-i13 | i0-i12]
qadd16
r4
,
r4
,
lr
; a1 + 7
add
r0
,
r11
,
#
0x10000
; add (d!=0)
qadd16
r2
,
r4
,
r5
; a1 + b1 + 7
qsub16
r3
,
r4
,
r5
; a1 - b1 + 7
ldr
r12
,
c0x08a914e8
; [2217 | 5352]
lsl
r8
,
r2
,
#
16
; prepare bottom halfword for scaling
asr
r2
,
r2
,
#
4
; scale top halfword
lsl
r9
,
r3
,
#
16
; prepare bottom halfword for scaling
asr
r3
,
r3
,
#
4
; scale top halfword
pkhtb
r4
,
r2
,
r8
,
asr
#
20
; pack and scale bottom halfword
pkhtb
r5
,
r3
,
r9
,
asr
#
20
; pack and scale bottom halfword
smulbt
r2
,
r6
,
r12
; [ ------ | c1*2217]
str
r4
,
[
r1
,
#
0
]
; [ o1 | o0]
smultt
r3
,
r6
,
r12
; [c1*2217 | ------ ]
str
r5
,
[
r1
,
#
16
]
; [ o9 | o8]
smlabb
r8
,
r7
,
r12
,
r2
; [ ------ | d1*5352]
smlatb
r9
,
r7
,
r12
,
r3
; [d1*5352 | ------ ]
smulbb
r2
,
r6
,
r12
; [ ------ | c1*5352]
smultb
r3
,
r6
,
r12
; [c1*5352 | ------ ]
lsls
r6
,
r7
,
#
16
; d1 != 0 ?
addeq
r8
,
r8
,
r11
; c1_b*2217+d1_b*5352+12000 + (d==0)
addne
r8
,
r8
,
r0
; c1_b*2217+d1_b*5352+12000 + (d!=0)
asrs
r6
,
r7
,
#
16
addeq
r9
,
r9
,
r11
; c1_t*2217+d1_t*5352+12000 + (d==0)
addne
r9
,
r9
,
r0
; c1_t*2217+d1_t*5352+12000 + (d!=0)
smlabt
r4
,
r7
,
r12
,
r10
; [ ------ | d1*2217] + 51000
smlatt
r5
,
r7
,
r12
,
r10
; [d1*2217 | ------ ] + 51000
pkhtb
r9
,
r9
,
r8
,
asr
#
16
sub
r4
,
r4
,
r2
sub
r5
,
r5
,
r3
ldr
r3
,
[
r1
,
#
4
]
; [i3 | i2]
pkhtb
r5
,
r5
,
r4
,
asr
#
16
; [o13|o12]
str
r9
,
[
r1
,
#
8
]
; [o5 | 04]
ldr
r9
,
[
r1
,
#
12
]
; [i7 | i6]
ldr
r8
,
[
r1
,
#
28
]
; [i15|i14]
ldr
r2
,
[
r1
,
#
20
]
; [i11|i10]
str
r5
,
[
r1
,
#
24
]
; [o13|o12]
qadd16
r4
,
r3
,
r8
; a1 = [i3+i15 | i2+i14]
qadd16
r5
,
r9
,
r2
; b1 = [i7+i11 | i6+i10]
qadd16
r4
,
r4
,
lr
; a1 + 7
qsub16
r6
,
r9
,
r2
; c1 = [i7-i11 | i6-i10]
qadd16
r2
,
r4
,
r5
; a1 + b1 + 7
qsub16
r7
,
r3
,
r8
; d1 = [i3-i15 | i2-i14]
qsub16
r3
,
r4
,
r5
; a1 - b1 + 7
lsl
r8
,
r2
,
#
16
; prepare bottom halfword for scaling
asr
r2
,
r2
,
#
4
; scale top halfword
lsl
r9
,
r3
,
#
16
; prepare bottom halfword for scaling
asr
r3
,
r3
,
#
4
; scale top halfword
pkhtb
r4
,
r2
,
r8
,
asr
#
20
; pack and scale bottom halfword
pkhtb
r5
,
r3
,
r9
,
asr
#
20
; pack and scale bottom halfword
smulbt
r2
,
r6
,
r12
; [ ------ | c1*2217]
str
r4
,
[
r1
,
#
4
]
; [ o3 | o2]
smultt
r3
,
r6
,
r12
; [c1*2217 | ------ ]
str
r5
,
[
r1
,
#
20
]
; [ o11 | o10]
smlabb
r8
,
r7
,
r12
,
r2
; [ ------ | d1*5352]
smlatb
r9
,
r7
,
r12
,
r3
; [d1*5352 | ------ ]
smulbb
r2
,
r6
,
r12
; [ ------ | c1*5352]
smultb
r3
,
r6
,
r12
; [c1*5352 | ------ ]
lsls
r6
,
r7
,
#
16
; d1 != 0 ?
addeq
r8
,
r8
,
r11
; c1_b*2217+d1_b*5352+12000 + (d==0)
addne
r8
,
r8
,
r0
; c1_b*2217+d1_b*5352+12000 + (d!=0)
asrs
r6
,
r7
,
#
16
addeq
r9
,
r9
,
r11
; c1_t*2217+d1_t*5352+12000 + (d==0)
addne
r9
,
r9
,
r0
; c1_t*2217+d1_t*5352+12000 + (d!=0)
smlabt
r4
,
r7
,
r12
,
r10
; [ ------ | d1*2217] + 51000
smlatt
r5
,
r7
,
r12
,
r10
; [d1*2217 | ------ ] + 51000
pkhtb
r9
,
r9
,
r8
,
asr
#
16
sub
r4
,
r4
,
r2
sub
r5
,
r5
,
r3
str
r9
,
[
r1
,
#
12
]
; [o7 | o6]
pkhtb
r5
,
r5
,
r4
,
asr
#
16
; [o15|o14]
str
r5
,
[
r1
,
#
28
]
; [o15|o14]
ldmfd
sp
!
,
{
r4
-
r12
,
pc
}
ENDP
; Used constants
c7500
DCD
7500
c14500
DCD
14500
c0x22a453a0
DCD
0x22a453a0
c0x00080008
DCD
0x00080008
c12000
DCD
12000
c51000
DCD
51000
c0x00070007
DCD
0x00070007
c0x08a914e8
DCD
0x08a914e8
END
This diff is collapsed.
Click to expand it.
vp8/encoder/arm/dct_arm.c
0 → 100644
+
24
−
0
View file @
4cde2ab7
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include
"vpx_config.h"
#include
"vp8/encoder/dct.h"
#if HAVE_ARMV6
void
vp8_fast_fdct8x4_armv6
(
short
*
input
,
short
*
output
,
int
pitch
)
{
vp8_fast_fdct4x4_armv6
(
input
,
output
,
pitch
);
vp8_fast_fdct4x4_armv6
(
input
+
4
,
output
+
16
,
pitch
);
}
#endif
/* HAVE_ARMV6 */
This diff is collapsed.
Click to expand it.
vp8/encoder/arm/dct_arm.h
+
10
−
1
View file @
4cde2ab7
...
...
@@ -14,12 +14,21 @@
#if HAVE_ARMV6
extern
prototype_fdct
(
vp8_short_walsh4x4_armv6
);
extern
prototype_fdct
(
vp8_fast_fdct4x4_armv6
);
extern
prototype_fdct
(
vp8_fast_fdct8x4_armv6
);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
#endif
#endif
#endif
/* HAVE_ARMV6 */
#if HAVE_ARMV7
extern
prototype_fdct
(
vp8_short_fdct4x4_neon
);
...
...
This diff is collapsed.
Click to expand it.
vp8/vp8cx_arm.mk
+
2
−
0
View file @
4cde2ab7
...
...
@@ -19,6 +19,7 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/asm_enc_offsets.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/encodemb_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/picklpf_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/dct_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/variance_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/variance_arm.h
VP8_CX_SRCS-$(HAVE_ARMV5TE)
+=
encoder/arm/boolhuff_arm.c
...
...
@@ -34,6 +35,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar
#File list for armv6
# encoder
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_fast_fdct4x4_armv6
$(
ASM
)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_fast_quantize_b_armv6
$(
ASM
)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_sad16x16_armv6
$(
ASM
)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_variance16x16_armv6
$(
ASM
)
...
...
This diff is collapsed.
Click to expand it.
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Topics
Snippets