Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
libvpx
Commits
4cde2ab7
Commit
4cde2ab7
authored
Mar 23, 2011
by
Johann
Committed by
Code Review
Mar 23, 2011
Browse files
Merge "ARMv6 optimized fdct4x4"
parents
edfc93ae
a61785b6
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
301 additions
and
4 deletions
+301
-4
vp8/encoder/arm/arm_csystemdependent.c
vp8/encoder/arm/arm_csystemdependent.c
+3
-3
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
+262
-0
vp8/encoder/arm/dct_arm.c
vp8/encoder/arm/dct_arm.c
+24
-0
vp8/encoder/arm/dct_arm.h
vp8/encoder/arm/dct_arm.h
+10
-1
vp8/vp8cx_arm.mk
vp8/vp8cx_arm.mk
+2
-0
No files found.
vp8/encoder/arm/arm_csystemdependent.c
View file @
4cde2ab7
...
...
@@ -59,9 +59,9 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
/*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_
c
;
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_
c;*/
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
*/
cpi
->
rtcd
.
fdct
.
fast4x4
=
vp8_fast_fdct4x4_
armv6
;
cpi
->
rtcd
.
fdct
.
fast8x4
=
vp8_fast_fdct8x4_
armv6
;
cpi
->
rtcd
.
fdct
.
walsh_short4x4
=
vp8_short_walsh4x4_armv6
;
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
...
...
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
0 → 100644
View file @
4cde2ab7
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT
|
vp8_fast_fdct4x4_armv6
|
ARM
REQUIRE8
PRESERVE8
AREA
|
.text
|
,
CODE
,
READONLY
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
vp8_fast_fdct4x4_armv6
|
PROC
stmfd
sp
!
,
{
r4
-
r12
,
lr
}
; PART 1
; coeffs 0-3
ldrd
r4
,
r5
,
[
r0
]
; [i1 | i0] [i3 | i2]
ldr
r10
,
c7500
ldr
r11
,
c14500
ldr
r12
,
c0x22a453a0
; [2217*4 | 5352*4]
ldr
lr
,
c0x00080008
ror
r5
,
r5
,
#
16
; [i2 | i3]
qadd16
r6
,
r4
,
r5
; [i1+i2 | i0+i3] = [b1 | a1] without shift
qsub16
r7
,
r4
,
r5
; [i1-i2 | i0-i3] = [c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2*[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r4
,
r6
,
lr
; o0 = (i1+i2)*8 + (i0+i3)*8
smusd
r5
,
r6
,
lr
; o2 = (i1+i2)*8 - (i0+i3)*8
smlad
r6
,
r7
,
r12
,
r11
; o1 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o3 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r8
,
r9
,
[
r0
]
; [i5 | i4] [i7 | i6]
pkhbt
r3
,
r4
,
r6
,
lsl
#
4
; [o1 | o0], keep in register for PART 2
pkhbt
r6
,
r5
,
r7
,
lsl
#
4
; [o3 | o2]
str
r6
,
[
r1
,
#
4
]
; coeffs 4-7
ror
r9
,
r9
,
#
16
; [i6 | i7]
qadd16
r6
,
r8
,
r9
; [i5+i6 | i4+i7] = [b1 | a1] without shift
qsub16
r7
,
r8
,
r9
; [i5-i6 | i4-i7] = [c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r9
,
r6
,
lr
; o4 = (i5+i6)*8 + (i4+i7)*8
smusd
r8
,
r6
,
lr
; o6 = (i5+i6)*8 - (i4+i7)*8
smlad
r6
,
r7
,
r12
,
r11
; o5 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o7 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r4
,
r5
,
[
r0
]
; [i9 | i8] [i11 | i10]
pkhbt
r9
,
r9
,
r6
,
lsl
#
4
; [o5 | o4], keep in register for PART 2
pkhbt
r6
,
r8
,
r7
,
lsl
#
4
; [o7 | o6]
str
r6
,
[
r1
,
#
12
]
; coeffs 8-11
ror
r5
,
r5
,
#
16
; [i10 | i11]
qadd16
r6
,
r4
,
r5
; [i9+i10 | i8+i11]=[b1 | a1] without shift
qsub16
r7
,
r4
,
r5
; [i9-i10 | i8-i11]=[c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r2
,
r6
,
lr
; o8 = (i9+i10)*8 + (i8+i11)*8
smusd
r8
,
r6
,
lr
; o10 = (i9+i10)*8 - (i8+i11)*8
smlad
r6
,
r7
,
r12
,
r11
; o9 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o11 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r4
,
r5
,
[
r0
]
; [i13 | i12] [i15 | i14]
pkhbt
r2
,
r2
,
r6
,
lsl
#
4
; [o9 | o8], keep in register for PART 2
pkhbt
r6
,
r8
,
r7
,
lsl
#
4
; [o11 | o10]
str
r6
,
[
r1
,
#
20
]
; coeffs 12-15
ror
r5
,
r5
,
#
16
; [i14 | i15]
qadd16
r6
,
r4
,
r5
; [i13+i14 | i12+i15]=[b1|a1] without shift
qsub16
r7
,
r4
,
r5
; [i13-i14 | i12-i15]=[c1|d1] without shift
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r4
,
r6
,
lr
; o12 = (i13+i14)*8 + (i12+i15)*8
smusd
r5
,
r6
,
lr
; o14 = (i13+i14)*8 - (i12+i15)*8
smlad
r6
,
r7
,
r12
,
r11
; o13 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o15 = (d1 * 2217 - c1 * 5352 + 7500)
pkhbt
r0
,
r4
,
r6
,
lsl
#
4
; [o13 | o12], keep in register for PART 2
pkhbt
r6
,
r5
,
r7
,
lsl
#
4
; [o15 | o14]
str
r6
,
[
r1
,
#
28
]
; PART 2 -------------------------------------------------
ldr
r11
,
c12000
ldr
r10
,
c51000
ldr
lr
,
c0x00070007
qadd16
r4
,
r3
,
r0
; a1 = [i1+i13 | i0+i12]
qadd16
r5
,
r9
,
r2
; b1 = [i5+i9 | i4+i8]
qsub16
r6
,
r9
,
r2
; c1 = [i5-i9 | i4-i8]
qsub16
r7
,
r3
,
r0
; d1 = [i1-i13 | i0-i12]
qadd16
r4
,
r4
,
lr
; a1 + 7
add
r0
,
r11
,
#
0x10000
; add (d!=0)
qadd16
r2
,
r4
,
r5
; a1 + b1 + 7
qsub16
r3
,
r4
,
r5
; a1 - b1 + 7
ldr
r12
,
c0x08a914e8
; [2217 | 5352]
lsl
r8
,
r2
,
#
16
; prepare bottom halfword for scaling
asr
r2
,
r2
,
#
4
; scale top halfword
lsl
r9
,
r3
,
#
16
; prepare bottom halfword for scaling
asr
r3
,
r3
,
#
4
; scale top halfword
pkhtb
r4
,
r2
,
r8
,
asr
#
20
; pack and scale bottom halfword
pkhtb
r5
,
r3
,
r9
,
asr
#
20
; pack and scale bottom halfword
smulbt
r2
,
r6
,
r12
; [ ------ | c1*2217]
str
r4
,
[
r1
,
#
0
]
; [ o1 | o0]
smultt
r3
,
r6
,
r12
; [c1*2217 | ------ ]
str
r5
,
[
r1
,
#
16
]
; [ o9 | o8]
smlabb
r8
,
r7
,
r12
,
r2
; [ ------ | d1*5352]
smlatb
r9
,
r7
,
r12
,
r3
; [d1*5352 | ------ ]
smulbb
r2
,
r6
,
r12
; [ ------ | c1*5352]
smultb
r3
,
r6
,
r12
; [c1*5352 | ------ ]
lsls
r6
,
r7
,
#
16
; d1 != 0 ?
addeq
r8
,
r8
,
r11
; c1_b*2217+d1_b*5352+12000 + (d==0)
addne
r8
,
r8
,
r0
; c1_b*2217+d1_b*5352+12000 + (d!=0)
asrs
r6
,
r7
,
#
16
addeq
r9
,
r9
,
r11
; c1_t*2217+d1_t*5352+12000 + (d==0)
addne
r9
,
r9
,
r0
; c1_t*2217+d1_t*5352+12000 + (d!=0)
smlabt
r4
,
r7
,
r12
,
r10
; [ ------ | d1*2217] + 51000
smlatt
r5
,
r7
,
r12
,
r10
; [d1*2217 | ------ ] + 51000
pkhtb
r9
,
r9
,
r8
,
asr
#
16
sub
r4
,
r4
,
r2
sub
r5
,
r5
,
r3
ldr
r3
,
[
r1
,
#
4
]
; [i3 | i2]
pkhtb
r5
,
r5
,
r4
,
asr
#
16
; [o13|o12]
str
r9
,
[
r1
,
#
8
]
; [o5 | 04]
ldr
r9
,
[
r1
,
#
12
]
; [i7 | i6]
ldr
r8
,
[
r1
,
#
28
]
; [i15|i14]
ldr
r2
,
[
r1
,
#
20
]
; [i11|i10]
str
r5
,
[
r1
,
#
24
]
; [o13|o12]
qadd16
r4
,
r3
,
r8
; a1 = [i3+i15 | i2+i14]
qadd16
r5
,
r9
,
r2
; b1 = [i7+i11 | i6+i10]
qadd16
r4
,
r4
,
lr
; a1 + 7
qsub16
r6
,
r9
,
r2
; c1 = [i7-i11 | i6-i10]
qadd16
r2
,
r4
,
r5
; a1 + b1 + 7
qsub16
r7
,
r3
,
r8
; d1 = [i3-i15 | i2-i14]
qsub16
r3
,
r4
,
r5
; a1 - b1 + 7
lsl
r8
,
r2
,
#
16
; prepare bottom halfword for scaling
asr
r2
,
r2
,
#
4
; scale top halfword
lsl
r9
,
r3
,
#
16
; prepare bottom halfword for scaling
asr
r3
,
r3
,
#
4
; scale top halfword
pkhtb
r4
,
r2
,
r8
,
asr
#
20
; pack and scale bottom halfword
pkhtb
r5
,
r3
,
r9
,
asr
#
20
; pack and scale bottom halfword
smulbt
r2
,
r6
,
r12
; [ ------ | c1*2217]
str
r4
,
[
r1
,
#
4
]
; [ o3 | o2]
smultt
r3
,
r6
,
r12
; [c1*2217 | ------ ]
str
r5
,
[
r1
,
#
20
]
; [ o11 | o10]
smlabb
r8
,
r7
,
r12
,
r2
; [ ------ | d1*5352]
smlatb
r9
,
r7
,
r12
,
r3
; [d1*5352 | ------ ]
smulbb
r2
,
r6
,
r12
; [ ------ | c1*5352]
smultb
r3
,
r6
,
r12
; [c1*5352 | ------ ]
lsls
r6
,
r7
,
#
16
; d1 != 0 ?
addeq
r8
,
r8
,
r11
; c1_b*2217+d1_b*5352+12000 + (d==0)
addne
r8
,
r8
,
r0
; c1_b*2217+d1_b*5352+12000 + (d!=0)
asrs
r6
,
r7
,
#
16
addeq
r9
,
r9
,
r11
; c1_t*2217+d1_t*5352+12000 + (d==0)
addne
r9
,
r9
,
r0
; c1_t*2217+d1_t*5352+12000 + (d!=0)
smlabt
r4
,
r7
,
r12
,
r10
; [ ------ | d1*2217] + 51000
smlatt
r5
,
r7
,
r12
,
r10
; [d1*2217 | ------ ] + 51000
pkhtb
r9
,
r9
,
r8
,
asr
#
16
sub
r4
,
r4
,
r2
sub
r5
,
r5
,
r3
str
r9
,
[
r1
,
#
12
]
; [o7 | o6]
pkhtb
r5
,
r5
,
r4
,
asr
#
16
; [o15|o14]
str
r5
,
[
r1
,
#
28
]
; [o15|o14]
ldmfd
sp
!
,
{
r4
-
r12
,
pc
}
ENDP
; Used constants
c7500
DCD
7500
c14500
DCD
14500
c0x22a453a0
DCD
0x22a453a0
c0x00080008
DCD
0x00080008
c12000
DCD
12000
c51000
DCD
51000
c0x00070007
DCD
0x00070007
c0x08a914e8
DCD
0x08a914e8
END
vp8/encoder/arm/dct_arm.c
0 → 100644
View file @
4cde2ab7
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8/encoder/dct.h"
#if HAVE_ARMV6
void
vp8_fast_fdct8x4_armv6
(
short
*
input
,
short
*
output
,
int
pitch
)
{
vp8_fast_fdct4x4_armv6
(
input
,
output
,
pitch
);
vp8_fast_fdct4x4_armv6
(
input
+
4
,
output
+
16
,
pitch
);
}
#endif
/* HAVE_ARMV6 */
vp8/encoder/arm/dct_arm.h
View file @
4cde2ab7
...
...
@@ -14,12 +14,21 @@
#if HAVE_ARMV6
extern
prototype_fdct
(
vp8_short_walsh4x4_armv6
);
extern
prototype_fdct
(
vp8_fast_fdct4x4_armv6
);
extern
prototype_fdct
(
vp8_fast_fdct8x4_armv6
);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
#endif
#endif
#endif
/* HAVE_ARMV6 */
#if HAVE_ARMV7
extern
prototype_fdct
(
vp8_short_fdct4x4_neon
);
...
...
vp8/vp8cx_arm.mk
View file @
4cde2ab7
...
...
@@ -19,6 +19,7 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/asm_enc_offsets.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/encodemb_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/picklpf_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/dct_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/variance_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/variance_arm.h
VP8_CX_SRCS-$(HAVE_ARMV5TE)
+=
encoder/arm/boolhuff_arm.c
...
...
@@ -34,6 +35,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar
#File list for armv6
# encoder
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_fast_fdct4x4_armv6
$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_fast_quantize_b_armv6
$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_sad16x16_armv6
$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_variance16x16_armv6
$(ASM)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment