Commit 56f5a9a0 authored by Johann's avatar Johann Committed by Fritz Koenig

update arm idct functions

Jeff Muizelaar posted some changes to the idct/reconstruction c code.
This is the equivalent update for the arm assembly.

This shows a good boost on v6, and a minor boost on neon.
Here are some numbers for highway in qcif, 2641 frames:
HEAD neon: ~161 fps
new neon:  ~162 fps
HEAD v6:   ~102 fps
new v6:    ~106 fps

The following functions have been updated for armv6 and neon:
vp8_dc_only_idct_add
vp8_dequant_idct_add
vp8_dequant_dc_idct_add

Conflicts:

	vp8/decoder/arm/armv6/dequantdcidct_v6.asm
	vp8/decoder/arm/armv6/dequantidct_v6.asm

Resolved by removing these files. When I rewrote the functions, I also
moved the files to dequant_dc_idct_v6.asm/dequant_idct_v6.asm

Change-Id: Ie3300df824d52474eca1a5134cf22d8b7809a5d4
parent 98fcccfe
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
; r0 input_dc
; r1 pred_ptr
; r2 dest_ptr
; r3 pitch
; sp stride
|vp8_dc_only_idct_add_v6| PROC
stmdb sp!, {r4 - r7, lr}
add r0, r0, #4 ; input_dc += 4
ldr r12, c0x0000FFFF
ldr r4, [r1], r3
ldr r6, [r1], r3
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
ldr lr, [sp, #20]
orr r0, r0, r0, lsl #16 ; a1 | a1
uxtab16 r5, r0, r4 ; a1+2 | a1+0
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
uxtab16 r7, r0, r6
uxtab16 r6, r0, r6, ror #8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 r7, #8, r7
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
ldr r4, [r1], r3
ldr r6, [r1]
str r5, [r2], lr
str r7, [r2], lr
uxtab16 r5, r0, r4
uxtab16 r4, r0, r4, ror #8
uxtab16 r7, r0, r6
uxtab16 r6, r0, r6, ror #8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 r7, #8, r7
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
str r5, [r2], lr
str r7, [r2]
ldmia sp!, {r4 - r7, pc}
ENDP ; |vp8_dc_only_idct_add_v6|
; Constant Pool
c0x0000FFFF DCD 0x0000FFFF
END
......@@ -15,8 +15,6 @@
EXPORT |vp8_short_idct4x4llm_v6_scott|
EXPORT |vp8_short_idct4x4llm_v6_dual|
EXPORT |vp8_dc_only_idct_armv6|
AREA |.text|, CODE, READONLY
;********************************************************************************
......@@ -344,34 +342,4 @@ loop2_dual
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
; sjl added 10/17/08
;void dc_only_idct_armv6(short input_dc, short *output, int pitch)
|vp8_dc_only_idct_armv6| PROC
stmdb sp!, {r4 - r6, lr}
add r0, r0, #0x4
add r4, r1, r2 ; output + shortpitch
mov r0, r0, ASR #0x3 ;aka a1
add r5, r1, r2, LSL #1 ; output + shortpitch * 2
pkhbt r0, r0, r0, lsl #16 ; a1 | a1
add r6, r5, r2 ; output + shortpitch * 3
str r0, [r1, #0]
str r0, [r1, #4]
str r0, [r4, #0]
str r0, [r4, #4]
str r0, [r5, #0]
str r0, [r5, #4]
str r0, [r6, #0]
str r0, [r6, #4]
ldmia sp!, {r4 - r6, pc}
ENDP ; |vp8_dc_only_idct_armv6|
END
......@@ -8,8 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_armv6|
EXPORT |vp8_short_inv_walsh4x4_1_armv6|
EXPORT |vp8_short_inv_walsh4x4_v6|
EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
......@@ -17,8 +17,8 @@
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_armv6(short *input, short *output)
|vp8_short_inv_walsh4x4_armv6| PROC
;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_v6| PROC
stmdb sp!, {r4 - r11, lr}
......@@ -123,11 +123,11 @@
str r5, [r1]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_short_inv_walsh4x4_armv6|
ENDP ; |vp8_short_inv_walsh4x4_v6|
;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output)
|vp8_short_inv_walsh4x4_1_armv6| PROC
;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_1_v6| PROC
ldrsh r2, [r0] ; [0]
add r2, r2, #3 ; [0] + 3
......@@ -145,7 +145,7 @@
str r2, [r1]
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_armv6|
ENDP ; |vp8_short_inv_walsh4x4_1_v6|
; Constant Pool
c0x00030003 DCD 0x00030003
......
......@@ -15,8 +15,9 @@
#if HAVE_ARMV6
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6);
extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
......@@ -24,16 +25,20 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
#endif
#if HAVE_ARMV7
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
extern prototype_idct(vp8_short_idct4x4llm_neon);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
......@@ -43,6 +48,9 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
......
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
; r0 input_dc
; r1 pred_ptr
; r2 dst_ptr
; r3 pitch
; sp stride
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
vld1.32 {d2[0]}, [r1], r3
vld1.32 {d2[1]}, [r1], r3
vld1.32 {d4[0]}, [r1], r3
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
vaddw.u8 q2, q0, d4
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
vst1.32 {d2[0]}, [r2], r12
vst1.32 {d2[1]}, [r2], r12
vst1.32 {d4[0]}, [r2], r12
vst1.32 {d4[1]}, [r2]
bx lr
ENDP
END
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dequant_dc_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride, int Dc)
; r0 = input
; r1 = dq
; r2 = pred
; r3 = dest
; sp + 36 = pitch ; +4 = 40
; sp + 40 = stride ; +4 = 44
; sp + 44 = Dc ; +4 = 48
|vp8_dequant_dc_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r6, [sp, #44]
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r3, [sp]
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
mov r12, #3
vp8_dequant_dc_add_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne vp8_dequant_dc_add_loop
sub r0, r0, #32
mov r1, r0
; short_idct4x4llm_v6_dual
ldr r3, cospi8sqrt2minus1
ldr r4, sinpi8sqrt2
ldr r6, [r0, #8]
mov r5, #2
vp8_dequant_dc_idct_loop1_v6
ldr r12, [r0, #24]
ldr r14, [r0, #16]
smulwt r9, r3, r6
smulwb r7, r3, r6
smulwt r10, r4, r6
smulwb r8, r4, r6
pkhbt r7, r7, r9, lsl #16
smulwt r11, r3, r12
pkhbt r8, r8, r10, lsl #16
uadd16 r6, r6, r7
smulwt r7, r4, r12
smulwb r9, r3, r12
smulwb r10, r4, r12
subs r5, r5, #1
pkhbt r9, r9, r11, lsl #16
ldr r11, [r0], #4
pkhbt r10, r10, r7, lsl #16
uadd16 r7, r12, r9
usub16 r7, r8, r7
uadd16 r6, r6, r10
uadd16 r10, r11, r14
usub16 r8, r11, r14
uadd16 r9, r10, r6
usub16 r10, r10, r6
uadd16 r6, r8, r7
usub16 r7, r8, r7
str r6, [r1, #8]
ldrne r6, [r0, #8]
str r7, [r1, #16]
str r10, [r1, #24]
str r9, [r1], #4
bne vp8_dequant_dc_idct_loop1_v6
mov r5, #2
sub r0, r1, #8
vp8_dequant_dc_idct_loop2_v6
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
smulwt r1, r3, r6
smulwt r12, r4, r6
smulwt lr, r3, r8
smulwt r10, r4, r8
pkhbt r11, r8, r6, lsl #16
pkhbt r1, lr, r1, lsl #16
pkhbt r12, r10, r12, lsl #16
pkhtb r6, r6, r8, asr #16
uadd16 r6, r1, r6
pkhbt lr, r9, r7, lsl #16
uadd16 r10, r11, lr
usub16 lr, r11, lr
pkhtb r8, r7, r9, asr #16
subs r5, r5, #1
smulwt r1, r3, r8
smulwb r7, r3, r8
smulwt r11, r4, r8
smulwb r9, r4, r8
pkhbt r1, r7, r1, lsl #16
uadd16 r8, r1, r8
pkhbt r11, r9, r11, lsl #16
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
ldr r12, [sp, #40]
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
uadd16 r6, r6, r9
uadd16 r10, r14, r1
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
ldr r11, [r2], r12
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
pkhtb r8, r8, r6, asr #19
uxtb16 lr, r11, ror #8
qadd16 r9, r9, lr
uxtb16 lr, r11
qadd16 r8, r8, lr
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
ldr r11, [r2], r12
ldr lr, [sp]
ldr r12, [sp, #44]
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
mov r6, r6, lsl #16
mov r7, r7, asr #3
pkhtb r7, r7, r10, asr #19
mov r1, r1, asr #3
pkhtb r1, r1, r6, asr #19
uxtb16 r8, r11, ror #8
qadd16 r7, r7, r8
uxtb16 r8, r11
qadd16 r1, r1, r8
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
str r9, [lr], r12
str r1, [lr], r12
str lr, [sp]
bne vp8_dequant_dc_idct_loop2_v6
; vpx_memset
sub r0, r0, #32
add sp, sp, #4
mov r12, #0
str r12, [r0]
str r12, [r0, #4]
str r12, [r0, #8]
str r12, [r0, #12]
str r12, [r0, #16]
str r12, [r0, #20]
str r12, [r0, #24]
str r12, [r0, #28]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_dequant_dc_idct_add_v6|
; Constant Pool
cospi8sqrt2minus1 DCD 0x00004E7B
sinpi8sqrt2 DCD 0x00008A8C
c0x00040004 DCD 0x00040004
END
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dequant_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride)
; r0 = input
; r1 = dq
; r2 = pred
; r3 = dest
; sp + 36 = pitch ; +4 = 40
; sp + 40 = stride ; +4 = 44
|vp8_dequant_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r3, [sp]
mov r12, #4
vp8_dequant_add_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne vp8_dequant_add_loop
sub r0, r0, #32
mov r1, r0
; short_idct4x4llm_v6_dual
ldr r3, cospi8sqrt2minus1
ldr r4, sinpi8sqrt2
ldr r6, [r0, #8]
mov r5, #2
vp8_dequant_idct_loop1_v6
ldr r12, [r0, #24]
ldr r14, [r0, #16]
smulwt r9, r3, r6
smulwb r7, r3, r6
smulwt r10, r4, r6
smulwb r8, r4, r6
pkhbt r7, r7, r9, lsl #16
smulwt r11, r3, r12
pkhbt r8, r8, r10, lsl #16
uadd16 r6, r6, r7
smulwt r7, r4, r12
smulwb r9, r3, r12
smulwb r10, r4, r12
subs r5, r5, #1
pkhbt r9, r9, r11, lsl #16
ldr r11, [r0], #4
pkhbt r10, r10, r7, lsl #16
uadd16 r7, r12, r9
usub16 r7, r8, r7
uadd16 r6, r6, r10
uadd16 r10, r11, r14
usub16 r8, r11, r14
uadd16 r9, r10, r6
usub16 r10, r10, r6
uadd16 r6, r8, r7
usub16 r7, r8, r7
str r6, [r1, #8]
ldrne r6, [r0, #8]
str r7, [r1, #16]
str r10, [r1, #24]
str r9, [r1], #4
bne vp8_dequant_idct_loop1_v6
mov r5, #2
sub r0, r1, #8
vp8_dequant_idct_loop2_v6
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
smulwt r1, r3, r6
smulwt r12, r4, r6
smulwt lr, r3, r8
smulwt r10, r4, r8
pkhbt r11, r8, r6, lsl #16
pkhbt r1, lr, r1, lsl #16
pkhbt r12, r10, r12, lsl #16
pkhtb r6, r6, r8, asr #16
uadd16 r6, r1, r6
pkhbt lr, r9, r7, lsl #16
uadd16 r10, r11, lr
usub16 lr, r11, lr
pkhtb r8, r7, r9, asr #16
subs r5, r5, #1
smulwt r1, r3, r8
smulwb r7, r3, r8
smulwt r11, r4, r8
smulwb r9, r4, r8
pkhbt r1, r7, r1, lsl #16
uadd16 r8, r1, r8
pkhbt r11, r9, r11, lsl #16
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
ldr r12, [sp, #40]
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
uadd16 r6, r6, r9
uadd16 r10, r14, r1
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
ldr r11, [r2], r12
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
pkhtb r8, r8, r6, asr #19
uxtb16 lr, r11, ror #8
qadd16 r9, r9, lr
uxtb16 lr, r11
qadd16 r8, r8, lr
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
ldr r11, [r2], r12
ldr lr, [sp]
ldr r12, [sp, #44]
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
mov r6, r6, lsl #16
mov r7, r7, asr #3
pkhtb r7, r7, r10, asr #19
mov r1, r1, asr #3
pkhtb r1, r1, r6, asr #19
uxtb16 r8, r11, ror #8
qadd16 r7, r7, r8
uxtb16 r8, r11
qadd16 r1, r1, r8
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
str r9, [lr], r12
str r1, [lr], r12
str lr, [sp]
bne vp8_dequant_idct_loop2_v6
; vpx_memset
sub r0, r0, #32
add sp, sp, #4
mov r12, #0
str r12, [r0]
str r12, [r0, #4]
str r12, [r0, #8]
str r12, [r0, #12]
str r12, [r0, #16]
str r12, [r0, #20]
str r12, [r0, #24]
str r12, [r0, #28]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_dequant_idct_add_v6|
; Constant Pool
cospi8sqrt2minus1 DCD 0x00004E7B
sinpi8sqrt2 DCD 0x00008A8C
c0x00040004 DCD 0x00040004
END
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequant_dc_idct_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc)
|vp8_dequant_dc_idct_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r6, [sp, #36] ;load Dc
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r0, [sp]
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1],