Commit 6fdc9aa7 authored by Tero Rintaluoma's avatar Tero Rintaluoma
Browse files

ARMv6 optimized subtract functions

Adds following ARMv6 optimized functions to encoder:
  - vp8_subtract_b_armv6
  - vp8_subtract_mby_armv6
  - vp8_subtract_mbuv_armv6

Gives 1-5% speed-up depending on input sequence and encoding
parameters. Functions have one stall cycle inside the loop body
on Cortex pipeline.

Change-Id: I19cca5408b9861b96f378e818eefeb3855238639
parent 4be062bb
......@@ -66,10 +66,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
cpi->rtcd.encodemb.mberr = vp8_mbblock_error_c;
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c;
cpi->rtcd.encodemb.subb = vp8_subtract_b_c;
cpi->rtcd.encodemb.submby = vp8_subtract_mby_c;
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;*/
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_c;*/
cpi->rtcd.encodemb.subb = vp8_subtract_b_armv6;
cpi->rtcd.encodemb.submby = vp8_subtract_mby_armv6;
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_armv6;
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6;
......
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_subtract_mby_armv6|
EXPORT |vp8_subtract_mbuv_armv6|
EXPORT |vp8_subtract_b_armv6|
INCLUDE asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 BLOCK *be
; r1 BLOCKD *bd
; r2 int pitch
|vp8_subtract_b_armv6| PROC
stmfd sp!, {r4-r9}
ldr r4, [r0, #vp8_block_base_src]
ldr r5, [r0, #vp8_block_src]
ldr r6, [r0, #vp8_block_src_diff]
ldr r3, [r4]
ldr r7, [r0, #vp8_block_src_stride]
add r3, r3, r5 ; src = *base_src + src
ldr r8, [r1, #vp8_blockd_predictor]
mov r9, #4 ; loop count
loop_block
ldr r0, [r3], r7 ; src
ldr r1, [r8], r2 ; pred
uxtb16 r4, r0 ; [s2 | s0]
uxtb16 r5, r1 ; [p2 | p0]
uxtb16 r0, r0, ror #8 ; [s3 | s1]
uxtb16 r1, r1, ror #8 ; [p3 | p1]
usub16 r4, r4, r5 ; [d2 | d0]
usub16 r5, r0, r1 ; [d3 | d1]
subs r9, r9, #1 ; decrement loop counter
pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
str r0, [r6, #0] ; diff
str r1, [r6, #4] ; diff
add r6, r6, r2, lsl #1 ; update diff pointer
bne loop_block
ldmfd sp!, {r4-r9}
mov pc, lr
ENDP
; r0 short *diff
; r1 unsigned char *usrc
; r2 unsigned char *vsrc
; r3 unsigned char *pred
; stack int stride
|vp8_subtract_mbuv_armv6| PROC
stmfd sp!, {r4-r12, lr}
add r0, r0, #512 ; set *diff point to Cb
add r3, r3, #256 ; set *pred point to Cb
mov r4, #8 ; loop count
ldr r5, [sp, #40] ; stride
; Subtract U block
loop_u
ldr r6, [r1] ; src (A)
ldr r7, [r3], #4 ; pred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
usub16 r6, r8, r9 ; [d2 | d0] (A)
usub16 r7, r10, r11 ; [d3 | d1] (A)
ldr r10, [r1, #4] ; src (B)
ldr r11, [r3], #4 ; pred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
str r8, [r0], #4 ; diff (A)
uxtb16 r8, r10 ; [s2 | s0] (B)
str r9, [r0], #4 ; diff (A)
uxtb16 r9, r11 ; [p2 | p0] (B)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
usub16 r6, r8, r9 ; [d2 | d0] (B)
usub16 r7, r10, r11 ; [d3 | d1] (B)
add r1, r1, r5 ; update usrc pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
str r8, [r0], #4 ; diff (B)
subs r4, r4, #1 ; update loop counter
str r9, [r0], #4 ; diff (B)
bne loop_u
mov r4, #8 ; loop count
; Subtract V block
loop_v
ldr r6, [r2] ; src (A)
ldr r7, [r3], #4 ; pred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
usub16 r6, r8, r9 ; [d2 | d0] (A)
usub16 r7, r10, r11 ; [d3 | d1] (A)
ldr r10, [r2, #4] ; src (B)
ldr r11, [r3], #4 ; pred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
str r8, [r0], #4 ; diff (A)
uxtb16 r8, r10 ; [s2 | s0] (B)
str r9, [r0], #4 ; diff (A)
uxtb16 r9, r11 ; [p2 | p0] (B)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
usub16 r6, r8, r9 ; [d2 | d0] (B)
usub16 r7, r10, r11 ; [d3 | d1] (B)
add r2, r2, r5 ; update vsrc pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
str r8, [r0], #4 ; diff (B)
subs r4, r4, #1 ; update loop counter
str r9, [r0], #4 ; diff (B)
bne loop_v
ldmfd sp!, {r4-r12, pc}
ENDP
; r0 short *diff
; r1 unsigned char *src
; r2 unsigned char *pred
; r3 int stride
|vp8_subtract_mby_armv6| PROC
stmfd sp!, {r4-r11}
mov r4, #16
loop
ldr r6, [r1] ; src (A)
ldr r7, [r2], #4 ; pred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
usub16 r6, r8, r9 ; [d2 | d0] (A)
usub16 r7, r10, r11 ; [d3 | d1] (A)
ldr r10, [r1, #4] ; src (B)
ldr r11, [r2], #4 ; pred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
str r8, [r0], #4 ; diff (A)
uxtb16 r8, r10 ; [s2 | s0] (B)
str r9, [r0], #4 ; diff (A)
uxtb16 r9, r11 ; [p2 | p0] (B)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
usub16 r6, r8, r9 ; [d2 | d0] (B)
usub16 r7, r10, r11 ; [d3 | d1] (B)
ldr r10, [r1, #8] ; src (C)
ldr r11, [r2], #4 ; pred (C)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
str r8, [r0], #4 ; diff (B)
uxtb16 r8, r10 ; [s2 | s0] (C)
str r9, [r0], #4 ; diff (B)
uxtb16 r9, r11 ; [p2 | p0] (C)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
usub16 r6, r8, r9 ; [d2 | d0] (C)
usub16 r7, r10, r11 ; [d3 | d1] (C)
ldr r10, [r1, #12] ; src (D)
ldr r11, [r2], #4 ; pred (D)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
str r8, [r0], #4 ; diff (C)
uxtb16 r8, r10 ; [s2 | s0] (D)
str r9, [r0], #4 ; diff (C)
uxtb16 r9, r11 ; [p2 | p0] (D)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
usub16 r6, r8, r9 ; [d2 | d0] (D)
usub16 r7, r10, r11 ; [d3 | d1] (D)
add r1, r1, r3 ; update src pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
str r8, [r0], #4 ; diff (D)
subs r4, r4, #1 ; update loop counter
str r9, [r0], #4 ; diff (D)
bne loop
ldmfd sp!, {r4-r11}
mov pc, lr
ENDP
END
......@@ -12,6 +12,24 @@
#ifndef ENCODEMB_ARM_H
#define ENCODEMB_ARM_H
#if HAVE_ARMV6
extern prototype_subb(vp8_subtract_b_armv6);
extern prototype_submby(vp8_subtract_mby_armv6);
extern prototype_submbuv(vp8_subtract_mbuv_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_encodemb_subb
#define vp8_encodemb_subb vp8_subtract_b_armv6
#undef vp8_encodemb_submby
#define vp8_encodemb_submby vp8_subtract_mby_armv6
#undef vp8_encodemb_submbuv
#define vp8_encodemb_submbuv vp8_subtract_mbuv_armv6
#endif
#endif /* HAVE_ARMV6 */
#if HAVE_ARMV7
//extern prototype_berr(vp8_block_error_c);
//extern prototype_mberr(vp8_mbblock_error_c);
......
......@@ -48,6 +48,14 @@ DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant));
DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff));
DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob));
// subtract
DEFINE(vp8_block_base_src, offsetof(BLOCK, base_src));
DEFINE(vp8_block_src, offsetof(BLOCK, src));
DEFINE(vp8_block_src_diff, offsetof(BLOCK, src_diff));
DEFINE(vp8_block_src_stride, offsetof(BLOCK, src_stride));
DEFINE(vp8_blockd_predictor, offsetof(BLOCKD, predictor));
//pack tokens
DEFINE(vp8_writer_lowvalue, offsetof(vp8_writer, lowvalue));
DEFINE(vp8_writer_range, offsetof(vp8_writer, range));
......
......@@ -34,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar
#File list for armv6
# encoder
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment