From 1d6dc1b7022e6966ab417f6e24d6e15e4211aacb Mon Sep 17 00:00:00 2001 From: Frank Galligan <fgalligan@google.com> Date: Fri, 21 Jun 2013 12:58:46 -0700 Subject: [PATCH] Add Neon optimized loop filter functions. - Added vp9_loop_filter_horizontal_edge_neon and vp9_loop_filter_vertical_edge_neon. - The functions are based off the vp8 loopfilter functions. - Matches x86 md5 checksum. Change-Id: Id1c4dddb03584227e5ecd29f574a6ac27738fdd0 --- vp9/common/arm/neon/vp9_loopfilter_neon.asm | 270 ++++++++++++++++++++ vp9/common/vp9_rtcd_defs.sh | 4 +- vp9/vp9_common.mk | 2 + 3 files changed, 274 insertions(+), 2 deletions(-) create mode 100644 vp9/common/arm/neon/vp9_loopfilter_neon.asm diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_neon.asm new file mode 100644 index 0000000000..5011315438 --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_neon.asm @@ -0,0 +1,270 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_loop_filter_horizontal_edge_neon| + EXPORT |vp9_loop_filter_vertical_edge_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; TODO(fgalligan): See about removing the count code as this function is only +; called with a count of 1. +; +; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_loop_filter_horizontal_edge_neon| PROC + push {r4-r6, lr} + + ldr r12, [sp,#20] ; load count + ldrb r4, [r2] ; load *blimit + ldrb r5, [r3] ; load *limit + cmp r12, #0 + beq end_vp9_lf_h_edge + + ldr r3, [sp, #16] ; load thresh + vdup.u8 d0, r4 ; duplicate blimit + ldrb r6, [r3] ; load *thresh + vdup.u8 d1, r5 ; duplicate limit + vdup.u8 d2, r6 ; duplicate thresh + +count_lf_h_loop + sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines + add r6, r2, r1 + add r1, r1, r1 + + vld1.u8 {d3}, [r2@64], r1 ; p3 + vld1.u8 {d4}, [r6@64], r1 ; p2 + vld1.u8 {d5}, [r2@64], r1 ; p1 + vld1.u8 {d6}, [r6@64], r1 ; p0 + vld1.u8 {d7}, [r2@64], r1 ; q0 + vld1.u8 {d16}, [r6@64], r1 ; q1 + vld1.u8 {d17}, [r2@64] ; q2 + vld1.u8 {d18}, [r6@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r6, r6, r1, lsl #1 + + bl vp9_loop_filter_neon + + vst1.u8 {d4}, [r2@64], r1 ; store op1 + vst1.u8 {d5}, [r6@64], r1 ; store op0 + vst1.u8 {d6}, [r2@64], r1 ; store oq0 + vst1.u8 {d7}, [r6@64], r1 ; store oq1 + + add r0, r0, #8 + subs r12, r12, #1 + bne count_lf_h_loop + +end_vp9_lf_h_edge + pop {r4-r6, pc} + ENDP ; |vp9_loop_filter_horizontal_edge_neon| + +; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; TODO(fgalligan): See about removing the count code as this function is only +; called with a count of 1. +; +; void vp9_loop_filter_vertical_edge_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_loop_filter_vertical_edge_neon| PROC + push {r4-r6, lr} + + ldr r12, [sp,#20] ; load count + ldrb r4, [r2] ; load *blimit + ldrb r5, [r3] ; load *limit + cmp r12, #0 + beq end_vp9_lf_v_edge + + ldr r3, [sp, #16] ; load thresh + vdup.u8 d0, r4 ; duplicate blimit + ldrb r6, [r3] ; load *thresh + vdup.u8 d1, r5 ; duplicate limit + vdup.u8 d2, r6 ; duplicate thresh + +count_lf_v_loop + sub r6, r0, #4 ; move s pointer down by 4 columns + + vld1.u8 {d3}, [r6], r1 ; load s data + vld1.u8 {d4}, [r6], r1 + vld1.u8 {d5}, [r6], r1 + vld1.u8 {d6}, [r6], r1 + vld1.u8 {d7}, [r6], r1 + vld1.u8 {d16}, [r6], r1 + vld1.u8 {d17}, [r6], r1 + vld1.u8 {d18}, [r6] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + bl vp9_loop_filter_neon + + sub r0, r0, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 + vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 + vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 + vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 + vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 + vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 + vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 + vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] + + add r0, r0, r1, lsl #3 ; s += pitch * 8 + subs r12, r12, #1 + bne count_lf_v_loop + +end_vp9_lf_v_edge + pop {r4-r6, pc} + ENDP ; |vp9_loop_filter_vertical_edge_neon| + +; void vp9_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; r0-r3 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +|vp9_loop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; abs(p3 - p2) + vabd.u8 d20, d4, d5 ; abs(p2 - p1) + vabd.u8 d21, d5, d6 ; abs(p1 - p0) + vabd.u8 d22, d16, d7 ; abs(q1 - q0) + vabd.u8 d3, d17, d16 ; abs(q2 - q1) + vabd.u8 d4, d18, d17 ; abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 + vmax.u8 d20, d21, d22 + vmax.u8 d3, d3, d4 + vmax.u8 d23, d19, d20 + + vabd.u8 d17, d6, d7 ; abs(p0 - q0) + + ; hevmask + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 d23, d23, d3 + + vmov.u8 d18, #0x80 + + vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d23, d1, d23 + + ; filter() function + ; convert to signed + veor d7, d7, d18 ; qs0 + vshr.u8 d28, d28, #1 ; a = a / 2 + veor d6, d6, d18 ; ps0 + + veor d5, d5, d18 ; ps1 + vqadd.u8 d17, d17, d28 ; a = b + a + + veor d16, d16, d18 ; qs1 + + vmov.u8 d19, #3 + + vsub.s8 d28, d7, d6 ; ( qs0 - ps0) + + vcge.u8 d17, d0, d17 ; (a > blimit * 2 + limit) * -1 + + vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) + vorr d22, d21, d22 ; hevmask + + vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) + + vand d27, d27, d22 ; filter &= hev + vand d23, d23, d17 ; filter_mask + + vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d17, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d27, q12 + + vand d27, d27, d23 ; filter &= mask + + vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) + vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) + vshr.s8 d28, d28, #3 ; filter2 >>= 3 + vshr.s8 d27, d27, #3 ; filter1 >>= 3 + + + vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) + vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter >> 1 + vrshr.s8 d27, d27, #1 + vbic d27, d27, d22 ; filter &= ~hev + + vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) + vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) + + veor d5, d19, d18 ; *op0 = u^0x80 + veor d6, d26, d18 ; *oq0 = u^0x80 + veor d4, d21, d18 ; *op1 = u^0x80 + veor d7, d20, d18 ; *oq1 = u^0x80 + + bx lr + ENDP ; |vp9_loop_filter_neon| + + END diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 9daf908d58..c76e8f7364 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -78,7 +78,7 @@ prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uin specialize vp9_mbloop_filter_vertical_edge sse2 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_loop_filter_vertical_edge mmx +specialize vp9_loop_filter_vertical_edge mmx neon prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" specialize vp9_mb_lpf_horizontal_edge_w sse2 @@ -87,7 +87,7 @@ prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const u specialize vp9_mbloop_filter_horizontal_edge sse2 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_loop_filter_horizontal_edge mmx +specialize vp9_loop_filter_horizontal_edge mmx neon # # post proc diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 7a7483332a..7eac3a1a2a 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -100,6 +100,8 @@ endif VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) + $(eval $(call asm_offsets_template,\ vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c)) -- GitLab