From 1d6dc1b7022e6966ab417f6e24d6e15e4211aacb Mon Sep 17 00:00:00 2001
From: Frank Galligan <fgalligan@google.com>
Date: Fri, 21 Jun 2013 12:58:46 -0700
Subject: [PATCH] Add Neon optimized loop filter functions.

- Added vp9_loop_filter_horizontal_edge_neon and
  vp9_loop_filter_vertical_edge_neon.
- The functions are based off the vp8 loopfilter
  functions.
- Matches x86 md5 checksum.

Change-Id: Id1c4dddb03584227e5ecd29f574a6ac27738fdd0
---
 vp9/common/arm/neon/vp9_loopfilter_neon.asm | 270 ++++++++++++++++++++
 vp9/common/vp9_rtcd_defs.sh                 |   4 +-
 vp9/vp9_common.mk                           |   2 +
 3 files changed, 274 insertions(+), 2 deletions(-)
 create mode 100644 vp9/common/arm/neon/vp9_loopfilter_neon.asm

diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_neon.asm
new file mode 100644
index 0000000000..5011315438
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -0,0 +1,270 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_loop_filter_horizontal_edge_neon|
+    EXPORT  |vp9_loop_filter_vertical_edge_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
+;                                           int p /* pitch */,
+;                                           const uint8_t *blimit,
+;                                           const uint8_t *limit,
+;                                           const uint8_t *thresh,
+;                                           int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_loop_filter_horizontal_edge_neon| PROC
+    push        {r4-r6, lr}
+
+    ldr         r12, [sp,#20]              ; load count
+    ldrb        r4, [r2]                   ; load *blimit
+    ldrb        r5, [r3]                   ; load *limit
+    cmp         r12, #0
+    beq         end_vp9_lf_h_edge
+
+    ldr         r3, [sp, #16]              ; load thresh
+    vdup.u8     d0, r4                     ; duplicate blimit
+    ldrb        r6, [r3]                   ; load *thresh
+    vdup.u8     d1, r5                     ; duplicate limit
+    vdup.u8     d2, r6                     ; duplicate thresh
+
+count_lf_h_loop
+    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
+    add         r6, r2, r1
+    add         r1, r1, r1
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r6@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r6@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r6@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r6@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r6, r6, r1, lsl #1
+
+    bl          vp9_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r6@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r6@64], r1          ; store oq1
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_lf_h_loop
+
+end_vp9_lf_h_edge
+    pop         {r4-r6, pc}
+    ENDP        ; |vp9_loop_filter_horizontal_edge_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
+;                                         int p /* pitch */,
+;                                         const uint8_t *blimit,
+;                                         const uint8_t *limit,
+;                                         const uint8_t *thresh,
+;                                         int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_loop_filter_vertical_edge_neon| PROC
+    push        {r4-r6, lr}
+
+    ldr         r12, [sp,#20]              ; load count
+    ldrb        r4, [r2]                   ; load *blimit
+    ldrb        r5, [r3]                   ; load *limit
+    cmp         r12, #0
+    beq         end_vp9_lf_v_edge
+
+    ldr         r3, [sp, #16]              ; load thresh
+    vdup.u8     d0, r4                     ; duplicate blimit
+    ldrb        r6, [r3]                   ; load *thresh
+    vdup.u8     d1, r5                     ; duplicate limit
+    vdup.u8     d2, r6                     ; duplicate thresh
+
+count_lf_v_loop
+    sub         r6, r0, #4                 ; move s pointer down by 4 columns
+
+    vld1.u8     {d3}, [r6], r1             ; load s data
+    vld1.u8     {d4}, [r6], r1
+    vld1.u8     {d5}, [r6], r1
+    vld1.u8     {d6}, [r6], r1
+    vld1.u8     {d7}, [r6], r1
+    vld1.u8     {d16}, [r6], r1
+    vld1.u8     {d17}, [r6], r1
+    vld1.u8     {d18}, [r6]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          vp9_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    bne         count_lf_v_loop
+
+end_vp9_lf_v_edge
+    pop         {r4-r6, pc}
+    ENDP        ; |vp9_loop_filter_vertical_edge_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; r0-r3 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+|vp9_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20
+    vmax.u8     d20, d21, d22
+    vmax.u8     d3, d3, d4
+    vmax.u8     d23, d19, d20
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3
+
+    vmov.u8     d18, #0x80
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d23, d1, d23
+
+    ; filter() function
+    ; convert to signed
+    veor        d7, d7, d18                 ; qs0
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; (a > blimit * 2 + limit) * -1
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter >> 1
+    vrshr.s8    d27, d27, #1
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_neon|
+
+    END
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 9daf908d58..c76e8f7364 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -78,7 +78,7 @@ prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uin
 specialize vp9_mbloop_filter_vertical_edge sse2
 
 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_vertical_edge mmx
+specialize vp9_loop_filter_vertical_edge mmx neon
 
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
 specialize vp9_mb_lpf_horizontal_edge_w sse2
@@ -87,7 +87,7 @@ prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const u
 specialize vp9_mbloop_filter_horizontal_edge sse2
 
 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_horizontal_edge mmx
+specialize vp9_loop_filter_horizontal_edge mmx neon
 
 #
 # post proc
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 7a7483332a..7eac3a1a2a 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -100,6 +100,8 @@ endif
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
 
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
+
 $(eval $(call asm_offsets_template,\
          vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))
 
-- 
GitLab