convolve_neon.s 5.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
@/*
@ ** Copyright 2003-2010, VisualOn, Inc.
@ **
@ ** Licensed under the Apache License, Version 2.0 (the "License");
@ ** you may not use this file except in compliance with the License.
@ ** You may obtain a copy of the License at
@ **
@ **     http://www.apache.org/licenses/LICENSE-2.0
@ **
@ ** Unless required by applicable law or agreed to in writing, software
@ ** distributed under the License is distributed on an "AS IS" BASIS,
@ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ ** See the License for the specific language governing permissions and
@ ** limitations under the License.
@ */
@
@*void Convolve (
@*    Word16 x[],        /* (i)     : input vector                           */
@*    Word16 h[],        /* (i)     : impulse response                       */
@*    Word16 y[],        /* (o)     : output vector                          */
@*    Word16 L           /* (i)     : vector size                            */
@*)
23
@
24 25 26 27 28
@ r0 --- x[]
@ r1 --- h[]
@ r2 --- y[]
@ r3 --- L

29 30
	.section  .text
        .global   Convolve_asm
31 32 33

Convolve_asm:

34 35
        STMFD          r13!, {r4 - r12, r14}
        MOV            r3,  #0
36
	MOV            r11, #0x8000
37 38

LOOP:
39 40 41 42 43 44 45
        @MOV            r8, #0                            @ s = 0
        ADD            r4, r1, r3, LSL #1                @ tmpH address
        ADD            r5, r3, #1                        @ i = n + 1
        MOV            r6, r0
        LDRSH          r9,  [r6], #2                     @ *tmpX++
        LDRSH          r10, [r4]                         @ *tmpH--
        SUB            r5, r5, #1
46 47
        VMOV.S32       Q10, #0
        MUL            r8,  r9, r10
48

49
LOOP1:
50 51 52 53
        CMP            r5, #0
        BLE            L1
        SUB            r4, r4, #8
        MOV            r9, r4
54
        VLD1.S16       D0, [r6]!
55 56
        VLD1.S16       D1, [r9]!
        VREV64.16      D1, D1
57 58 59 60
        SUBS           r5, r5, #4
        VMLAL.S16      Q10, D0, D1
        B              LOOP1
L1:
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
        VADD.S32       D20, D20, D21
        VPADD.S32      D20, D20, D20
        VMOV.S32       r5, D20[0]
        ADD            r5, r5, r8
        ADD            r5, r11, r5, LSL #1
        MOV            r5, r5, LSR #16                   @extract_h(s)
        ADD            r3, r3, #1
        STRH           r5, [r2], #2                      @y[n]


        @MOV            r8, #0
        ADD            r4, r1, r3, LSL #1                @tmpH address
        ADD            r5, r3, #1
        MOV            r6, r0
        LDRSH          r9,  [r6], #2                     @ *tmpX++
76
        LDRSH          r10, [r4], #-2
77 78 79 80 81 82
        LDRSH          r12, [r6], #2
        LDRSH          r14, [r4]

        MUL            r8, r9, r10
        SUB            r5, r5, #2
        MLA            r8, r12, r14, r8
83

84 85 86 87 88 89
        VMOV.S32       Q10, #0
LOOP2:
        CMP            r5, #0
        BLE            L2
        SUB            r4, r4, #8
        MOV            r9, r4
90
        VLD1.S16       D0, [r6]!
91 92 93
        VLD1.S16       D1, [r9]!
        SUBS           r5, r5, #4
        VREV64.16      D1, D1
94
        VMLAL.S16      Q10, D0, D1
95 96 97 98 99 100 101 102
        B              LOOP2
L2:
        VADD.S32       D20, D20, D21
        VPADD.S32      D20, D20, D20
        VMOV.S32       r5, D20[0]
        ADD            r8, r8, r5
        ADD            r8, r11, r8, LSL #1
        MOV            r8, r8, LSR #16                   @extract_h(s)
103
        ADD            r3, r3, #1
104 105 106 107 108 109 110 111 112 113 114 115 116 117
        STRH           r8, [r2], #2                      @y[n]


        @MOV            r8, #0
        ADD            r4, r1, r3, LSL #1
        ADD            r5, r3, #1
        MOV            r6, r0
        LDRSH          r9,  [r6], #2
        LDRSH          r10, [r4], #-2
        LDRSH          r12, [r6], #2
        LDRSH          r14, [r4], #-2
        MUL            r8, r9, r10
        LDRSH          r9,  [r6], #2
        LDRSH          r10, [r4]
118
        MLA            r8, r12, r14, r8
119 120 121 122 123 124 125 126 127
        SUB            r5, r5, #3
        MLA            r8, r9, r10, r8

        VMOV.S32       Q10, #0
LOOP3:
        CMP            r5, #0
        BLE            L3
        SUB            r4, r4, #8
        MOV            r9, r4
128
        VLD1.S16       D0, [r6]!
129 130 131
        VLD1.S16       D1, [r9]!
        VREV64.16      D1, D1
        SUBS           r5, r5, #4
132 133
        VMLAL.S16      Q10, D0, D1
        B              LOOP3
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148

L3:
        VADD.S32       D20, D20, D21
        VPADD.S32      D20, D20, D20
        VMOV.S32       r5, D20[0]
        ADD            r8, r8, r5
        ADD            r8, r11, r8, LSL #1
        MOV            r8, r8, LSR #16                   @extract_h(s)
        ADD            r3, r3, #1
        STRH           r8, [r2], #2                      @y[n]

        ADD            r5, r3, #1                        @ i = n + 1
        ADD            r4, r1, r5, LSL #1                @ tmpH address
        MOV            r6, r0
        VMOV.S32       Q10, #0
149
LOOP4:
150 151 152 153
        CMP            r5, #0
        BLE            L4
        SUB            r4, r4, #8
        MOV            r9, r4
154
        VLD1.S16       D0, [r6]!
155 156
        VLD1.S16       D1, [r9]!
        VREV64.16      D1, D1
157 158 159 160
        SUBS           r5, r5, #4
        VMLAL.S16      Q10, D0, D1
        B              LOOP4
L4:
161 162 163 164 165 166 167
        VADD.S32       D20, D20, D21
        VPADD.S32      D20, D20, D20
        VMOV.S32       r5,  D20[0]
        ADD            r5, r11, r5, LSL #1
        MOV            r5, r5, LSR #16                   @extract_h(s)
        ADD            r3, r3, #1
        STRH           r5, [r2], #2                      @y[n]
168

169 170
        CMP            r3, #64
        BLT            LOOP
171 172 173

Convolve_asm_end:

174
        LDMFD      r13!, {r4 - r12, r15}
175

176
        @ENDFUNC
177
        .end
178