Commit da63d299 authored by Johann's avatar Johann Committed by Gerrit Code Review
Browse files

Merge "VP8 encoder for ARMv8 by using NEON intrinsics 6"

Showing with 272 additions and 226 deletions
......@@ -446,14 +446,12 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
# Forward DCT
#
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/;
specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/;
$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon;
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/;
specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/;
$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
specialize qw/vp8_short_walsh4x4 sse2 media neon/;
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_fdct4x4_neon|
EXPORT |vp8_short_fdct8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=4
ALIGN 16 ; enable use of @128 bit aligned loads
coeff
DCW 5352, 5352, 5352, 5352
DCW 2217, 2217, 2217, 2217
DCD 14500, 14500, 14500, 14500
DCD 7500, 7500, 7500, 7500
DCD 12000, 12000, 12000, 12000
DCD 51000, 51000, 51000, 51000
;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|vp8_short_fdct4x4_neon| PROC
; Part one
vld1.16 {d0}, [r0@64], r2
adr r12, coeff
vld1.16 {d1}, [r0@64], r2
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
vld1.16 {d2}, [r0@64], r2
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
vld1.16 {d3}, [r0@64], r2
; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
vtrn.32 d0, d2
vtrn.32 d1, d3
vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
vtrn.16 d0, d1
vtrn.16 d2, d3
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
vshl.s16 q2, q2, #3 ; (a1, b1) << 3
vshl.s16 q3, q3, #3 ; (c1, d1) << 3
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
vtrn.32 d0, d2
vtrn.32 d1, d3
vtrn.16 d0, d1
vtrn.16 d2, d3
vmov.s16 d26, #7
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
vadd.s16 d4, d4, d26 ; a1 + 7
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
vceq.s16 d4, d7, #0
vshr.s16 d0, d0, #4
vshr.s16 d2, d2, #4
vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
vmvn d4, d4
vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
vst1.16 {q0, q1}, [r1@128]
bx lr
ENDP
;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
|vp8_short_fdct8x4_neon| PROC
; Part one
vld1.16 {q0}, [r0@128], r2
adr r12, coeff
vld1.16 {q1}, [r0@128], r2
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
vld1.16 {q2}, [r0@128], r2
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
vld1.16 {q3}, [r0@128], r2
; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
vtrn.32 q0, q2 ; [A0|B0]
vtrn.32 q1, q3 ; [A1|B1]
vtrn.16 q0, q1 ; [A2|B2]
vtrn.16 q2, q3 ; [A3|B3]
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
vshl.s16 q11, q11, #3 ; a1 << 3
vshl.s16 q12, q12, #3 ; b1 << 3
vshl.s16 q13, q13, #3 ; c1 << 3
vshl.s16 q14, q14, #3 ; d1 << 3
vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
vmov.s16 q11, q9 ; 14500
vmov.s16 q12, q10 ; 7500
vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
vtrn.32 q0, q2 ; q0=[A0 | B0]
vtrn.32 q1, q3 ; q1=[A4 | B4]
vtrn.16 q0, q1 ; q2=[A8 | B8]
vtrn.16 q2, q3 ; q3=[A12|B12]
vmov.s16 q15, #7
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
vadd.s16 q11, q11, q15 ; a1 + 7
vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
vadd.s16 q0, q11, q12 ; a1 + b1 + 7
vsub.s16 q1, q11, q12 ; a1 - b1 + 7
vmov.s16 q11, q9 ; 12000
vmov.s16 q12, q10 ; 51000
vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
vceq.s16 q14, q14, #0
vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
vmvn q14, q14
vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
vst1.16 {q0, q1}, [r1@128]! ; block A
vst1.16 {q2, q3}, [r1@128]! ; block B
bx lr
ENDP
END
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_short_fdct4x4_neon(
int16_t *input,
int16_t *output,
int pitch) {
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
int16x4_t d16s16, d17s16, d26s16, dEmptys16;
uint16x4_t d4u16;
int16x8_t q0s16, q1s16;
int32x4_t q9s32, q10s32, q11s32, q12s32;
int16x4x2_t v2tmp0, v2tmp1;
int32x2x2_t v2tmp2, v2tmp3;
d16s16 = vdup_n_s16(5352);
d17s16 = vdup_n_s16(2217);
q9s32 = vdupq_n_s32(14500);
q10s32 = vdupq_n_s32(7500);
q11s32 = vdupq_n_s32(12000);
q12s32 = vdupq_n_s32(51000);
// Part one
pitch >>= 1;
d0s16 = vld1_s16(input);
input += pitch;
d1s16 = vld1_s16(input);
input += pitch;
d2s16 = vld1_s16(input);
input += pitch;
d3s16 = vld1_s16(input);
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
vreinterpret_s32_s16(d2s16));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
vreinterpret_s32_s16(d3s16));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
vreinterpret_s16_s32(v2tmp3.val[0])); // d1
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
vreinterpret_s16_s32(v2tmp3.val[1])); // d3
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
d4s16 = vshl_n_s16(d4s16, 3);
d5s16 = vshl_n_s16(d5s16, 3);
d6s16 = vshl_n_s16(d6s16, 3);
d7s16 = vshl_n_s16(d7s16, 3);
d0s16 = vadd_s16(d4s16, d5s16);
d2s16 = vsub_s16(d4s16, d5s16);
q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
d1s16 = vshrn_n_s32(q9s32, 12);
d3s16 = vshrn_n_s32(q10s32, 12);
// Part two
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
vreinterpret_s32_s16(d2s16));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
vreinterpret_s32_s16(d3s16));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
vreinterpret_s16_s32(v2tmp3.val[0])); // d1
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
vreinterpret_s16_s32(v2tmp3.val[1])); // d3
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
d26s16 = vdup_n_s16(7);
d4s16 = vadd_s16(d4s16, d26s16);
d0s16 = vadd_s16(d4s16, d5s16);
d2s16 = vsub_s16(d4s16, d5s16);
q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
dEmptys16 = vdup_n_s16(0);
d4u16 = vceq_s16(d7s16, dEmptys16);
d0s16 = vshr_n_s16(d0s16, 4);
d2s16 = vshr_n_s16(d2s16, 4);
q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
d4u16 = vmvn_u16(d4u16);
d1s16 = vshrn_n_s32(q11s32, 16);
d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
d3s16 = vshrn_n_s32(q12s32, 16);
q0s16 = vcombine_s16(d0s16, d1s16);
q1s16 = vcombine_s16(d2s16, d3s16);
vst1q_s16(output, q0s16);
vst1q_s16(output + 8, q1s16);
return;
}
void vp8_short_fdct8x4_neon(
int16_t *input,
int16_t *output,
int pitch) {
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
uint16x4_t d28u16, d29u16;
uint16x8_t q14u16;
int16x8_t q0s16, q1s16, q2s16, q3s16;
int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
int32x4_t q9s32, q10s32, q11s32, q12s32;
int16x8x2_t v2tmp0, v2tmp1;
int32x4x2_t v2tmp2, v2tmp3;
d16s16 = vdup_n_s16(5352);
d17s16 = vdup_n_s16(2217);
q9s32 = vdupq_n_s32(14500);
q10s32 = vdupq_n_s32(7500);
// Part one
pitch >>= 1;
q0s16 = vld1q_s16(input);
input += pitch;
q1s16 = vld1q_s16(input);
input += pitch;
q2s16 = vld1q_s16(input);
input += pitch;
q3s16 = vld1q_s16(input);
v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
vreinterpretq_s32_s16(q2s16));
v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
vreinterpretq_s32_s16(q3s16));
v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
q11s16 = vshlq_n_s16(q11s16, 3);
q12s16 = vshlq_n_s16(q12s16, 3);
q13s16 = vshlq_n_s16(q13s16, 3);
q14s16 = vshlq_n_s16(q14s16, 3);
q0s16 = vaddq_s16(q11s16, q12s16);
q2s16 = vsubq_s16(q11s16, q12s16);
q11s32 = q9s32;
q12s32 = q10s32;
d26s16 = vget_low_s16(q13s16);
d27s16 = vget_high_s16(q13s16);
d28s16 = vget_low_s16(q14s16);
d29s16 = vget_high_s16(q14s16);
q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
d2s16 = vshrn_n_s32(q9s32, 12);
d6s16 = vshrn_n_s32(q10s32, 12);
d3s16 = vshrn_n_s32(q11s32, 12);
d7s16 = vshrn_n_s32(q12s32, 12);
q1s16 = vcombine_s16(d2s16, d3s16);
q3s16 = vcombine_s16(d6s16, d7s16);
// Part two
q9s32 = vdupq_n_s32(12000);
q10s32 = vdupq_n_s32(51000);
v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
vreinterpretq_s32_s16(q2s16));
v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
vreinterpretq_s32_s16(q3s16));
v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
q15s16 = vdupq_n_s16(7);
q11s16 = vaddq_s16(q11s16, q15s16);
q0s16 = vaddq_s16(q11s16, q12s16);
q1s16 = vsubq_s16(q11s16, q12s16);
q11s32 = q9s32;
q12s32 = q10s32;
d0s16 = vget_low_s16(q0s16);
d1s16 = vget_high_s16(q0s16);
d2s16 = vget_low_s16(q1s16);
d3s16 = vget_high_s16(q1s16);
d0s16 = vshr_n_s16(d0s16, 4);
d4s16 = vshr_n_s16(d1s16, 4);
d2s16 = vshr_n_s16(d2s16, 4);
d6s16 = vshr_n_s16(d3s16, 4);
d26s16 = vget_low_s16(q13s16);
d27s16 = vget_high_s16(q13s16);
d28s16 = vget_low_s16(q14s16);
d29s16 = vget_high_s16(q14s16);
q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
d1s16 = vshrn_n_s32(q9s32, 16);
d3s16 = vshrn_n_s32(q10s32, 16);
d5s16 = vshrn_n_s32(q11s32, 16);
d7s16 = vshrn_n_s32(q12s32, 16);
qEmptys16 = vdupq_n_s16(0);
q14u16 = vceqq_s16(q14s16, qEmptys16);
q14u16 = vmvnq_u16(q14u16);
d28u16 = vget_low_u16(q14u16);
d29u16 = vget_high_u16(q14u16);
d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
q0s16 = vcombine_s16(d0s16, d1s16);
q1s16 = vcombine_s16(d2s16, d3s16);
q2s16 = vcombine_s16(d4s16, d5s16);
q3s16 = vcombine_s16(d6s16, d7s16);
vst1q_s16(output, q0s16);
vst1q_s16(output + 8, q1s16);
vst1q_s16(output + 16, q2s16);
vst1q_s16(output + 24, q3s16);
return;
}
......@@ -37,10 +37,10 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
# encoder
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/picklpf_arm.c
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/shortfdct_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment