Skip to content
GitLab
Explore
Projects
Groups
Topics
Snippets
Projects
Groups
Topics
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
BC
public
external
libvpx
Commits
da63d299
Commit
da63d299
authored
10 years ago
by
Johann
Committed by
Gerrit Code Review
10 years ago
Browse files
Options
Download
Plain Diff
Merge "VP8 encoder for ARMv8 by using NEON intrinsics 6"
parents
23c88870
eed005b0
v1.14.0-linphone
1.4.X
feature/update_to_v1.9.0-linphone
feature/uwp_nuget
highbitdepth
indianrunnerduck
javanwhistlingduck
khakicampbell
linphone
linphone-android
linphone-old
longtailedduck
m49-2623
m52-2743
m54-2840
m56-2924
m66-3359
m68-3440
mandarinduck
nextgen
nextgenv2
sandbox/Jingning/experimental
sandbox/Jingning/vpx
sandbox/aconverse@google.com/ansbench
sandbox/hkuang@google.com/decode
sandbox/jimbankoski@google.com/proposed-aom
sandbox/jingning@google.com/decoder_test_suite
sandbox/jingning@google.com/experimental
sandbox/jzern@google.com/test
sandbox/wangch@google.com/vp9
sandbox/yaowu@google.com/mergeaom
v1.12.0-linphone
v1.6.1_linphone
v1.7.0-linphone
v1.9.0-linphone
v1.9.0
v1.9.0-rc1
v1.8.2
v1.8.1
v1.8.0
v1.7.0
v1.6.1
v1.6.0
v1.5.0
v1.4.0
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
vp8/common/rtcd_defs.pl
+2
-4
vp8/common/rtcd_defs.pl
vp8/encoder/arm/neon/shortfdct_neon.asm
+0
-221
vp8/encoder/arm/neon/shortfdct_neon.asm
vp8/encoder/arm/neon/shortfdct_neon.c
+269
-0
vp8/encoder/arm/neon/shortfdct_neon.c
vp8/vp8cx_arm.mk
+1
-1
vp8/vp8cx_arm.mk
with
272 additions
and
226 deletions
vp8/common/rtcd_defs.pl
+
2
−
4
View file @
da63d299
...
...
@@ -446,14 +446,12 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
# Forward DCT
#
add_proto
qw/void vp8_short_fdct4x4/
,
"
short *input, short *output, int pitch
";
specialize
qw/vp8_short_fdct4x4 mmx sse2 media neon
_asm
/
;
specialize
qw/vp8_short_fdct4x4 mmx sse2 media neon/
;
$vp8_short_fdct4x4_media
=
vp8_short_fdct4x4_armv6
;
$vp8_short_fdct4x4_neon_asm
=
vp8_short_fdct4x4_neon
;
add_proto
qw/void vp8_short_fdct8x4/
,
"
short *input, short *output, int pitch
";
specialize
qw/vp8_short_fdct8x4 mmx sse2 media neon
_asm
/
;
specialize
qw/vp8_short_fdct8x4 mmx sse2 media neon/
;
$vp8_short_fdct8x4_media
=
vp8_short_fdct8x4_armv6
;
$vp8_short_fdct8x4_neon_asm
=
vp8_short_fdct8x4_neon
;
add_proto
qw/void vp8_short_walsh4x4/
,
"
short *input, short *output, int pitch
";
specialize
qw/vp8_short_walsh4x4 sse2 media neon/
;
...
...
This diff is collapsed.
Click to expand it.
vp8/encoder/arm/neon/shortfdct_neon.asm
deleted
100644 → 0
+
0
−
221
View file @
23c88870
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT
|
vp8_short_fdct4x4_neon
|
EXPORT
|
vp8_short_fdct8x4_neon
|
ARM
REQUIRE8
PRESERVE8
AREA
||
.text
||
,
CODE
,
READONLY
,
AL
IGN
=
4
ALIGN
16
; enable use of @128 bit aligned loads
coeff
DCW
5352
,
5352
,
5352
,
5352
DCW
2217
,
2217
,
2217
,
2217
DCD
14500
,
14500
,
14500
,
14500
DCD
7500
,
7500
,
7500
,
7500
DCD
12000
,
12000
,
12000
,
12000
DCD
51000
,
51000
,
51000
,
51000
;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
vp8_short_fdct4x4_neon
|
PROC
; Part one
vld1.16
{
d0
}
,
[
r0@64
],
r2
adr
r12
,
coeff
vld1.16
{
d1
}
,
[
r0@64
],
r2
vld1.16
{
q8
}
,
[
r12@128
]
!
; d16=5352, d17=2217
vld1.16
{
d2
}
,
[
r0@64
],
r2
vld1.32
{
q9
,
q10
}
,
[
r12@128
]
!
; q9=14500, q10=7500
vld1.16
{
d3
}
,
[
r0@64
],
r2
; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
vtrn.32
d0
,
d2
vtrn.32
d1
,
d3
vld1.32
{
q11
,
q12
}
,
[
r12@128
]
; q11=12000, q12=51000
vtrn.16
d0
,
d1
vtrn.16
d2
,
d3
vadd.s16
d4
,
d0
,
d3
; a1 = ip[0] + ip[3]
vadd.s16
d5
,
d1
,
d2
; b1 = ip[1] + ip[2]
vsub.s16
d6
,
d1
,
d2
; c1 = ip[1] - ip[2]
vsub.s16
d7
,
d0
,
d3
; d1 = ip[0] - ip[3]
vshl.s16
q2
,
q2
,
#
3
; (a1, b1) << 3
vshl.s16
q3
,
q3
,
#
3
; (c1, d1) << 3
vadd.s16
d0
,
d4
,
d5
; op[0] = a1 + b1
vsub.s16
d2
,
d4
,
d5
; op[2] = a1 - b1
vmlal.s16
q9
,
d7
,
d16
; d1*5352 + 14500
vmlal.s16
q10
,
d7
,
d17
; d1*2217 + 7500
vmlal.s16
q9
,
d6
,
d17
; c1*2217 + d1*5352 + 14500
vmlsl.s16
q10
,
d6
,
d16
; d1*2217 - c1*5352 + 7500
vshrn.s32
d1
,
q9
,
#
12
; op[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32
d3
,
q10
,
#
12
; op[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
vtrn.32
d0
,
d2
vtrn.32
d1
,
d3
vtrn.16
d0
,
d1
vtrn.16
d2
,
d3
vmov.s16
d26
,
#
7
vadd.s16
d4
,
d0
,
d3
; a1 = ip[0] + ip[12]
vadd.s16
d5
,
d1
,
d2
; b1 = ip[4] + ip[8]
vsub.s16
d6
,
d1
,
d2
; c1 = ip[4] - ip[8]
vadd.s16
d4
,
d4
,
d26
; a1 + 7
vsub.s16
d7
,
d0
,
d3
; d1 = ip[0] - ip[12]
vadd.s16
d0
,
d4
,
d5
; op[0] = a1 + b1 + 7
vsub.s16
d2
,
d4
,
d5
; op[8] = a1 - b1 + 7
vmlal.s16
q11
,
d7
,
d16
; d1*5352 + 12000
vmlal.s16
q12
,
d7
,
d17
; d1*2217 + 51000
vceq.s16
d4
,
d7
,
#
0
vshr.s16
d0
,
d0
,
#
4
vshr.s16
d2
,
d2
,
#
4
vmlal.s16
q11
,
d6
,
d17
; c1*2217 + d1*5352 + 12000
vmlsl.s16
q12
,
d6
,
d16
; d1*2217 - c1*5352 + 51000
vmvn
d4
,
d4
vshrn.s32
d1
,
q11
,
#
16
; op[4] = (c1*2217 + d1*5352 + 12000)>>16
vsub.s16
d1
,
d1
,
d4
; op[4] += (d1!=0)
vshrn.s32
d3
,
q12
,
#
16
; op[12]= (d1*2217 - c1*5352 + 51000)>>16
vst1.16
{
q0
,
q1
}
,
[
r1@128
]
bx
lr
ENDP
;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
|
vp8_short_fdct8x4_neon
|
PROC
; Part one
vld1.16
{
q0
}
,
[
r0@128
],
r2
adr
r12
,
coeff
vld1.16
{
q1
}
,
[
r0@128
],
r2
vld1.16
{
q8
}
,
[
r12@128
]
!
; d16=5352, d17=2217
vld1.16
{
q2
}
,
[
r0@128
],
r2
vld1.32
{
q9
,
q10
}
,
[
r12@128
]
!
; q9=14500, q10=7500
vld1.16
{
q3
}
,
[
r0@128
],
r2
; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
vtrn.32
q0
,
q2
; [A0|B0]
vtrn.32
q1
,
q3
; [A1|B1]
vtrn.16
q0
,
q1
; [A2|B2]
vtrn.16
q2
,
q3
; [A3|B3]
vadd.s16
q11
,
q0
,
q3
; a1 = ip[0] + ip[3]
vadd.s16
q12
,
q1
,
q2
; b1 = ip[1] + ip[2]
vsub.s16
q13
,
q1
,
q2
; c1 = ip[1] - ip[2]
vsub.s16
q14
,
q0
,
q3
; d1 = ip[0] - ip[3]
vshl.s16
q11
,
q11
,
#
3
; a1 << 3
vshl.s16
q12
,
q12
,
#
3
; b1 << 3
vshl.s16
q13
,
q13
,
#
3
; c1 << 3
vshl.s16
q14
,
q14
,
#
3
; d1 << 3
vadd.s16
q0
,
q11
,
q12
; [A0 | B0] = a1 + b1
vsub.s16
q2
,
q11
,
q12
; [A2 | B2] = a1 - b1
vmov.s16
q11
,
q9
; 14500
vmov.s16
q12
,
q10
; 7500
vmlal.s16
q9
,
d28
,
d16
; A[1] = d1*5352 + 14500
vmlal.s16
q10
,
d28
,
d17
; A[3] = d1*2217 + 7500
vmlal.s16
q11
,
d29
,
d16
; B[1] = d1*5352 + 14500
vmlal.s16
q12
,
d29
,
d17
; B[3] = d1*2217 + 7500
vmlal.s16
q9
,
d26
,
d17
; A[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16
q10
,
d26
,
d16
; A[3] = d1*2217 - c1*5352 + 7500
vmlal.s16
q11
,
d27
,
d17
; B[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16
q12
,
d27
,
d16
; B[3] = d1*2217 - c1*5352 + 7500
vshrn.s32
d2
,
q9
,
#
12
; A[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32
d6
,
q10
,
#
12
; A[3] = (d1*2217 - c1*5352 + 7500)>>12
vshrn.s32
d3
,
q11
,
#
12
; B[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32
d7
,
q12
,
#
12
; B[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
vld1.32
{
q9
,
q10
}
,
[
r12@128
]
; q9=12000, q10=51000
; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
vtrn.32
q0
,
q2
; q0=[A0 | B0]
vtrn.32
q1
,
q3
; q1=[A4 | B4]
vtrn.16
q0
,
q1
; q2=[A8 | B8]
vtrn.16
q2
,
q3
; q3=[A12|B12]
vmov.s16
q15
,
#
7
vadd.s16
q11
,
q0
,
q3
; a1 = ip[0] + ip[12]
vadd.s16
q12
,
q1
,
q2
; b1 = ip[4] + ip[8]
vadd.s16
q11
,
q11
,
q15
; a1 + 7
vsub.s16
q13
,
q1
,
q2
; c1 = ip[4] - ip[8]
vsub.s16
q14
,
q0
,
q3
; d1 = ip[0] - ip[12]
vadd.s16
q0
,
q11
,
q12
; a1 + b1 + 7
vsub.s16
q1
,
q11
,
q12
; a1 - b1 + 7
vmov.s16
q11
,
q9
; 12000
vmov.s16
q12
,
q10
; 51000
vshr.s16
d0
,
d0
,
#
4
; A[0] = (a1 + b1 + 7)>>4
vshr.s16
d4
,
d1
,
#
4
; B[0] = (a1 + b1 + 7)>>4
vshr.s16
d2
,
d2
,
#
4
; A[8] = (a1 + b1 + 7)>>4
vshr.s16
d6
,
d3
,
#
4
; B[8] = (a1 + b1 + 7)>>4
vmlal.s16
q9
,
d28
,
d16
; A[4] = d1*5352 + 12000
vmlal.s16
q10
,
d28
,
d17
; A[12] = d1*2217 + 51000
vmlal.s16
q11
,
d29
,
d16
; B[4] = d1*5352 + 12000
vmlal.s16
q12
,
d29
,
d17
; B[12] = d1*2217 + 51000
vceq.s16
q14
,
q14
,
#
0
vmlal.s16
q9
,
d26
,
d17
; A[4] = c1*2217 + d1*5352 + 12000
vmlsl.s16
q10
,
d26
,
d16
; A[12] = d1*2217 - c1*5352 + 51000
vmlal.s16
q11
,
d27
,
d17
; B[4] = c1*2217 + d1*5352 + 12000
vmlsl.s16
q12
,
d27
,
d16
; B[12] = d1*2217 - c1*5352 + 51000
vmvn
q14
,
q14
vshrn.s32
d1
,
q9
,
#
16
; A[4] = (c1*2217 + d1*5352 + 12000)>>16
vshrn.s32
d3
,
q10
,
#
16
; A[12]= (d1*2217 - c1*5352 + 51000)>>16
vsub.s16
d1
,
d1
,
d28
; A[4] += (d1!=0)
vshrn.s32
d5
,
q11
,
#
16
; B[4] = (c1*2217 + d1*5352 + 12000)>>16
vshrn.s32
d7
,
q12
,
#
16
; B[12]= (d1*2217 - c1*5352 + 51000)>>16
vsub.s16
d5
,
d5
,
d29
; B[4] += (d1!=0)
vst1.16
{
q0
,
q1
}
,
[
r1@128
]
!
; block A
vst1.16
{
q2
,
q3
}
,
[
r1@128
]
!
; block B
bx
lr
ENDP
END
This diff is collapsed.
Click to expand it.
vp8/encoder/arm/neon/shortfdct_neon.c
0 → 100644
+
269
−
0
View file @
da63d299
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include
<arm_neon.h>
void
vp8_short_fdct4x4_neon
(
int16_t
*
input
,
int16_t
*
output
,
int
pitch
)
{
int16x4_t
d0s16
,
d1s16
,
d2s16
,
d3s16
,
d4s16
,
d5s16
,
d6s16
,
d7s16
;
int16x4_t
d16s16
,
d17s16
,
d26s16
,
dEmptys16
;
uint16x4_t
d4u16
;
int16x8_t
q0s16
,
q1s16
;
int32x4_t
q9s32
,
q10s32
,
q11s32
,
q12s32
;
int16x4x2_t
v2tmp0
,
v2tmp1
;
int32x2x2_t
v2tmp2
,
v2tmp3
;
d16s16
=
vdup_n_s16
(
5352
);
d17s16
=
vdup_n_s16
(
2217
);
q9s32
=
vdupq_n_s32
(
14500
);
q10s32
=
vdupq_n_s32
(
7500
);
q11s32
=
vdupq_n_s32
(
12000
);
q12s32
=
vdupq_n_s32
(
51000
);
// Part one
pitch
>>=
1
;
d0s16
=
vld1_s16
(
input
);
input
+=
pitch
;
d1s16
=
vld1_s16
(
input
);
input
+=
pitch
;
d2s16
=
vld1_s16
(
input
);
input
+=
pitch
;
d3s16
=
vld1_s16
(
input
);
v2tmp2
=
vtrn_s32
(
vreinterpret_s32_s16
(
d0s16
),
vreinterpret_s32_s16
(
d2s16
));
v2tmp3
=
vtrn_s32
(
vreinterpret_s32_s16
(
d1s16
),
vreinterpret_s32_s16
(
d3s16
));
v2tmp0
=
vtrn_s16
(
vreinterpret_s16_s32
(
v2tmp2
.
val
[
0
]),
// d0
vreinterpret_s16_s32
(
v2tmp3
.
val
[
0
]));
// d1
v2tmp1
=
vtrn_s16
(
vreinterpret_s16_s32
(
v2tmp2
.
val
[
1
]),
// d2
vreinterpret_s16_s32
(
v2tmp3
.
val
[
1
]));
// d3
d4s16
=
vadd_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
d5s16
=
vadd_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
d6s16
=
vsub_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
d7s16
=
vsub_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
d4s16
=
vshl_n_s16
(
d4s16
,
3
);
d5s16
=
vshl_n_s16
(
d5s16
,
3
);
d6s16
=
vshl_n_s16
(
d6s16
,
3
);
d7s16
=
vshl_n_s16
(
d7s16
,
3
);
d0s16
=
vadd_s16
(
d4s16
,
d5s16
);
d2s16
=
vsub_s16
(
d4s16
,
d5s16
);
q9s32
=
vmlal_s16
(
q9s32
,
d7s16
,
d16s16
);
q10s32
=
vmlal_s16
(
q10s32
,
d7s16
,
d17s16
);
q9s32
=
vmlal_s16
(
q9s32
,
d6s16
,
d17s16
);
q10s32
=
vmlsl_s16
(
q10s32
,
d6s16
,
d16s16
);
d1s16
=
vshrn_n_s32
(
q9s32
,
12
);
d3s16
=
vshrn_n_s32
(
q10s32
,
12
);
// Part two
v2tmp2
=
vtrn_s32
(
vreinterpret_s32_s16
(
d0s16
),
vreinterpret_s32_s16
(
d2s16
));
v2tmp3
=
vtrn_s32
(
vreinterpret_s32_s16
(
d1s16
),
vreinterpret_s32_s16
(
d3s16
));
v2tmp0
=
vtrn_s16
(
vreinterpret_s16_s32
(
v2tmp2
.
val
[
0
]),
// d0
vreinterpret_s16_s32
(
v2tmp3
.
val
[
0
]));
// d1
v2tmp1
=
vtrn_s16
(
vreinterpret_s16_s32
(
v2tmp2
.
val
[
1
]),
// d2
vreinterpret_s16_s32
(
v2tmp3
.
val
[
1
]));
// d3
d4s16
=
vadd_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
d5s16
=
vadd_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
d6s16
=
vsub_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
d7s16
=
vsub_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
d26s16
=
vdup_n_s16
(
7
);
d4s16
=
vadd_s16
(
d4s16
,
d26s16
);
d0s16
=
vadd_s16
(
d4s16
,
d5s16
);
d2s16
=
vsub_s16
(
d4s16
,
d5s16
);
q11s32
=
vmlal_s16
(
q11s32
,
d7s16
,
d16s16
);
q12s32
=
vmlal_s16
(
q12s32
,
d7s16
,
d17s16
);
dEmptys16
=
vdup_n_s16
(
0
);
d4u16
=
vceq_s16
(
d7s16
,
dEmptys16
);
d0s16
=
vshr_n_s16
(
d0s16
,
4
);
d2s16
=
vshr_n_s16
(
d2s16
,
4
);
q11s32
=
vmlal_s16
(
q11s32
,
d6s16
,
d17s16
);
q12s32
=
vmlsl_s16
(
q12s32
,
d6s16
,
d16s16
);
d4u16
=
vmvn_u16
(
d4u16
);
d1s16
=
vshrn_n_s32
(
q11s32
,
16
);
d1s16
=
vsub_s16
(
d1s16
,
vreinterpret_s16_u16
(
d4u16
));
d3s16
=
vshrn_n_s32
(
q12s32
,
16
);
q0s16
=
vcombine_s16
(
d0s16
,
d1s16
);
q1s16
=
vcombine_s16
(
d2s16
,
d3s16
);
vst1q_s16
(
output
,
q0s16
);
vst1q_s16
(
output
+
8
,
q1s16
);
return
;
}
void
vp8_short_fdct8x4_neon
(
int16_t
*
input
,
int16_t
*
output
,
int
pitch
)
{
int16x4_t
d0s16
,
d1s16
,
d2s16
,
d3s16
,
d4s16
,
d5s16
,
d6s16
,
d7s16
;
int16x4_t
d16s16
,
d17s16
,
d26s16
,
d27s16
,
d28s16
,
d29s16
;
uint16x4_t
d28u16
,
d29u16
;
uint16x8_t
q14u16
;
int16x8_t
q0s16
,
q1s16
,
q2s16
,
q3s16
;
int16x8_t
q11s16
,
q12s16
,
q13s16
,
q14s16
,
q15s16
,
qEmptys16
;
int32x4_t
q9s32
,
q10s32
,
q11s32
,
q12s32
;
int16x8x2_t
v2tmp0
,
v2tmp1
;
int32x4x2_t
v2tmp2
,
v2tmp3
;
d16s16
=
vdup_n_s16
(
5352
);
d17s16
=
vdup_n_s16
(
2217
);
q9s32
=
vdupq_n_s32
(
14500
);
q10s32
=
vdupq_n_s32
(
7500
);
// Part one
pitch
>>=
1
;
q0s16
=
vld1q_s16
(
input
);
input
+=
pitch
;
q1s16
=
vld1q_s16
(
input
);
input
+=
pitch
;
q2s16
=
vld1q_s16
(
input
);
input
+=
pitch
;
q3s16
=
vld1q_s16
(
input
);
v2tmp2
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
q0s16
),
vreinterpretq_s32_s16
(
q2s16
));
v2tmp3
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
q1s16
),
vreinterpretq_s32_s16
(
q3s16
));
v2tmp0
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
v2tmp2
.
val
[
0
]),
// q0
vreinterpretq_s16_s32
(
v2tmp3
.
val
[
0
]));
// q1
v2tmp1
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
v2tmp2
.
val
[
1
]),
// q2
vreinterpretq_s16_s32
(
v2tmp3
.
val
[
1
]));
// q3
q11s16
=
vaddq_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
q12s16
=
vaddq_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
q13s16
=
vsubq_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
q14s16
=
vsubq_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
q11s16
=
vshlq_n_s16
(
q11s16
,
3
);
q12s16
=
vshlq_n_s16
(
q12s16
,
3
);
q13s16
=
vshlq_n_s16
(
q13s16
,
3
);
q14s16
=
vshlq_n_s16
(
q14s16
,
3
);
q0s16
=
vaddq_s16
(
q11s16
,
q12s16
);
q2s16
=
vsubq_s16
(
q11s16
,
q12s16
);
q11s32
=
q9s32
;
q12s32
=
q10s32
;
d26s16
=
vget_low_s16
(
q13s16
);
d27s16
=
vget_high_s16
(
q13s16
);
d28s16
=
vget_low_s16
(
q14s16
);
d29s16
=
vget_high_s16
(
q14s16
);
q9s32
=
vmlal_s16
(
q9s32
,
d28s16
,
d16s16
);
q10s32
=
vmlal_s16
(
q10s32
,
d28s16
,
d17s16
);
q11s32
=
vmlal_s16
(
q11s32
,
d29s16
,
d16s16
);
q12s32
=
vmlal_s16
(
q12s32
,
d29s16
,
d17s16
);
q9s32
=
vmlal_s16
(
q9s32
,
d26s16
,
d17s16
);
q10s32
=
vmlsl_s16
(
q10s32
,
d26s16
,
d16s16
);
q11s32
=
vmlal_s16
(
q11s32
,
d27s16
,
d17s16
);
q12s32
=
vmlsl_s16
(
q12s32
,
d27s16
,
d16s16
);
d2s16
=
vshrn_n_s32
(
q9s32
,
12
);
d6s16
=
vshrn_n_s32
(
q10s32
,
12
);
d3s16
=
vshrn_n_s32
(
q11s32
,
12
);
d7s16
=
vshrn_n_s32
(
q12s32
,
12
);
q1s16
=
vcombine_s16
(
d2s16
,
d3s16
);
q3s16
=
vcombine_s16
(
d6s16
,
d7s16
);
// Part two
q9s32
=
vdupq_n_s32
(
12000
);
q10s32
=
vdupq_n_s32
(
51000
);
v2tmp2
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
q0s16
),
vreinterpretq_s32_s16
(
q2s16
));
v2tmp3
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
q1s16
),
vreinterpretq_s32_s16
(
q3s16
));
v2tmp0
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
v2tmp2
.
val
[
0
]),
// q0
vreinterpretq_s16_s32
(
v2tmp3
.
val
[
0
]));
// q1
v2tmp1
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
v2tmp2
.
val
[
1
]),
// q2
vreinterpretq_s16_s32
(
v2tmp3
.
val
[
1
]));
// q3
q11s16
=
vaddq_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
q12s16
=
vaddq_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
q13s16
=
vsubq_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
q14s16
=
vsubq_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
q15s16
=
vdupq_n_s16
(
7
);
q11s16
=
vaddq_s16
(
q11s16
,
q15s16
);
q0s16
=
vaddq_s16
(
q11s16
,
q12s16
);
q1s16
=
vsubq_s16
(
q11s16
,
q12s16
);
q11s32
=
q9s32
;
q12s32
=
q10s32
;
d0s16
=
vget_low_s16
(
q0s16
);
d1s16
=
vget_high_s16
(
q0s16
);
d2s16
=
vget_low_s16
(
q1s16
);
d3s16
=
vget_high_s16
(
q1s16
);
d0s16
=
vshr_n_s16
(
d0s16
,
4
);
d4s16
=
vshr_n_s16
(
d1s16
,
4
);
d2s16
=
vshr_n_s16
(
d2s16
,
4
);
d6s16
=
vshr_n_s16
(
d3s16
,
4
);
d26s16
=
vget_low_s16
(
q13s16
);
d27s16
=
vget_high_s16
(
q13s16
);
d28s16
=
vget_low_s16
(
q14s16
);
d29s16
=
vget_high_s16
(
q14s16
);
q9s32
=
vmlal_s16
(
q9s32
,
d28s16
,
d16s16
);
q10s32
=
vmlal_s16
(
q10s32
,
d28s16
,
d17s16
);
q11s32
=
vmlal_s16
(
q11s32
,
d29s16
,
d16s16
);
q12s32
=
vmlal_s16
(
q12s32
,
d29s16
,
d17s16
);
q9s32
=
vmlal_s16
(
q9s32
,
d26s16
,
d17s16
);
q10s32
=
vmlsl_s16
(
q10s32
,
d26s16
,
d16s16
);
q11s32
=
vmlal_s16
(
q11s32
,
d27s16
,
d17s16
);
q12s32
=
vmlsl_s16
(
q12s32
,
d27s16
,
d16s16
);
d1s16
=
vshrn_n_s32
(
q9s32
,
16
);
d3s16
=
vshrn_n_s32
(
q10s32
,
16
);
d5s16
=
vshrn_n_s32
(
q11s32
,
16
);
d7s16
=
vshrn_n_s32
(
q12s32
,
16
);
qEmptys16
=
vdupq_n_s16
(
0
);
q14u16
=
vceqq_s16
(
q14s16
,
qEmptys16
);
q14u16
=
vmvnq_u16
(
q14u16
);
d28u16
=
vget_low_u16
(
q14u16
);
d29u16
=
vget_high_u16
(
q14u16
);
d1s16
=
vsub_s16
(
d1s16
,
vreinterpret_s16_u16
(
d28u16
));
d5s16
=
vsub_s16
(
d5s16
,
vreinterpret_s16_u16
(
d29u16
));
q0s16
=
vcombine_s16
(
d0s16
,
d1s16
);
q1s16
=
vcombine_s16
(
d2s16
,
d3s16
);
q2s16
=
vcombine_s16
(
d4s16
,
d5s16
);
q3s16
=
vcombine_s16
(
d6s16
,
d7s16
);
vst1q_s16
(
output
,
q0s16
);
vst1q_s16
(
output
+
8
,
q1s16
);
vst1q_s16
(
output
+
16
,
q2s16
);
vst1q_s16
(
output
+
24
,
q3s16
);
return
;
}
This diff is collapsed.
Click to expand it.
vp8/vp8cx_arm.mk
+
1
−
1
View file @
da63d299
...
...
@@ -37,10 +37,10 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
# encoder
VP8_CX_SRCS-$(HAVE_NEON_ASM)
+=
encoder/arm/neon/fastquantizeb_neon
$(
ASM
)
VP8_CX_SRCS-$(HAVE_NEON_ASM)
+=
encoder/arm/neon/picklpf_arm.c
VP8_CX_SRCS-$(HAVE_NEON_ASM)
+=
encoder/arm/neon/shortfdct_neon
$(
ASM
)
VP8_CX_SRCS-$(HAVE_NEON_ASM)
+=
encoder/arm/neon/vp8_mse16x16_neon
$(
ASM
)
VP8_CX_SRCS-$(HAVE_NEON_ASM)
+=
encoder/arm/neon/vp8_memcpy_neon
$(
ASM
)
VP8_CX_SRCS-$(HAVE_NEON)
+=
encoder/arm/neon/denoising_neon.c
VP8_CX_SRCS-$(HAVE_NEON)
+=
encoder/arm/neon/vp8_shortwalsh4x4_neon.c
VP8_CX_SRCS-$(HAVE_NEON)
+=
encoder/arm/neon/subtract_neon.c
VP8_CX_SRCS-$(HAVE_NEON)
+=
encoder/arm/neon/shortfdct_neon.c
This diff is collapsed.
Click to expand it.
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Topics
Snippets