Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
BC
public
external
libvpx
Commits
099bd7f0
Commit
099bd7f0
authored
Jul 22, 2016
by
clang-format
Committed by
James Zern
Jul 25, 2016
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vpx_dsp: apply clang-format
Change-Id: I3ea3e77364879928bd916f2b0a7838073ade5975
parent
82070ae9
Changes
146
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
146 changed files
with
21368 additions
and
22562 deletions
+21368
-22562
vpx_dsp/add_noise.c
vpx_dsp/add_noise.c
+1
-1
vpx_dsp/arm/avg_neon.c
vpx_dsp/arm/avg_neon.c
+17
-20
vpx_dsp/arm/fwd_txfm_neon.c
vpx_dsp/arm/fwd_txfm_neon.c
+8
-8
vpx_dsp/arm/hadamard_neon.c
vpx_dsp/arm/hadamard_neon.c
+12
-14
vpx_dsp/arm/idct16x16_1_add_neon.c
vpx_dsp/arm/idct16x16_1_add_neon.c
+41
-44
vpx_dsp/arm/idct16x16_add_neon.c
vpx_dsp/arm/idct16x16_add_neon.c
+1173
-1196
vpx_dsp/arm/idct16x16_neon.c
vpx_dsp/arm/idct16x16_neon.c
+35
-69
vpx_dsp/arm/idct32x32_1_add_neon.c
vpx_dsp/arm/idct32x32_1_add_neon.c
+111
-136
vpx_dsp/arm/idct32x32_add_neon.c
vpx_dsp/arm/idct32x32_add_neon.c
+648
-683
vpx_dsp/arm/idct4x4_1_add_neon.c
vpx_dsp/arm/idct4x4_1_add_neon.c
+26
-30
vpx_dsp/arm/idct4x4_add_neon.c
vpx_dsp/arm/idct4x4_add_neon.c
+133
-138
vpx_dsp/arm/idct8x8_1_add_neon.c
vpx_dsp/arm/idct8x8_1_add_neon.c
+45
-48
vpx_dsp/arm/idct8x8_add_neon.c
vpx_dsp/arm/idct8x8_add_neon.c
+486
-518
vpx_dsp/arm/intrapred_neon.c
vpx_dsp/arm/intrapred_neon.c
+53
-57
vpx_dsp/arm/loopfilter_16_neon.c
vpx_dsp/arm/loopfilter_16_neon.c
+154
-160
vpx_dsp/arm/loopfilter_4_neon.c
vpx_dsp/arm/loopfilter_4_neon.c
+230
-247
vpx_dsp/arm/loopfilter_8_neon.c
vpx_dsp/arm/loopfilter_8_neon.c
+340
-356
vpx_dsp/arm/loopfilter_neon.c
vpx_dsp/arm/loopfilter_neon.c
+11
-21
vpx_dsp/arm/sad4d_neon.c
vpx_dsp/arm/sad4d_neon.c
+25
-27
vpx_dsp/arm/sad_neon.c
vpx_dsp/arm/sad_neon.c
+78
-87
vpx_dsp/arm/subpel_variance_media.c
vpx_dsp/arm/subpel_variance_media.c
+37
-62
vpx_dsp/arm/subpel_variance_neon.c
vpx_dsp/arm/subpel_variance_neon.c
+23
-42
vpx_dsp/arm/subtract_neon.c
vpx_dsp/arm/subtract_neon.c
+24
-25
vpx_dsp/arm/variance_neon.c
vpx_dsp/arm/variance_neon.c
+248
-267
vpx_dsp/arm/vpx_convolve8_avg_neon.c
vpx_dsp/arm/vpx_convolve8_avg_neon.c
+47
-65
vpx_dsp/arm/vpx_convolve8_neon.c
vpx_dsp/arm/vpx_convolve8_neon.c
+49
-67
vpx_dsp/arm/vpx_convolve_avg_neon.c
vpx_dsp/arm/vpx_convolve_avg_neon.c
+18
-21
vpx_dsp/arm/vpx_convolve_copy_neon.c
vpx_dsp/arm/vpx_convolve_copy_neon.c
+11
-13
vpx_dsp/arm/vpx_convolve_neon.c
vpx_dsp/arm/vpx_convolve_neon.c
+15
-22
vpx_dsp/avg.c
vpx_dsp/avg.c
+28
-30
vpx_dsp/bitreader.c
vpx_dsp/bitreader.c
+12
-15
vpx_dsp/bitreader.h
vpx_dsp/bitreader.h
+5
-11
vpx_dsp/bitreader_buffer.c
vpx_dsp/bitreader_buffer.c
+4
-7
vpx_dsp/bitwriter.c
vpx_dsp/bitwriter.c
+6
-9
vpx_dsp/bitwriter.h
vpx_dsp/bitwriter.h
+1
-2
vpx_dsp/bitwriter_buffer.c
vpx_dsp/bitwriter_buffer.c
+4
-5
vpx_dsp/deblock.c
vpx_dsp/deblock.c
+41
-42
vpx_dsp/fastssim.c
vpx_dsp/fastssim.c
+101
-125
vpx_dsp/fwd_txfm.c
vpx_dsp/fwd_txfm.c
+46
-57
vpx_dsp/intrapred.c
vpx_dsp/intrapred.c
+120
-139
vpx_dsp/inv_txfm.c
vpx_dsp/inv_txfm.c
+146
-170
vpx_dsp/inv_txfm.h
vpx_dsp/inv_txfm.h
+6
-8
vpx_dsp/loopfilter.c
vpx_dsp/loopfilter.c
+214
-249
vpx_dsp/mips/add_noise_msa.c
vpx_dsp/mips/add_noise_msa.c
+2
-2
vpx_dsp/mips/common_dspr2.h
vpx_dsp/mips/common_dspr2.h
+4
-20
vpx_dsp/mips/convolve2_avg_dspr2.c
vpx_dsp/mips/convolve2_avg_dspr2.c
+52
-70
vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+79
-103
vpx_dsp/mips/convolve2_dspr2.c
vpx_dsp/mips/convolve2_dspr2.c
+648
-401
vpx_dsp/mips/convolve2_horiz_dspr2.c
vpx_dsp/mips/convolve2_horiz_dspr2.c
+60
-85
vpx_dsp/mips/convolve2_vert_dspr2.c
vpx_dsp/mips/convolve2_vert_dspr2.c
+55
-77
vpx_dsp/mips/convolve8_avg_dspr2.c
vpx_dsp/mips/convolve8_avg_dspr2.c
+159
-196
vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+89
-117
vpx_dsp/mips/convolve8_dspr2.c
vpx_dsp/mips/convolve8_dspr2.c
+937
-605
vpx_dsp/mips/convolve8_horiz_dspr2.c
vpx_dsp/mips/convolve8_horiz_dspr2.c
+71
-104
vpx_dsp/mips/convolve8_vert_dspr2.c
vpx_dsp/mips/convolve8_vert_dspr2.c
+44
-68
vpx_dsp/mips/convolve_common_dspr2.h
vpx_dsp/mips/convolve_common_dspr2.h
+9
-10
vpx_dsp/mips/deblock_msa.c
vpx_dsp/mips/deblock_msa.c
+172
-173
vpx_dsp/mips/fwd_dct32x32_msa.c
vpx_dsp/mips/fwd_dct32x32_msa.c
+52
-58
vpx_dsp/mips/fwd_txfm_msa.c
vpx_dsp/mips/fwd_txfm_msa.c
+34
-34
vpx_dsp/mips/fwd_txfm_msa.h
vpx_dsp/mips/fwd_txfm_msa.h
+338
-331
vpx_dsp/mips/idct16x16_msa.c
vpx_dsp/mips/idct16x16_msa.c
+25
-27
vpx_dsp/mips/idct32x32_msa.c
vpx_dsp/mips/idct32x32_msa.c
+41
-51
vpx_dsp/mips/idct4x4_msa.c
vpx_dsp/mips/idct4x4_msa.c
+2
-2
vpx_dsp/mips/idct8x8_msa.c
vpx_dsp/mips/idct8x8_msa.c
+14
-14
vpx_dsp/mips/intrapred16_dspr2.c
vpx_dsp/mips/intrapred16_dspr2.c
+22
-27
vpx_dsp/mips/intrapred4_dspr2.c
vpx_dsp/mips/intrapred4_dspr2.c
+31
-36
vpx_dsp/mips/intrapred8_dspr2.c
vpx_dsp/mips/intrapred8_dspr2.c
+37
-42
vpx_dsp/mips/intrapred_msa.c
vpx_dsp/mips/intrapred_msa.c
+21
-20
vpx_dsp/mips/inv_txfm_dspr2.h
vpx_dsp/mips/inv_txfm_dspr2.h
+31
-25
vpx_dsp/mips/inv_txfm_msa.h
vpx_dsp/mips/inv_txfm_msa.h
+375
-374
vpx_dsp/mips/itrans16_dspr2.c
vpx_dsp/mips/itrans16_dspr2.c
+218
-256
vpx_dsp/mips/itrans32_cols_dspr2.c
vpx_dsp/mips/itrans32_cols_dspr2.c
+193
-220
vpx_dsp/mips/itrans32_dspr2.c
vpx_dsp/mips/itrans32_dspr2.c
+182
-226
vpx_dsp/mips/itrans4_dspr2.c
vpx_dsp/mips/itrans4_dspr2.c
+56
-74
vpx_dsp/mips/itrans8_dspr2.c
vpx_dsp/mips/itrans8_dspr2.c
+69
-93
vpx_dsp/mips/loopfilter_16_msa.c
vpx_dsp/mips/loopfilter_16_msa.c
+41
-49
vpx_dsp/mips/loopfilter_4_msa.c
vpx_dsp/mips/loopfilter_4_msa.c
+15
-15
vpx_dsp/mips/loopfilter_8_msa.c
vpx_dsp/mips/loopfilter_8_msa.c
+29
-39
vpx_dsp/mips/loopfilter_filters_dspr2.c
vpx_dsp/mips/loopfilter_filters_dspr2.c
+89
-121
vpx_dsp/mips/loopfilter_filters_dspr2.h
vpx_dsp/mips/loopfilter_filters_dspr2.h
+139
-169
vpx_dsp/mips/loopfilter_macros_dspr2.h
vpx_dsp/mips/loopfilter_macros_dspr2.h
+396
-439
vpx_dsp/mips/loopfilter_masks_dspr2.h
vpx_dsp/mips/loopfilter_masks_dspr2.h
+52
-70
vpx_dsp/mips/loopfilter_mb_dspr2.c
vpx_dsp/mips/loopfilter_mb_dspr2.c
+183
-244
vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
+244
-319
vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
+219
-302
vpx_dsp/mips/loopfilter_msa.h
vpx_dsp/mips/loopfilter_msa.h
+230
-226
vpx_dsp/mips/macros_msa.h
vpx_dsp/mips/macros_msa.h
+1131
-1010
vpx_dsp/mips/sad_msa.c
vpx_dsp/mips/sad_msa.c
+188
-187
vpx_dsp/mips/sub_pixel_variance_msa.c
vpx_dsp/mips/sub_pixel_variance_msa.c
+310
-474
vpx_dsp/mips/subtract_msa.c
vpx_dsp/mips/subtract_msa.c
+16
-16
vpx_dsp/mips/txfm_macros_msa.h
vpx_dsp/mips/txfm_macros_msa.h
+76
-73
vpx_dsp/mips/variance_msa.c
vpx_dsp/mips/variance_msa.c
+42
-46
vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+66
-106
vpx_dsp/mips/vpx_convolve8_avg_msa.c
vpx_dsp/mips/vpx_convolve8_avg_msa.c
+103
-160
vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+77
-119
vpx_dsp/mips/vpx_convolve8_horiz_msa.c
vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+25
-37
vpx_dsp/mips/vpx_convolve8_msa.c
vpx_dsp/mips/vpx_convolve8_msa.c
+56
-62
vpx_dsp/mips/vpx_convolve8_vert_msa.c
vpx_dsp/mips/vpx_convolve8_vert_msa.c
+26
-38
vpx_dsp/mips/vpx_convolve_avg_msa.c
vpx_dsp/mips/vpx_convolve_avg_msa.c
+28
-28
vpx_dsp/mips/vpx_convolve_copy_msa.c
vpx_dsp/mips/vpx_convolve_copy_msa.c
+4
-4
vpx_dsp/mips/vpx_convolve_msa.h
vpx_dsp/mips/vpx_convolve_msa.h
+100
-95
vpx_dsp/prob.c
vpx_dsp/prob.c
+16
-22
vpx_dsp/prob.h
vpx_dsp/prob.h
+4
-6
vpx_dsp/psnr.c
vpx_dsp/psnr.c
+48
-63
vpx_dsp/psnr.h
vpx_dsp/psnr.h
+7
-13
vpx_dsp/psnrhvs.c
vpx_dsp/psnrhvs.c
+81
-90
vpx_dsp/quantize.c
vpx_dsp/quantize.c
+52
-70
vpx_dsp/quantize.h
vpx_dsp/quantize.h
+8
-11
vpx_dsp/sad.c
vpx_dsp/sad.c
+75
-74
vpx_dsp/ssim.c
vpx_dsp/ssim.c
+63
-80
vpx_dsp/ssim.h
vpx_dsp/ssim.h
+7
-9
vpx_dsp/subtract.c
vpx_dsp/subtract.c
+12
-14
vpx_dsp/txfm_common.h
vpx_dsp/txfm_common.h
+10
-10
vpx_dsp/variance.c
vpx_dsp/variance.c
+276
-326
vpx_dsp/variance.h
vpx_dsp/variance.h
+28
-32
vpx_dsp/vpx_convolve.c
vpx_dsp/vpx_convolve.c
+127
-143
vpx_dsp/vpx_convolve.h
vpx_dsp/vpx_convolve.h
+2
-2
vpx_dsp/vpx_dsp_common.h
vpx_dsp/vpx_dsp_common.h
+3
-6
vpx_dsp/vpx_dsp_rtcd.c
vpx_dsp/vpx_dsp_rtcd.c
+1
-3
vpx_dsp/vpx_filter.h
vpx_dsp/vpx_filter.h
+0
-1
vpx_dsp/x86/avg_intrin_sse2.c
vpx_dsp/x86/avg_intrin_sse2.c
+13
-14
vpx_dsp/x86/convolve.h
vpx_dsp/x86/convolve.h
+165
-245
vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+1775
-1485
vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+1607
-1560
vpx_dsp/x86/fwd_txfm_avx2.c
vpx_dsp/x86/fwd_txfm_avx2.c
+5
-5
vpx_dsp/x86/fwd_txfm_impl_sse2.h
vpx_dsp/x86/fwd_txfm_impl_sse2.h
+108
-122
vpx_dsp/x86/fwd_txfm_sse2.c
vpx_dsp/x86/fwd_txfm_sse2.c
+80
-80
vpx_dsp/x86/fwd_txfm_sse2.h
vpx_dsp/x86/fwd_txfm_sse2.h
+93
-176
vpx_dsp/x86/halfpix_variance_sse2.c
vpx_dsp/x86/halfpix_variance_sse2.c
+7
-13
vpx_dsp/x86/highbd_loopfilter_sse2.c
vpx_dsp/x86/highbd_loopfilter_sse2.c
+188
-236
vpx_dsp/x86/highbd_quantize_intrin_sse2.c
vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+24
-49
vpx_dsp/x86/highbd_variance_sse2.c
vpx_dsp/x86/highbd_variance_sse2.c
+376
-418
vpx_dsp/x86/inv_txfm_sse2.c
vpx_dsp/x86/inv_txfm_sse2.c
+1105
-1128
vpx_dsp/x86/inv_txfm_sse2.h
vpx_dsp/x86/inv_txfm_sse2.h
+40
-40
vpx_dsp/x86/loopfilter_avx2.c
vpx_dsp/x86/loopfilter_avx2.c
+794
-860
vpx_dsp/x86/loopfilter_sse2.c
vpx_dsp/x86/loopfilter_sse2.c
+280
-282
vpx_dsp/x86/quantize_sse2.c
vpx_dsp/x86/quantize_sse2.c
+7
-7
vpx_dsp/x86/sad4d_avx2.c
vpx_dsp/x86/sad4d_avx2.c
+16
-20
vpx_dsp/x86/sad_avx2.c
vpx_dsp/x86/sad_avx2.c
+142
-145
vpx_dsp/x86/txfm_common_sse2.h
vpx_dsp/x86/txfm_common_sse2.h
+3
-3
vpx_dsp/x86/variance_avx2.c
vpx_dsp/x86/variance_avx2.c
+47
-75
vpx_dsp/x86/variance_impl_avx2.c
vpx_dsp/x86/variance_impl_avx2.c
+348
-371
vpx_dsp/x86/variance_sse2.c
vpx_dsp/x86/variance_sse2.c
+154
-180
vpx_dsp/x86/vpx_asm_stubs.c
vpx_dsp/x86/vpx_asm_stubs.c
+2
-2
vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+232
-259
vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+97
-110
No files found.
vpx_dsp/add_noise.c
View file @
099bd7f0
...
...
@@ -48,7 +48,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
// set up a 256 entry lookup that matches gaussian distribution
for
(
i
=
-
32
;
i
<
32
;
++
i
)
{
const
int
a_i
=
(
int
)
(
0
.
5
+
256
*
gaussian
(
sigma
,
0
,
i
));
const
int
a_i
=
(
int
)(
0
.
5
+
256
*
gaussian
(
sigma
,
0
,
i
));
if
(
a_i
)
{
for
(
j
=
0
;
j
<
a_i
;
++
j
)
{
char_dist
[
next
+
j
]
=
(
char
)
i
;
...
...
vpx_dsp/arm/avg_neon.c
View file @
099bd7f0
...
...
@@ -198,27 +198,24 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
}
}
void
vpx_minmax_8x8_neon
(
const
uint8_t
*
a
,
int
a_stride
,
const
uint8_t
*
b
,
int
b_stride
,
int
*
min
,
int
*
max
)
{
void
vpx_minmax_8x8_neon
(
const
uint8_t
*
a
,
int
a_stride
,
const
uint8_t
*
b
,
int
b_stride
,
int
*
min
,
int
*
max
)
{
// Load and concatenate.
const
uint8x16_t
a01
=
vcombine_u8
(
vld1_u8
(
a
),
vld1_u8
(
a
+
a_stride
));
const
uint8x16_t
a23
=
vcombine_u8
(
vld1_u8
(
a
+
2
*
a_stride
),
vld1_u8
(
a
+
3
*
a_stride
));
const
uint8x16_t
a45
=
vcombine_u8
(
vld1_u8
(
a
+
4
*
a_stride
),
vld1_u8
(
a
+
5
*
a_stride
));
const
uint8x16_t
a67
=
vcombine_u8
(
vld1_u8
(
a
+
6
*
a_stride
),
vld1_u8
(
a
+
7
*
a_stride
));
const
uint8x16_t
b01
=
vcombine_u8
(
vld1_u8
(
b
),
vld1_u8
(
b
+
b_stride
));
const
uint8x16_t
b23
=
vcombine_u8
(
vld1_u8
(
b
+
2
*
b_stride
),
vld1_u8
(
b
+
3
*
b_stride
));
const
uint8x16_t
b45
=
vcombine_u8
(
vld1_u8
(
b
+
4
*
b_stride
),
vld1_u8
(
b
+
5
*
b_stride
));
const
uint8x16_t
b67
=
vcombine_u8
(
vld1_u8
(
b
+
6
*
b_stride
),
vld1_u8
(
b
+
7
*
b_stride
));
const
uint8x16_t
a01
=
vcombine_u8
(
vld1_u8
(
a
),
vld1_u8
(
a
+
a_stride
));
const
uint8x16_t
a23
=
vcombine_u8
(
vld1_u8
(
a
+
2
*
a_stride
),
vld1_u8
(
a
+
3
*
a_stride
));
const
uint8x16_t
a45
=
vcombine_u8
(
vld1_u8
(
a
+
4
*
a_stride
),
vld1_u8
(
a
+
5
*
a_stride
));
const
uint8x16_t
a67
=
vcombine_u8
(
vld1_u8
(
a
+
6
*
a_stride
),
vld1_u8
(
a
+
7
*
a_stride
));
const
uint8x16_t
b01
=
vcombine_u8
(
vld1_u8
(
b
),
vld1_u8
(
b
+
b_stride
));
const
uint8x16_t
b23
=
vcombine_u8
(
vld1_u8
(
b
+
2
*
b_stride
),
vld1_u8
(
b
+
3
*
b_stride
));
const
uint8x16_t
b45
=
vcombine_u8
(
vld1_u8
(
b
+
4
*
b_stride
),
vld1_u8
(
b
+
5
*
b_stride
));
const
uint8x16_t
b67
=
vcombine_u8
(
vld1_u8
(
b
+
6
*
b_stride
),
vld1_u8
(
b
+
7
*
b_stride
));
// Absolute difference.
const
uint8x16_t
ab01_diff
=
vabdq_u8
(
a01
,
b01
);
...
...
vpx_dsp/arm/fwd_txfm_neon.c
View file @
099bd7f0
...
...
@@ -131,14 +131,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
// 14 15 16 17 54 55 56 57
// 24 25 26 27 64 65 66 67
// 34 35 36 37 74 75 76 77
const
int32x4x2_t
r02_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_0
),
vreinterpretq_s32_s16
(
out_2
));
const
int32x4x2_t
r13_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_1
),
vreinterpretq_s32_s16
(
out_3
));
const
int32x4x2_t
r46_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_4
),
vreinterpretq_s32_s16
(
out_6
));
const
int32x4x2_t
r57_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_5
),
vreinterpretq_s32_s16
(
out_7
));
const
int32x4x2_t
r02_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_0
),
vreinterpretq_s32_s16
(
out_2
));
const
int32x4x2_t
r13_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_1
),
vreinterpretq_s32_s16
(
out_3
));
const
int32x4x2_t
r46_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_4
),
vreinterpretq_s32_s16
(
out_6
));
const
int32x4x2_t
r57_s32
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
out_5
),
vreinterpretq_s32_s16
(
out_7
));
const
int16x8x2_t
r01_s16
=
vtrnq_s16
(
vreinterpretq_s16_s32
(
r02_s32
.
val
[
0
]),
vreinterpretq_s16_s32
(
r13_s32
.
val
[
0
]));
...
...
vpx_dsp/arm/hadamard_neon.c
View file @
099bd7f0
...
...
@@ -12,9 +12,8 @@
#include "./vpx_dsp_rtcd.h"
static
void
hadamard8x8_one_pass
(
int16x8_t
*
a0
,
int16x8_t
*
a1
,
int16x8_t
*
a2
,
int16x8_t
*
a3
,
int16x8_t
*
a4
,
int16x8_t
*
a5
,
static
void
hadamard8x8_one_pass
(
int16x8_t
*
a0
,
int16x8_t
*
a1
,
int16x8_t
*
a2
,
int16x8_t
*
a3
,
int16x8_t
*
a4
,
int16x8_t
*
a5
,
int16x8_t
*
a6
,
int16x8_t
*
a7
)
{
const
int16x8_t
b0
=
vaddq_s16
(
*
a0
,
*
a1
);
const
int16x8_t
b1
=
vsubq_s16
(
*
a0
,
*
a1
);
...
...
@@ -47,9 +46,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
// reversing transpose order which may make it easier for the compiler to
// reconcile the vtrn.64 moves.
static
void
transpose8x8
(
int16x8_t
*
a0
,
int16x8_t
*
a1
,
int16x8_t
*
a2
,
int16x8_t
*
a3
,
int16x8_t
*
a4
,
int16x8_t
*
a5
,
static
void
transpose8x8
(
int16x8_t
*
a0
,
int16x8_t
*
a1
,
int16x8_t
*
a2
,
int16x8_t
*
a3
,
int16x8_t
*
a4
,
int16x8_t
*
a5
,
int16x8_t
*
a6
,
int16x8_t
*
a7
)
{
// Swap 64 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
...
...
@@ -91,14 +89,14 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
// a1657_hi:
// 12 13 28 29 44 45 60 61
// 14 15 30 31 46 47 62 63
const
int32x4x2_t
a0246_lo
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
a04_lo
),
vreinterpretq_s32_s16
(
a26_lo
));
const
int32x4x2_t
a1357_lo
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
a15_lo
),
vreinterpretq_s32_s16
(
a37_lo
));
const
int32x4x2_t
a0246_hi
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
a04_hi
),
vreinterpretq_s32_s16
(
a26_hi
));
const
int32x4x2_t
a1357_hi
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
a15_hi
),
vreinterpretq_s32_s16
(
a37_hi
));
const
int32x4x2_t
a0246_lo
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
a04_lo
),
vreinterpretq_s32_s16
(
a26_lo
));
const
int32x4x2_t
a1357_lo
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
a15_lo
),
vreinterpretq_s32_s16
(
a37_lo
));
const
int32x4x2_t
a0246_hi
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
a04_hi
),
vreinterpretq_s32_s16
(
a26_hi
));
const
int32x4x2_t
a1357_hi
=
vtrnq_s32
(
vreinterpretq_s32_s16
(
a15_hi
),
vreinterpretq_s32_s16
(
a37_hi
));
// Swap 16 bit elements resulting in:
// b0:
...
...
vpx_dsp/arm/idct16x16_1_add_neon.c
View file @
099bd7f0
...
...
@@ -13,49 +13,46 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
void
vpx_idct16x16_1_add_neon
(
int16_t
*
input
,
uint8_t
*
dest
,
int
dest_stride
)
{
uint8x8_t
d2u8
,
d3u8
,
d30u8
,
d31u8
;
uint64x1_t
d2u64
,
d3u64
,
d4u64
,
d5u64
;
uint16x8_t
q0u16
,
q9u16
,
q10u16
,
q11u16
,
q12u16
;
int16x8_t
q0s16
;
uint8_t
*
d1
,
*
d2
;
int16_t
i
,
j
,
a1
,
cospi_16_64
=
11585
;
int16_t
out
=
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
);
out
=
dct_const_round_shift
(
out
*
cospi_16_64
);
a1
=
ROUND_POWER_OF_TWO
(
out
,
6
);
q0s16
=
vdupq_n_s16
(
a1
);
q0u16
=
vreinterpretq_u16_s16
(
q0s16
);
for
(
d1
=
d2
=
dest
,
i
=
0
;
i
<
4
;
i
++
)
{
for
(
j
=
0
;
j
<
2
;
j
++
)
{
d2u64
=
vld1_u64
((
const
uint64_t
*
)
d1
);
d3u64
=
vld1_u64
((
const
uint64_t
*
)(
d1
+
8
));
d1
+=
dest_stride
;
d4u64
=
vld1_u64
((
const
uint64_t
*
)
d1
);
d5u64
=
vld1_u64
((
const
uint64_t
*
)(
d1
+
8
));
d1
+=
dest_stride
;
q9u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d2u64
));
q10u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d3u64
));
q11u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d4u64
));
q12u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d5u64
));
d2u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q9u16
));
d3u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q10u16
));
d30u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q11u16
));
d31u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q12u16
));
vst1_u64
((
uint64_t
*
)
d2
,
vreinterpret_u64_u8
(
d2u8
));
vst1_u64
((
uint64_t
*
)(
d2
+
8
),
vreinterpret_u64_u8
(
d3u8
));
d2
+=
dest_stride
;
vst1_u64
((
uint64_t
*
)
d2
,
vreinterpret_u64_u8
(
d30u8
));
vst1_u64
((
uint64_t
*
)(
d2
+
8
),
vreinterpret_u64_u8
(
d31u8
));
d2
+=
dest_stride
;
}
void
vpx_idct16x16_1_add_neon
(
int16_t
*
input
,
uint8_t
*
dest
,
int
dest_stride
)
{
uint8x8_t
d2u8
,
d3u8
,
d30u8
,
d31u8
;
uint64x1_t
d2u64
,
d3u64
,
d4u64
,
d5u64
;
uint16x8_t
q0u16
,
q9u16
,
q10u16
,
q11u16
,
q12u16
;
int16x8_t
q0s16
;
uint8_t
*
d1
,
*
d2
;
int16_t
i
,
j
,
a1
,
cospi_16_64
=
11585
;
int16_t
out
=
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
);
out
=
dct_const_round_shift
(
out
*
cospi_16_64
);
a1
=
ROUND_POWER_OF_TWO
(
out
,
6
);
q0s16
=
vdupq_n_s16
(
a1
);
q0u16
=
vreinterpretq_u16_s16
(
q0s16
);
for
(
d1
=
d2
=
dest
,
i
=
0
;
i
<
4
;
i
++
)
{
for
(
j
=
0
;
j
<
2
;
j
++
)
{
d2u64
=
vld1_u64
((
const
uint64_t
*
)
d1
);
d3u64
=
vld1_u64
((
const
uint64_t
*
)(
d1
+
8
));
d1
+=
dest_stride
;
d4u64
=
vld1_u64
((
const
uint64_t
*
)
d1
);
d5u64
=
vld1_u64
((
const
uint64_t
*
)(
d1
+
8
));
d1
+=
dest_stride
;
q9u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d2u64
));
q10u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d3u64
));
q11u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d4u64
));
q12u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d5u64
));
d2u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q9u16
));
d3u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q10u16
));
d30u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q11u16
));
d31u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q12u16
));
vst1_u64
((
uint64_t
*
)
d2
,
vreinterpret_u64_u8
(
d2u8
));
vst1_u64
((
uint64_t
*
)(
d2
+
8
),
vreinterpret_u64_u8
(
d3u8
));
d2
+=
dest_stride
;
vst1_u64
((
uint64_t
*
)
d2
,
vreinterpret_u64_u8
(
d30u8
));
vst1_u64
((
uint64_t
*
)(
d2
+
8
),
vreinterpret_u64_u8
(
d31u8
));
d2
+=
dest_stride
;
}
return
;
}
return
;
}
vpx_dsp/arm/idct16x16_add_neon.c
View file @
099bd7f0
This diff is collapsed.
Click to expand it.
vpx_dsp/arm/idct16x16_neon.c
View file @
099bd7f0
...
...
@@ -10,24 +10,16 @@
#include "vpx_dsp/vpx_dsp_common.h"
void
vpx_idct16x16_256_add_neon_pass1
(
const
int16_t
*
input
,
int16_t
*
output
,
void
vpx_idct16x16_256_add_neon_pass1
(
const
int16_t
*
input
,
int16_t
*
output
,
int
output_stride
);
void
vpx_idct16x16_256_add_neon_pass2
(
const
int16_t
*
src
,
int16_t
*
output
,
int16_t
*
pass1Output
,
int16_t
skip_adding
,
uint8_t
*
dest
,
int
dest_stride
);
void
vpx_idct16x16_10_add_neon_pass1
(
const
int16_t
*
input
,
int16_t
*
output
,
void
vpx_idct16x16_256_add_neon_pass2
(
const
int16_t
*
src
,
int16_t
*
output
,
int16_t
*
pass1Output
,
int16_t
skip_adding
,
uint8_t
*
dest
,
int
dest_stride
);
void
vpx_idct16x16_10_add_neon_pass1
(
const
int16_t
*
input
,
int16_t
*
output
,
int
output_stride
);
void
vpx_idct16x16_10_add_neon_pass2
(
const
int16_t
*
src
,
int16_t
*
output
,
int16_t
*
pass1Output
,
int16_t
skip_adding
,
uint8_t
*
dest
,
int
dest_stride
);
void
vpx_idct16x16_10_add_neon_pass2
(
const
int16_t
*
src
,
int16_t
*
output
,
int16_t
*
pass1Output
,
int16_t
skip_adding
,
uint8_t
*
dest
,
int
dest_stride
);
#if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
...
...
@@ -35,13 +27,13 @@ extern void vpx_push_neon(int64_t *store);
extern
void
vpx_pop_neon
(
int64_t
*
store
);
#endif // HAVE_NEON_ASM
void
vpx_idct16x16_256_add_neon
(
const
int16_t
*
input
,
uint8_t
*
dest
,
int
dest_stride
)
{
void
vpx_idct16x16_256_add_neon
(
const
int16_t
*
input
,
uint8_t
*
dest
,
int
dest_stride
)
{
#if HAVE_NEON_ASM
int64_t
store_reg
[
8
];
#endif
int16_t
pass1_output
[
16
*
16
]
=
{
0
};
int16_t
row_idct_output
[
16
*
16
]
=
{
0
};
int16_t
pass1_output
[
16
*
16
]
=
{
0
};
int16_t
row_idct_output
[
16
*
16
]
=
{
0
};
#if HAVE_NEON_ASM
// save d8-d15 register values.
...
...
@@ -56,27 +48,19 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2
(
input
+
1
,
row_idct_output
,
pass1_output
,
0
,
dest
,
dest_stride
);
vpx_idct16x16_256_add_neon_pass2
(
input
+
1
,
row_idct_output
,
pass1_output
,
0
,
dest
,
dest_stride
);
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1
(
input
+
8
*
16
,
pass1_output
,
8
);
vpx_idct16x16_256_add_neon_pass1
(
input
+
8
*
16
,
pass1_output
,
8
);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2
(
input
+
8
*
16
+
1
,
row_idct_output
+
8
,
pass1_output
,
0
,
dest
,
dest_stride
);
vpx_idct16x16_256_add_neon_pass2
(
input
+
8
*
16
+
1
,
row_idct_output
+
8
,
pass1_output
,
0
,
dest
,
dest_stride
);
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
...
...
@@ -86,27 +70,20 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2
(
row_idct_output
+
1
,
row_idct_output
,
pass1_output
,
1
,
dest
,
dest_stride
);
vpx_idct16x16_256_add_neon_pass2
(
row_idct_output
+
1
,
row_idct_output
,
pass1_output
,
1
,
dest
,
dest_stride
);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1
(
row_idct_output
+
8
*
16
,
pass1_output
,
8
);
vpx_idct16x16_256_add_neon_pass1
(
row_idct_output
+
8
*
16
,
pass1_output
,
8
);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2
(
row_idct_output
+
8
*
16
+
1
,
row_idct_output
+
8
,
pass1_output
,
1
,
dest
+
8
,
dest_stride
);
vpx_idct16x16_256_add_neon_pass2
(
row_idct_output
+
8
*
16
+
1
,
row_idct_output
+
8
,
pass1_output
,
1
,
dest
+
8
,
dest_stride
);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
...
...
@@ -116,13 +93,13 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
return
;
}
void
vpx_idct16x16_10_add_neon
(
const
int16_t
*
input
,
uint8_t
*
dest
,
int
dest_stride
)
{
void
vpx_idct16x16_10_add_neon
(
const
int16_t
*
input
,
uint8_t
*
dest
,
int
dest_stride
)
{
#if HAVE_NEON_ASM
int64_t
store_reg
[
8
];
#endif
int16_t
pass1_output
[
16
*
16
]
=
{
0
};
int16_t
row_idct_output
[
16
*
16
]
=
{
0
};
int16_t
pass1_output
[
16
*
16
]
=
{
0
};
int16_t
row_idct_output
[
16
*
16
]
=
{
0
};
#if HAVE_NEON_ASM
// save d8-d15 register values.
...
...
@@ -137,12 +114,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_10_add_neon_pass2
(
input
+
1
,
row_idct_output
,
pass1_output
,
0
,
dest
,
dest_stride
);
vpx_idct16x16_10_add_neon_pass2
(
input
+
1
,
row_idct_output
,
pass1_output
,
0
,
dest
,
dest_stride
);
/* Skip Parallel idct on the lower 8 rows as they are all 0s */
...
...
@@ -154,27 +127,20 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2
(
row_idct_output
+
1
,
row_idct_output
,
pass1_output
,
1
,
dest
,
dest_stride
);
vpx_idct16x16_256_add_neon_pass2
(
row_idct_output
+
1
,
row_idct_output
,
pass1_output
,
1
,
dest
,
dest_stride
);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1
(
row_idct_output
+
8
*
16
,
pass1_output
,
8
);
vpx_idct16x16_256_add_neon_pass1
(
row_idct_output
+
8
*
16
,
pass1_output
,
8
);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2
(
row_idct_output
+
8
*
16
+
1
,
row_idct_output
+
8
,
pass1_output
,
1
,
dest
+
8
,
dest_stride
);
vpx_idct16x16_256_add_neon_pass2
(
row_idct_output
+
8
*
16
+
1
,
row_idct_output
+
8
,
pass1_output
,
1
,
dest
+
8
,
dest_stride
);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
...
...
vpx_dsp/arm/idct32x32_1_add_neon.c
View file @
099bd7f0
...
...
@@ -15,151 +15,126 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
static
INLINE
void
LD_16x8
(
uint8_t
*
d
,
int
d_stride
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q9u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q10u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q11u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q12u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q13u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q14u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q15u8
=
vld1q_u8
(
d
);
return
;
static
INLINE
void
LD_16x8
(
uint8_t
*
d
,
int
d_stride
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q9u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q10u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q11u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q12u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q13u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q14u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q15u8
=
vld1q_u8
(
d
);
return
;
}
static
INLINE
void
ADD_DIFF_16x8
(
uint8x16_t
qdiffu8
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vqaddq_u8
(
*
q8u8
,
qdiffu8
);
*
q9u8
=
vqaddq_u8
(
*
q9u8
,
qdiffu8
);
*
q10u8
=
vqaddq_u8
(
*
q10u8
,
qdiffu8
);
*
q11u8
=
vqaddq_u8
(
*
q11u8
,
qdiffu8
);
*
q12u8
=
vqaddq_u8
(
*
q12u8
,
qdiffu8
);
*
q13u8
=
vqaddq_u8
(
*
q13u8
,
qdiffu8
);
*
q14u8
=
vqaddq_u8
(
*
q14u8
,
qdiffu8
);
*
q15u8
=
vqaddq_u8
(
*
q15u8
,
qdiffu8
);
return
;
static
INLINE
void
ADD_DIFF_16x8
(
uint8x16_t
qdiffu8
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vqaddq_u8
(
*
q8u8
,
qdiffu8
);
*
q9u8
=
vqaddq_u8
(
*
q9u8
,
qdiffu8
);
*
q10u8
=
vqaddq_u8
(
*
q10u8
,
qdiffu8
);
*
q11u8
=
vqaddq_u8
(
*
q11u8
,
qdiffu8
);
*
q12u8
=
vqaddq_u8
(
*
q12u8
,
qdiffu8
);
*
q13u8
=
vqaddq_u8
(
*
q13u8
,
qdiffu8
);
*
q14u8
=
vqaddq_u8
(
*
q14u8
,
qdiffu8
);
*
q15u8
=
vqaddq_u8
(
*
q15u8
,
qdiffu8
);
return
;
}
static
INLINE
void
SUB_DIFF_16x8
(
uint8x16_t
qdiffu8
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vqsubq_u8
(
*
q8u8
,
qdiffu8
);
*
q9u8
=
vqsubq_u8
(
*
q9u8
,
qdiffu8
);
*
q10u8
=
vqsubq_u8
(
*
q10u8
,
qdiffu8
);
*
q11u8
=
vqsubq_u8
(
*
q11u8
,
qdiffu8
);
*
q12u8
=
vqsubq_u8
(
*
q12u8
,
qdiffu8
);
*
q13u8
=
vqsubq_u8
(
*
q13u8
,
qdiffu8
);
*
q14u8
=
vqsubq_u8
(
*
q14u8
,
qdiffu8
);
*
q15u8
=
vqsubq_u8
(
*
q15u8
,
qdiffu8
);
return
;
static
INLINE
void
SUB_DIFF_16x8
(
uint8x16_t
qdiffu8
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vqsubq_u8
(
*
q8u8
,
qdiffu8
);
*
q9u8
=
vqsubq_u8
(
*
q9u8
,
qdiffu8
);
*
q10u8
=
vqsubq_u8
(
*
q10u8
,
qdiffu8
);
*
q11u8
=
vqsubq_u8
(
*
q11u8
,
qdiffu8
);
*
q12u8
=
vqsubq_u8
(
*
q12u8
,
qdiffu8
);
*
q13u8
=
vqsubq_u8
(
*
q13u8
,
qdiffu8
);
*
q14u8
=
vqsubq_u8
(
*
q14u8
,
qdiffu8
);
*
q15u8
=
vqsubq_u8
(
*
q15u8
,
qdiffu8
);
return
;
}
static
INLINE
void
ST_16x8
(
uint8_t
*
d
,
int
d_stride
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{