Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
libvpx
Commits
2d12a52f
Commit
2d12a52f
authored
Jan 06, 2017
by
Linfeng Zhang
Committed by
Gerrit Code Review
Jan 06, 2017
Browse files
Merge "Add high bitdepth 8x8 idct NEON intrinsics"
parents
90f889a5
9b187954
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1018 additions
and
256 deletions
+1018
-256
test/partial_idct_test.cc
test/partial_idct_test.cc
+24
-0
vpx_dsp/arm/highbd_idct8x8_add_neon.c
vpx_dsp/arm/highbd_idct8x8_add_neon.c
+614
-0
vpx_dsp/arm/idct8x8_add_neon.c
vpx_dsp/arm/idct8x8_add_neon.c
+0
-238
vpx_dsp/arm/idct_neon.h
vpx_dsp/arm/idct_neon.h
+245
-0
vpx_dsp/arm/transpose_neon.h
vpx_dsp/arm/transpose_neon.h
+131
-16
vpx_dsp/vpx_dsp.mk
vpx_dsp/vpx_dsp.mk
+1
-0
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/vpx_dsp_rtcd_defs.pl
+3
-2
No files found.
test/partial_idct_test.cc
View file @
2d12a52f
...
...
@@ -446,6 +446,30 @@ INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
const
PartialInvTxfmParam
neon_partial_idct_tests
[]
=
{
#if CONFIG_VP9_HIGHBITDEPTH
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_64_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_64_add_neon
>
,
TX_8X8
,
64
,
8
,
2
),
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_64_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_64_add_neon
>
,
TX_8X8
,
64
,
10
,
2
),
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_64_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_64_add_neon
>
,
TX_8X8
,
64
,
12
,
2
),
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_12_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_12_add_neon
>
,
TX_8X8
,
12
,
8
,
2
),
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_12_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_12_add_neon
>
,
TX_8X8
,
12
,
10
,
2
),
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_12_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_12_add_neon
>
,
TX_8X8
,
12
,
12
,
2
),
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_1_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_1_add_neon
>
,
TX_8X8
,
1
,
8
,
2
),
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_1_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_1_add_neon
>
,
TX_8X8
,
1
,
10
,
2
),
make_tuple
(
&
vpx_highbd_fdct8x8_c
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_1_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct8x8_1_add_neon
>
,
TX_8X8
,
1
,
12
,
2
),
make_tuple
(
&
vpx_highbd_fdct4x4_c
,
&
highbd_wrapper
<
vpx_highbd_idct4x4_16_add_c
>
,
&
highbd_wrapper
<
vpx_highbd_idct4x4_16_add_neon
>
,
TX_4X4
,
16
,
8
,
2
),
...
...
vpx_dsp/arm/highbd_idct8x8_add_neon.c
0 → 100644
View file @
2d12a52f
This diff is collapsed.
Click to expand it.
vpx_dsp/arm/idct8x8_add_neon.c
View file @
2d12a52f
...
...
@@ -16,132 +16,6 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
static
INLINE
void
idct8x8_64_1d_bd8
(
const
int16x4_t
cospis0
,
const
int16x4_t
cospis1
,
int16x8_t
*
const
io0
,
int16x8_t
*
const
io1
,
int16x8_t
*
const
io2
,
int16x8_t
*
const
io3
,
int16x8_t
*
const
io4
,
int16x8_t
*
const
io5
,
int16x8_t
*
const
io6
,
int16x8_t
*
const
io7
)
{
int16x4_t
input_1l
,
input_1h
,
input_3l
,
input_3h
,
input_5l
,
input_5h
,
input_7l
,
input_7h
;
int16x4_t
step1l
[
4
],
step1h
[
4
];
int16x8_t
step1
[
8
],
step2
[
8
];
int32x4_t
t32
[
8
];
int16x4_t
t16
[
8
];
transpose_s16_8x8
(
io0
,
io1
,
io2
,
io3
,
io4
,
io5
,
io6
,
io7
);
// stage 1
input_1l
=
vget_low_s16
(
*
io1
);
input_1h
=
vget_high_s16
(
*
io1
);
input_3l
=
vget_low_s16
(
*
io3
);
input_3h
=
vget_high_s16
(
*
io3
);
input_5l
=
vget_low_s16
(
*
io5
);
input_5h
=
vget_high_s16
(
*
io5
);
input_7l
=
vget_low_s16
(
*
io7
);
input_7h
=
vget_high_s16
(
*
io7
);
step1l
[
0
]
=
vget_low_s16
(
*
io0
);
step1h
[
0
]
=
vget_high_s16
(
*
io0
);
step1l
[
1
]
=
vget_low_s16
(
*
io2
);
step1h
[
1
]
=
vget_high_s16
(
*
io2
);
step1l
[
2
]
=
vget_low_s16
(
*
io4
);
step1h
[
2
]
=
vget_high_s16
(
*
io4
);
step1l
[
3
]
=
vget_low_s16
(
*
io6
);
step1h
[
3
]
=
vget_high_s16
(
*
io6
);
t32
[
0
]
=
vmull_lane_s16
(
input_1l
,
cospis1
,
3
);
t32
[
1
]
=
vmull_lane_s16
(
input_1h
,
cospis1
,
3
);
t32
[
2
]
=
vmull_lane_s16
(
input_3l
,
cospis1
,
2
);
t32
[
3
]
=
vmull_lane_s16
(
input_3h
,
cospis1
,
2
);
t32
[
4
]
=
vmull_lane_s16
(
input_3l
,
cospis1
,
1
);
t32
[
5
]
=
vmull_lane_s16
(
input_3h
,
cospis1
,
1
);
t32
[
6
]
=
vmull_lane_s16
(
input_1l
,
cospis1
,
0
);
t32
[
7
]
=
vmull_lane_s16
(
input_1h
,
cospis1
,
0
);
t32
[
0
]
=
vmlsl_lane_s16
(
t32
[
0
],
input_7l
,
cospis1
,
0
);
t32
[
1
]
=
vmlsl_lane_s16
(
t32
[
1
],
input_7h
,
cospis1
,
0
);
t32
[
2
]
=
vmlal_lane_s16
(
t32
[
2
],
input_5l
,
cospis1
,
1
);
t32
[
3
]
=
vmlal_lane_s16
(
t32
[
3
],
input_5h
,
cospis1
,
1
);
t32
[
4
]
=
vmlsl_lane_s16
(
t32
[
4
],
input_5l
,
cospis1
,
2
);
t32
[
5
]
=
vmlsl_lane_s16
(
t32
[
5
],
input_5h
,
cospis1
,
2
);
t32
[
6
]
=
vmlal_lane_s16
(
t32
[
6
],
input_7l
,
cospis1
,
3
);
t32
[
7
]
=
vmlal_lane_s16
(
t32
[
7
],
input_7h
,
cospis1
,
3
);
t16
[
0
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
t16
[
1
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
t16
[
2
]
=
vrshrn_n_s32
(
t32
[
2
],
14
);
t16
[
3
]
=
vrshrn_n_s32
(
t32
[
3
],
14
);
t16
[
4
]
=
vrshrn_n_s32
(
t32
[
4
],
14
);
t16
[
5
]
=
vrshrn_n_s32
(
t32
[
5
],
14
);
t16
[
6
]
=
vrshrn_n_s32
(
t32
[
6
],
14
);
t16
[
7
]
=
vrshrn_n_s32
(
t32
[
7
],
14
);
step1
[
4
]
=
vcombine_s16
(
t16
[
0
],
t16
[
1
]);
step1
[
5
]
=
vcombine_s16
(
t16
[
2
],
t16
[
3
]);
step1
[
6
]
=
vcombine_s16
(
t16
[
4
],
t16
[
5
]);
step1
[
7
]
=
vcombine_s16
(
t16
[
6
],
t16
[
7
]);
// stage 2
t32
[
2
]
=
vmull_lane_s16
(
step1l
[
0
],
cospis0
,
2
);
t32
[
3
]
=
vmull_lane_s16
(
step1h
[
0
],
cospis0
,
2
);
t32
[
4
]
=
vmull_lane_s16
(
step1l
[
1
],
cospis0
,
3
);
t32
[
5
]
=
vmull_lane_s16
(
step1h
[
1
],
cospis0
,
3
);
t32
[
6
]
=
vmull_lane_s16
(
step1l
[
1
],
cospis0
,
1
);
t32
[
7
]
=
vmull_lane_s16
(
step1h
[
1
],
cospis0
,
1
);
t32
[
0
]
=
vmlal_lane_s16
(
t32
[
2
],
step1l
[
2
],
cospis0
,
2
);
t32
[
1
]
=
vmlal_lane_s16
(
t32
[
3
],
step1h
[
2
],
cospis0
,
2
);
t32
[
2
]
=
vmlsl_lane_s16
(
t32
[
2
],
step1l
[
2
],
cospis0
,
2
);
t32
[
3
]
=
vmlsl_lane_s16
(
t32
[
3
],
step1h
[
2
],
cospis0
,
2
);
t32
[
4
]
=
vmlsl_lane_s16
(
t32
[
4
],
step1l
[
3
],
cospis0
,
1
);
t32
[
5
]
=
vmlsl_lane_s16
(
t32
[
5
],
step1h
[
3
],
cospis0
,
1
);
t32
[
6
]
=
vmlal_lane_s16
(
t32
[
6
],
step1l
[
3
],
cospis0
,
3
);
t32
[
7
]
=
vmlal_lane_s16
(
t32
[
7
],
step1h
[
3
],
cospis0
,
3
);
t16
[
0
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
t16
[
1
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
t16
[
2
]
=
vrshrn_n_s32
(
t32
[
2
],
14
);
t16
[
3
]
=
vrshrn_n_s32
(
t32
[
3
],
14
);
t16
[
4
]
=
vrshrn_n_s32
(
t32
[
4
],
14
);
t16
[
5
]
=
vrshrn_n_s32
(
t32
[
5
],
14
);
t16
[
6
]
=
vrshrn_n_s32
(
t32
[
6
],
14
);
t16
[
7
]
=
vrshrn_n_s32
(
t32
[
7
],
14
);
step2
[
0
]
=
vcombine_s16
(
t16
[
0
],
t16
[
1
]);
step2
[
1
]
=
vcombine_s16
(
t16
[
2
],
t16
[
3
]);
step2
[
2
]
=
vcombine_s16
(
t16
[
4
],
t16
[
5
]);
step2
[
3
]
=
vcombine_s16
(
t16
[
6
],
t16
[
7
]);
step2
[
4
]
=
vaddq_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
5
]
=
vsubq_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
6
]
=
vsubq_s16
(
step1
[
7
],
step1
[
6
]);
step2
[
7
]
=
vaddq_s16
(
step1
[
7
],
step1
[
6
]);
// stage 3
step1
[
0
]
=
vaddq_s16
(
step2
[
0
],
step2
[
3
]);
step1
[
1
]
=
vaddq_s16
(
step2
[
1
],
step2
[
2
]);
step1
[
2
]
=
vsubq_s16
(
step2
[
1
],
step2
[
2
]);
step1
[
3
]
=
vsubq_s16
(
step2
[
0
],
step2
[
3
]);
t32
[
2
]
=
vmull_lane_s16
(
vget_low_s16
(
step2
[
6
]),
cospis0
,
2
);
t32
[
3
]
=
vmull_lane_s16
(
vget_high_s16
(
step2
[
6
]),
cospis0
,
2
);
t32
[
0
]
=
vmlsl_lane_s16
(
t32
[
2
],
vget_low_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
1
]
=
vmlsl_lane_s16
(
t32
[
3
],
vget_high_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
2
]
=
vmlal_lane_s16
(
t32
[
2
],
vget_low_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
3
]
=
vmlal_lane_s16
(
t32
[
3
],
vget_high_s16
(
step2
[
5
]),
cospis0
,
2
);
t16
[
0
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
t16
[
1
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
t16
[
2
]
=
vrshrn_n_s32
(
t32
[
2
],
14
);
t16
[
3
]
=
vrshrn_n_s32
(
t32
[
3
],
14
);
step1
[
5
]
=
vcombine_s16
(
t16
[
0
],
t16
[
1
]);
step1
[
6
]
=
vcombine_s16
(
t16
[
2
],
t16
[
3
]);
// stage 4
*
io0
=
vaddq_s16
(
step1
[
0
],
step2
[
7
]);
*
io1
=
vaddq_s16
(
step1
[
1
],
step1
[
6
]);
*
io2
=
vaddq_s16
(
step1
[
2
],
step1
[
5
]);
*
io3
=
vaddq_s16
(
step1
[
3
],
step2
[
4
]);
*
io4
=
vsubq_s16
(
step1
[
3
],
step2
[
4
]);
*
io5
=
vsubq_s16
(
step1
[
2
],
step1
[
5
]);
*
io6
=
vsubq_s16
(
step1
[
1
],
step1
[
6
]);
*
io7
=
vsubq_s16
(
step1
[
0
],
step2
[
7
]);
}
static
INLINE
void
add8x8
(
int16x8_t
a0
,
int16x8_t
a1
,
int16x8_t
a2
,
int16x8_t
a3
,
int16x8_t
a4
,
int16x8_t
a5
,
int16x8_t
a6
,
int16x8_t
a7
,
uint8_t
*
dest
,
...
...
@@ -229,118 +103,6 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
add8x8
(
a0
,
a1
,
a2
,
a3
,
a4
,
a5
,
a6
,
a7
,
dest
,
stride
);
}
static
INLINE
void
idct8x8_12_pass1_bd8
(
const
int16x4_t
cospis0
,
const
int16x4_t
cospisd0
,
const
int16x4_t
cospisd1
,
int16x4_t
*
const
io0
,
int16x4_t
*
const
io1
,
int16x4_t
*
const
io2
,
int16x4_t
*
const
io3
,
int16x4_t
*
const
io4
,
int16x4_t
*
const
io5
,
int16x4_t
*
const
io6
,
int16x4_t
*
const
io7
)
{
int16x4_t
step1
[
8
],
step2
[
8
];
int32x4_t
t32
[
2
];
transpose_s16_4x4d
(
io0
,
io1
,
io2
,
io3
);
// stage 1
step1
[
4
]
=
vqrdmulh_lane_s16
(
*
io1
,
cospisd1
,
3
);
step1
[
5
]
=
vqrdmulh_lane_s16
(
*
io3
,
cospisd1
,
2
);
step1
[
6
]
=
vqrdmulh_lane_s16
(
*
io3
,
cospisd1
,
1
);
step1
[
7
]
=
vqrdmulh_lane_s16
(
*
io1
,
cospisd1
,
0
);
// stage 2
step2
[
0
]
=
vqrdmulh_lane_s16
(
*
io0
,
cospisd0
,
2
);
step2
[
2
]
=
vqrdmulh_lane_s16
(
*
io2
,
cospisd0
,
3
);
step2
[
3
]
=
vqrdmulh_lane_s16
(
*
io2
,
cospisd0
,
1
);
step2
[
4
]
=
vadd_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
5
]
=
vsub_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
6
]
=
vsub_s16
(
step1
[
7
],
step1
[
6
]);
step2
[
7
]
=
vadd_s16
(
step1
[
7
],
step1
[
6
]);
// stage 3
step1
[
0
]
=
vadd_s16
(
step2
[
0
],
step2
[
3
]);
step1
[
1
]
=
vadd_s16
(
step2
[
0
],
step2
[
2
]);
step1
[
2
]
=
vsub_s16
(
step2
[
0
],
step2
[
2
]);
step1
[
3
]
=
vsub_s16
(
step2
[
0
],
step2
[
3
]);
t32
[
1
]
=
vmull_lane_s16
(
step2
[
6
],
cospis0
,
2
);
t32
[
0
]
=
vmlsl_lane_s16
(
t32
[
1
],
step2
[
5
],
cospis0
,
2
);
t32
[
1
]
=
vmlal_lane_s16
(
t32
[
1
],
step2
[
5
],
cospis0
,
2
);
step1
[
5
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
step1
[
6
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
// stage 4
*
io0
=
vadd_s16
(
step1
[
0
],
step2
[
7
]);
*
io1
=
vadd_s16
(
step1
[
1
],
step1
[
6
]);
*
io2
=
vadd_s16
(
step1
[
2
],
step1
[
5
]);
*
io3
=
vadd_s16
(
step1
[
3
],
step2
[
4
]);
*
io4
=
vsub_s16
(
step1
[
3
],
step2
[
4
]);
*
io5
=
vsub_s16
(
step1
[
2
],
step1
[
5
]);
*
io6
=
vsub_s16
(
step1
[
1
],
step1
[
6
]);
*
io7
=
vsub_s16
(
step1
[
0
],
step2
[
7
]);
}
static
INLINE
void
idct8x8_12_pass2_bd8
(
const
int16x4_t
cospis0
,
const
int16x4_t
cospisd0
,
const
int16x4_t
cospisd1
,
const
int16x4_t
input0
,
const
int16x4_t
input1
,
const
int16x4_t
input2
,
const
int16x4_t
input3
,
const
int16x4_t
input4
,
const
int16x4_t
input5
,
const
int16x4_t
input6
,
const
int16x4_t
input7
,
int16x8_t
*
const
output0
,
int16x8_t
*
const
output1
,
int16x8_t
*
const
output2
,
int16x8_t
*
const
output3
,
int16x8_t
*
const
output4
,
int16x8_t
*
const
output5
,
int16x8_t
*
const
output6
,
int16x8_t
*
const
output7
)
{
int16x8_t
in
[
4
];
int16x8_t
step1
[
8
],
step2
[
8
];
int32x4_t
t32
[
8
];
int16x4_t
t16
[
8
];
transpose_s16_4x8
(
input0
,
input1
,
input2
,
input3
,
input4
,
input5
,
input6
,
input7
,
&
in
[
0
],
&
in
[
1
],
&
in
[
2
],
&
in
[
3
]);
// stage 1
step1
[
4
]
=
vqrdmulhq_lane_s16
(
in
[
1
],
cospisd1
,
3
);
step1
[
5
]
=
vqrdmulhq_lane_s16
(
in
[
3
],
cospisd1
,
2
);
step1
[
6
]
=
vqrdmulhq_lane_s16
(
in
[
3
],
cospisd1
,
1
);
step1
[
7
]
=
vqrdmulhq_lane_s16
(
in
[
1
],
cospisd1
,
0
);
// stage 2
step2
[
0
]
=
vqrdmulhq_lane_s16
(
in
[
0
],
cospisd0
,
2
);
step2
[
2
]
=
vqrdmulhq_lane_s16
(
in
[
2
],
cospisd0
,
3
);
step2
[
3
]
=
vqrdmulhq_lane_s16
(
in
[
2
],
cospisd0
,
1
);
step2
[
4
]
=
vaddq_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
5
]
=
vsubq_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
6
]
=
vsubq_s16
(
step1
[
7
],
step1
[
6
]);
step2
[
7
]
=
vaddq_s16
(
step1
[
7
],
step1
[
6
]);
// stage 3
step1
[
0
]
=
vaddq_s16
(
step2
[
0
],
step2
[
3
]);
step1
[
1
]
=
vaddq_s16
(
step2
[
0
],
step2
[
2
]);
step1
[
2
]
=
vsubq_s16
(
step2
[
0
],
step2
[
2
]);
step1
[
3
]
=
vsubq_s16
(
step2
[
0
],
step2
[
3
]);
t32
[
2
]
=
vmull_lane_s16
(
vget_low_s16
(
step2
[
6
]),
cospis0
,
2
);
t32
[
3
]
=
vmull_lane_s16
(
vget_high_s16
(
step2
[
6
]),
cospis0
,
2
);
t32
[
0
]
=
vmlsl_lane_s16
(
t32
[
2
],
vget_low_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
1
]
=
vmlsl_lane_s16
(
t32
[
3
],
vget_high_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
2
]
=
vmlal_lane_s16
(
t32
[
2
],
vget_low_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
3
]
=
vmlal_lane_s16
(
t32
[
3
],
vget_high_s16
(
step2
[
5
]),
cospis0
,
2
);
t16
[
0
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
t16
[
1
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
t16
[
2
]
=
vrshrn_n_s32
(
t32
[
2
],
14
);
t16
[
3
]
=
vrshrn_n_s32
(
t32
[
3
],
14
);
step1
[
5
]
=
vcombine_s16
(
t16
[
0
],
t16
[
1
]);
step1
[
6
]
=
vcombine_s16
(
t16
[
2
],
t16
[
3
]);
// stage 4
*
output0
=
vaddq_s16
(
step1
[
0
],
step2
[
7
]);
*
output1
=
vaddq_s16
(
step1
[
1
],
step1
[
6
]);
*
output2
=
vaddq_s16
(
step1
[
2
],
step1
[
5
]);
*
output3
=
vaddq_s16
(
step1
[
3
],
step2
[
4
]);
*
output4
=
vsubq_s16
(
step1
[
3
],
step2
[
4
]);
*
output5
=
vsubq_s16
(
step1
[
2
],
step1
[
5
]);
*
output6
=
vsubq_s16
(
step1
[
1
],
step1
[
6
]);
*
output7
=
vsubq_s16
(
step1
[
0
],
step2
[
7
]);
}
void
vpx_idct8x8_12_add_neon
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
)
{
const
int16x8_t
cospis
=
vld1q_s16
(
kCospi
);
...
...
vpx_dsp/arm/idct_neon.h
View file @
2d12a52f
...
...
@@ -24,6 +24,13 @@ DECLARE_ALIGNED(16, static const int16_t, kCospi[8]) = {
-
9102
/* -cospi_20_64 */
,
3196
/* cospi_28_64 */
};
DECLARE_ALIGNED
(
16
,
static
const
int32_t
,
kCospi32
[
8
])
=
{
16384
/* cospi_0_64 */
,
15137
/* cospi_8_64 */
,
11585
/* cospi_16_64 */
,
6270
/* cospi_24_64 */
,
16069
/* cospi_4_64 */
,
13623
/* cospi_12_64 */
,
-
9102
/* -cospi_20_64 */
,
3196
/* cospi_28_64 */
};
//------------------------------------------------------------------------------
// Helper functions used to load tran_low_t into int16, narrowing if necessary.
...
...
@@ -217,4 +224,242 @@ static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
*
a1
=
vsubq_s16
(
d0
,
d1
);
}
static
INLINE
void
idct8x8_12_pass1_bd8
(
const
int16x4_t
cospis0
,
const
int16x4_t
cospisd0
,
const
int16x4_t
cospisd1
,
int16x4_t
*
const
io0
,
int16x4_t
*
const
io1
,
int16x4_t
*
const
io2
,
int16x4_t
*
const
io3
,
int16x4_t
*
const
io4
,
int16x4_t
*
const
io5
,
int16x4_t
*
const
io6
,
int16x4_t
*
const
io7
)
{
int16x4_t
step1
[
8
],
step2
[
8
];
int32x4_t
t32
[
2
];
transpose_s16_4x4d
(
io0
,
io1
,
io2
,
io3
);
// stage 1
step1
[
4
]
=
vqrdmulh_lane_s16
(
*
io1
,
cospisd1
,
3
);
step1
[
5
]
=
vqrdmulh_lane_s16
(
*
io3
,
cospisd1
,
2
);
step1
[
6
]
=
vqrdmulh_lane_s16
(
*
io3
,
cospisd1
,
1
);
step1
[
7
]
=
vqrdmulh_lane_s16
(
*
io1
,
cospisd1
,
0
);
// stage 2
step2
[
1
]
=
vqrdmulh_lane_s16
(
*
io0
,
cospisd0
,
2
);
step2
[
2
]
=
vqrdmulh_lane_s16
(
*
io2
,
cospisd0
,
3
);
step2
[
3
]
=
vqrdmulh_lane_s16
(
*
io2
,
cospisd0
,
1
);
step2
[
4
]
=
vadd_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
5
]
=
vsub_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
6
]
=
vsub_s16
(
step1
[
7
],
step1
[
6
]);
step2
[
7
]
=
vadd_s16
(
step1
[
7
],
step1
[
6
]);
// stage 3
step1
[
0
]
=
vadd_s16
(
step2
[
1
],
step2
[
3
]);
step1
[
1
]
=
vadd_s16
(
step2
[
1
],
step2
[
2
]);
step1
[
2
]
=
vsub_s16
(
step2
[
1
],
step2
[
2
]);
step1
[
3
]
=
vsub_s16
(
step2
[
1
],
step2
[
3
]);
t32
[
1
]
=
vmull_lane_s16
(
step2
[
6
],
cospis0
,
2
);
t32
[
0
]
=
vmlsl_lane_s16
(
t32
[
1
],
step2
[
5
],
cospis0
,
2
);
t32
[
1
]
=
vmlal_lane_s16
(
t32
[
1
],
step2
[
5
],
cospis0
,
2
);
step1
[
5
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
step1
[
6
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
// stage 4
*
io0
=
vadd_s16
(
step1
[
0
],
step2
[
7
]);
*
io1
=
vadd_s16
(
step1
[
1
],
step1
[
6
]);
*
io2
=
vadd_s16
(
step1
[
2
],
step1
[
5
]);
*
io3
=
vadd_s16
(
step1
[
3
],
step2
[
4
]);
*
io4
=
vsub_s16
(
step1
[
3
],
step2
[
4
]);
*
io5
=
vsub_s16
(
step1
[
2
],
step1
[
5
]);
*
io6
=
vsub_s16
(
step1
[
1
],
step1
[
6
]);
*
io7
=
vsub_s16
(
step1
[
0
],
step2
[
7
]);
}
static
INLINE
void
idct8x8_12_pass2_bd8
(
const
int16x4_t
cospis0
,
const
int16x4_t
cospisd0
,
const
int16x4_t
cospisd1
,
const
int16x4_t
input0
,
const
int16x4_t
input1
,
const
int16x4_t
input2
,
const
int16x4_t
input3
,
const
int16x4_t
input4
,
const
int16x4_t
input5
,
const
int16x4_t
input6
,
const
int16x4_t
input7
,
int16x8_t
*
const
output0
,
int16x8_t
*
const
output1
,
int16x8_t
*
const
output2
,
int16x8_t
*
const
output3
,
int16x8_t
*
const
output4
,
int16x8_t
*
const
output5
,
int16x8_t
*
const
output6
,
int16x8_t
*
const
output7
)
{
int16x8_t
in
[
4
];
int16x8_t
step1
[
8
],
step2
[
8
];
int32x4_t
t32
[
8
];
int16x4_t
t16
[
8
];
transpose_s16_4x8
(
input0
,
input1
,
input2
,
input3
,
input4
,
input5
,
input6
,
input7
,
&
in
[
0
],
&
in
[
1
],
&
in
[
2
],
&
in
[
3
]);
// stage 1
step1
[
4
]
=
vqrdmulhq_lane_s16
(
in
[
1
],
cospisd1
,
3
);
step1
[
5
]
=
vqrdmulhq_lane_s16
(
in
[
3
],
cospisd1
,
2
);
step1
[
6
]
=
vqrdmulhq_lane_s16
(
in
[
3
],
cospisd1
,
1
);
step1
[
7
]
=
vqrdmulhq_lane_s16
(
in
[
1
],
cospisd1
,
0
);
// stage 2
step2
[
1
]
=
vqrdmulhq_lane_s16
(
in
[
0
],
cospisd0
,
2
);
step2
[
2
]
=
vqrdmulhq_lane_s16
(
in
[
2
],
cospisd0
,
3
);
step2
[
3
]
=
vqrdmulhq_lane_s16
(
in
[
2
],
cospisd0
,
1
);
step2
[
4
]
=
vaddq_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
5
]
=
vsubq_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
6
]
=
vsubq_s16
(
step1
[
7
],
step1
[
6
]);
step2
[
7
]
=
vaddq_s16
(
step1
[
7
],
step1
[
6
]);
// stage 3
step1
[
0
]
=
vaddq_s16
(
step2
[
1
],
step2
[
3
]);
step1
[
1
]
=
vaddq_s16
(
step2
[
1
],
step2
[
2
]);
step1
[
2
]
=
vsubq_s16
(
step2
[
1
],
step2
[
2
]);
step1
[
3
]
=
vsubq_s16
(
step2
[
1
],
step2
[
3
]);
t32
[
2
]
=
vmull_lane_s16
(
vget_low_s16
(
step2
[
6
]),
cospis0
,
2
);
t32
[
3
]
=
vmull_lane_s16
(
vget_high_s16
(
step2
[
6
]),
cospis0
,
2
);
t32
[
0
]
=
vmlsl_lane_s16
(
t32
[
2
],
vget_low_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
1
]
=
vmlsl_lane_s16
(
t32
[
3
],
vget_high_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
2
]
=
vmlal_lane_s16
(
t32
[
2
],
vget_low_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
3
]
=
vmlal_lane_s16
(
t32
[
3
],
vget_high_s16
(
step2
[
5
]),
cospis0
,
2
);
t16
[
0
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
t16
[
1
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
t16
[
2
]
=
vrshrn_n_s32
(
t32
[
2
],
14
);
t16
[
3
]
=
vrshrn_n_s32
(
t32
[
3
],
14
);
step1
[
5
]
=
vcombine_s16
(
t16
[
0
],
t16
[
1
]);
step1
[
6
]
=
vcombine_s16
(
t16
[
2
],
t16
[
3
]);
// stage 4
*
output0
=
vaddq_s16
(
step1
[
0
],
step2
[
7
]);
*
output1
=
vaddq_s16
(
step1
[
1
],
step1
[
6
]);
*
output2
=
vaddq_s16
(
step1
[
2
],
step1
[
5
]);
*
output3
=
vaddq_s16
(
step1
[
3
],
step2
[
4
]);
*
output4
=
vsubq_s16
(
step1
[
3
],
step2
[
4
]);
*
output5
=
vsubq_s16
(
step1
[
2
],
step1
[
5
]);
*
output6
=
vsubq_s16
(
step1
[
1
],
step1
[
6
]);
*
output7
=
vsubq_s16
(
step1
[
0
],
step2
[
7
]);
}
static
INLINE
void
idct8x8_64_1d_bd8
(
const
int16x4_t
cospis0
,
const
int16x4_t
cospis1
,
int16x8_t
*
const
io0
,
int16x8_t
*
const
io1
,
int16x8_t
*
const
io2
,
int16x8_t
*
const
io3
,
int16x8_t
*
const
io4
,
int16x8_t
*
const
io5
,
int16x8_t
*
const
io6
,
int16x8_t
*
const
io7
)
{
int16x4_t
input_1l
,
input_1h
,
input_3l
,
input_3h
,
input_5l
,
input_5h
,
input_7l
,
input_7h
;
int16x4_t
step1l
[
4
],
step1h
[
4
];
int16x8_t
step1
[
8
],
step2
[
8
];
int32x4_t
t32
[
8
];
int16x4_t
t16
[
8
];
transpose_s16_8x8
(
io0
,
io1
,
io2
,
io3
,
io4
,
io5
,
io6
,
io7
);
// stage 1
input_1l
=
vget_low_s16
(
*
io1
);
input_1h
=
vget_high_s16
(
*
io1
);
input_3l
=
vget_low_s16
(
*
io3
);
input_3h
=
vget_high_s16
(
*
io3
);
input_5l
=
vget_low_s16
(
*
io5
);
input_5h
=
vget_high_s16
(
*
io5
);
input_7l
=
vget_low_s16
(
*
io7
);
input_7h
=
vget_high_s16
(
*
io7
);
step1l
[
0
]
=
vget_low_s16
(
*
io0
);
step1h
[
0
]
=
vget_high_s16
(
*
io0
);
step1l
[
1
]
=
vget_low_s16
(
*
io2
);
step1h
[
1
]
=
vget_high_s16
(
*
io2
);
step1l
[
2
]
=
vget_low_s16
(
*
io4
);
step1h
[
2
]
=
vget_high_s16
(
*
io4
);
step1l
[
3
]
=
vget_low_s16
(
*
io6
);
step1h
[
3
]
=
vget_high_s16
(
*
io6
);
t32
[
0
]
=
vmull_lane_s16
(
input_1l
,
cospis1
,
3
);
t32
[
1
]
=
vmull_lane_s16
(
input_1h
,
cospis1
,
3
);
t32
[
2
]
=
vmull_lane_s16
(
input_3l
,
cospis1
,
2
);
t32
[
3
]
=
vmull_lane_s16
(
input_3h
,
cospis1
,
2
);
t32
[
4
]
=
vmull_lane_s16
(
input_3l
,
cospis1
,
1
);
t32
[
5
]
=
vmull_lane_s16
(
input_3h
,
cospis1
,
1
);
t32
[
6
]
=
vmull_lane_s16
(
input_1l
,
cospis1
,
0
);
t32
[
7
]
=
vmull_lane_s16
(
input_1h
,
cospis1
,
0
);
t32
[
0
]
=
vmlsl_lane_s16
(
t32
[
0
],
input_7l
,
cospis1
,
0
);
t32
[
1
]
=
vmlsl_lane_s16
(
t32
[
1
],
input_7h
,
cospis1
,
0
);
t32
[
2
]
=
vmlal_lane_s16
(
t32
[
2
],
input_5l
,
cospis1
,
1
);
t32
[
3
]
=
vmlal_lane_s16
(
t32
[
3
],
input_5h
,
cospis1
,
1
);
t32
[
4
]
=
vmlsl_lane_s16
(
t32
[
4
],
input_5l
,
cospis1
,
2
);
t32
[
5
]
=
vmlsl_lane_s16
(
t32
[
5
],
input_5h
,
cospis1
,
2
);
t32
[
6
]
=
vmlal_lane_s16
(
t32
[
6
],
input_7l
,
cospis1
,
3
);
t32
[
7
]
=
vmlal_lane_s16
(
t32
[
7
],
input_7h
,
cospis1
,
3
);
t16
[
0
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
t16
[
1
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
t16
[
2
]
=
vrshrn_n_s32
(
t32
[
2
],
14
);
t16
[
3
]
=
vrshrn_n_s32
(
t32
[
3
],
14
);
t16
[
4
]
=
vrshrn_n_s32
(
t32
[
4
],
14
);
t16
[
5
]
=
vrshrn_n_s32
(
t32
[
5
],
14
);
t16
[
6
]
=
vrshrn_n_s32
(
t32
[
6
],
14
);
t16
[
7
]
=
vrshrn_n_s32
(
t32
[
7
],
14
);
step1
[
4
]
=
vcombine_s16
(
t16
[
0
],
t16
[
1
]);
step1
[
5
]
=
vcombine_s16
(
t16
[
2
],
t16
[
3
]);
step1
[
6
]
=
vcombine_s16
(
t16
[
4
],
t16
[
5
]);
step1
[
7
]
=
vcombine_s16
(
t16
[
6
],
t16
[
7
]);
// stage 2
t32
[
2
]
=
vmull_lane_s16
(
step1l
[
0
],
cospis0
,
2
);
t32
[
3
]
=
vmull_lane_s16
(
step1h
[
0
],
cospis0
,
2
);
t32
[
4
]
=
vmull_lane_s16
(
step1l
[
1
],
cospis0
,
3
);
t32
[
5
]
=
vmull_lane_s16
(
step1h
[
1
],
cospis0
,
3
);
t32
[
6
]
=
vmull_lane_s16
(
step1l
[
1
],
cospis0
,
1
);
t32
[
7
]
=
vmull_lane_s16
(
step1h
[
1
],
cospis0
,
1
);
t32
[
0
]
=
vmlal_lane_s16
(
t32
[
2
],
step1l
[
2
],
cospis0
,
2
);
t32
[
1
]
=
vmlal_lane_s16
(
t32
[
3
],
step1h
[
2
],
cospis0
,
2
);
t32
[
2
]
=
vmlsl_lane_s16
(
t32
[
2
],
step1l
[
2
],
cospis0
,
2
);
t32
[
3
]
=
vmlsl_lane_s16
(
t32
[
3
],
step1h
[
2
],
cospis0
,
2
);
t32
[
4
]
=
vmlsl_lane_s16
(
t32
[
4
],
step1l
[
3
],
cospis0
,
1
);
t32
[
5
]
=
vmlsl_lane_s16
(
t32
[
5
],
step1h
[
3
],
cospis0
,
1
);
t32
[
6
]
=
vmlal_lane_s16
(
t32
[
6
],
step1l
[
3
],
cospis0
,
3
);
t32
[
7
]
=
vmlal_lane_s16
(
t32
[
7
],
step1h
[
3
],
cospis0
,
3
);
t16
[
0
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
t16
[
1
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
t16
[
2
]
=
vrshrn_n_s32
(
t32
[
2
],
14
);
t16
[
3
]
=
vrshrn_n_s32
(
t32
[
3
],
14
);
t16
[
4
]
=
vrshrn_n_s32
(
t32
[
4
],
14
);
t16
[
5
]
=
vrshrn_n_s32
(
t32
[
5
],
14
);
t16
[
6
]
=
vrshrn_n_s32
(
t32
[
6
],
14
);
t16
[
7
]
=
vrshrn_n_s32
(
t32
[
7
],
14
);
step2
[
0
]
=
vcombine_s16
(
t16
[
0
],
t16
[
1
]);
step2
[
1
]
=
vcombine_s16
(
t16
[
2
],
t16
[
3
]);
step2
[
2
]
=
vcombine_s16
(
t16
[
4
],
t16
[
5
]);
step2
[
3
]
=
vcombine_s16
(
t16
[
6
],
t16
[
7
]);
step2
[
4
]
=
vaddq_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
5
]
=
vsubq_s16
(
step1
[
4
],
step1
[
5
]);
step2
[
6
]
=
vsubq_s16
(
step1
[
7
],
step1
[
6
]);
step2
[
7
]
=
vaddq_s16
(
step1
[
7
],
step1
[
6
]);
// stage 3
step1
[
0
]
=
vaddq_s16
(
step2
[
0
],
step2
[
3
]);
step1
[
1
]
=
vaddq_s16
(
step2
[
1
],
step2
[
2
]);
step1
[
2
]
=
vsubq_s16
(
step2
[
1
],
step2
[
2
]);
step1
[
3
]
=
vsubq_s16
(
step2
[
0
],
step2
[
3
]);
t32
[
2
]
=
vmull_lane_s16
(
vget_low_s16
(
step2
[
6
]),
cospis0
,
2
);
t32
[
3
]
=
vmull_lane_s16
(
vget_high_s16
(
step2
[
6
]),
cospis0
,
2
);
t32
[
0
]
=
vmlsl_lane_s16
(
t32
[
2
],
vget_low_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
1
]
=
vmlsl_lane_s16
(
t32
[
3
],
vget_high_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
2
]
=
vmlal_lane_s16
(
t32
[
2
],
vget_low_s16
(
step2
[
5
]),
cospis0
,
2
);
t32
[
3
]
=
vmlal_lane_s16
(
t32
[
3
],
vget_high_s16
(
step2
[
5
]),
cospis0
,
2
);
t16
[
0
]
=
vrshrn_n_s32
(
t32
[
0
],
14
);
t16
[
1
]
=
vrshrn_n_s32
(
t32
[
1
],
14
);
t16
[
2
]
=
vrshrn_n_s32
(
t32
[
2
],
14
);
t16
[
3
]
=
vrshrn_n_s32
(
t32
[
3
],
14
);
step1
[
5
]
=
vcombine_s16
(
t16
[
0
],
t16
[
1
]);
step1
[
6
]
=
vcombine_s16
(
t16
[
2
],
t16
[
3
]);
// stage 4
*
io0
=
vaddq_s16
(
step1
[
0
],
step2
[
7
]);
*
io1
=
vaddq_s16
(
step1
[
1
],
step1
[
6
]);
*
io2
=
vaddq_s16
(
step1
[
2
],
step1
[
5
]);
*
io3
=
vaddq_s16
(
step1
[
3
],
step2
[
4
]);
*
io4
=
vsubq_s16
(
step1
[
3
],
step2
[
4
]);
*
io5
=
vsubq_s16
(
step1
[
2
],
step1
[
5
]);
*
io6
=
vsubq_s16
(
step1
[
1
],
step1
[
6
]);
*
io7
=
vsubq_s16
(
step1
[
0
],
step2
[
7
]);
}
#endif // VPX_DSP_ARM_IDCT_NEON_H_
vpx_dsp/arm/transpose_neon.h
View file @
2d12a52f
...
...
@@ -21,7 +21,7 @@
//
// b0.val[0]: 00 01 02 03 16 17 18 19
// b0.val[1]: 04 05 06 07 20 21 22 23
static
INLINE
int16x8x2_t
vpx_vtrnq_s64
(
int32x4_t
a0
,
int32x4_t
a1
)
{
static
INLINE
int16x8x2_t
vpx_vtrnq_s64
_to_s16
(
int32x4_t
a0
,
int32x4_t
a1
)
{