Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
libvpx
Commits
90f889a5
Commit
90f889a5
authored
Jan 06, 2017
by
Linfeng Zhang
Committed by
Gerrit Code Review
Jan 06, 2017
Browse files
Merge "Clean DC only idct NEON intrinsics"
parents
72746c07
911bb980
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
156 additions
and
202 deletions
+156
-202
vpx_dsp/arm/idct16x16_1_add_neon.c
vpx_dsp/arm/idct16x16_1_add_neon.c
+56
-39
vpx_dsp/arm/idct32x32_1_add_neon.c
vpx_dsp/arm/idct32x32_1_add_neon.c
+32
-111
vpx_dsp/arm/idct4x4_1_add_neon.c
vpx_dsp/arm/idct4x4_1_add_neon.c
+19
-15
vpx_dsp/arm/idct8x8_1_add_neon.c
vpx_dsp/arm/idct8x8_1_add_neon.c
+43
-37
vpx_dsp/arm/idct_neon.h
vpx_dsp/arm/idct_neon.h
+6
-0
No files found.
vpx_dsp/arm/idct16x16_1_add_neon.c
View file @
90f889a5
...
...
@@ -11,49 +11,66 @@
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
void
vpx_idct16x16_1_add_neon
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
)
{
uint8x8_t
d2u8
,
d3u8
,
d30u8
,
d31u8
;
uint64x1_t
d2u64
,
d3u64
,
d4u64
,
d5u64
;
uint16x8_t
q0u16
,
q9u16
,
q10u16
,
q11u16
,
q12u16
;
int16x8_t
q0s16
;
uint8_t
*
d1
,
*
d2
;
int16_t
i
,
j
,
a1
;
int16_t
out
=
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
);
out
=
dct_const_round_shift
(
out
*
cospi_16_64
);
a1
=
ROUND_POWER_OF_TWO
(
out
,
6
);
q0s16
=
vdupq_n_s16
(
a1
);
q0u16
=
vreinterpretq_u16_s16
(
q0s16
);
for
(
d1
=
d2
=
dest
,
i
=
0
;
i
<
4
;
i
++
)
{
for
(
j
=
0
;
j
<
2
;
j
++
)
{
d2u64
=
vld1_u64
((
const
uint64_t
*
)
d1
);
d3u64
=
vld1_u64
((
const
uint64_t
*
)(
d1
+
8
));
d1
+=
stride
;
d4u64
=
vld1_u64
((
const
uint64_t
*
)
d1
);
d5u64
=
vld1_u64
((
const
uint64_t
*
)(
d1
+
8
));
d1
+=
stride
;
static
INLINE
void
idct16x16_1_add_pos_kernel
(
uint8_t
**
dest
,
const
int
stride
,
const
uint8x16_t
res
)
{
const
uint8x16_t
a
=
vld1q_u8
(
*
dest
);
const
uint8x16_t
b
=
vqaddq_u8
(
a
,
res
);
vst1q_u8
(
*
dest
,
b
);
*
dest
+=
stride
;
}
q9u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d2u64
));
q10u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d3u64
));
q11u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d4u64
));
q12u16
=
vaddw_u8
(
q0u16
,
vreinterpret_u8_u64
(
d5u64
));
static
INLINE
void
idct16x16_1_add_neg_kernel
(
uint8_t
**
dest
,
const
int
stride
,
const
uint8x16_t
res
)
{
const
uint8x16_t
a
=
vld1q_u8
(
*
dest
);
const
uint8x16_t
b
=
vqsubq_u8
(
a
,
res
);
vst1q_u8
(
*
dest
,
b
);
*
dest
+=
stride
;
}
d2u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q9u16
));
d3u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q10u16
));
d30u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q11u16
));
d31u8
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
q12u16
));
void
vpx_idct16x16_1_add_neon
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
)
{
const
int16_t
out0
=
WRAPLOW
(
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
));
const
int16_t
out1
=
WRAPLOW
(
dct_const_round_shift
(
out0
*
cospi_16_64
));
const
int16_t
a1
=
ROUND_POWER_OF_TWO
(
out1
,
6
);
vst1_u64
((
uint64_t
*
)
d2
,
vreinterpret_u64_u8
(
d2u8
));
vst1_u64
((
uint64_t
*
)(
d2
+
8
),
vreinterpret_u64_u8
(
d3u8
));
d2
+=
stride
;
vst1_u64
((
uint64_t
*
)
d2
,
vreinterpret_u64_u8
(
d30u8
));
vst1_u64
((
uint64_t
*
)(
d2
+
8
),
vreinterpret_u64_u8
(
d31u8
));
d2
+=
stride
;
}
if
(
a1
>=
0
)
{
const
uint8x16_t
dc
=
create_dcq
(
a1
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
}
else
{
const
uint8x16_t
dc
=
create_dcq
(
-
a1
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct16x16_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
}
}
vpx_dsp/arm/idct32x32_1_add_neon.c
View file @
90f889a5
...
...
@@ -10,127 +10,48 @@
#include <arm_neon.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
static
INLINE
void
LD_16x8
(
uint8_t
*
d
,
int
d_stride
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q9u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q10u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q11u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q12u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q13u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q14u8
=
vld1q_u8
(
d
);
d
+=
d_stride
;
*
q15u8
=
vld1q_u8
(
d
);
static
INLINE
void
idct32x32_1_add_pos_kernel
(
uint8_t
**
dest
,
const
int
stride
,
const
uint8x16_t
res
)
{
const
uint8x16_t
a0
=
vld1q_u8
(
*
dest
);
const
uint8x16_t
a1
=
vld1q_u8
(
*
dest
+
16
);
const
uint8x16_t
b0
=
vqaddq_u8
(
a0
,
res
);
const
uint8x16_t
b1
=
vqaddq_u8
(
a1
,
res
);
vst1q_u8
(
*
dest
,
b0
);
vst1q_u8
(
*
dest
+
16
,
b1
);
*
dest
+=
stride
;
}
static
INLINE
void
ADD_DIFF_16x8
(
uint8x16_t
qdiffu8
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vqaddq_u8
(
*
q8u8
,
qdiffu8
);
*
q9u8
=
vqaddq_u8
(
*
q9u8
,
qdiffu8
);
*
q10u8
=
vqaddq_u8
(
*
q10u8
,
qdiffu8
);
*
q11u8
=
vqaddq_u8
(
*
q11u8
,
qdiffu8
);
*
q12u8
=
vqaddq_u8
(
*
q12u8
,
qdiffu8
);
*
q13u8
=
vqaddq_u8
(
*
q13u8
,
qdiffu8
);
*
q14u8
=
vqaddq_u8
(
*
q14u8
,
qdiffu8
);
*
q15u8
=
vqaddq_u8
(
*
q15u8
,
qdiffu8
);
}
static
INLINE
void
SUB_DIFF_16x8
(
uint8x16_t
qdiffu8
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
*
q8u8
=
vqsubq_u8
(
*
q8u8
,
qdiffu8
);
*
q9u8
=
vqsubq_u8
(
*
q9u8
,
qdiffu8
);
*
q10u8
=
vqsubq_u8
(
*
q10u8
,
qdiffu8
);
*
q11u8
=
vqsubq_u8
(
*
q11u8
,
qdiffu8
);
*
q12u8
=
vqsubq_u8
(
*
q12u8
,
qdiffu8
);
*
q13u8
=
vqsubq_u8
(
*
q13u8
,
qdiffu8
);
*
q14u8
=
vqsubq_u8
(
*
q14u8
,
qdiffu8
);
*
q15u8
=
vqsubq_u8
(
*
q15u8
,
qdiffu8
);
}
static
INLINE
void
ST_16x8
(
uint8_t
*
d
,
int
d_stride
,
uint8x16_t
*
q8u8
,
uint8x16_t
*
q9u8
,
uint8x16_t
*
q10u8
,
uint8x16_t
*
q11u8
,
uint8x16_t
*
q12u8
,
uint8x16_t
*
q13u8
,
uint8x16_t
*
q14u8
,
uint8x16_t
*
q15u8
)
{
vst1q_u8
(
d
,
*
q8u8
);
d
+=
d_stride
;
vst1q_u8
(
d
,
*
q9u8
);
d
+=
d_stride
;
vst1q_u8
(
d
,
*
q10u8
);
d
+=
d_stride
;
vst1q_u8
(
d
,
*
q11u8
);
d
+=
d_stride
;
vst1q_u8
(
d
,
*
q12u8
);
d
+=
d_stride
;
vst1q_u8
(
d
,
*
q13u8
);
d
+=
d_stride
;
vst1q_u8
(
d
,
*
q14u8
);
d
+=
d_stride
;
vst1q_u8
(
d
,
*
q15u8
);
static
INLINE
void
idct32x32_1_add_neg_kernel
(
uint8_t
**
dest
,
const
int
stride
,
const
uint8x16_t
res
)
{
const
uint8x16_t
a0
=
vld1q_u8
(
*
dest
);
const
uint8x16_t
a1
=
vld1q_u8
(
*
dest
+
16
);
const
uint8x16_t
b0
=
vqsubq_u8
(
a0
,
res
);
const
uint8x16_t
b1
=
vqsubq_u8
(
a1
,
res
);
vst1q_u8
(
*
dest
,
b0
);
vst1q_u8
(
*
dest
+
16
,
b1
);
*
dest
+=
stride
;
}
void
vpx_idct32x32_1_add_neon
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
)
{
uint8x16_t
q0u8
,
q8u8
,
q9u8
,
q10u8
,
q11u8
,
q12u8
,
q13u8
,
q14u8
,
q15u8
;
int
i
,
j
,
dest_stride8
;
uint8_t
*
d
;
int16_t
a1
;
int16_t
out
=
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
);
out
=
dct_const_round_shift
(
out
*
cospi_16_64
);
a1
=
ROUND_POWER_OF_TWO
(
out
,
6
);
dest_stride8
=
stride
*
8
;
if
(
a1
>=
0
)
{
// diff_positive_32_32
a1
=
a1
<
0
?
0
:
a1
>
255
?
255
:
a1
;
q0u8
=
vdupq_n_u8
((
uint8_t
)
a1
);
for
(
i
=
0
;
i
<
2
;
i
++
,
dest
+=
16
)
{
// diff_positive_32_32_loop
d
=
dest
;
for
(
j
=
0
;
j
<
4
;
j
++
)
{
LD_16x8
(
d
,
stride
,
&
q8u8
,
&
q9u8
,
&
q10u8
,
&
q11u8
,
&
q12u8
,
&
q13u8
,
&
q14u8
,
&
q15u8
);
ADD_DIFF_16x8
(
q0u8
,
&
q8u8
,
&
q9u8
,
&
q10u8
,
&
q11u8
,
&
q12u8
,
&
q13u8
,
&
q14u8
,
&
q15u8
);
ST_16x8
(
d
,
stride
,
&
q8u8
,
&
q9u8
,
&
q10u8
,
&
q11u8
,
&
q12u8
,
&
q13u8
,
&
q14u8
,
&
q15u8
);
d
+=
dest_stride8
;
}
int
i
;
const
int16_t
out0
=
WRAPLOW
(
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
));
const
int16_t
out1
=
WRAPLOW
(
dct_const_round_shift
(
out0
*
cospi_16_64
));
const
int16_t
a1
=
ROUND_POWER_OF_TWO
(
out1
,
6
);
if
(
a1
>=
0
)
{
const
uint8x16_t
dc
=
create_dcq
(
a1
);
for
(
i
=
0
;
i
<
32
;
i
++
)
{
idct32x32_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
}
}
else
{
// diff_negative_32_32
a1
=
-
a1
;
a1
=
a1
<
0
?
0
:
a1
>
255
?
255
:
a1
;
q0u8
=
vdupq_n_u8
((
uint8_t
)
a1
);
for
(
i
=
0
;
i
<
2
;
i
++
,
dest
+=
16
)
{
// diff_negative_32_32_loop
d
=
dest
;
for
(
j
=
0
;
j
<
4
;
j
++
)
{
LD_16x8
(
d
,
stride
,
&
q8u8
,
&
q9u8
,
&
q10u8
,
&
q11u8
,
&
q12u8
,
&
q13u8
,
&
q14u8
,
&
q15u8
);
SUB_DIFF_16x8
(
q0u8
,
&
q8u8
,
&
q9u8
,
&
q10u8
,
&
q11u8
,
&
q12u8
,
&
q13u8
,
&
q14u8
,
&
q15u8
);
ST_16x8
(
d
,
stride
,
&
q8u8
,
&
q9u8
,
&
q10u8
,
&
q11u8
,
&
q12u8
,
&
q13u8
,
&
q14u8
,
&
q15u8
);
d
+=
dest_stride8
;
}
}
else
{
const
uint8x16_t
dc
=
create_dcq
(
-
a1
);
for
(
i
=
0
;
i
<
32
;
i
++
)
{
idct32x32_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
}
}
}
vpx_dsp/arm/idct4x4_1_add_neon.c
View file @
90f889a5
...
...
@@ -14,28 +14,32 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
static
INLINE
void
idct4x4_1_add_kernel
(
uint8_t
**
dest
,
const
int
stride
,
const
int16x8_t
res
,
uint32x2_t
*
const
d
)
{
uint16x8_t
a
;
uint8x8_t
b
;
*
d
=
vld1_lane_u32
((
const
uint32_t
*
)
*
dest
,
*
d
,
0
);
*
d
=
vld1_lane_u32
((
const
uint32_t
*
)(
*
dest
+
stride
),
*
d
,
1
);
a
=
vaddw_u8
(
vreinterpretq_u16_s16
(
res
),
vreinterpret_u8_u32
(
*
d
));
b
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
a
));
vst1_lane_u32
((
uint32_t
*
)
*
dest
,
vreinterpret_u32_u8
(
b
),
0
);
*
dest
+=
stride
;
vst1_lane_u32
((
uint32_t
*
)
*
dest
,
vreinterpret_u32_u8
(
b
),
1
);
*
dest
+=
stride
;
}
void
vpx_idct4x4_1_add_neon
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
)
{
int
i
;
const
int16_t
out0
=
dct_const_round_shift
((
int16_t
)
input
[
0
]
*
cospi_16_64
);
const
int16_t
out1
=
dct_const_round_shift
(
out0
*
cospi_16_64
);
const
int16_t
out0
=
WRAPLOW
(
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
));
const
int16_t
out1
=
WRAPLOW
(
dct_const_round_shift
(
out0
*
cospi_16_64
));
const
int16_t
a1
=
ROUND_POWER_OF_TWO
(
out1
,
4
);
const
int16x8_t
dc
=
vdupq_n_s16
(
a1
);
uint32x2_t
d
=
vdup_n_u32
(
0
);
uint16x8_t
a
;
uint8x8_t
b
;
assert
(
!
((
intptr_t
)
dest
%
sizeof
(
uint32_t
)));
assert
(
!
(
stride
%
sizeof
(
uint32_t
)));
for
(
i
=
0
;
i
<
2
;
i
++
)
{
d
=
vld1_lane_u32
((
const
uint32_t
*
)
dest
,
d
,
0
);
d
=
vld1_lane_u32
((
const
uint32_t
*
)(
dest
+
stride
),
d
,
1
);
a
=
vaddw_u8
(
vreinterpretq_u16_s16
(
dc
),
vreinterpret_u8_u32
(
d
));
b
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
a
));
vst1_lane_u32
((
uint32_t
*
)
dest
,
vreinterpret_u32_u8
(
b
),
0
);
dest
+=
stride
;
vst1_lane_u32
((
uint32_t
*
)
dest
,
vreinterpret_u32_u8
(
b
),
1
);
dest
+=
stride
;
}
idct4x4_1_add_kernel
(
&
dest
,
stride
,
dc
,
&
d
);
idct4x4_1_add_kernel
(
&
dest
,
stride
,
dc
,
&
d
);
}
vpx_dsp/arm/idct8x8_1_add_neon.c
View file @
90f889a5
...
...
@@ -12,47 +12,53 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
void
vpx_idct8x8_1_add_neon
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
)
{
int
i
;
const
int16_t
out0
=
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
);
const
int16_t
out1
=
dct_const_round_shift
(
out0
*
cospi_16_64
);
const
int16_t
out2
=
ROUND_POWER_OF_TWO
(
out1
,
5
);
const
int16x8_t
dc
=
vdupq_n_s16
(
out2
);
const
uint16x8_t
dc_u16
=
vreinterpretq_u16_s16
(
dc
);
const
uint8_t
*
dst
=
dest
;
uint8x8_t
d0
,
d1
,
d2
,
d3
;
uint16x8_t
d0_u16
,
d1_u16
,
d2_u16
,
d3_u16
;
static
INLINE
uint8x8_t
create_dcd
(
const
int16_t
dc
)
{
int16x8_t
t
=
vdupq_n_s16
(
dc
);
return
vqmovun_s16
(
t
);
}
for
(
i
=
0
;
i
<
2
;
i
++
)
{
d0
=
vld1_u8
(
dst
);
dst
+=
stride
;
d1
=
vld1_u8
(
dst
);
dst
+=
stride
;
d2
=
vld1_u8
(
dst
);
dst
+=
stride
;
d3
=
vld1_u8
(
dst
);
dst
+=
stride
;
static
INLINE
void
idct8x8_1_add_pos_kernel
(
uint8_t
**
dest
,
const
int
stride
,
const
uint8x8_t
res
)
{
const
uint8x8_t
a
=
vld1_u8
(
*
dest
);
const
uint8x8_t
b
=
vqadd_u8
(
a
,
res
);
vst1_u8
(
*
dest
,
b
);
*
dest
+=
stride
;
}
d0_u16
=
vaddw_u8
(
dc_u16
,
d0
);
d1_u16
=
vaddw_u8
(
dc_u16
,
d1
);
d2_u16
=
vaddw_u8
(
dc_u16
,
d2
);
d3_u16
=
vaddw_u8
(
dc_u16
,
d3
);
static
INLINE
void
idct8x8_1_add_neg_kernel
(
uint8_t
**
dest
,
const
int
stride
,
const
uint8x8_t
res
)
{
const
uint8x8_t
a
=
vld1_u8
(
*
dest
);
const
uint8x8_t
b
=
vqsub_u8
(
a
,
res
);
vst1_u8
(
*
dest
,
b
);
*
dest
+=
stride
;
}
d0
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
d0_u16
));
d1
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
d1_u16
));
d2
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
d2_u16
));
d3
=
vqmovun_s16
(
vreinterpretq_s16_u16
(
d3_u16
));
void
vpx_idct8x8_1_add_neon
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
)
{
const
int16_t
out0
=
WRAPLOW
(
dct_const_round_shift
(
input
[
0
]
*
cospi_16_64
));
const
int16_t
out1
=
WRAPLOW
(
dct_const_round_shift
(
out0
*
cospi_16_64
));
const
int16_t
a1
=
ROUND_POWER_OF_TWO
(
out1
,
5
);
vst1_u8
(
dest
,
d0
);
dest
+=
stride
;
vst1_u8
(
dest
,
d1
);
dest
+=
stride
;
vst1_u8
(
dest
,
d2
);
dest
+=
stride
;
vst1_u8
(
dest
,
d3
);
dest
+=
stride
;
if
(
a1
>=
0
)
{
const
uint8x8_t
dc
=
create_dcd
(
a1
);
idct8x8_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_pos_kernel
(
&
dest
,
stride
,
dc
);
}
else
{
const
uint8x8_t
dc
=
create_dcd
(
-
a1
);
idct8x8_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
idct8x8_1_add_neg_kernel
(
&
dest
,
stride
,
dc
);
}
}
vpx_dsp/arm/idct_neon.h
View file @
90f889a5
...
...
@@ -181,6 +181,12 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
vst1_u8
(
b
,
b7
);
}
static
INLINE
uint8x16_t
create_dcq
(
const
int16_t
dc
)
{
// Clip both sides and gcc may compile to assembly 'usat'.
const
int16_t
t
=
(
dc
<
0
)
?
0
:
((
dc
>
255
)
?
255
:
dc
);
return
vdupq_n_u8
((
uint8_t
)
t
);
}
static
INLINE
void
idct4x4_16_kernel_bd8
(
const
int16x4_t
cospis
,
int16x8_t
*
const
a0
,
int16x8_t
*
const
a1
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment