Commit eb1d0f8d authored by James Zern's avatar James Zern

add vp9_satd_neon

~60-65% faster at the function level across block sizes

Change-Id: Iaf8cbe95731c43fdcbf68256e44284ba51a93893
parent 60760f71
......@@ -385,6 +385,14 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
INSTANTIATE_TEST_CASE_P(
NEON, SatdTest,
::testing::Values(
make_tuple(16, &vp9_satd_neon),
make_tuple(64, &vp9_satd_neon),
make_tuple(256, &vp9_satd_neon),
make_tuple(1024, &vp9_satd_neon)));
#endif
#if HAVE_MSA
......
......@@ -210,7 +210,7 @@ add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride,
specialize qw/vp9_hadamard_16x16 sse2/;
add_proto qw/int vp9_satd/, "const int16_t *coeff, int length";
specialize qw/vp9_satd sse2/;
specialize qw/vp9_satd sse2 neon/;
add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
specialize qw/vp9_int_pro_row sse2 neon/;
......
......@@ -50,6 +50,33 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
return (horizontal_add_u16x8(v_sum) + 32) >> 6;
}
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
int vp9_satd_neon(const int16_t *coeff, int length) {
const int16x4_t zero = vdup_n_s16(0);
int32x4_t accum = vdupq_n_s32(0);
do {
const int16x8_t src0 = vld1q_s16(coeff);
const int16x8_t src8 = vld1q_s16(coeff + 8);
accum = vabal_s16(accum, vget_low_s16(src0), zero);
accum = vabal_s16(accum, vget_high_s16(src0), zero);
accum = vabal_s16(accum, vget_low_s16(src8), zero);
accum = vabal_s16(accum, vget_high_s16(src8), zero);
length -= 16;
coeff += 16;
} while (length != 0);
{
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'.
const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
vreinterpret_s32_s64(vget_high_s64(s0)));
const int satd = vget_lane_s32(s1, 0);
return satd;
}
}
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
const int ref_stride, const int height) {
int i;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment