Commit 6e44bf20 authored by James Zern's avatar James Zern

vp9_reconintra_neon: add DC 4x4 predictors

~85-89% faster over 20M pixels

Change-Id: I3812e8adfffe5255034da88dfe6546e12f4d10ee
parent 79fb3a01
......@@ -208,9 +208,11 @@ INTRA_PRED_TEST(DSPR2, TestIntraPred4, vp9_dc_predictor_4x4_dspr2, NULL, NULL,
#endif // HAVE_DSPR2
#if HAVE_NEON
INTRA_PRED_TEST(NEON, TestIntraPred4, NULL, NULL, NULL, NULL,
vp9_v_predictor_4x4_neon, vp9_h_predictor_4x4_neon, NULL, NULL,
NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon)
INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon,
vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon,
vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon,
vp9_h_predictor_4x4_neon, NULL, NULL, NULL, NULL, NULL, NULL,
vp9_tm_predictor_4x4_neon)
#endif // HAVE_NEON
#if HAVE_MSA
......
......@@ -14,6 +14,75 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
//------------------------------------------------------------------------------
// DC 4x4
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left,
int do_above, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
if (do_above) {
const uint8x8_t A = vld1_u8(above); // top row
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
sum_top = vcombine_u16(p1, p1);
}
if (do_left) {
const uint8x8_t L = vld1_u8(left); // left border
const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
const uint16x4_t p1 = vpadd_u16(p0, p0);
sum_left = vcombine_u16(p1, p1);
}
if (do_above && do_left) {
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
dc0 = vrshrn_n_u16(sum, 3);
} else if (do_above) {
dc0 = vrshrn_n_u16(sum_top, 2);
} else if (do_left) {
dc0 = vrshrn_n_u16(sum_left, 2);
} else {
dc0 = vdup_n_u8(0x80);
}
{
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i;
for (i = 0; i < 4; ++i) {
vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
}
}
}
void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
dc_4x4(dst, stride, above, left, 1, 1);
}
void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)above;
dc_4x4(dst, stride, NULL, left, 0, 1);
}
void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
dc_4x4(dst, stride, above, NULL, 1, 0);
}
void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)above;
(void)left;
dc_4x4(dst, stride, NULL, NULL, 0, 0);
}
//------------------------------------------------------------------------------
// DC 8x8
......
......@@ -84,16 +84,16 @@ add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, cons
specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc";
specialize qw/vp9_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc";
specialize qw/vp9_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc";
specialize qw/vp9_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc";
specialize qw/vp9_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment