Commit d331e7a1 authored by Linfeng Zhang's avatar Linfeng Zhang

Remove get_filter_base() and get_filter_offset() in convolve

so that the convolve functions are independent of table alignment.

Change-Id: Ieab132a30d72c6e75bbe9473544fbe2cf51541ee
parent d49a1a53
......@@ -33,9 +33,9 @@ static const unsigned int kMaxDimension = 64;
typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h);
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4, int w,
int h);
typedef void (*WrapperFilterBlock2d8Func)(
const uint8_t *src_ptr, const unsigned int src_stride,
......@@ -550,7 +550,7 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
width, height);
}
vpx_usec_timer_mark(&timer);
......@@ -570,7 +570,7 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
width, height);
}
vpx_usec_timer_mark(&timer);
......@@ -585,7 +585,7 @@ TEST_P(ConvolveTest, Copy) {
uint8_t *const out = output();
ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,
NULL, 0, NULL, 0, Width(), Height()));
NULL, 0, 0, 0, 0, Width(), Height()));
CheckGuardBlocks();
......@@ -604,7 +604,7 @@ TEST_P(ConvolveTest, Avg) {
CopyOutputToRef();
ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,
NULL, 0, NULL, 0, Width(), Height()));
NULL, 0, 0, 0, 0, Width(), Height()));
CheckGuardBlocks();
......@@ -621,12 +621,10 @@ TEST_P(ConvolveTest, Avg) {
TEST_P(ConvolveTest, CopyHoriz) {
uint8_t *const in = input();
uint8_t *const out = output();
DECLARE_ALIGNED(256, const int16_t,
filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride,
filter8, 16, filter8, 16, Width(),
Height()));
vp9_filter_kernels[0], 0, 16, 0, 16,
Width(), Height()));
CheckGuardBlocks();
......@@ -641,12 +639,10 @@ TEST_P(ConvolveTest, CopyHoriz) {
TEST_P(ConvolveTest, CopyVert) {
uint8_t *const in = input();
uint8_t *const out = output();
DECLARE_ALIGNED(256, const int16_t,
filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride,
filter8, 16, filter8, 16, Width(),
Height()));
vp9_filter_kernels[0], 0, 16, 0, 16,
Width(), Height()));
CheckGuardBlocks();
......@@ -661,12 +657,10 @@ TEST_P(ConvolveTest, CopyVert) {
TEST_P(ConvolveTest, Copy2D) {
uint8_t *const in = input();
uint8_t *const out = output();
DECLARE_ALIGNED(256, const int16_t,
filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride,
filter8, 16, filter8, 16, Width(),
Height()));
vp9_filter_kernels[0], 0, 16, 0, 16,
Width(), Height()));
CheckGuardBlocks();
......@@ -702,7 +696,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
}
}
const int16_t kInvalidFilter[8] = { 0 };
const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = {
wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c
};
......@@ -755,21 +748,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
Width(), Height(), UUT_->use_highbd_);
if (filter_x && filter_y)
ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i](
in, kInputStride, out, kOutputStride, filters[filter_x], 16,
filters[filter_y], 16, Width(), Height()));
ASM_REGISTER_STATE_CHECK(
UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters,
filter_x, 16, filter_y, 16, Width(), Height()));
else if (filter_y)
ASM_REGISTER_STATE_CHECK(UUT_->v8_[i](
in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
filters[filter_y], 16, Width(), Height()));
ASM_REGISTER_STATE_CHECK(
UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0,
16, filter_y, 16, Width(), Height()));
else if (filter_x)
ASM_REGISTER_STATE_CHECK(UUT_->h8_[i](
in, kInputStride, out, kOutputStride, filters[filter_x], 16,
kInvalidFilter, 16, Width(), Height()));
ASM_REGISTER_STATE_CHECK(
UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,
filter_x, 16, 0, 16, Width(), Height()));
else
ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](
in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
kInvalidFilter, 0, Width(), Height()));
ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out,
kOutputStride, NULL, 0, 0,
0, 0, Width(), Height()));
CheckGuardBlocks();
......@@ -853,21 +846,21 @@ TEST_P(ConvolveTest, FilterExtremes) {
filters[filter_y], ref, kOutputStride,
Width(), Height(), UUT_->use_highbd_);
if (filter_x && filter_y)
ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0](
in, kInputStride, out, kOutputStride, filters[filter_x], 16,
filters[filter_y], 16, Width(), Height()));
ASM_REGISTER_STATE_CHECK(
UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters,
filter_x, 16, filter_y, 16, Width(), Height()));
else if (filter_y)
ASM_REGISTER_STATE_CHECK(UUT_->v8_[0](
in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
filters[filter_y], 16, Width(), Height()));
ASM_REGISTER_STATE_CHECK(
UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0,
16, filter_y, 16, Width(), Height()));
else if (filter_x)
ASM_REGISTER_STATE_CHECK(UUT_->h8_[0](
in, kInputStride, out, kOutputStride, filters[filter_x], 16,
kInvalidFilter, 16, Width(), Height()));
ASM_REGISTER_STATE_CHECK(
UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,
filter_x, 16, 0, 16, Width(), Height()));
else
ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](
in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
kInvalidFilter, 0, Width(), Height()));
ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out,
kOutputStride, NULL, 0, 0,
0, 0, Width(), Height()));
for (int y = 0; y < Height(); ++y) {
for (int x = 0; x < Width(); ++x)
......@@ -897,8 +890,8 @@ TEST_P(ConvolveTest, CheckScalingFiltering) {
for (int step = 1; step <= 32; ++step) {
/* Test the horizontal and vertical filters in combination. */
ASM_REGISTER_STATE_CHECK(
UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac],
step, eighttap[frac], step, Width(), Height()));
UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, frac,
step, frac, step, Width(), Height()));
CheckGuardBlocks();
......@@ -917,14 +910,14 @@ TEST_P(ConvolveTest, CheckScalingFiltering) {
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
#define WRAP(func, bd) \
void wrap_##func##_##bd( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \
const int16_t *filter_y, int filter_y_stride, int w, int h) { \
vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride, \
reinterpret_cast<uint16_t *>(dst), dst_stride, filter_x, \
filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
#define WRAP(func, bd) \
void wrap_##func##_##bd( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride, \
reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \
}
#if HAVE_SSE2 && ARCH_X86_64
......
......@@ -26,9 +26,9 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
const struct scale_factors *sf, int w, int h,
int ref, const InterpKernel *kernel, int xs,
int ys) {
sf->predict[subpel_x != 0][subpel_y != 0][ref](
src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
ys, w, h);
sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst,
dst_stride, kernel, subpel_x,
xs, subpel_y, ys, w, h);
}
#if CONFIG_VP9_HIGHBITDEPTH
......@@ -37,8 +37,8 @@ static INLINE void highbd_inter_predictor(
const int subpel_x, const int subpel_y, const struct scale_factors *sf,
int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
ys, w, h, bd);
src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w,
h, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
......
......@@ -390,12 +390,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
}
if (decision == FILTER_BLOCK) {
vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0,
NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0,
0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
} else { // COPY_BLOCK
vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0,
NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0,
0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
}
*denoiser_decision = decision;
......
......@@ -2645,15 +2645,14 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor, bd);
CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel,
x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
16 * src_h / dst_h, 16 / factor, 16 / factor,
bd);
} else {
vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
16 / factor);
vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
16 * src_h / dst_h, 16 / factor, 16 / factor);
}
}
}
......
......@@ -43,10 +43,9 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
(x / factor) * src_w / dst_w;
uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
16 / factor);
vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
16 * src_h / dst_h, 16 / factor, 16 / factor);
}
}
}
......
......@@ -2162,15 +2162,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
vpx_highbd_convolve_copy(
CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,
NULL, 0, NULL, 0, bw, bh, xd->bd);
NULL, 0, 0, 0, 0, bw, bh, xd->bd);
else
vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride, NULL,
0, NULL, 0, bw, bh);
0, 0, 0, 0, bw, bh);
#else
vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride, NULL, 0,
NULL, 0, bw, bh);
0, 0, 0, bw, bh);
#endif // CONFIG_VP9_HIGHBITDEPTH
best_pred = this_mode_pred;
}
......@@ -2264,14 +2264,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (cm->use_highbitdepth)
vpx_highbd_convolve_copy(
CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0,
CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0,
bw, bh, xd->bd);
else
vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
#else
vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
......
......@@ -600,7 +600,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
32, NULL, 0, NULL, 0, bs, bs, xd->bd);
32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
} else {
......@@ -623,7 +623,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
recon = CONVERT_TO_BYTEPTR(recon16);
} else {
#endif // CONFIG_VP9_HIGHBITDEPTH
vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
switch (tx_size) {
case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
......
......@@ -137,15 +137,14 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, // unused
int y_step_q4, // unused
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
if (x_step_q4 != 16) {
vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
} else {
const int16x8_t filters = vld1q_s16(filter_x);
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
uint16x8_t t0, t1, t2, t3;
......@@ -337,15 +336,15 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, // unused
int y_step_q4, // unused
int w, int h, int bd) {
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4,
int y_step_q4, int w, int h, int bd) {
if (x_step_q4 != 16) {
vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
bd);
} else {
const int16x8_t filters = vld1q_s16(filter_x);
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
uint16x8_t t0, t1, t2, t3;
......@@ -566,15 +565,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, // unused
int x_step_q4, // unused
const int16_t *filter_y, int y_step_q4,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
if (y_step_q4 != 16) {
vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
x_step_q4, y0_q4, y_step_q4, w, h, bd);
} else {
const int16x8_t filters = vld1q_s16(filter_y);
const int16x8_t filters = vld1q_s16(filter[y0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
assert(!((intptr_t)dst & 3));
......@@ -732,15 +730,15 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x, // unused
int x_step_q4, // unused
const int16_t *filter_y, int y_step_q4,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
if (y_step_q4 != 16) {
vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
bd);
} else {
const int16x8_t filters = vld1q_s16(filter_y);
const int16x8_t filters = vld1q_s16(filter[y0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
assert(!((intptr_t)dst & 3));
......
......@@ -15,13 +15,14 @@
void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
(void)filter_x;
(void)filter_x_stride;
(void)filter_y;
(void)filter_y_stride;
(void)filter;
(void)x0_q4;
(void)x_step_q4;
(void)y0_q4;
(void)y_step_q4;
(void)bd;
if (w < 8) { // avg4
......
......@@ -15,13 +15,14 @@
void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
(void)filter_x;
(void)filter_x_stride;
(void)filter_y;
(void)filter_y_stride;
(void)filter;
(void)x0_q4;
(void)x_step_q4;
(void)y0_q4;
(void)y_step_q4;
(void)bd;
if (w < 8) { // copy4
......
......@@ -15,10 +15,9 @@
void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4, int w,
int h, int bd) {
const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
uint16_t temp[64 * 136];
const int intermediate_height =
......@@ -29,20 +28,19 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
* buffer which has lots of extra room and is subsequently discarded this is
* safe if somewhat less than ideal. */
vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
filter_x, x_step_q4, filter_y, y_step_q4, w,
filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
intermediate_height, bd);
/* Step into the temp buffer 3 lines to get the actual frame data */
vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
}
void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
uint16_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h, int bd) {
const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
uint16_t temp[64 * 136];
const int intermediate_height =
......@@ -52,8 +50,9 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
* to average the values after both passes.
*/
vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
filter_x, x_step_q4, filter_y, y_step_q4, w,
filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
intermediate_height, bd);
vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
bd);
}
......@@ -42,10 +42,11 @@
; r1 int src_stride
; r2 uint8_t *dst
; r3 int dst_stride
; sp[]const int16_t *filter_x
; sp[]int x_step_q4
; sp[]const int16_t *filter_y ; unused
; sp[]int y_step_q4 ; unused
; sp[]const int16_t *filter
; sp[]int x0_q4
; sp[]int x_step_q4 ; unused
; sp[]int y0_q4
; sp[]int y_step_q4 ; unused
; sp[]int w
; sp[]int h
......@@ -54,11 +55,11 @@
sub r0, r0, #3 ; adjust for taps
ldr r5, [sp, #32] ; filter_x
ldr r6, [sp, #48] ; w
ldr r7, [sp, #52] ; h
ldrd r4, r5, [sp, #32] ; filter, x0_q4
add r4, r5, lsl #4
ldrd r6, r7, [sp, #52] ; w, h
vld1.s16 {q0}, [r5] ; filter_x
vld1.s16 {q0}, [r4] ; filter
sub r8, r1, r1, lsl #2 ; -src_stride * 3
add r8, r8, #4 ; -src_stride * 3 + 4
......@@ -127,7 +128,7 @@ vpx_convolve8_avg_loop_horiz
sub r2, r2, r3, lsl #2 ; reset for store
; src[] * filter_x
; src[] * filter
MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
......@@ -184,11 +185,13 @@ vpx_convolve8_avg_loop_horiz
sub r0, r0, r1
sub r0, r0, r1, lsl #1
ldr r4, [sp, #32] ; filter_y
ldr r6, [sp, #40] ; w
ldr lr, [sp, #44] ; h
ldr r4, [sp, #24] ; filter
ldr r5, [sp, #36] ; y0_q4
add r4, r5, lsl #4
ldr r6, [sp, #44] ; w
ldr lr, [sp, #48] ; h
vld1.s16 {q0}, [r4] ; filter_y
vld1.s16 {q0}, [r4] ; filter
lsl r1, r1, #1
lsl r3, r3, #1
......@@ -232,7 +235,7 @@ vpx_convolve8_avg_loop_vert
pld [r7]
pld [r4]
; src[] * filter_y
; src[] * filter
MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
pld [r7, r1]
......
......@@ -125,11 +125,10 @@ static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2,
void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, // unused
int y_step_q4, // unused
int w, int h) {
const int16x8_t filters = vld1q_s16(filter_x);
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
uint8x8_t t0, t1, t2, t3;
assert(!((intptr_t)dst & 3));
......@@ -137,8 +136,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 == 16);
(void)x_step_q4;
(void)y0_q4;
(void)y_step_q4;
(void)filter_y;
src -= 3;
......@@ -390,11 +389,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, // unused
int y_step_q4, // unused
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
const int16x8_t filters = vld1q_s16(filter_x);
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
uint8x8_t t0, t1, t2, t3;
assert(!((intptr_t)dst & 3));
......@@ -402,8 +400,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
assert(x_step_q4 == 16);
(void)x_step_q4;
(void)y0_q4;
(void)y_step_q4;
(void)filter_y;
src -= 3;
......@@ -692,19 +690,18 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, // unused
int x_step_q4, // unused
const int16_t *filter_y, int y_step_q4, int w,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
const int16x8_t filters = vld1q_s16(filter_y);
const int16x8_t filters = vld1q_s16(filter[y0_q4]);
assert(!((intptr_t)dst & 3));
assert(!(dst_stride & 3));
assert(y_step_q4 == 16);
(void)x0_q4;
(void)x_step_q4;
(void)y_step_q4;
(void)filter_x;
src -= 3 * src_stride;
......@@ -864,19 +861,18 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,