Commit 099bd7f0 authored by clang-format's avatar clang-format Committed by James Zern

vpx_dsp: apply clang-format

Change-Id: I3ea3e77364879928bd916f2b0a7838073ade5975
parent 82070ae9
......@@ -48,7 +48,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
// set up a 256 entry lookup that matches gaussian distribution
for (i = -32; i < 32; ++i) {
const int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
if (a_i) {
for (j = 0; j < a_i; ++j) {
char_dist[next + j] = (char)i;
......
......@@ -198,27 +198,24 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
}
}
void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
int *min, int *max) {
void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int *min, int *max) {
// Load and concatenate.
const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
vld1_u8(a + a_stride));
const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
vld1_u8(a + 3 * a_stride));
const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
vld1_u8(a + 5 * a_stride));
const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
vld1_u8(a + 7 * a_stride));
const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
vld1_u8(b + b_stride));
const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
vld1_u8(b + 3 * b_stride));
const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
vld1_u8(b + 5 * b_stride));
const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
vld1_u8(b + 7 * b_stride));
const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
const uint8x16_t a23 =
vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
const uint8x16_t a45 =
vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
const uint8x16_t a67 =
vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
const uint8x16_t b23 =
vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
const uint8x16_t b45 =
vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
const uint8x16_t b67 =
vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
// Absolute difference.
const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
......
......@@ -131,14 +131,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
// 14 15 16 17 54 55 56 57
// 24 25 26 27 64 65 66 67
// 34 35 36 37 74 75 76 77
const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
vreinterpretq_s32_s16(out_2));
const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
vreinterpretq_s32_s16(out_3));
const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
vreinterpretq_s32_s16(out_6));
const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
vreinterpretq_s32_s16(out_7));
const int32x4x2_t r02_s32 =
vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
const int32x4x2_t r13_s32 =
vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
const int32x4x2_t r46_s32 =
vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
const int32x4x2_t r57_s32 =
vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
const int16x8x2_t r01_s16 =
vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
vreinterpretq_s16_s32(r13_s32.val[0]));
......
......@@ -12,9 +12,8 @@
#include "./vpx_dsp_rtcd.h"
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
int16x8_t *a2, int16x8_t *a3,
int16x8_t *a4, int16x8_t *a5,
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
const int16x8_t b0 = vaddq_s16(*a0, *a1);
const int16x8_t b1 = vsubq_s16(*a0, *a1);
......@@ -47,9 +46,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
// reversing transpose order which may make it easier for the compiler to
// reconcile the vtrn.64 moves.
static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
int16x8_t *a2, int16x8_t *a3,
int16x8_t *a4, int16x8_t *a5,
static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
// Swap 64 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
......@@ -91,14 +89,14 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
// a1657_hi:
// 12 13 28 29 44 45 60 61
// 14 15 30 31 46 47 62 63
const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo),
vreinterpretq_s32_s16(a26_lo));
const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo),
vreinterpretq_s32_s16(a37_lo));
const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi),
vreinterpretq_s32_s16(a26_hi));
const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi),
vreinterpretq_s32_s16(a37_hi));
const int32x4x2_t a0246_lo =
vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
const int32x4x2_t a1357_lo =
vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
const int32x4x2_t a0246_hi =
vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
const int32x4x2_t a1357_hi =
vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
// Swap 16 bit elements resulting in:
// b0:
......
......@@ -13,49 +13,46 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
void vpx_idct16x16_1_add_neon(
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8x8_t d2u8, d3u8, d30u8, d31u8;
uint64x1_t d2u64, d3u64, d4u64, d5u64;
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
int16x8_t q0s16;
uint8_t *d1, *d2;
int16_t i, j, a1, cospi_16_64 = 11585;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
q0s16 = vdupq_n_s16(a1);
q0u16 = vreinterpretq_u16_s16(q0s16);
for (d1 = d2 = dest, i = 0; i < 4; i++) {
for (j = 0; j < 2; j++) {
d2u64 = vld1_u64((const uint64_t *)d1);
d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
d1 += dest_stride;
d4u64 = vld1_u64((const uint64_t *)d1);
d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
d1 += dest_stride;
q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
d2 += dest_stride;
}
void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8x8_t d2u8, d3u8, d30u8, d31u8;
uint64x1_t d2u64, d3u64, d4u64, d5u64;
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
int16x8_t q0s16;
uint8_t *d1, *d2;
int16_t i, j, a1, cospi_16_64 = 11585;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
q0s16 = vdupq_n_s16(a1);
q0u16 = vreinterpretq_u16_s16(q0s16);
for (d1 = d2 = dest, i = 0; i < 4; i++) {
for (j = 0; j < 2; j++) {
d2u64 = vld1_u64((const uint64_t *)d1);
d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
d1 += dest_stride;
d4u64 = vld1_u64((const uint64_t *)d1);
d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
d1 += dest_stride;
q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
d2 += dest_stride;
}
return;
}
return;
}
This diff is collapsed.
......@@ -10,24 +10,16 @@
#include "vpx_dsp/vpx_dsp_common.h"
void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
int16_t *output,
void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
int output_stride);
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
int16_t *output,
int16_t *pass1Output,
int16_t skip_adding,
uint8_t *dest,
int dest_stride);
void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
int16_t *output,
void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
int16_t *pass1Output, int16_t skip_adding,
uint8_t *dest, int dest_stride);
void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
int output_stride);
void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
int16_t *output,
int16_t *pass1Output,
int16_t skip_adding,
uint8_t *dest,
int dest_stride);
void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
int16_t *pass1Output, int16_t skip_adding,
uint8_t *dest, int dest_stride);
#if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
......@@ -35,13 +27,13 @@ extern void vpx_push_neon(int64_t *store);
extern void vpx_pop_neon(int64_t *store);
#endif // HAVE_NEON_ASM
void vpx_idct16x16_256_add_neon(const int16_t *input,
uint8_t *dest, int dest_stride) {
void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
int dest_stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
int16_t pass1_output[16 * 16] = { 0 };
int16_t row_idct_output[16 * 16] = { 0 };
#if HAVE_NEON_ASM
// save d8-d15 register values.
......@@ -56,27 +48,19 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
dest,
dest_stride);
vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
dest, dest_stride);
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
row_idct_output+8,
pass1_output,
0,
dest,
dest_stride);
vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
pass1_output, 0, dest, dest_stride);
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
......@@ -86,27 +70,20 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
dest,
dest_stride);
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
pass1_output, 1, dest, dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
dest+8,
dest_stride);
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output + 8, pass1_output, 1,
dest + 8, dest_stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
......@@ -116,13 +93,13 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
return;
}
void vpx_idct16x16_10_add_neon(const int16_t *input,
uint8_t *dest, int dest_stride) {
void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
int dest_stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
int16_t pass1_output[16 * 16] = { 0 };
int16_t row_idct_output[16 * 16] = { 0 };
#if HAVE_NEON_ASM
// save d8-d15 register values.
......@@ -137,12 +114,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
vpx_idct16x16_10_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
dest,
dest_stride);
vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
dest, dest_stride);
/* Skip Parallel idct on the lower 8 rows as they are all 0s */
......@@ -154,27 +127,20 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
dest,
dest_stride);
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
pass1_output, 1, dest, dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
dest+8,
dest_stride);
vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output + 8, pass1_output, 1,
dest + 8, dest_stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
......
......@@ -15,151 +15,126 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
static INLINE void LD_16x8(
uint8_t *d,
int d_stride,
uint8x16_t *q8u8,
uint8x16_t *q9u8,
uint8x16_t *q10u8,
uint8x16_t *q11u8,
uint8x16_t *q12u8,
uint8x16_t *q13u8,
uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vld1q_u8(d);
d += d_stride;
*q9u8 = vld1q_u8(d);
d += d_stride;
*q10u8 = vld1q_u8(d);
d += d_stride;
*q11u8 = vld1q_u8(d);
d += d_stride;
*q12u8 = vld1q_u8(d);
d += d_stride;
*q13u8 = vld1q_u8(d);
d += d_stride;
*q14u8 = vld1q_u8(d);
d += d_stride;
*q15u8 = vld1q_u8(d);
return;
static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vld1q_u8(d);
d += d_stride;
*q9u8 = vld1q_u8(d);
d += d_stride;
*q10u8 = vld1q_u8(d);
d += d_stride;
*q11u8 = vld1q_u8(d);
d += d_stride;
*q12u8 = vld1q_u8(d);
d += d_stride;
*q13u8 = vld1q_u8(d);
d += d_stride;
*q14u8 = vld1q_u8(d);
d += d_stride;
*q15u8 = vld1q_u8(d);
return;
}
static INLINE void ADD_DIFF_16x8(
uint8x16_t qdiffu8,
uint8x16_t *q8u8,
uint8x16_t *q9u8,
uint8x16_t *q10u8,
uint8x16_t *q11u8,
uint8x16_t *q12u8,
uint8x16_t *q13u8,
uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vqaddq_u8(*q8u8, qdiffu8);
*q9u8 = vqaddq_u8(*q9u8, qdiffu8);
*q10u8 = vqaddq_u8(*q10u8, qdiffu8);
*q11u8 = vqaddq_u8(*q11u8, qdiffu8);
*q12u8 = vqaddq_u8(*q12u8, qdiffu8);
*q13u8 = vqaddq_u8(*q13u8, qdiffu8);
*q14u8 = vqaddq_u8(*q14u8, qdiffu8);
*q15u8 = vqaddq_u8(*q15u8, qdiffu8);
return;
static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vqaddq_u8(*q8u8, qdiffu8);
*q9u8 = vqaddq_u8(*q9u8, qdiffu8);
*q10u8 = vqaddq_u8(*q10u8, qdiffu8);
*q11u8 = vqaddq_u8(*q11u8, qdiffu8);
*q12u8 = vqaddq_u8(*q12u8, qdiffu8);
*q13u8 = vqaddq_u8(*q13u8, qdiffu8);
*q14u8 = vqaddq_u8(*q14u8, qdiffu8);
*q15u8 = vqaddq_u8(*q15u8, qdiffu8);
return;
}
static INLINE void SUB_DIFF_16x8(
uint8x16_t qdiffu8,
uint8x16_t *q8u8,
uint8x16_t *q9u8,
uint8x16_t *q10u8,
uint8x16_t *q11u8,
uint8x16_t *q12u8,
uint8x16_t *q13u8,
uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vqsubq_u8(*q8u8, qdiffu8);
*q9u8 = vqsubq_u8(*q9u8, qdiffu8);
*q10u8 = vqsubq_u8(*q10u8, qdiffu8);
*q11u8 = vqsubq_u8(*q11u8, qdiffu8);
*q12u8 = vqsubq_u8(*q12u8, qdiffu8);
*q13u8 = vqsubq_u8(*q13u8, qdiffu8);
*q14u8 = vqsubq_u8(*q14u8, qdiffu8);
*q15u8 = vqsubq_u8(*q15u8, qdiffu8);
return;
static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vqsubq_u8(*q8u8, qdiffu8);
*q9u8 = vqsubq_u8(*q9u8, qdiffu8);
*q10u8 = vqsubq_u8(*q10u8, qdiffu8);
*q11u8 = vqsubq_u8(*q11u8, qdiffu8);
*q12u8 = vqsubq_u8(*q12u8, qdiffu8);
*q13u8 = vqsubq_u8(*q13u8, qdiffu8);
*q14u8 = vqsubq_u8(*q14u8, qdiffu8);
*q15u8 = vqsubq_u8(*q15u8, qdiffu8);
return;
}
static INLINE void ST_16x8(
uint8_t *d,
int d_stride,
uint8x16_t *q8u8,
uint8x16_t *q9u8,
uint8x16_t *q10u8,
uint8x16_t *q11u8,
uint8x16_t *q12u8,
uint8x16_t *q13u8,
uint8x16_t *q14u8,
uint8x16_t *q15u8) {
vst1q_u8(d, *q8u8);
d += d_stride;
vst1q_u8(d, *q9u8);
d += d_stride;
vst1q_u8(d, *q10u8);
d += d_stride;
vst1q_u8(d, *q11u8);
d += d_stride;
vst1q_u8(d, *q12u8);
d += d_stride;
vst1q_u8(d, *q13u8);
d += d_stride;
vst1q_u8(d, *q14u8);
d += d_stride;
vst1q_u8(d, *q15u8);
return;
static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
vst1q_u8(d, *q8u8);
d += d_stride;
vst1q_u8(d, *q9u8);
d += d_stride;
vst1q_u8(d, *q10u8);
d += d_stride;
vst1q_u8(d, *q11u8);
d += d_stride;
vst1q_u8(d, *q12u8);
d += d_stride;
vst1q_u8(d, *q13u8);
d += d_stride;
vst1q_u8(d, *q14u8);
d += d_stride;
vst1q_u8(d, *q15u8);
return;
}
void vpx_idct32x32_1_add_neon(
int16_t *input,
uint8_t *dest,
int dest_stride) {
uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int i, j, dest_stride8;
uint8_t *d;
int16_t a1, cospi_16_64 = 11585;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int i, j, dest_stride8;
uint8_t *d;
int16_t a1, cospi_16_64 = 11585;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
dest_stride8 = dest_stride * 8;
if (a1 >= 0) { // diff_positive_32_32
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
q0u8 = vdupq_n_u8(a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
&q12u8, &q13u8, &q14u8, &q15u8);
ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
&q12u8, &q13u8, &q14u8, &q15u8);
ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
&q12u8, &q13u8, &q14u8, &q15u8);
d += dest_stride8;
}
}
} else { // diff_negative_32_32
a1 = -a1;
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
q0u8 = vdupq_n_u8(a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
&q12u8, &q13u8, &q14u8, &q15u8);
SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
&q12u8, &q13u8, &q14u8, &q15u8);
ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
&q12u8, &q13u8, &q14u8, &q15u8);
d += dest_stride8;
}
}
dest_stride8 = dest_stride * 8;
if (a1 >= 0) { // diff_positive_32_32
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
q0u8 = vdupq_n_u8(a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,