Commit 691585f6 authored by James Zern's avatar James Zern Committed by Gerrit Code Review
Browse files

Merge changes If59743aa,Ib046fe28,Ia2345752

* changes:
  Remove the unnecessary cast of (int16_t)cospi_{1...31}_64
  Remove the unnecessary upcasts of (int)cospi_{1...31}_64
  Change cospi_{1...31}_64 from tran_high_t to tran_coef_t
parents 10bab1ec d586cdb4
......@@ -72,7 +72,7 @@ static INLINE void transpose_4x4(__m128i *res) {
}
static void fdct4_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
......@@ -194,7 +194,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
......@@ -709,7 +709,7 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
static void fdct8_sse2(__m128i *in) {
// constants
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
......@@ -861,7 +861,7 @@ static void fadst8_sse2(__m128i *in) {
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__const_0 = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
......@@ -1142,7 +1142,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
static void fdct16_8col(__m128i *in) {
// perform 16x16 1-D DCT for 8 columns
__m128i i[8], s[8], p[8], t[8], u[16], v[16];
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
......@@ -1489,8 +1489,8 @@ static void fadst16_8col(__m128i *in) {
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
......
......@@ -31,7 +31,7 @@ void vp9_fdct8x8_quant_ssse3(
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
......
......@@ -169,8 +169,8 @@ static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
// fdct_round_shift(a * c0 +/- b * c1)
static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
const tran_high_t c0,
const tran_high_t c1, int16x8_t *add,
const tran_coef_t c0,
const tran_coef_t c1, int16x8_t *add,
int16x8_t *sub) {
const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
......
......@@ -214,8 +214,8 @@ static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
// fdct_round_shift(a * c0 +/- b * c1)
static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
const tran_high_t constant0,
const tran_high_t constant1,
const tran_coef_t constant0,
const tran_coef_t constant1,
int16x8_t *add, int16x8_t *sub) {
const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
......@@ -590,19 +590,14 @@ static INLINE void butterfly_one_coeff_s16_s32(
// Like butterfly_one_coeff, but with s32.
static INLINE void butterfly_one_coeff_s32(
const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
const int32x4_t b_hi, const tran_high_t constant, int32x4_t *add_lo,
const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
// TODO(johannkoenig): Strangely there is only a conversion warning on int64_t
// to int32_t (const tran_high_t (aka const long long)) but not for int64_t to
// int16_t. The constants fit in int16_t. Investigate using int16_t for the
// constants to avoid bouncing between types.
const int32_t constant_s32 = (int32_t)constant;
const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant_s32);
const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant_s32);
const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant_s32);
const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant_s32);
const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant_s32);
const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant_s32);
const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
*add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
*add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
*sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
......@@ -621,19 +616,17 @@ static INLINE void butterfly_one_coeff_s32(
// Like butterfly_two_coeff, but with s32.
static INLINE void butterfly_two_coeff_s32(
const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
const int32x4_t b_hi, const tran_high_t constant0,
const tran_high_t constant1, int32x4_t *add_lo, int32x4_t *add_hi,
int32x4_t *sub_lo, int32x4_t *sub_hi) {
const int32_t constant0_s32 = (int32_t)constant0;
const int32_t constant1_s32 = (int32_t)constant1;
const int32x4_t a0 = vmulq_n_s32(a_lo, constant0_s32);
const int32x4_t a1 = vmulq_n_s32(a_hi, constant0_s32);
const int32x4_t a2 = vmulq_n_s32(a_lo, constant1_s32);
const int32x4_t a3 = vmulq_n_s32(a_hi, constant1_s32);
const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0_s32);
const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0_s32);
const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1_s32);
const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1_s32);
const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
int32x4_t *sub_hi) {
const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
*add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
*add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
*sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
......
......@@ -50,8 +50,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
// Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64);
const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64);
const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
// fdct_round_shift
int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
......@@ -59,13 +59,11 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
// s_3 * cospi_8_64 + s_2 * cospi_24_64
// s_3 * cospi_24_64 - s_2 * cospi_8_64
const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64);
const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64);
const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
const int32x4_t temp3 =
vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64);
const int32x4_t temp4 =
vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64);
const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
// fdct_round_shift
int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
......
......@@ -48,18 +48,18 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
......@@ -77,10 +77,10 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
// Stage 2
v_x0 = vsubq_s16(v_s6, v_s5);
v_x1 = vaddq_s16(v_s6, v_s5);
v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
......@@ -95,22 +95,22 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
v_x3 = vaddq_s16(v_s7, cd);
}
// Stage 4
v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
......
......@@ -1410,10 +1410,10 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
const tran_low_t out1 =
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const tran_low_t out0 = HIGHBD_WRAPLOW(
dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
const tran_low_t out1 = HIGHBD_WRAPLOW(
dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
int i;
......
......@@ -61,10 +61,10 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
const tran_low_t out1 =
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const tran_low_t out0 = HIGHBD_WRAPLOW(
dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
const tran_low_t out1 = HIGHBD_WRAPLOW(
dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
int i;
......
......@@ -54,10 +54,10 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
const tran_low_t out1 =
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const tran_low_t out0 = HIGHBD_WRAPLOW(
dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
const tran_low_t out1 = HIGHBD_WRAPLOW(
dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
const int16x8_t dc = vdupq_n_s16(a1);
......
......@@ -38,10 +38,10 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
const tran_low_t out1 =
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const tran_low_t out0 = HIGHBD_WRAPLOW(
dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
const tran_low_t out1 = HIGHBD_WRAPLOW(
dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
const int16x8_t dc = vdupq_n_s16(a1);
......
This diff is collapsed.
......@@ -25,37 +25,37 @@
// printf("static const int cospi_%d_64 = %.0f;\n", i,
// round(16384 * cos(i*M_PI/64)));
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
static const tran_high_t cospi_1_64 = 16364;
static const tran_high_t cospi_2_64 = 16305;
static const tran_high_t cospi_3_64 = 16207;
static const tran_high_t cospi_4_64 = 16069;
static const tran_high_t cospi_5_64 = 15893;
static const tran_high_t cospi_6_64 = 15679;
static const tran_high_t cospi_7_64 = 15426;
static const tran_high_t cospi_8_64 = 15137;
static const tran_high_t cospi_9_64 = 14811;
static const tran_high_t cospi_10_64 = 14449;
static const tran_high_t cospi_11_64 = 14053;
static const tran_high_t cospi_12_64 = 13623;
static const tran_high_t cospi_13_64 = 13160;
static const tran_high_t cospi_14_64 = 12665;
static const tran_high_t cospi_15_64 = 12140;
static const tran_high_t cospi_16_64 = 11585;
static const tran_high_t cospi_17_64 = 11003;
static const tran_high_t cospi_18_64 = 10394;
static const tran_high_t cospi_19_64 = 9760;
static const tran_high_t cospi_20_64 = 9102;
static const tran_high_t cospi_21_64 = 8423;
static const tran_high_t cospi_22_64 = 7723;
static const tran_high_t cospi_23_64 = 7005;
static const tran_high_t cospi_24_64 = 6270;
static const tran_high_t cospi_25_64 = 5520;
static const tran_high_t cospi_26_64 = 4756;
static const tran_high_t cospi_27_64 = 3981;
static const tran_high_t cospi_28_64 = 3196;
static const tran_high_t cospi_29_64 = 2404;
static const tran_high_t cospi_30_64 = 1606;
static const tran_high_t cospi_31_64 = 804;
static const tran_coef_t cospi_1_64 = 16364;
static const tran_coef_t cospi_2_64 = 16305;
static const tran_coef_t cospi_3_64 = 16207;
static const tran_coef_t cospi_4_64 = 16069;
static const tran_coef_t cospi_5_64 = 15893;
static const tran_coef_t cospi_6_64 = 15679;
static const tran_coef_t cospi_7_64 = 15426;
static const tran_coef_t cospi_8_64 = 15137;
static const tran_coef_t cospi_9_64 = 14811;
static const tran_coef_t cospi_10_64 = 14449;
static const tran_coef_t cospi_11_64 = 14053;
static const tran_coef_t cospi_12_64 = 13623;
static const tran_coef_t cospi_13_64 = 13160;
static const tran_coef_t cospi_14_64 = 12665;
static const tran_coef_t cospi_15_64 = 12140;
static const tran_coef_t cospi_16_64 = 11585;
static const tran_coef_t cospi_17_64 = 11003;
static const tran_coef_t cospi_18_64 = 10394;
static const tran_coef_t cospi_19_64 = 9760;
static const tran_coef_t cospi_20_64 = 9102;
static const tran_coef_t cospi_21_64 = 8423;
static const tran_coef_t cospi_22_64 = 7723;
static const tran_coef_t cospi_23_64 = 7005;
static const tran_coef_t cospi_24_64 = 6270;
static const tran_coef_t cospi_25_64 = 5520;
static const tran_coef_t cospi_26_64 = 4756;
static const tran_coef_t cospi_27_64 = 3981;
static const tran_coef_t cospi_28_64 = 3196;
static const tran_coef_t cospi_29_64 = 2404;
static const tran_coef_t cospi_30_64 = 1606;
static const tran_coef_t cospi_31_64 = 804;
// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
static const tran_coef_t sinpi_1_9 = 5283;
......
......@@ -51,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
const __m256i k__cospi_p16_m16 =
pair256_set_epi16(+cospi_16_64, -cospi_16_64);
const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
......
......@@ -63,7 +63,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
......
......@@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
......@@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
// When we use them, in one case, they are all the same. In all others
// it's a pair of them that we need to repeat four times. This is done
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
......
......@@ -56,20 +56,20 @@ static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
__m128i step1[16], step2[16];
// stage 2
highbd_butterfly_sse2(io[1], io[15], (int)cospi_30_64, (int)cospi_2_64,
&step2[8], &step2[15]);
highbd_butterfly_sse2(io[9], io[7], (int)cospi_14_64, (int)cospi_18_64,
&step2[9], &step2[14]);
highbd_butterfly_sse2(io[5], io[11], (int)cospi_22_64, (int)cospi_10_64,
&step2[10], &step2[13]);
highbd_butterfly_sse2(io[13], io[3], (int)cospi_6_64, (int)cospi_26_64,
&step2[11], &step2[12]);
highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
&step2[15]);
highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
&step2[14]);
highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
&step2[13]);
highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
&step2[12]);
// stage 3
highbd_butterfly_sse2(io[2], io[14], (int)cospi_28_64, (int)cospi_4_64,
&step1[4], &step1[7]);
highbd_butterfly_sse2(io[10], io[6], (int)cospi_12_64, (int)cospi_20_64,
&step1[5], &step1[6]);
highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
&step1[7]);
highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
&step1[6]);
step1[8] = _mm_add_epi32(step2[8], step2[9]);
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
......@@ -81,11 +81,11 @@ static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
// stage 4
highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
highbd_butterfly_sse2(io[4], io[12], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,
&step2[9], &step2[14]);
highbd_butterfly_sse2(step1[10], step1[13], (int)cospi_8_64, (int)cospi_24_64,
highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
&step2[3]);
highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
&step2[14]);
highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
&step2[13], &step2[10]);
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
step1[4] = _mm_add_epi32(step1[4], step1[5]);
......@@ -106,20 +106,20 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
__m128i temp1[2], sign[2];
// stage 2
highbd_partial_butterfly_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64,
&step2[8], &step2[15]);
highbd_partial_butterfly_neg_sse2(io[7], (int)cospi_14_64, (int)cospi_18_64,
&step2[9], &step2[14]);
highbd_partial_butterfly_sse2(io[5], (int)cospi_22_64, (int)cospi_10_64,
&step2[10], &step2[13]);
highbd_partial_butterfly_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64,
&step2[11], &step2[12]);
highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
&step2[15]);
highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
&step2[14]);
highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
&step2[13]);
highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
&step2[12]);
// stage 3
highbd_partial_butterfly_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64,
&step1[4], &step1[7]);
highbd_partial_butterfly_neg_sse2(io[6], (int)cospi_12_64, (int)cospi_20_64,
&step1[5], &step1[6]);
highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
&step1[7]);
highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
&step1[6]);
step1[8] = _mm_add_epi32(step2[8], step2[9]);
step1[9] = _mm_sub_epi32(step2[8], step2[9]);
step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10]
......@@ -131,13 +131,13 @@ static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
// stage 4
abs_extend_64bit_sse2(io[0], temp1, sign);
step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);
step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
step2[1] = step2[0];
highbd_partial_butterfly_sse2(io[4], (int)cospi_24_64, (int)cospi_8_64,
&step2[2], &step2[3]);
highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,
&step2[9], &step2[14]);
highbd_butterfly_sse2(step1[10], step1[13], (int)cospi_8_64, (int)cospi_24_64,
highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
&step2[3]);
highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
&step2[14]);
highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
&step2[13], &step2[10]);
step2[5] = _mm_sub_epi32(step1[4], step1[5]);
step1[4] = _mm_add_epi32(step1[4], step1[5]);
......@@ -158,14 +158,14 @@ static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
__m128i temp[2], sign[2];
// stage 2
highbd_partial_butterfly_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64,
&step2[8], &step2[15]);
highbd_partial_butterfly_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64,
&step2[11], &step2[12]);
highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
&step2[15]);
highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
&step2[12]);
// stage 3
highbd_partial_butterfly_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64,
&step1[4], &step1[7]);