diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 615e07ba9d348f6dd4fc671ff3cdc1281d8b4c2e..104db6aebdfec77e75e10cdaf8d473d9733bfaad 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -701,7 +701,7 @@ prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_b specialize vp9_quantize_b $ssse3_x86_64 prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" -specialize vp9_quantize_b_32x32 $ssse3_x86_64 +specialize vp9_quantize_b_32x32 # $ssse3_x86_64 FIXME(jingning): need a unit test on thisbefore enabled # # Structured Similarity (SSIM) diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 3112dadcda67d67c84e422ad9b28d1ecd19f6604..4f4ad04b3a68619dd8b5360464eb5faee03308f5 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -1077,6 +1077,44 @@ static void dct32_1d(int *input, int *output, int round) { output[30] = step[30]; output[31] = step[31]; + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (round) { + output[0] = half_round_shift(output[0]); + output[1] = half_round_shift(output[1]); + output[2] = half_round_shift(output[2]); + output[3] = half_round_shift(output[3]); + output[4] = half_round_shift(output[4]); + output[5] = half_round_shift(output[5]); + output[6] = half_round_shift(output[6]); + output[7] = half_round_shift(output[7]); + output[8] = half_round_shift(output[8]); + output[9] = half_round_shift(output[9]); + output[10] = half_round_shift(output[10]); + output[11] = half_round_shift(output[11]); + output[12] = half_round_shift(output[12]); + output[13] = half_round_shift(output[13]); + output[14] = half_round_shift(output[14]); + output[15] = half_round_shift(output[15]); + + output[16] = half_round_shift(output[16]); + output[17] = half_round_shift(output[17]); + output[18] = half_round_shift(output[18]); + output[19] = half_round_shift(output[19]); + output[20] = half_round_shift(output[20]); + output[21] = half_round_shift(output[21]); + output[22] = half_round_shift(output[22]); + output[23] = half_round_shift(output[23]); + output[24] = half_round_shift(output[24]); + output[25] = half_round_shift(output[25]); + output[26] = half_round_shift(output[26]); + output[27] = half_round_shift(output[27]); + output[28] = half_round_shift(output[28]); + output[29] = half_round_shift(output[29]); + output[30] = half_round_shift(output[30]); + output[31] = half_round_shift(output[31]); + } + // Stage 3 step[0] = output[0] + output[(8 - 1)]; step[1] = output[1] + output[(8 - 2)]; @@ -1112,44 +1150,6 @@ static void dct32_1d(int *input, int *output, int round) { step[30] = output[30] + output[25]; step[31] = output[31] + output[24]; - // dump the magnitude by half, hence the intermediate values are within 1108 - // the range of 16 bits. - if (round) { - step[0] = half_round_shift(step[0]); - step[1] = half_round_shift(step[1]); - step[2] = half_round_shift(step[2]); - step[3] = half_round_shift(step[3]); - step[4] = half_round_shift(step[4]); - step[5] = half_round_shift(step[5]); - step[6] = half_round_shift(step[6]); - step[7] = half_round_shift(step[7]); - step[8] = half_round_shift(step[8]); - step[9] = half_round_shift(step[9]); - step[10] = half_round_shift(step[10]); - step[11] = half_round_shift(step[11]); - step[12] = half_round_shift(step[12]); - step[13] = half_round_shift(step[13]); - step[14] = half_round_shift(step[14]); - step[15] = half_round_shift(step[15]); - - step[16] = half_round_shift(step[16]); - step[17] = half_round_shift(step[17]); - step[18] = half_round_shift(step[18]); - step[19] = half_round_shift(step[19]); - step[20] = half_round_shift(step[20]); - step[21] = half_round_shift(step[21]); - step[22] = half_round_shift(step[22]); - step[23] = half_round_shift(step[23]); - step[24] = half_round_shift(step[24]); - step[25] = half_round_shift(step[25]); - step[26] = half_round_shift(step[26]); - step[27] = half_round_shift(step[27]); - step[28] = half_round_shift(step[28]); - step[29] = half_round_shift(step[29]); - step[30] = half_round_shift(step[30]); - step[31] = half_round_shift(step[31]); - } - // Stage 4 output[0] = step[0] + step[3]; output[1] = step[1] + step[2]; diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c index ce65859e40dc5bf4154b0621311eb12a58ddbc00..95ae2664390d7da5b5f5ce2dae1144ea06dab758 100644 --- a/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c @@ -403,6 +403,148 @@ void FDCT32x32_2D(int16_t *input, step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero); + __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero); + __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero); + __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero); + __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero); + __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero); + __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero); + __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero); + __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); + __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); + __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); + __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); + __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); + __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); + __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); + __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); + __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); + __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); + __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); + __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); + __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); + __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); + __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); + __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); + __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); + __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); + __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); + __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); + __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); + __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); + __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); + __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); + + step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0); + step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0); + step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0); + step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0); + step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0); + step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0); + step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0); + step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0); + step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0); + step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0); + step2[10] = _mm_sub_epi16(step2[10], s3_10_0); + step2[11] = _mm_sub_epi16(step2[11], s3_11_0); + step2[12] = _mm_sub_epi16(step2[12], s3_12_0); + step2[13] = _mm_sub_epi16(step2[13], s3_13_0); + step2[14] = _mm_sub_epi16(step2[14], s2_14_0); + step2[15] = _mm_sub_epi16(step2[15], s2_15_0); + step1[16] = _mm_sub_epi16(step1[16], s3_16_0); + step1[17] = _mm_sub_epi16(step1[17], s3_17_0); + step1[18] = _mm_sub_epi16(step1[18], s3_18_0); + step1[19] = _mm_sub_epi16(step1[19], s3_19_0); + step2[20] = _mm_sub_epi16(step2[20], s3_20_0); + step2[21] = _mm_sub_epi16(step2[21], s3_21_0); + step2[22] = _mm_sub_epi16(step2[22], s3_22_0); + step2[23] = _mm_sub_epi16(step2[23], s3_23_0); + step2[24] = _mm_sub_epi16(step2[24], s3_24_0); + step2[25] = _mm_sub_epi16(step2[25], s3_25_0); + step2[26] = _mm_sub_epi16(step2[26], s3_26_0); + step2[27] = _mm_sub_epi16(step2[27], s3_27_0); + step1[28] = _mm_sub_epi16(step1[28], s3_28_0); + step1[29] = _mm_sub_epi16(step1[29], s3_29_0); + step1[30] = _mm_sub_epi16(step1[30], s3_30_0); + step1[31] = _mm_sub_epi16(step1[31], s3_31_0); + + step2[ 0] = _mm_add_epi16(step2[ 0], kOne); + step2[ 1] = _mm_add_epi16(step2[ 1], kOne); + step2[ 2] = _mm_add_epi16(step2[ 2], kOne); + step2[ 3] = _mm_add_epi16(step2[ 3], kOne); + step2[ 4] = _mm_add_epi16(step2[ 4], kOne); + step2[ 5] = _mm_add_epi16(step2[ 5], kOne); + step2[ 6] = _mm_add_epi16(step2[ 6], kOne); + step2[ 7] = _mm_add_epi16(step2[ 7], kOne); + step2[ 8] = _mm_add_epi16(step2[ 8], kOne); + step2[ 9] = _mm_add_epi16(step2[ 9], kOne); + step2[10] = _mm_add_epi16(step2[10], kOne); + step2[11] = _mm_add_epi16(step2[11], kOne); + step2[12] = _mm_add_epi16(step2[12], kOne); + step2[13] = _mm_add_epi16(step2[13], kOne); + step2[14] = _mm_add_epi16(step2[14], kOne); + step2[15] = _mm_add_epi16(step2[15], kOne); + step1[16] = _mm_add_epi16(step1[16], kOne); + step1[17] = _mm_add_epi16(step1[17], kOne); + step1[18] = _mm_add_epi16(step1[18], kOne); + step1[19] = _mm_add_epi16(step1[19], kOne); + step2[20] = _mm_add_epi16(step2[20], kOne); + step2[21] = _mm_add_epi16(step2[21], kOne); + step2[22] = _mm_add_epi16(step2[22], kOne); + step2[23] = _mm_add_epi16(step2[23], kOne); + step2[24] = _mm_add_epi16(step2[24], kOne); + step2[25] = _mm_add_epi16(step2[25], kOne); + step2[26] = _mm_add_epi16(step2[26], kOne); + step2[27] = _mm_add_epi16(step2[27], kOne); + step1[28] = _mm_add_epi16(step1[28], kOne); + step1[29] = _mm_add_epi16(step1[29], kOne); + step1[30] = _mm_add_epi16(step1[30], kOne); + step1[31] = _mm_add_epi16(step1[31], kOne); + + step2[ 0] = _mm_srai_epi16(step2[ 0], 2); + step2[ 1] = _mm_srai_epi16(step2[ 1], 2); + step2[ 2] = _mm_srai_epi16(step2[ 2], 2); + step2[ 3] = _mm_srai_epi16(step2[ 3], 2); + step2[ 4] = _mm_srai_epi16(step2[ 4], 2); + step2[ 5] = _mm_srai_epi16(step2[ 5], 2); + step2[ 6] = _mm_srai_epi16(step2[ 6], 2); + step2[ 7] = _mm_srai_epi16(step2[ 7], 2); + step2[ 8] = _mm_srai_epi16(step2[ 8], 2); + step2[ 9] = _mm_srai_epi16(step2[ 9], 2); + step2[10] = _mm_srai_epi16(step2[10], 2); + step2[11] = _mm_srai_epi16(step2[11], 2); + step2[12] = _mm_srai_epi16(step2[12], 2); + step2[13] = _mm_srai_epi16(step2[13], 2); + step2[14] = _mm_srai_epi16(step2[14], 2); + step2[15] = _mm_srai_epi16(step2[15], 2); + step1[16] = _mm_srai_epi16(step1[16], 2); + step1[17] = _mm_srai_epi16(step1[17], 2); + step1[18] = _mm_srai_epi16(step1[18], 2); + step1[19] = _mm_srai_epi16(step1[19], 2); + step2[20] = _mm_srai_epi16(step2[20], 2); + step2[21] = _mm_srai_epi16(step2[21], 2); + step2[22] = _mm_srai_epi16(step2[22], 2); + step2[23] = _mm_srai_epi16(step2[23], 2); + step2[24] = _mm_srai_epi16(step2[24], 2); + step2[25] = _mm_srai_epi16(step2[25], 2); + step2[26] = _mm_srai_epi16(step2[26], 2); + step2[27] = _mm_srai_epi16(step2[27], 2); + step1[28] = _mm_srai_epi16(step1[28], 2); + step1[29] = _mm_srai_epi16(step1[29], 2); + step1[30] = _mm_srai_epi16(step1[30], 2); + step1[31] = _mm_srai_epi16(step1[31], 2); + } +#endif + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif // Stage 3 { step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); @@ -469,144 +611,6 @@ void FDCT32x32_2D(int16_t *input, step3[31] = _mm_add_epi16(step2[24], step1[31]); } -#if !FDCT32x32_HIGH_PRECISION - // dump the magnitude by half, hence the intermediate values are within - // the range of 16 bits. - if (1 == pass) { - __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero); - __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero); - __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero); - __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero); - __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero); - __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero); - __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero); - __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero); - __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); - __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); - __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero); - __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero); - __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero); - __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero); - __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); - __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); - __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero); - __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero); - __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero); - __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero); - __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero); - __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero); - __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero); - __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero); - __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero); - __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero); - __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero); - __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero); - __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero); - __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero); - __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero); - __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero); - step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0); - step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0); - step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0); - step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0); - step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0); - step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0); - step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0); - step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0); - step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0); - step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0); - step3[10] = _mm_sub_epi16(step3[10], s3_10_0); - step3[11] = _mm_sub_epi16(step3[11], s3_11_0); - step3[12] = _mm_sub_epi16(step3[12], s3_12_0); - step3[13] = _mm_sub_epi16(step3[13], s3_13_0); - step2[14] = _mm_sub_epi16(step2[14], s2_14_0); - step2[15] = _mm_sub_epi16(step2[15], s2_15_0); - step3[16] = _mm_sub_epi16(step3[16], s3_16_0); - step3[17] = _mm_sub_epi16(step3[17], s3_17_0); - step3[18] = _mm_sub_epi16(step3[18], s3_18_0); - step3[19] = _mm_sub_epi16(step3[19], s3_19_0); - step3[20] = _mm_sub_epi16(step3[20], s3_20_0); - step3[21] = _mm_sub_epi16(step3[21], s3_21_0); - step3[22] = _mm_sub_epi16(step3[22], s3_22_0); - step3[23] = _mm_sub_epi16(step3[23], s3_23_0); - step3[24] = _mm_sub_epi16(step3[24], s3_24_0); - step3[25] = _mm_sub_epi16(step3[25], s3_25_0); - step3[26] = _mm_sub_epi16(step3[26], s3_26_0); - step3[27] = _mm_sub_epi16(step3[27], s3_27_0); - step3[28] = _mm_sub_epi16(step3[28], s3_28_0); - step3[29] = _mm_sub_epi16(step3[29], s3_29_0); - step3[30] = _mm_sub_epi16(step3[30], s3_30_0); - step3[31] = _mm_sub_epi16(step3[31], s3_31_0); - step3[ 0] = _mm_add_epi16(step3[ 0], kOne); - step3[ 1] = _mm_add_epi16(step3[ 1], kOne); - step3[ 2] = _mm_add_epi16(step3[ 2], kOne); - step3[ 3] = _mm_add_epi16(step3[ 3], kOne); - step3[ 4] = _mm_add_epi16(step3[ 4], kOne); - step3[ 5] = _mm_add_epi16(step3[ 5], kOne); - step3[ 6] = _mm_add_epi16(step3[ 6], kOne); - step3[ 7] = _mm_add_epi16(step3[ 7], kOne); - step2[ 8] = _mm_add_epi16(step2[ 8], kOne); - step2[ 9] = _mm_add_epi16(step2[ 9], kOne); - step3[10] = _mm_add_epi16(step3[10], kOne); - step3[11] = _mm_add_epi16(step3[11], kOne); - step3[12] = _mm_add_epi16(step3[12], kOne); - step3[13] = _mm_add_epi16(step3[13], kOne); - step2[14] = _mm_add_epi16(step2[14], kOne); - step2[15] = _mm_add_epi16(step2[15], kOne); - step3[16] = _mm_add_epi16(step3[16], kOne); - step3[17] = _mm_add_epi16(step3[17], kOne); - step3[18] = _mm_add_epi16(step3[18], kOne); - step3[19] = _mm_add_epi16(step3[19], kOne); - step3[20] = _mm_add_epi16(step3[20], kOne); - step3[21] = _mm_add_epi16(step3[21], kOne); - step3[22] = _mm_add_epi16(step3[22], kOne); - step3[23] = _mm_add_epi16(step3[23], kOne); - step3[24] = _mm_add_epi16(step3[24], kOne); - step3[25] = _mm_add_epi16(step3[25], kOne); - step3[26] = _mm_add_epi16(step3[26], kOne); - step3[27] = _mm_add_epi16(step3[27], kOne); - step3[28] = _mm_add_epi16(step3[28], kOne); - step3[29] = _mm_add_epi16(step3[29], kOne); - step3[30] = _mm_add_epi16(step3[30], kOne); - step3[31] = _mm_add_epi16(step3[31], kOne); - step3[ 0] = _mm_srai_epi16(step3[ 0], 2); - step3[ 1] = _mm_srai_epi16(step3[ 1], 2); - step3[ 2] = _mm_srai_epi16(step3[ 2], 2); - step3[ 3] = _mm_srai_epi16(step3[ 3], 2); - step3[ 4] = _mm_srai_epi16(step3[ 4], 2); - step3[ 5] = _mm_srai_epi16(step3[ 5], 2); - step3[ 6] = _mm_srai_epi16(step3[ 6], 2); - step3[ 7] = _mm_srai_epi16(step3[ 7], 2); - step2[ 8] = _mm_srai_epi16(step2[ 8], 2); - step2[ 9] = _mm_srai_epi16(step2[ 9], 2); - step3[10] = _mm_srai_epi16(step3[10], 2); - step3[11] = _mm_srai_epi16(step3[11], 2); - step3[12] = _mm_srai_epi16(step3[12], 2); - step3[13] = _mm_srai_epi16(step3[13], 2); - step2[14] = _mm_srai_epi16(step2[14], 2); - step2[15] = _mm_srai_epi16(step2[15], 2); - step3[16] = _mm_srai_epi16(step3[16], 2); - step3[17] = _mm_srai_epi16(step3[17], 2); - step3[18] = _mm_srai_epi16(step3[18], 2); - step3[19] = _mm_srai_epi16(step3[19], 2); - step3[20] = _mm_srai_epi16(step3[20], 2); - step3[21] = _mm_srai_epi16(step3[21], 2); - step3[22] = _mm_srai_epi16(step3[22], 2); - step3[23] = _mm_srai_epi16(step3[23], 2); - step3[24] = _mm_srai_epi16(step3[24], 2); - step3[25] = _mm_srai_epi16(step3[25], 2); - step3[26] = _mm_srai_epi16(step3[26], 2); - step3[27] = _mm_srai_epi16(step3[27], 2); - step3[28] = _mm_srai_epi16(step3[28], 2); - step3[29] = _mm_srai_epi16(step3[29], 2); - step3[30] = _mm_srai_epi16(step3[30], 2); - step3[31] = _mm_srai_epi16(step3[31], 2); - } -#endif - -#if FDCT32x32_HIGH_PRECISION - if (pass == 0) { -#endif // Stage 4 { step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]); @@ -1158,25 +1162,146 @@ void FDCT32x32_2D(int16_t *input, const __m128i mask16 = _mm_set1_epi32(0x80008000); const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length priori to addition operations + lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero); + lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero); + lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero); + lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero); + lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero); + lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero); + lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero); + lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero); + lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero); + lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero); + lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero); + lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero); + lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero); + lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero); + lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero); + lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero); + + lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); + lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); + lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]); + lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]); + lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]); + lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]); + lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]); + lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]); + lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]); + lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]); + lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]); + lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]); + lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]); + lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]); + lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]); + lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero); + lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero); + lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero); + lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero); + lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero); + lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero); + lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero); + lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero); + lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero); + lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero); + lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero); + lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero); + lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero); + lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero); + lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero); + lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero); + + lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero); + lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero); + lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero); + lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero); + lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero); + lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero); + lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero); + lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero); + lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero); + lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero); + lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero); + lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero); + lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero); + lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero); + lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero); + lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero); + + lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); + lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); + lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); + lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); + lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); + lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); + lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); + lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); + lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); + lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); + lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); + lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); + lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); + lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); + lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); + lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); + lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); + lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); + lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); + lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); + lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); + lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); + lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); + lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); + lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); + lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); + lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); + lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); + lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); + lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); + lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); + lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); + } + // stage 4 { // expanding to 32-bit length priori to addition operations - lstep3[ 0] = k_cvtlo_epi16(step3[ 0], mask16, kZero); - lstep3[ 1] = k_cvthi_epi16(step3[ 0], mask16, kZero); - lstep3[ 2] = k_cvtlo_epi16(step3[ 1], mask16, kZero); - lstep3[ 3] = k_cvthi_epi16(step3[ 1], mask16, kZero); - lstep3[ 4] = k_cvtlo_epi16(step3[ 2], mask16, kZero); - lstep3[ 5] = k_cvthi_epi16(step3[ 2], mask16, kZero); - lstep3[ 6] = k_cvtlo_epi16(step3[ 3], mask16, kZero); - lstep3[ 7] = k_cvthi_epi16(step3[ 3], mask16, kZero); - lstep3[20] = k_cvtlo_epi16(step3[10], mask16, kZero); - lstep3[21] = k_cvthi_epi16(step3[10], mask16, kZero); - lstep3[22] = k_cvtlo_epi16(step3[11], mask16, kZero); - lstep3[23] = k_cvthi_epi16(step3[11], mask16, kZero); - lstep3[24] = k_cvtlo_epi16(step3[12], mask16, kZero); - lstep3[25] = k_cvthi_epi16(step3[12], mask16, kZero); - lstep3[26] = k_cvtlo_epi16(step3[13], mask16, kZero); - lstep3[27] = k_cvthi_epi16(step3[13], mask16, kZero); lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero); lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero); lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero); @@ -1212,88 +1337,150 @@ void FDCT32x32_2D(int16_t *input, lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); } { - const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); - const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); - const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - lstep1[10] = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); - lstep1[11] = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); - lstep1[12] = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); - lstep1[13] = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // to be continued... + // + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[ 0] = k_madd_epi32(u[0], k32_p16_m16); + v[ 1] = k_madd_epi32(u[1], k32_p16_m16); + v[ 2] = k_madd_epi32(u[2], k32_p16_m16); + v[ 3] = k_madd_epi32(u[3], k32_p16_m16); + v[ 4] = k_madd_epi32(u[0], k32_p16_p16); + v[ 5] = k_madd_epi32(u[1], k32_p16_p16); + v[ 6] = k_madd_epi32(u[2], k32_p16_p16); + v[ 7] = k_madd_epi32(u[3], k32_p16_p16); + + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); } { - const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); - const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); - const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); - const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); - const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); - const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); - const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); - const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); - const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - lstep1[36] = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); - lstep1[37] = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); - lstep1[38] = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); - lstep1[39] = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); - lstep1[40] = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); - lstep1[41] = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); - lstep1[42] = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); - lstep1[43] = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); - lstep1[52] = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); - lstep1[53] = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); - lstep1[54] = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); - lstep1[55] = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); - lstep1[56] = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); - lstep1[57] = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); - lstep1[58] = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); - lstep1[59] = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); + u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); + u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); + u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); + u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); + u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); + u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); + u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); + u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); + u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24); + v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24); + v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24); + v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24); + v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24); + v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24); + v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24); + v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24); + v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08); + v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08); + v[10] = k_madd_epi32(u[10], k32_m24_m08); + v[11] = k_madd_epi32(u[11], k32_m24_m08); + v[12] = k_madd_epi32(u[12], k32_m24_m08); + v[13] = k_madd_epi32(u[13], k32_m24_m08); + v[14] = k_madd_epi32(u[14], k32_m24_m08); + v[15] = k_madd_epi32(u[15], k32_m24_m08); + v[16] = k_madd_epi32(u[12], k32_m08_p24); + v[17] = k_madd_epi32(u[13], k32_m08_p24); + v[18] = k_madd_epi32(u[14], k32_m08_p24); + v[19] = k_madd_epi32(u[15], k32_m08_p24); + v[20] = k_madd_epi32(u[ 8], k32_m08_p24); + v[21] = k_madd_epi32(u[ 9], k32_m08_p24); + v[22] = k_madd_epi32(u[10], k32_m08_p24); + v[23] = k_madd_epi32(u[11], k32_m08_p24); + v[24] = k_madd_epi32(u[ 4], k32_p24_p08); + v[25] = k_madd_epi32(u[ 5], k32_p24_p08); + v[26] = k_madd_epi32(u[ 6], k32_p24_p08); + v[27] = k_madd_epi32(u[ 7], k32_p24_p08); + v[28] = k_madd_epi32(u[ 0], k32_p24_p08); + v[29] = k_madd_epi32(u[ 1], k32_p24_p08); + v[30] = k_madd_epi32(u[ 2], k32_p24_p08); + v[31] = k_madd_epi32(u[ 3], k32_p24_p08); + + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64(v[10], v[11]); + u[ 6] = k_packs_epi64(v[12], v[13]); + u[ 7] = k_packs_epi64(v[14], v[15]); + u[ 8] = k_packs_epi64(v[16], v[17]); + u[ 9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); + lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); + lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); + lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); + lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); + lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); + lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); + lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); + lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); + lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); } // stage 5 { - lstep3[ 8] = k_cvtlo_epi16(step3[4], mask16, kZero); - lstep3[ 9] = k_cvthi_epi16(step3[4], mask16, kZero); - lstep3[14] = k_cvtlo_epi16(step3[7], mask16, kZero); - lstep3[15] = k_cvthi_epi16(step3[7], mask16, kZero); - lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]); lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]); lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]); @@ -1465,23 +1652,6 @@ void FDCT32x32_2D(int16_t *input, lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); } { - lstep3[32] = k_cvtlo_epi16(step3[16], mask16, kZero); - lstep3[33] = k_cvthi_epi16(step3[16], mask16, kZero); - lstep3[34] = k_cvtlo_epi16(step3[17], mask16, kZero); - lstep3[35] = k_cvthi_epi16(step3[17], mask16, kZero); - lstep3[44] = k_cvtlo_epi16(step3[22], mask16, kZero); - lstep3[45] = k_cvthi_epi16(step3[22], mask16, kZero); - lstep3[46] = k_cvtlo_epi16(step3[23], mask16, kZero); - lstep3[47] = k_cvthi_epi16(step3[23], mask16, kZero); - lstep3[48] = k_cvtlo_epi16(step3[24], mask16, kZero); - lstep3[49] = k_cvthi_epi16(step3[24], mask16, kZero); - lstep3[50] = k_cvtlo_epi16(step3[25], mask16, kZero); - lstep3[51] = k_cvthi_epi16(step3[25], mask16, kZero); - lstep3[60] = k_cvtlo_epi16(step3[30], mask16, kZero); - lstep3[61] = k_cvthi_epi16(step3[30], mask16, kZero); - lstep3[62] = k_cvtlo_epi16(step3[31], mask16, kZero); - lstep3[63] = k_cvthi_epi16(step3[31], mask16, kZero); - lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);