Commit a591ac9e authored by Yunqing Wang's avatar Yunqing Wang Committed by Gerrit Code Review
Browse files

Merge "Fix decoder mismatch in sub-pixel AVX2 intrinsic filters"

Showing with 16 additions and 16 deletions
...@@ -111,21 +111,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, ...@@ -111,21 +111,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
// filter the source buffer // filter the source buffer
srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
// add and saturate the results together // add and saturate the results together
srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
// filter the source buffer // filter the source buffer
srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
// add and saturate the results together // add and saturate the results together
...@@ -146,21 +146,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, ...@@ -146,21 +146,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
// filter the source buffer // filter the source buffer
srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
// add and saturate the results together // add and saturate the results together
srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
// filter the source buffer // filter the source buffer
srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
// add and saturate the results together // add and saturate the results together
...@@ -208,26 +208,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, ...@@ -208,26 +208,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
_mm256_castsi256_si128(filt1Reg)); _mm256_castsi256_si128(filt1Reg));
srcRegFilt2 = _mm_shuffle_epi8(srcReg1, srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
_mm256_castsi256_si128(filt2Reg)); _mm256_castsi256_si128(filt4Reg));
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
_mm256_castsi256_si128(firstFilters)); _mm256_castsi256_si128(firstFilters));
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
_mm256_castsi256_si128(secondFilters)); _mm256_castsi256_si128(forthFilters));
// add and saturate the results together // add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
// filter the source buffer // filter the source buffer
srcRegFilt3= _mm_shuffle_epi8(srcReg1, srcRegFilt3= _mm_shuffle_epi8(srcReg1,
_mm256_castsi256_si128(filt4Reg)); _mm256_castsi256_si128(filt2Reg));
srcRegFilt2= _mm_shuffle_epi8(srcReg1, srcRegFilt2= _mm_shuffle_epi8(srcReg1,
_mm256_castsi256_si128(filt3Reg)); _mm256_castsi256_si128(filt3Reg));
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
_mm256_castsi256_si128(forthFilters)); _mm256_castsi256_si128(secondFilters));
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
_mm256_castsi256_si128(thirdFilters)); _mm256_castsi256_si128(thirdFilters));
...@@ -247,26 +247,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, ...@@ -247,26 +247,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
_mm256_castsi256_si128(filt1Reg)); _mm256_castsi256_si128(filt1Reg));
srcRegFilt2 = _mm_shuffle_epi8(srcReg2, srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
_mm256_castsi256_si128(filt2Reg)); _mm256_castsi256_si128(filt4Reg));
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
_mm256_castsi256_si128(firstFilters)); _mm256_castsi256_si128(firstFilters));
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
_mm256_castsi256_si128(secondFilters)); _mm256_castsi256_si128(forthFilters));
// add and saturate the results together // add and saturate the results together
srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
// filter the source buffer // filter the source buffer
srcRegFilt3 = _mm_shuffle_epi8(srcReg2, srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
_mm256_castsi256_si128(filt4Reg)); _mm256_castsi256_si128(filt2Reg));
srcRegFilt2 = _mm_shuffle_epi8(srcReg2, srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
_mm256_castsi256_si128(filt3Reg)); _mm256_castsi256_si128(filt3Reg));
// multiply 2 adjacent elements with the filter and add the result // multiply 2 adjacent elements with the filter and add the result
srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
_mm256_castsi256_si128(forthFilters)); _mm256_castsi256_si128(secondFilters));
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
_mm256_castsi256_si128(thirdFilters)); _mm256_castsi256_si128(thirdFilters));
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment