Commit ae35425a authored by Kyle Siefring's avatar Kyle Siefring
Browse files

Optimize convolve8 SSSE3 and AVX2 intrinsics

Changed the intrinsics to perform summation similiar to the way the assembly does.

The new code diverges from the assembly by preferring unsaturated additions.

Results for haswell

SSSE3
Horiz/Vert  Size  Speedup
Horiz       x4    ~32%
Horiz       x8    ~6%
Vert        x8    ~4%

AVX2
Horiz/Vert  Size  Speedup
Horiz       x16   ~16%
Vert        x16   ~14%

BUG=webm:1471

Change-Id: I7ad98ea688c904b1ba324adf8eb977873c8b8668
parent b3a36f79
...@@ -603,6 +603,75 @@ TEST_P(ConvolveTest, DISABLED_Scale_Speed) { ...@@ -603,6 +603,75 @@ TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
} }
TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
const int kNumTests = 5000000;
const int width = Width();
const int height = Height();
vpx_usec_timer timer;
SetConstantInput(127);
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
width, height);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("convolve8_%dx%d_%d: %d us\n", width, height,
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
const int kNumTests = 5000000;
const int width = Width();
const int height = Height();
vpx_usec_timer timer;
SetConstantInput(127);
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
width, height);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
const int kNumTests = 5000000;
const int width = Width();
const int height = Height();
vpx_usec_timer timer;
SetConstantInput(127);
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
width, height);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) { TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
const uint8_t *const in = input(); const uint8_t *const in = input();
uint8_t *const out = output(); uint8_t *const out = output();
......
...@@ -58,16 +58,19 @@ static INLINE __m256i convolve8_16_avx2(const __m256i *const s, ...@@ -58,16 +58,19 @@ static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
// add and saturate the results together __m256i sum1, sum2;
const __m256i min_x2x1 = _mm256_min_epi16(x2, x1);
const __m256i max_x2x1 = _mm256_max_epi16(x2, x1); // sum the results together, saturating only on the final step
__m256i temp = _mm256_adds_epi16(x0, x3); // adding x0 with x2 and x1 with x3 is the only order that prevents
temp = _mm256_adds_epi16(temp, min_x2x1); // outranges for all filters
temp = _mm256_adds_epi16(temp, max_x2x1); sum1 = _mm256_add_epi16(x0, x2);
sum2 = _mm256_add_epi16(x1, x3);
// add the rounding offset early to avoid another saturated add
sum1 = _mm256_add_epi16(sum1, k_64);
sum1 = _mm256_adds_epi16(sum1, sum2);
// round and shift by 7 bit each 16 bit // round and shift by 7 bit each 16 bit
temp = _mm256_adds_epi16(temp, k_64); sum1 = _mm256_srai_epi16(sum1, 7);
temp = _mm256_srai_epi16(temp, 7); return sum1;
return temp;
} }
static INLINE __m128i convolve8_8_avx2(const __m256i *const s, static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
...@@ -82,16 +85,19 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s, ...@@ -82,16 +85,19 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
_mm256_castsi256_si128(f[2])); _mm256_castsi256_si128(f[2]));
const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]), const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
_mm256_castsi256_si128(f[3])); _mm256_castsi256_si128(f[3]));
// add and saturate the results together __m128i sum1, sum2;
const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
const __m128i max_x2x1 = _mm_max_epi16(x2, x1); // sum the results together, saturating only on the final step
__m128i temp = _mm_adds_epi16(x0, x3); // adding x0 with x2 and x1 with x3 is the only order that prevents
temp = _mm_adds_epi16(temp, min_x2x1); // outranges for all filters
temp = _mm_adds_epi16(temp, max_x2x1); sum1 = _mm_add_epi16(x0, x2);
// round and shift by 7 bit each 16 bit sum2 = _mm_add_epi16(x1, x3);
temp = _mm_adds_epi16(temp, k_64); // add the rounding offset early to avoid another saturated add
temp = _mm_srai_epi16(temp, 7); sum1 = _mm_add_epi16(sum1, k_64);
return temp; sum1 = _mm_adds_epi16(sum1, sum2);
// shift by 7 bit each 16 bit
sum1 = _mm_srai_epi16(sum1, 7);
return sum1;
} }
#undef MM256_BROADCASTSI128_SI256 #undef MM256_BROADCASTSI128_SI256
......
...@@ -48,16 +48,19 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, ...@@ -48,16 +48,19 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
// add and saturate the results together __m128i sum1, sum2;
const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
const __m128i max_x2x1 = _mm_max_epi16(x2, x1); // sum the results together, saturating only on the final step
__m128i temp = _mm_adds_epi16(x0, x3); // adding x0 with x2 and x1 with x3 is the only order that prevents
temp = _mm_adds_epi16(temp, min_x2x1); // outranges for all filters
temp = _mm_adds_epi16(temp, max_x2x1); sum1 = _mm_add_epi16(x0, x2);
// round and shift by 7 bit each 16 bit sum2 = _mm_add_epi16(x1, x3);
temp = _mm_adds_epi16(temp, k_64); // add the rounding offset early to avoid another saturated add
temp = _mm_srai_epi16(temp, 7); sum1 = _mm_add_epi16(sum1, k_64);
return temp; sum1 = _mm_adds_epi16(sum1, sum2);
// shift by 7 bit each 16 bit
sum1 = _mm_srai_epi16(sum1, 7);
return sum1;
} }
static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
......
...@@ -38,8 +38,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3( ...@@ -38,8 +38,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
__m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i firstFilters, secondFilters, shuffle1, shuffle2;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i srcRegFilt1, srcRegFilt2;
__m128i addFilterReg64, filtersReg, srcReg, minReg; __m128i addFilterReg64, filtersReg, srcReg;
unsigned int i; unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
...@@ -75,18 +75,16 @@ void vpx_filter_block1d4_h8_intrin_ssse3( ...@@ -75,18 +75,16 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
// extract the higher half of the lane // sum the results together, saturating only on the final step
srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); // the specific order of the additions prevents outranges
srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); // extract the higher half of the register
srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
// add and saturate all the results together // add the rounding offset early to avoid another saturated add
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// shift by 7 bit each 16 bits // shift by 7 bit each 16 bits
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment