Commit ae35425a authored by Kyle Siefring's avatar Kyle Siefring

Optimize convolve8 SSSE3 and AVX2 intrinsics

Changed the intrinsics to perform summation similiar to the way the assembly does.

The new code diverges from the assembly by preferring unsaturated additions.

Results for haswell

SSSE3
Horiz/Vert  Size  Speedup
Horiz       x4    ~32%
Horiz       x8    ~6%
Vert        x8    ~4%

AVX2
Horiz/Vert  Size  Speedup
Horiz       x16   ~16%
Vert        x16   ~14%

BUG=webm:1471

Change-Id: I7ad98ea688c904b1ba324adf8eb977873c8b8668
parent b3a36f79
......@@ -603,6 +603,75 @@ TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
const int kNumTests = 5000000;
const int width = Width();
const int height = Height();
vpx_usec_timer timer;
SetConstantInput(127);
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
width, height);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("convolve8_%dx%d_%d: %d us\n", width, height,
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
const int kNumTests = 5000000;
const int width = Width();
const int height = Height();
vpx_usec_timer timer;
SetConstantInput(127);
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
width, height);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
const int kNumTests = 5000000;
const int width = Width();
const int height = Height();
vpx_usec_timer timer;
SetConstantInput(127);
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
width, height);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
......
......@@ -58,16 +58,19 @@ static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
// add and saturate the results together
const __m256i min_x2x1 = _mm256_min_epi16(x2, x1);
const __m256i max_x2x1 = _mm256_max_epi16(x2, x1);
__m256i temp = _mm256_adds_epi16(x0, x3);
temp = _mm256_adds_epi16(temp, min_x2x1);
temp = _mm256_adds_epi16(temp, max_x2x1);
__m256i sum1, sum2;
// sum the results together, saturating only on the final step
// adding x0 with x2 and x1 with x3 is the only order that prevents
// outranges for all filters
sum1 = _mm256_add_epi16(x0, x2);
sum2 = _mm256_add_epi16(x1, x3);
// add the rounding offset early to avoid another saturated add
sum1 = _mm256_add_epi16(sum1, k_64);
sum1 = _mm256_adds_epi16(sum1, sum2);
// round and shift by 7 bit each 16 bit
temp = _mm256_adds_epi16(temp, k_64);
temp = _mm256_srai_epi16(temp, 7);
return temp;
sum1 = _mm256_srai_epi16(sum1, 7);
return sum1;
}
static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
......@@ -82,16 +85,19 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
_mm256_castsi256_si128(f[2]));
const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
_mm256_castsi256_si128(f[3]));
// add and saturate the results together
const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
__m128i temp = _mm_adds_epi16(x0, x3);
temp = _mm_adds_epi16(temp, min_x2x1);
temp = _mm_adds_epi16(temp, max_x2x1);
// round and shift by 7 bit each 16 bit
temp = _mm_adds_epi16(temp, k_64);
temp = _mm_srai_epi16(temp, 7);
return temp;
__m128i sum1, sum2;
// sum the results together, saturating only on the final step
// adding x0 with x2 and x1 with x3 is the only order that prevents
// outranges for all filters
sum1 = _mm_add_epi16(x0, x2);
sum2 = _mm_add_epi16(x1, x3);
// add the rounding offset early to avoid another saturated add
sum1 = _mm_add_epi16(sum1, k_64);
sum1 = _mm_adds_epi16(sum1, sum2);
// shift by 7 bit each 16 bit
sum1 = _mm_srai_epi16(sum1, 7);
return sum1;
}
#undef MM256_BROADCASTSI128_SI256
......
......@@ -48,16 +48,19 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
// add and saturate the results together
const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
__m128i temp = _mm_adds_epi16(x0, x3);
temp = _mm_adds_epi16(temp, min_x2x1);
temp = _mm_adds_epi16(temp, max_x2x1);
// round and shift by 7 bit each 16 bit
temp = _mm_adds_epi16(temp, k_64);
temp = _mm_srai_epi16(temp, 7);
return temp;
__m128i sum1, sum2;
// sum the results together, saturating only on the final step
// adding x0 with x2 and x1 with x3 is the only order that prevents
// outranges for all filters
sum1 = _mm_add_epi16(x0, x2);
sum2 = _mm_add_epi16(x1, x3);
// add the rounding offset early to avoid another saturated add
sum1 = _mm_add_epi16(sum1, k_64);
sum1 = _mm_adds_epi16(sum1, sum2);
// shift by 7 bit each 16 bit
sum1 = _mm_srai_epi16(sum1, 7);
return sum1;
}
static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
......
......@@ -38,8 +38,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, srcReg, minReg;
__m128i srcRegFilt1, srcRegFilt2;
__m128i addFilterReg64, filtersReg, srcReg;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
......@@ -75,18 +75,16 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
// extract the higher half of the lane
srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
// sum the results together, saturating only on the final step
// the specific order of the additions prevents outranges
srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
// extract the higher half of the register
srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
// add and saturate all the results together
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
// add the rounding offset early to avoid another saturated add
srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
// shift by 7 bit each 16 bits
srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment