Commit 580d3224 authored by Linfeng Zhang's avatar Linfeng Zhang

Add 4 to 3 scaling SSSE3 optimization

Note this change will trigger the different C version on SSSE3 and
generate different scaled output.

Its speed is 2x compared with the version calling vpx_scaled_2d_ssse3().

Change-Id: I17fff122cd0a5ac8aa451d84daa606582da8e194
parent 1fa3ec30
......@@ -28,7 +28,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
int x, y, i;
#if HAVE_NEON
#if HAVE_SSSE3 || HAVE_NEON
// TODO(linfengz): The 4:3 specialized C code is disabled by default since
// it's much slower than the general version which calls vpx_scaled_2d() even
// if vpx_scaled_2d() is not optimized. It will only be enabled as a reference
......
This diff is collapsed.
......@@ -11,6 +11,7 @@
#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_
#define VPX_DSP_X86_CONVOLVE_SSSE3_H_
#include <assert.h>
#include <tmmintrin.h> // SSSE3
#include "./vpx_config.h"
......@@ -25,6 +26,20 @@ static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
}
static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
__m128i *const f) {
const __m128i f_values = _mm_load_si128((const __m128i *)filter);
// pack and duplicate the filter values
// It utilizes the fact that the high byte of filter[3] is always 0 to clean
// half of f[0] and f[4].
assert(filter[3] >= 0 && filter[3] < 256);
f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
}
static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
const __m128i *const f) {
// multiply 2 adjacent elements with the filter and add the result
......@@ -45,4 +60,50 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
return temp;
}
static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
const __m128i *const f) {
// multiply 2 adjacent elements with the filter and add the result
const __m128i k_64 = _mm_set1_epi16(1 << 6);
const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
// compensate the subtracted 64 in f[1]. x4 is always non negative.
const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
// add and saturate the results together
__m128i temp = _mm_adds_epi16(x0, x3);
temp = _mm_adds_epi16(temp, x1);
temp = _mm_adds_epi16(temp, x2);
temp = _mm_adds_epi16(temp, x4);
// round and shift by 7 bit each 16 bit
temp = _mm_adds_epi16(temp, k_64);
temp = _mm_srai_epi16(temp, 7);
return temp;
}
static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
const __m128i *const f) {
// multiply 2 adjacent elements with the filter and add the result
const __m128i k_64 = _mm_set1_epi16(1 << 6);
const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
// compensate the subtracted 64 in f[2]. x5 is always non negative.
const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
__m128i temp;
// add and saturate the results together
temp = _mm_adds_epi16(x0, x1);
temp = _mm_adds_epi16(temp, x2);
temp = _mm_adds_epi16(temp, x3);
temp = _mm_adds_epi16(temp, x4);
temp = _mm_adds_epi16(temp, x5);
// round and shift by 7 bit each 16 bit
temp = _mm_adds_epi16(temp, k_64);
temp = _mm_srai_epi16(temp, 7);
return temp;
}
#endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_
......@@ -113,4 +113,12 @@ static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
_mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
}
static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
const ptrdiff_t stride) {
_mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
_mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
_mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
_mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
}
#endif // VPX_DSP_X86_MEM_SSE2_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment