Commit af10457e authored by levytamar82's avatar levytamar82 Committed by Gerrit Code Review

Fix bug 806

in the function sad32x32x4d and sad64x64x4d the source is aligned to 16 bytes
and not to 32 bytes - the load is now unaligned.

Change-Id: I922fdba56d0936b5cf72e4503519f185645a168c
parent 65234504
......@@ -640,19 +640,9 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
#if HAVE_AVX2
#if CONFIG_VP9_ENCODER
// TODO(jzern): these prototypes can be removed after the avx2 versions are
// reenabled in vp9_rtcd_defs.pl.
extern "C" {
void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_ptr[], int ref_stride,
unsigned int *sad_array);
void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_ptr[], int ref_stride,
unsigned int *sad_array);
}
const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, SADx4Test, ::testing::Values(
INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
make_tuple(32, 32, sad_32x32x4d_avx2),
make_tuple(64, 64, sad_64x64x4d_avx2)));
#endif // CONFIG_VP9_ENCODER
......
......@@ -653,7 +653,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const
specialize qw/vp9_sad4x4x8 sse4/;
add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad64x64x4d sse2/;
specialize qw/vp9_sad64x64x4d sse2 avx2/;
add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x64x4d sse2/;
......@@ -668,7 +668,7 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, co
specialize qw/vp9_sad16x32x4d sse2/;
add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x32x4d sse2/;
specialize qw/vp9_sad32x32x4d sse2 avx2/;
add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x16x4d sse2/;
......
......@@ -31,7 +31,7 @@ void vp9_sad32x32x4d_avx2(uint8_t *src,
sum_ref3 = _mm256_set1_epi16(0);
for (i = 0; i < 32 ; i++) {
// load src and all refs
src_reg = _mm256_load_si256((__m256i *)(src));
src_reg = _mm256_loadu_si256((__m256i *)(src));
ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
......@@ -103,8 +103,8 @@ void vp9_sad64x64x4d_avx2(uint8_t *src,
sum_ref3 = _mm256_set1_epi16(0);
for (i = 0; i < 64 ; i++) {
// load 64 bytes from src and all refs
src_reg = _mm256_load_si256((__m256i *)(src));
srcnext_reg = _mm256_load_si256((__m256i *)(src + 32));
src_reg = _mm256_loadu_si256((__m256i *)(src));
srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32));
ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment