Commit 38bc1d0f authored by James Zern's avatar James Zern

vpx_fdct16x16_1_sse2: improve load pattern

load the full row rather than doing 2 8-wide columns

Change-Id: I7a1c0cba06b0dc1ae86046410922b1efccb95c95
parent eb64ea3e
......@@ -91,40 +91,39 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
int i;
for (i = 0; i < 2; ++i) {
input += 8 * i;
in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
u0 = _mm_add_epi16(in0, in1);
u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
sum = _mm_add_epi16(sum, u1);
u0 = _mm_add_epi16(in0, in1);
u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
sum = _mm_add_epi16(sum, u1);
u0 = _mm_add_epi16(in0, in1);
u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
sum = _mm_add_epi16(sum, u1);
u0 = _mm_add_epi16(in0, in1);
......@@ -132,6 +131,7 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
sum = _mm_add_epi16(sum, u0);
sum = _mm_add_epi16(sum, u1);
input += 8 * stride;
}
u0 = _mm_setzero_si128();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment