diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 8199021fd63428aa9006f33a7c13bdf290f7c113..85fbd79bb932ba752f2402f2f3edeefdda0c9400 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -84,7 +84,7 @@ prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t specialize vp9_mb_lpf_vertical_edge_w sse2 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mbloop_filter_vertical_edge +specialize vp9_mbloop_filter_vertical_edge sse2 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_loop_filter_vertical_edge mmx @@ -93,7 +93,7 @@ prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_ specialize vp9_mb_lpf_horizontal_edge_w sse2 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mbloop_filter_horizontal_edge +specialize vp9_mbloop_filter_horizontal_edge sse2 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_loop_filter_horizontal_edge mmx diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index 7d5cae6577242905ce6f71995c6efc550cbe9841..50f890ab8b57ffc8f3b63bc3dcc69ab1f1e88e5f 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -487,7 +487,8 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh) { + const unsigned char *_thresh, + int count) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -507,14 +508,15 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, const __m128i blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + (void)count; + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); @@ -570,8 +572,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, { const __m128i four = _mm_set1_epi16(4); unsigned char *src = s; - int i = 0; - do { + { __m128i workp_a, workp_b, workp_shft; p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); @@ -586,40 +587,38 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[i*8], + _mm_storel_epi64((__m128i *)&flat_op2[0], _mm_packus_epi16(workp_shft, workp_shft)); workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[i*8], + _mm_storel_epi64((__m128i *)&flat_op1[0], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[i*8], + _mm_storel_epi64((__m128i *)&flat_op0[0], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[i*8], + _mm_storel_epi64((__m128i *)&flat_oq0[0], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[i*8], + _mm_storel_epi64((__m128i *)&flat_oq1[0], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[i*8], + _mm_storel_epi64((__m128i *)&flat_oq2[0], _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < 2); + } } // lp filter { @@ -631,13 +630,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); __m128i filt; __m128i work_a; @@ -679,47 +678,47 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, filt = _mm_andnot_si128(hev, filt); work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_load_si128((__m128i *)flat_oq0); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); work_a = _mm_andnot_si128(flat, work_a); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_load_si128((__m128i *)flat_oq1); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); work_a = _mm_andnot_si128(flat, work_a); q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_load_si128((__m128i *)flat_op0); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); work_a = _mm_andnot_si128(flat, work_a); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_load_si128((__m128i *)flat_op1); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); work_a = _mm_andnot_si128(flat, work_a); p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); } } @@ -766,7 +765,7 @@ void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u, /* Loop filtering */ vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit, - _thresh); + _thresh, 1); /* Store result */ _mm_storel_epi64((__m128i *)(u - 3 * p), @@ -929,18 +928,20 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, int p, const unsigned char *blimit, const unsigned char *limit, - const unsigned char *thresh) { + const unsigned char *thresh, + int count) { DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); unsigned char *src[2]; unsigned char *dst[2]; + (void)count; /* Transpose 16x16 */ transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); /* Loop filtering */ vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh); + thresh, 1); src[0] = t_dst + 3 * 16; src[1] = t_dst + 3 * 16 + 8; @@ -999,7 +1000,7 @@ void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u, /* Loop filtering */ vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh); + thresh, 1); src[0] = t_dst + 3 * 16; src[1] = t_dst + 3 * 16 + 8;