diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index 8aab0ff0395867607502b2aa0d26fdd410b87d96..c0467cd843d0c215150a4187c33d32b0c5893145 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -54,9 +54,11 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6; + rtcd->loopfilter.simple_mb_v = + vp8_loop_filter_simple_vertical_edge_armv6; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6; + rtcd->loopfilter.simple_mb_h = + vp8_loop_filter_simple_horizontal_edge_armv6; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6; rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm index c7441b055cb738c88981e4fa4bc675cf517ae2e0..1cbbbcdef5e2533b43a7095b9588d07b474d939b 100644 --- a/vp8/common/arm/armv6/loopfilter_v6.asm +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -53,14 +53,11 @@ count RN r5 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, -;r2 const char *flimit, +;r2 const char *blimit, ;r3 const char *limit, ;stack const char *thresh, ;stack int count -;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. - ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- @@ -72,14 +69,18 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r9, [src], pstep ; p3 - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r10, [src], pstep ; p2 - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r11, [src], pstep ; p1 - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r6], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |Hnext8| ; vp8_filter_mask() function @@ -275,14 +276,18 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r9, [src], pstep ; p3 - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r10, [src], pstep ; p2 - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r11, [src], pstep ; p1 - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r6], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |MBHnext8| @@ -584,15 +589,19 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r7, [src], pstep - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r8, [src], pstep - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r12], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |Vnext8| @@ -855,18 +864,22 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit pld [src, #23] ldr r7, [src], pstep - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit pld [src, #23] ldr r8, [src], pstep - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r12], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 pld [src, #23] ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |MBVnext8| ; vp8_filter_mask() function @@ -906,6 +919,7 @@ count RN r5 str lr, [sp, #8] ldr lr, [src], pstep + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 ldr lr, [sp, #8] ; load back (f)limit accumulator @@ -954,6 +968,7 @@ count RN r5 beq mbvskip_filter ; skip filtering + ;vp8_hevmask() function ;calculate high edge variance @@ -1121,6 +1136,7 @@ count RN r5 smlabb r8, r6, lr, r7 smlatb r6, r6, lr, r7 smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 ssat r8, #8, r8, asr #7 ssat r6, #8, r6, asr #7 diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm index 40a71f49db7257c8654cc8e6a79afe398f150ee2..5e00cf01bbd7296ea6bd9455470c1f2d2baa617b 100644 --- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -45,35 +45,28 @@ MEND + src RN r0 pstep RN r1 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, -;r2 const char *flimit, -;r3 const char *limit, -;stack const char *thresh, -;stack int count - -; All 16 elements in flimit are equal. So, in the code, only one load is needed -; for flimit. Same applies to limit. thresh is not used in simple looopfilter +;r2 const char *blimit ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_simple_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r3] ; limit + ldrb r12, [r2] ; blimit ldr r3, [src, -pstep, lsl #1] ; p1 ldr r4, [src, -pstep] ; p0 ldr r5, [src] ; q0 ldr r6, [src, pstep] ; q1 - ldr r7, [r2] ; flimit + orr r12, r12, r12, lsl #8 ; blimit ldr r2, c0x80808080 - ldr r9, [sp, #40] ; count for 8-in-parallel - uadd8 r7, r7, r7 ; flimit * 2 - mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time - uadd8 r12, r7, r12 ; flimit * 2 + limit + orr r12, r12, r12, lsl #16 ; blimit + mov r9, #4 ; double the count. we're doing 4 at a time mov lr, #0 ; need 0 in a couple places |simple_hnext8| @@ -148,34 +141,32 @@ pstep RN r1 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r2] ; r12: flimit + ldrb r12, [r2] ; r12: blimit ldr r2, c0x80808080 - ldr r7, [r3] ; limit + orr r12, r12, r12, lsl #8 ; load soure data to r7, r8, r9, r10 ldrh r3, [src, #-2] pld [src, #23] ; preload for next block ldrh r4, [src], pstep - uadd8 r12, r12, r12 ; flimit * 2 + orr r12, r12, r12, lsl #16 ldrh r5, [src, #-2] pld [src, #23] ldrh r6, [src], pstep - uadd8 r12, r12, r7 ; flimit * 2 + limit pkhbt r7, r3, r4, lsl #16 ldrh r3, [src, #-2] pld [src, #23] ldrh r4, [src], pstep - ldr r11, [sp, #40] ; count (r11) for 8-in-parallel pkhbt r8, r5, r6, lsl #16 ldrh r5, [src, #-2] pld [src, #23] ldrh r6, [src], pstep - mov r11, r11, lsl #1 ; 4-in-parallel + mov r11, #4 ; double the count. we're doing 4 at a time |simple_vnext8| ; vp8_simple_filter_mask() function diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c index 6d1caa485c8a4f3486e7a8fa7cc41f870343b035..c841d455a0c6963ad01fd24c6bee3a4561db1549 100644 --- a/vp8/common/arm/loopfilter_arm.c +++ b/vp8/common/arm/loopfilter_arm.c @@ -9,30 +9,34 @@ */ -#include "vpx_ports/config.h" -#include <math.h> +#include "vpx_config.h" #include "vp8/common/loopfilter.h" #include "vp8/common/onyxc_int.h" +#if HAVE_ARMV6 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); - -extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon); -extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon); -extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon); -extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon); -extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon); -extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon); - -extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon; -extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon; -extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon; -extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; +#endif +#if HAVE_ARMV7 +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh, + unsigned char *v); + +extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; + +extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; +#endif #if HAVE_ARMV6 /*ARMV6 loopfilter functions*/ @@ -40,96 +44,72 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); } /* Vertical B Filtering */ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); } #endif @@ -139,83 +119,58 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - -void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); -void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride); -} + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); -void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); } /* Vertical B Filtering */ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; - if (u_ptr) - vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4); -} + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); -void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); } #endif diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h index cd62207d7012961cba8b9aa7109d266cc4b9d609..390a547b0776143bf7023043bcf38856a52df523 100644 --- a/vp8/common/arm/loopfilter_arm.h +++ b/vp8/common/arm/loopfilter_arm.h @@ -12,15 +12,17 @@ #ifndef LOOPFILTER_ARM_H #define LOOPFILTER_ARM_H +#include "vpx_config.h" + #if HAVE_ARMV6 extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_lf_normal_mb_v @@ -36,28 +38,29 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6 #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6 +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6 #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6 #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6 +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6 #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6 -#endif -#endif +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV6 */ #if HAVE_ARMV7 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon); extern prototype_loopfilter_block(vp8_loop_filter_bv_neon); extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon); extern prototype_loopfilter_block(vp8_loop_filter_bh_neon); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_lf_normal_mb_v @@ -83,7 +86,8 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon); #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon -#endif -#endif +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV7 */ -#endif +#endif /* LOOPFILTER_ARM_H */ diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm index e73dd6401a6ae17048473349004785ef602a6dcf..e44be0a1e34d2199c20401aabc68315f2be2cb35 100644 --- a/vp8/common/arm/neon/loopfilter_neon.asm +++ b/vp8/common/arm/neon/loopfilter_neon.asm @@ -14,109 +14,97 @@ EXPORT |vp8_loop_filter_vertical_edge_y_neon| EXPORT |vp8_loop_filter_vertical_edge_uv_neon| ARM - REQUIRE8 - PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -; flimit, limit, and thresh should be positive numbers. -; All 16 elements in these variables are equal. - -; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) ; r0 unsigned char *src ; r1 int pitch -; r2 const signed char *flimit -; r3 const signed char *limit -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_loop_filter_horizontal_edge_y_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {q3}, [r2], r1 ; p3 - vld1.u8 {q4}, [r2], r1 ; p2 - vld1.u8 {q5}, [r2], r1 ; p1 - vld1.u8 {q6}, [r2], r1 ; p0 - vld1.u8 {q7}, [r2], r1 ; q0 - vld1.u8 {q8}, [r2], r1 ; q1 - vld1.u8 {q9}, [r2], r1 ; q2 - vld1.u8 {q10}, [r2] ; q3 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - sub r0, r0, r1, lsl #1 + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1 + add r1, r1, r1 + + vdup.u8 q2, r3 ; duplicate thresh + + vld1.u8 {q3}, [r2@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r2@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r2@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r2@128] ; q2 + vld1.u8 {q10}, [r12@128] ; q3 + + sub r2, r2, r1, lsl #1 + sub r12, r12, r1, lsl #1 bl vp8_loop_filter_neon - vst1.u8 {q5}, [r0], r1 ; store op1 - vst1.u8 {q6}, [r0], r1 ; store op0 - vst1.u8 {q7}, [r0], r1 ; store oq0 - vst1.u8 {q8}, [r0], r1 ; store oq1 + vst1.u8 {q5}, [r2@128], r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r2@128], r1 ; store oq0 + vst1.u8 {q8}, [r12@128], r1 ; store oq1 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| -; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) + ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v |vp8_loop_filter_horizontal_edge_uv_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + ldr r12, [sp, #4] ; load thresh ldr r2, [sp, #8] ; load v ptr + vdup.u8 q2, r12 ; duplicate thresh sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - vld1.u8 {d6}, [r3], r1 ; p3 - vld1.u8 {d8}, [r3], r1 ; p2 - vld1.u8 {d10}, [r3], r1 ; p1 - vld1.u8 {d12}, [r3], r1 ; p0 - vld1.u8 {d14}, [r3], r1 ; q0 - vld1.u8 {d16}, [r3], r1 ; q1 - vld1.u8 {d18}, [r3], r1 ; q2 - vld1.u8 {d20}, [r3] ; q3 - - ldr r3, [sp, #4] ; load thresh pointer - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - vld1.u8 {d7}, [r12], r1 ; p3 - vld1.u8 {d9}, [r12], r1 ; p2 - vld1.u8 {d11}, [r12], r1 ; p1 - vld1.u8 {d13}, [r12], r1 ; p0 - vld1.u8 {d15}, [r12], r1 ; q0 - vld1.u8 {d17}, [r12], r1 ; q1 - vld1.u8 {d19}, [r12], r1 ; q2 - vld1.u8 {d21}, [r12] ; q3 - vld1.s8 {d4[], d5[]}, [r3] ; thresh + vld1.u8 {d6}, [r3@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r3@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r3@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r3@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r3@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r3@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r3@64] ; q3 + vld1.u8 {d21}, [r12@64] ; q3 bl vp8_loop_filter_neon sub r0, r0, r1, lsl #1 sub r2, r2, r1, lsl #1 - vst1.u8 {d10}, [r0], r1 ; store u op1 - vst1.u8 {d11}, [r2], r1 ; store v op1 - vst1.u8 {d12}, [r0], r1 ; store u op0 - vst1.u8 {d13}, [r2], r1 ; store v op0 - vst1.u8 {d14}, [r0], r1 ; store u oq0 - vst1.u8 {d15}, [r2], r1 ; store v oq0 - vst1.u8 {d16}, [r0] ; store u oq1 - vst1.u8 {d17}, [r2] ; store v oq1 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r2@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r2@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r2@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64] ; store u oq1 + vst1.u8 {d17}, [r2@64] ; store v oq1 - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| ; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, @@ -124,39 +112,38 @@ ; const signed char *limit, ; const signed char *thresh, ; int count) -; r0 unsigned char *src, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r0 unsigned char *src +; r1 int pitch +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, + |vp8_loop_filter_vertical_edge_y_neon| PROC - stmdb sp!, {lr} - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - sub r2, r0, #4 ; src ptr down by 4 columns - sub r0, r0, #2 ; dst ptr - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {d6}, [r2], r1 ; load first 8-line src data - vld1.u8 {d8}, [r2], r1 + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, #4 ; src ptr down by 4 columns + add r1, r1, r1 + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1, asr #1 + + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d8}, [r12], r1 vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r2], r1 + vld1.u8 {d12}, [r12], r1 vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r2], r1 + vld1.u8 {d16}, [r12], r1 vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r2], r1 - - vld1.s8 {d4[], d5[]}, [r12] ; thresh + vld1.u8 {d20}, [r12], r1 vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r2], r1 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r2], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d19}, [r2], r1 - vld1.u8 {d21}, [r2] + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d19}, [r2] + vld1.u8 {d21}, [r12] ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -164,6 +151,8 @@ vtrn.32 q5, q9 vtrn.32 q6, q10 + vdup.u8 q2, r3 ; duplicate thresh + vtrn.16 q3, q5 vtrn.16 q4, q6 vtrn.16 q7, q9 @@ -178,28 +167,34 @@ vswp d12, d11 vswp d16, d13 + + sub r0, r0, #2 ; dst ptr + vswp d14, d12 vswp d16, d15 + add r12, r0, r1, asr #1 + ;store op1, op0, oq0, oq1 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0], r1 - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r0] + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_y_neon| ; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch @@ -209,38 +204,36 @@ ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v |vp8_loop_filter_vertical_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r12, r0, #4 ; move u pointer down by 4 columns - vld1.s8 {d0[], d1[]}, [r2] ; flimit - vld1.s8 {d2[], d3[]}, [r3] ; limit - + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + sub r12, r0, #4 ; move u pointer down by 4 columns ldr r2, [sp, #8] ; load v ptr - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d20}, [r12] - + vdup.u8 q1, r3 ; duplicate limit sub r3, r2, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r12], r1 ;load u data vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d8}, [r12], r1 vld1.u8 {d9}, [r3], r1 + vld1.u8 {d10}, [r12], r1 vld1.u8 {d11}, [r3], r1 + vld1.u8 {d12}, [r12], r1 vld1.u8 {d13}, [r3], r1 + vld1.u8 {d14}, [r12], r1 vld1.u8 {d15}, [r3], r1 + vld1.u8 {d16}, [r12], r1 vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r12], r1 vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r12] vld1.u8 {d21}, [r3] - ldr r12, [sp, #4] ; load thresh pointer + ldr r12, [sp, #4] ; load thresh ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -248,6 +241,8 @@ vtrn.32 q5, q9 vtrn.32 q6, q10 + vdup.u8 q2, r12 ; duplicate thresh + vtrn.16 q3, q5 vtrn.16 q4, q6 vtrn.16 q7, q9 @@ -258,18 +253,16 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - bl vp8_loop_filter_neon - sub r0, r0, #2 - sub r2, r2, #2 - vswp d12, d11 vswp d16, d13 vswp d14, d12 vswp d16, d15 + sub r0, r0, #2 + sub r2, r2, #2 + ;store op1, op0, oq0, oq1 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 @@ -288,7 +281,7 @@ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| ; void vp8_loop_filter_neon(); @@ -316,42 +309,44 @@ vabd.u8 q14, q8, q7 ; abs(q1 - q0) vabd.u8 q3, q9, q8 ; abs(q2 - q1) vabd.u8 q4, q10, q9 ; abs(q3 - q2) - vabd.u8 q9, q6, q7 ; abs(p0 - q0) vmax.u8 q11, q11, q12 vmax.u8 q12, q13, q14 vmax.u8 q3, q3, q4 vmax.u8 q15, q11, q12 + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + ; vp8_hevmask vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 vmax.u8 q15, q15, q3 - vadd.u8 q0, q0, q0 ; flimit * 2 - vadd.u8 q0, q0, q1 ; flimit * 2 + limit - vcge.u8 q15, q1, q15 + vmov.u8 q10, #0x80 ; 0x80 vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - vshr.u8 q2, q2, #1 ; a = a / 2 - vqadd.u8 q9, q9, q2 ; a = b + a - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - vmov.u8 q0, #0x80 ; 0x80 + vcge.u8 q15, q1, q15 ; vp8_filter() function ; convert to signed - veor q7, q7, q0 ; qs0 - veor q6, q6, q0 ; ps0 - veor q5, q5, q0 ; ps1 - veor q8, q8, q0 ; qs1 + veor q7, q7, q10 ; qs0 + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 vmov.u8 q10, #3 ; #3 vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q11, d15, d13 + vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 + vmovl.u8 q4, d20 vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) @@ -378,19 +373,20 @@ vshr.s8 q2, q2, #3 ; Filter2 >>= 3 vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) ; outer tap adjustments: ++vp8_filter >> 1 vrshr.s8 q1, q1, #1 vbic q1, q1, q14 ; vp8_filter &= ~hev - + vmov.u8 q0, #0x80 ; 0x80 vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter) vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter) - veor q5, q13, q0 ; *op1 = u^0x80 veor q6, q11, q0 ; *op0 = u^0x80 veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 veor q8, q12, q0 ; *oq1 = u^0x80 bx lr diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm index 7c5ea3644a8b130f43903abc35f1d9a69b82a40a..adf848b9c347966ecd5205b8f9a8f0a4cd46f9c2 100644 --- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -9,99 +9,109 @@ ; - EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| + EXPORT |vp8_loop_filter_bhs_neon| + EXPORT |vp8_loop_filter_mbhs_neon| ARM - REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit -;are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. -; r0 unsigned char *s, -; r1 int p, //pitch -; r2 const signed char *flimit, -; r3 const signed char *limit, -; stack(r4) const signed char *thresh (unused) -; //stack(r5) int count --unused + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE |vp8_loop_filter_simple_horizontal_edge_neon| PROC - sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines - vld1.u8 {q5}, [r0], r1 ; p1 - vld1.s8 {d2[], d3[]}, [r2] ; flimit - vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 - vld1.u8 {q6}, [r0], r1 ; p0 - vmov.u8 q0, #0x80 ; 0x80 - vld1.u8 {q7}, [r0], r1 ; q0 - vmov.u8 q10, #0x03 ; 0x03 - vld1.u8 {q8}, [r0] ; q1 + sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines + + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q5}, [r3@128], r1 ; p0 + vld1.u8 {q8}, [r0@128] ; q1 + vld1.u8 {q6}, [r3@128] ; p1 - ;vp8_filter_mask() function vabd.u8 q15, q6, q7 ; abs(p0 - q0) vabd.u8 q14, q5, q8 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q13, #3 vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - ;vp8_filter() function veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value - vadd.u8 q1, q1, q1 ; flimit * 2 - vadd.u8 q1, q1, q13 ; flimit * 2 + limit - vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 -;;;;;;;;;; - ;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0) vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q3, d15, d13 vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) - ;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0) - vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0) - vadd.s16 q12, q3, q3 + vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) + vmul.s16 q3, q3, q13 + vmov.u8 q10, #0x03 ; 0x03 vmov.u8 q9, #0x04 ; 0x04 - vadd.s16 q2, q2, q11 - vadd.s16 q3, q3, q12 - vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) vaddw.s8 q3, q3, d9 - ;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d9, q3 -;;;;;;;;;;;;; - vand q4, q4, q15 ; vp8_filter &= mask + vand q14, q4, q15 ; vp8_filter &= mask - vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q4, q4, #3 ; Filter1 >>= 3 + vshr.s8 q4, q3, #3 ; Filter1 >>= 3 - sub r0, r0, r1, lsl #1 + sub r0, r0, r1 ;calculate output vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) - add r3, r0, r1 - veor q6, q11, q0 ; *op0 = u^0x80 veor q7, q10, q0 ; *oq0 = u^0x80 - vst1.u8 {q6}, [r0] ; store op0 - vst1.u8 {q7}, [r3] ; store oq0 + vst1.u8 {q6}, [r3@128] ; store op0 + vst1.u8 {q7}, [r0@128] ; store oq0 bx lr ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| -;----------------- +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bhs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate blim + + add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 + add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride + bl vp8_loop_filter_simple_horizontal_edge_neon + add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride + pop {r4, lr} + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbhs_neon| PROC + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp8_loop_filter_bhs_neon| END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm index a7f7b690e3265fb6cffd96be3745f0f35c9ffa6b..e690df2f7de9d8e3e9cd502f78c24fd70c5c6241 100644 --- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -9,59 +9,54 @@ ; - EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon| + EXPORT |vp8_loop_filter_bvs_neon| + EXPORT |vp8_loop_filter_mbvs_neon| ARM - REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit -;are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. -; r0 unsigned char *s, -; r1 int p, //pitch -; r2 const signed char *flimit, -; r3 const signed char *limit, -; stack(r4) const signed char *thresh (unused) -; //stack(r5) int count --unused + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE |vp8_loop_filter_simple_vertical_edge_neon| PROC sub r0, r0, #2 ; move src pointer down by 2 columns - - vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1 - vld1.s8 {d2[], d3[]}, [r2] ; flimit - vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 - vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1 - vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1 - vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1 - vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1 - vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1 - vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1 - vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1 - - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vmov.u8 q0, #0x80 ; 0x80 - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vmov.u8 q11, #0x03 ; 0x03 - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vmov.u8 q12, #0x04 ; 0x04 - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + add r12, r1, r1 + add r3, r0, r1 + + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 + + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] vswp d7, d10 vswp d12, d9 - ;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6 ;vp8_filter_mask() function ;vp8_hevmask() function sub r0, r0, r1, lsl #4 vabd.u8 q15, q5, q4 ; abs(p0 - q0) vabd.u8 q14, q3, q6 ; abs(p1 - q1) + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q11, #3 vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value @@ -69,80 +64,91 @@ veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value - vadd.u8 q1, q1, q1 ; flimit * 2 - vadd.u8 q1, q1, q13 ; flimit * 2 + limit vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 - ;vp8_filter() function -;;;;;;;;;; - ;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0) vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) vsubl.s8 q13, d9, d11 - vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vmul.s16 q13, q13, q11 - ;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0) - vadd.s16 q14, q13, q13 - vadd.s16 q2, q2, q10 - vadd.s16 q13, q13, q14 + vmov.u8 q11, #0x03 ; 0x03 + vmov.u8 q12, #0x04 ; 0x04 - ;vqadd.s8 q1, q1, q2 - vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d3 + vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d29 - vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d3, q13 + vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d29, q13 add r0, r0, #1 - add r2, r0, r1 -;;;;;;;;;;; + add r3, r0, r1 - vand q1, q1, q15 ; vp8_filter &= mask + vand q14, q14, q15 ; vp8_filter &= mask - vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + vshr.s8 q14, q3, #3 ; Filter1 >>= 3 ;calculate output - vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1) vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1) - veor q7, q10, q0 ; *oq0 = u^0x80 veor q6, q11, q0 ; *op0 = u^0x80 - - add r3, r2, r1 + veor q7, q10, q0 ; *oq0 = u^0x80 + add r12, r1, r1 vswp d13, d14 - add r12, r3, r1 ;store op1, op0, oq0, oq1 - vst2.8 {d12[0], d13[0]}, [r0] - vst2.8 {d12[1], d13[1]}, [r2] - vst2.8 {d12[2], d13[2]}, [r3] - vst2.8 {d12[3], d13[3]}, [r12], r1 - add r0, r12, r1 - vst2.8 {d12[4], d13[4]}, [r12] - vst2.8 {d12[5], d13[5]}, [r0], r1 - add r2, r0, r1 - vst2.8 {d12[6], d13[6]}, [r0] - vst2.8 {d12[7], d13[7]}, [r2], r1 - add r3, r2, r1 - vst2.8 {d14[0], d15[0]}, [r2] - vst2.8 {d14[1], d15[1]}, [r3], r1 - add r12, r3, r1 - vst2.8 {d14[2], d15[2]}, [r3] - vst2.8 {d14[3], d15[3]}, [r12], r1 - add r0, r12, r1 - vst2.8 {d14[4], d15[4]}, [r12] - vst2.8 {d14[5], d15[5]}, [r0], r1 - add r2, r0, r1 - vst2.8 {d14[6], d15[6]}, [r0] - vst2.8 {d14[7], d15[7]}, [r2] + vst2.8 {d12[0], d13[0]}, [r0], r12 + vst2.8 {d12[1], d13[1]}, [r3], r12 + vst2.8 {d12[2], d13[2]}, [r0], r12 + vst2.8 {d12[3], d13[3]}, [r3], r12 + vst2.8 {d12[4], d13[4]}, [r0], r12 + vst2.8 {d12[5], d13[5]}, [r3], r12 + vst2.8 {d12[6], d13[6]}, [r0], r12 + vst2.8 {d12[7], d13[7]}, [r3], r12 + vst2.8 {d14[0], d15[0]}, [r0], r12 + vst2.8 {d14[1], d15[1]}, [r3], r12 + vst2.8 {d14[2], d15[2]}, [r0], r12 + vst2.8 {d14[3], d15[3]}, [r3], r12 + vst2.8 {d14[4], d15[4]}, [r0], r12 + vst2.8 {d14[5], d15[5]}, [r3], r12 + vst2.8 {d14[6], d15[6]}, [r0], r12 + vst2.8 {d14[7], d15[7]}, [r3] bx lr ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| -;----------------- - +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bvs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + mov r4, r0 + add r0, r0, #4 + vdup.s8 q1, r3 ; duplicate blim + bl vp8_loop_filter_simple_vertical_edge_neon + ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1 + add r0, r4, #8 + bl vp8_loop_filter_simple_vertical_edge_neon + add r0, r4, #12 + pop {r4, lr} + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbvs_neon| PROC + ldrb r3, [r2] ; load mblim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| END diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm index 72f0f92712fc57ddee934ff294292b31f08e4eb4..f41c156df8b27783c36ef81ba0f1cada5f666e2c 100644 --- a/vp8/common/arm/neon/mbloopfilter_neon.asm +++ b/vp8/common/arm/neon/mbloopfilter_neon.asm @@ -14,155 +14,143 @@ EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| ARM - REQUIRE8 - PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -; flimit, limit, and thresh should be positive numbers. -; All 16 elements in these variables are equal. - ; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) ; r0 unsigned char *src, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_mbloop_filter_horizontal_edge_y_neon| PROC - stmdb sp!, {lr} - sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r12, [sp, #4] ; load thresh pointer - - vld1.u8 {q3}, [r0], r1 ; p3 - vld1.s8 {d2[], d3[]}, [r3] ; limit - vld1.u8 {q4}, [r0], r1 ; p2 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - vld1.u8 {q5}, [r0], r1 ; p1 - vld1.u8 {q6}, [r0], r1 ; p0 - vld1.u8 {q7}, [r0], r1 ; q0 - vld1.u8 {q8}, [r0], r1 ; q1 - vld1.u8 {q9}, [r0], r1 ; q2 - vld1.u8 {q10}, [r0], r1 ; q3 + push {lr} + add r1, r1, r1 ; double stride + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + add r12, r0, r1, lsr #1 ; move src pointer up by 1 line + + vld1.u8 {q3}, [r0@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r0@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r0@128], r1 ; q2 + vld1.u8 {q10}, [r12@128], r1 ; q3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #3 - add r0, r0, r1 - add r2, r0, r1 - add r3, r2, r1 - - vst1.u8 {q4}, [r0] ; store op2 - vst1.u8 {q5}, [r2] ; store op1 - vst1.u8 {q6}, [r3], r1 ; store op0 - add r12, r3, r1 - vst1.u8 {q7}, [r3] ; store oq0 - vst1.u8 {q8}, [r12], r1 ; store oq1 - vst1.u8 {q9}, [r12] ; store oq2 - - ldmia sp!, {pc} + sub r12, r12, r1, lsl #2 + add r0, r12, r1, lsr #1 + + vst1.u8 {q4}, [r12@128],r1 ; store op2 + vst1.u8 {q5}, [r0@128],r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r0@128],r1 ; store oq0 + vst1.u8 {q8}, [r12@128] ; store oq1 + vst1.u8 {q9}, [r0@128] ; store oq2 + + pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| ; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, ; sp+4 unsigned char *v + |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines - vld1.s8 {d2[], d3[]}, [r3] ; limit - ldr r3, [sp, #8] ; load v ptr - ldr r12, [sp, #4] ; load thresh pointer - sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r0], r1 ; p3 - vld1.u8 {d7}, [r3], r1 ; p3 - vld1.u8 {d8}, [r0], r1 ; p2 - vld1.u8 {d9}, [r3], r1 ; p2 - vld1.u8 {d10}, [r0], r1 ; p1 - vld1.u8 {d11}, [r3], r1 ; p1 - vld1.u8 {d12}, [r0], r1 ; p0 - vld1.u8 {d13}, [r3], r1 ; p0 - vld1.u8 {d14}, [r0], r1 ; q0 - vld1.u8 {d15}, [r3], r1 ; q0 - vld1.u8 {d16}, [r0], r1 ; q1 - vld1.u8 {d17}, [r3], r1 ; q1 - vld1.u8 {d18}, [r0], r1 ; q2 - vld1.u8 {d19}, [r3], r1 ; q2 - vld1.u8 {d20}, [r0], r1 ; q3 - vld1.u8 {d21}, [r3], r1 ; q3 - - vld1.s8 {d4[], d5[]}, [r12] ; thresh + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r0@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r0@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r0@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r0@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r0@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r0@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r0@64], r1 ; q3 + vld1.u8 {d21}, [r12@64], r1 ; q3 bl vp8_mbloop_filter_neon sub r0, r0, r1, lsl #3 - sub r3, r3, r1, lsl #3 + sub r12, r12, r1, lsl #3 add r0, r0, r1 - add r3, r3, r1 - - vst1.u8 {d8}, [r0], r1 ; store u op2 - vst1.u8 {d9}, [r3], r1 ; store v op2 - vst1.u8 {d10}, [r0], r1 ; store u op1 - vst1.u8 {d11}, [r3], r1 ; store v op1 - vst1.u8 {d12}, [r0], r1 ; store u op0 - vst1.u8 {d13}, [r3], r1 ; store v op0 - vst1.u8 {d14}, [r0], r1 ; store u oq0 - vst1.u8 {d15}, [r3], r1 ; store v oq0 - vst1.u8 {d16}, [r0], r1 ; store u oq1 - vst1.u8 {d17}, [r3], r1 ; store v oq1 - vst1.u8 {d18}, [r0], r1 ; store u oq2 - vst1.u8 {d19}, [r3], r1 ; store v oq2 - - ldmia sp!, {pc} + add r12, r12, r1 + + vst1.u8 {d8}, [r0@64], r1 ; store u op2 + vst1.u8 {d9}, [r12@64], r1 ; store v op2 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r12@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r12@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r12@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64], r1 ; store u oq1 + vst1.u8 {d17}, [r12@64], r1 ; store v oq1 + vst1.u8 {d18}, [r0@64], r1 ; store u oq2 + vst1.u8 {d19}, [r12@64], r1 ; store v oq2 + + pop {pc} ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| ; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) ; r0 unsigned char *src, ; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 int count (unused) +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, |vp8_mbloop_filter_vertical_edge_y_neon| PROC - stmdb sp!, {lr} + push {lr} + ldr r12, [sp, #4] ; load thresh sub r0, r0, #4 ; move src pointer down by 4 columns + vdup.s8 q2, r12 ; thresh + add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines vld1.u8 {d6}, [r0], r1 ; load first 8-line src data - ldr r12, [sp, #4] ; load thresh pointer + vld1.u8 {d7}, [r12], r1 ; load second 8-line src data vld1.u8 {d8}, [r0], r1 - sub sp, sp, #32 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 vld1.u8 {d20}, [r0], r1 - - vld1.u8 {d7}, [r0], r1 ; load second 8-line src data - vld1.u8 {d9}, [r0], r1 - vld1.u8 {d11}, [r0], r1 - vld1.u8 {d13}, [r0], r1 - vld1.u8 {d15}, [r0], r1 - vld1.u8 {d17}, [r0], r1 - vld1.u8 {d19}, [r0], r1 - vld1.u8 {d21}, [r0], r1 + vld1.u8 {d21}, [r12], r1 ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -180,29 +168,17 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - vld1.s8 {d2[], d3[]}, [r3] ; limit - mov r12, sp - vst1.u8 {q3}, [r12]! - vst1.u8 {q10}, [r12]! + sub r0, r0, r1, lsl #3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #4 - - add r2, r0, r1 - - add r3, r2, r1 - - vld1.u8 {q3}, [sp]! - vld1.u8 {q10}, [sp]! + sub r12, r12, r1, lsl #3 ;transpose to 16x8 matrix vtrn.32 q3, q7 vtrn.32 q4, q8 vtrn.32 q5, q9 vtrn.32 q6, q10 - add r12, r3, r1 vtrn.16 q3, q5 vtrn.16 q4, q6 @@ -215,36 +191,30 @@ vtrn.8 q9, q10 ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0] - vst1.8 {d8}, [r2] - vst1.8 {d10}, [r3] - vst1.8 {d12}, [r12], r1 - add r0, r12, r1 - vst1.8 {d14}, [r12] - vst1.8 {d16}, [r0], r1 - add r2, r0, r1 - vst1.8 {d18}, [r0] - vst1.8 {d20}, [r2], r1 - add r3, r2, r1 - vst1.8 {d7}, [r2] - vst1.8 {d9}, [r3], r1 - add r12, r3, r1 - vst1.8 {d11}, [r3] + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r12], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r12], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r12], r1 + vst1.8 {d12}, [r0], r1 vst1.8 {d13}, [r12], r1 - add r0, r12, r1 - vst1.8 {d15}, [r12] - vst1.8 {d17}, [r0], r1 - add r2, r0, r1 - vst1.8 {d19}, [r0] - vst1.8 {d21}, [r2] - - ldmia sp!, {pc} + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r12], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r12], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] + + pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| ; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, ; unsigned char *v) ; r0 unsigned char *u, ; r1 int pitch, @@ -253,30 +223,29 @@ ; sp const signed char *thresh, ; sp+4 unsigned char *v |vp8_mbloop_filter_vertical_edge_uv_neon| PROC - stmdb sp!, {lr} - sub r0, r0, #4 ; move src pointer down by 4 columns - vld1.s8 {d2[], d3[]}, [r3] ; limit - ldr r3, [sp, #8] ; load v ptr - ldr r12, [sp, #4] ; load thresh pointer - - sub r3, r3, #4 ; move v pointer down by 4 columns + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, #4 ; move u pointer down by 4 columns + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, #4 ; move v pointer down by 4 columns vld1.u8 {d6}, [r0], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d7}, [r12], r1 ;load v data vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r3], r1 + vld1.u8 {d9}, [r12], r1 vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r3], r1 + vld1.u8 {d11}, [r12], r1 vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r3], r1 + vld1.u8 {d13}, [r12], r1 vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r3], r1 + vld1.u8 {d15}, [r12], r1 vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r3], r1 + vld1.u8 {d17}, [r12], r1 vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r3], r1 + vld1.u8 {d19}, [r12], r1 vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r3], r1 + vld1.u8 {d21}, [r12], r1 ;transpose to 8x16 matrix vtrn.32 q3, q7 @@ -294,19 +263,11 @@ vtrn.8 q7, q8 vtrn.8 q9, q10 - sub sp, sp, #32 - vld1.s8 {d4[], d5[]}, [r12] ; thresh - mov r12, sp - vst1.u8 {q3}, [r12]! - vst1.u8 {q10}, [r12]! + sub r0, r0, r1, lsl #3 bl vp8_mbloop_filter_neon - sub r0, r0, r1, lsl #3 - sub r3, r3, r1, lsl #3 - - vld1.u8 {q3}, [sp]! - vld1.u8 {q10}, [sp]! + sub r12, r12, r1, lsl #3 ;transpose to 16x8 matrix vtrn.32 q3, q7 @@ -326,23 +287,23 @@ ;store op2, op1, op0, oq0, oq1, oq2 vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r3], r1 + vst1.8 {d7}, [r12], r1 vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r3], r1 + vst1.8 {d9}, [r12], r1 vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r3], r1 + vst1.8 {d11}, [r12], r1 vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r3], r1 + vst1.8 {d13}, [r12], r1 vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r3], r1 + vst1.8 {d15}, [r12], r1 vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r3], r1 + vst1.8 {d17}, [r12], r1 vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r3], r1 - vst1.8 {d20}, [r0], r1 - vst1.8 {d21}, [r3], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] - ldmia sp!, {pc} + pop {pc} ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| ; void vp8_mbloop_filter_neon() @@ -350,26 +311,19 @@ ; functions do the necessary load, transpose (if necessary), preserve (if ; necessary) and store. -; TODO: -; The vertical filter writes p3/q3 back out because two 4 element writes are -; much simpler than ordering and writing two 3 element sets (or three 2 elements -; sets, or whichever other combinations are possible). -; If we can preserve q3 and q10, the vertical filter will be able to avoid -; storing those values on the stack and reading them back after the filter. - ; r0,r1 PRESERVE -; r2 flimit -; r3 PRESERVE -; q1 limit +; r2 mblimit +; r3 limit + ; q2 thresh -; q3 p3 +; q3 p3 PRESERVE ; q4 p2 ; q5 p1 ; q6 p0 ; q7 q0 ; q8 q1 ; q9 q2 -; q10 q3 +; q10 q3 PRESERVE |vp8_mbloop_filter_neon| PROC @@ -378,12 +332,12 @@ vabd.u8 q12, q4, q5 ; abs(p2 - p1) vabd.u8 q13, q5, q6 ; abs(p1 - p0) vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q1, q9, q8 ; abs(q2 - q1) vabd.u8 q0, q10, q9 ; abs(q3 - q2) vmax.u8 q11, q11, q12 vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q0 + vmax.u8 q1, q1, q0 vmax.u8 q15, q11, q12 vabd.u8 q12, q6, q7 ; abs(p0 - q0) @@ -391,44 +345,46 @@ ; vp8_hevmask vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 - vmax.u8 q15, q15, q3 + vmax.u8 q15, q15, q1 - vld1.s8 {d4[], d5[]}, [r2] ; flimit + vdup.u8 q1, r3 ; limit + vdup.u8 q2, r2 ; mblimit vmov.u8 q0, #0x80 ; 0x80 - vadd.u8 q2, q2, q2 ; flimit * 2 - vadd.u8 q2, q2, q1 ; flimit * 2 + limit vcge.u8 q15, q1, q15 vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 - vshr.u8 q1, q1, #1 ; a = a / 2 - vqadd.u8 q12, q12, q1 ; a = b + a - vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + vmov.u16 q11, #3 ; #3 ; vp8_filter ; convert to signed veor q7, q7, q0 ; qs0 + vshr.u8 q1, q1, #1 ; a = a / 2 veor q6, q6, q0 ; ps0 veor q5, q5, q0 ; ps1 + + vqadd.u8 q12, q12, q1 ; a = b + a + veor q8, q8, q0 ; qs1 veor q4, q4, q0 ; ps2 veor q9, q9, q0 ; qs2 vorr q14, q13, q14 ; vp8_hevmask + vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + vsubl.s8 q2, d14, d12 ; qs0 - ps0 vsubl.s8 q13, d15, d13 vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - vadd.s16 q10, q2, q2 ; 3 * (qs0 - ps0) - vadd.s16 q11, q13, q13 + vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vand q15, q15, q12 ; vp8_filter_mask - vadd.s16 q2, q2, q10 - vadd.s16 q13, q13, q11 + vmul.i16 q13, q13, q11 vmov.u8 q12, #3 ; #3 @@ -447,23 +403,19 @@ vand q13, q1, q14 ; Filter2 &= hev - vmov.u8 d7, #9 ; #9 - vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - vmov.u8 d6, #18 ; #18 + vmov q0, q15 vshr.s8 q2, q2, #3 ; Filter1 >>= 3 vshr.s8 q13, q13, #3 ; Filter2 >>= 3 - vmov q10, q15 + vmov q11, q15 vmov q12, q15 vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - vmov.u8 d5, #27 ; #27 - vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) vbic q1, q1, q14 ; vp8_filter &= ~hev @@ -471,35 +423,43 @@ ; roughly 1/7th difference across boundary ; roughly 2/7th difference across boundary ; roughly 3/7th difference across boundary - vmov q11, q15 + + vmov.u8 d5, #9 ; #9 + vmov.u8 d4, #18 ; #18 + vmov q13, q15 vmov q14, q15 - vmlal.s8 q10, d2, d7 ; Filter2 * 9 - vmlal.s8 q11, d3, d7 - vmlal.s8 q12, d2, d6 ; Filter2 * 18 - vmlal.s8 q13, d3, d6 - vmlal.s8 q14, d2, d5 ; Filter2 * 27 + vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 + vmlal.s8 q11, d3, d5 + vmov.u8 d5, #27 ; #27 + vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 + vmlal.s8 q13, d3, d4 + vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 vmlal.s8 q15, d3, d5 - vqshrn.s16 d20, q10, #7 ; u = clamp((63 + Filter2 * 9)>>7) - vqshrn.s16 d21, q11, #7 + + vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d1, q11, #7 vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) vqshrn.s16 d25, q13, #7 vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) vqshrn.s16 d29, q15, #7 - vqsub.s8 q11, q9, q10 ; s = clamp(qs2 - u) - vqadd.s8 q10, q4, q10 ; s = clamp(ps2 + u) + vmov.u8 q1, #0x80 ; 0x80 + + vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) + vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) - veor q9, q11, q0 ; *oq2 = s^0x80 - veor q4, q10, q0 ; *op2 = s^0x80 - veor q8, q13, q0 ; *oq1 = s^0x80 - veor q5, q12, q0 ; *op2 = s^0x80 - veor q7, q15, q0 ; *oq0 = s^0x80 - veor q6, q14, q0 ; *op0 = s^0x80 + + veor q9, q11, q1 ; *oq2 = s^0x80 + veor q4, q0, q1 ; *op2 = s^0x80 + veor q8, q13, q1 ; *oq1 = s^0x80 + veor q5, q12, q1 ; *op2 = s^0x80 + veor q7, q15, q1 ; *oq0 = s^0x80 + veor q6, q14, q1 ; *op0 = s^0x80 bx lr ENDP ; |vp8_mbloop_filter_neon| diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index d9d439cf8ab520ba6d7cd89ef396796a76f75199..47b13c7556e7ecbca4fdccaf79622729bd603830 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -108,9 +108,9 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_c; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_c; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_c; #if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS) diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c index a3242716f2407d1d15d7e4198c59ae1adfccce4d..be3f53593000ded44aef34f74167f4ba56856727 100644 --- a/vp8/common/loopfilter.c +++ b/vp8/common/loopfilter.c @@ -9,152 +9,149 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "loopfilter.h" #include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" typedef unsigned char uc; - prototype_loopfilter(vp8_loop_filter_horizontal_edge_c); prototype_loopfilter(vp8_loop_filter_vertical_edge_c); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c); prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); + +prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); +prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c); /* Horizontal MB filtering */ -void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Vertical MB Filtering */ -void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - -void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Horizontal B Filtering */ -void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit); } /* Vertical B Filtering */ -void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); } -void vp8_init_loop_filter(VP8_COMMON *cm) +static void lf_init_lut(loop_filter_info_n *lfi) { - loop_filter_info *lfi = cm->lf_info; - LOOPFILTERTYPE lft = cm->filter_type; - int sharpness_lvl = cm->sharpness_level; - int frame_type = cm->frame_type; - int i, j; + int filt_lvl; - int block_inside_limit = 0; - int HEVThresh; - - /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */ - for (i = 0; i <= MAX_LOOP_FILTER; i++) + for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) { - int filt_lvl = i; - - if (frame_type == KEY_FRAME) + if (filt_lvl >= 40) { - if (filt_lvl >= 40) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; + } + else if (filt_lvl >= 20) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; + } + else if (filt_lvl >= 15) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; } else { - if (filt_lvl >= 40) - HEVThresh = 3; - else if (filt_lvl >= 20) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; } + } + + lfi->mode_lf_lut[DC_PRED] = 1; + lfi->mode_lf_lut[V_PRED] = 1; + lfi->mode_lf_lut[H_PRED] = 1; + lfi->mode_lf_lut[TM_PRED] = 1; + lfi->mode_lf_lut[B_PRED] = 0; + + lfi->mode_lf_lut[ZEROMV] = 1; + lfi->mode_lf_lut[NEARESTMV] = 2; + lfi->mode_lf_lut[NEARMV] = 2; + lfi->mode_lf_lut[NEWMV] = 2; + lfi->mode_lf_lut[SPLITMV] = 3; + +} + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) +{ + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + int block_inside_limit = 0; /* Set loop filter paramaeters that control sharpness. */ block_inside_limit = filt_lvl >> (sharpness_lvl > 0); @@ -169,119 +166,120 @@ void vp8_init_loop_filter(VP8_COMMON *cm) if (block_inside_limit < 1) block_inside_limit = 1; - for (j = 0; j < 16; j++) - { - lfi[i].lim[j] = block_inside_limit; - lfi[i].mbflim[j] = filt_lvl + 2; - lfi[i].flim[j] = filt_lvl; - lfi[i].thr[j] = HEVThresh; - } - + vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), + SIMD_WIDTH); + vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); } +} - /* Set up the function pointers depending on the type of loop filtering selected */ - if (lft == NORMAL_LOOPFILTER) - { - cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v); - cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v); - cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h); - cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h); - } - else +void vp8_loop_filter_init(VP8_COMMON *cm) +{ + loop_filter_info_n *lfi = &cm->lf_info; + int i; + + /* init limits for given sharpness*/ + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + /* init LUT for lvl and hev thr picking */ + lf_init_lut(lfi); + + /* init hev threshold const vectors */ + for(i = 0; i < 4 ; i++) { - cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v); - cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v); - cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h); - cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h); + vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); } } -/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding - * each frame. Check last_frame_type to skip the function most of times. - */ -void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type) +void vp8_loop_filter_frame_init(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl, + int sharpness_lvl) { - int HEVThresh; - int i, j; + int seg, /* segment number */ + ref, /* index in ref_lf_deltas */ + mode; /* index in mode_lf_deltas */ - /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */ - for (i = 0; i <= MAX_LOOP_FILTER; i++) + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + if(cm->last_sharpness_level != sharpness_lvl) { - int filt_lvl = i; + vp8_loop_filter_update_sharpness(lfi, sharpness_lvl); + cm->last_sharpness_level = sharpness_lvl; + } - if (frame_type == KEY_FRAME) - { - if (filt_lvl >= 40) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; - } - else + for(seg = 0; seg < MAX_MB_SEGMENTS; seg++) + { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + /* Note the baseline filter values for each segment */ + if (mbd->segmentation_enabled) { - if (filt_lvl >= 40) - HEVThresh = 3; - else if (filt_lvl >= 20) - HEVThresh = 2; - else if (filt_lvl >= 15) - HEVThresh = 1; - else - HEVThresh = 0; + /* Abs value */ + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + { + lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + else /* Delta Value */ + { + lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0; + } } - for (j = 0; j < 16; j++) + if (!mbd->mode_ref_lf_delta_enabled) { - /*lfi[i].lim[j] = block_inside_limit; - lfi[i].mbflim[j] = filt_lvl+2;*/ - /*lfi[i].flim[j] = filt_lvl;*/ - lfi[i].thr[j] = HEVThresh; + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 ); + continue; } - } -} + lvl_ref = lvl_seg; -int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level) -{ - MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi; + /* INTRA_FRAME */ + ref = INTRA_FRAME; - if (mbd->mode_ref_lf_delta_enabled) - { /* Apply delta for reference frame */ - filter_level += mbd->ref_lf_deltas[mbmi->ref_frame]; + lvl_ref += mbd->ref_lf_deltas[ref]; - /* Apply delta for mode */ - if (mbmi->ref_frame == INTRA_FRAME) - { - /* Only the split mode BPRED has a further special case */ - if (mbmi->mode == B_PRED) - filter_level += mbd->mode_lf_deltas[0]; - } - else + /* Apply delta for Intra modes */ + mode = 0; /* B_PRED */ + /* Only the split mode BPRED has a further special case */ + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + + lfi->lvl[seg][ref][mode] = lvl_mode; + + mode = 1; /* all the rest of Intra modes */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */ + lfi->lvl[seg][ref][mode] = lvl_mode; + + /* LAST, GOLDEN, ALT */ + for(ref = 1; ref < MAX_REF_FRAMES; ref++) { - /* Zero motion mode */ - if (mbmi->mode == ZEROMV) - filter_level += mbd->mode_lf_deltas[1]; + int lvl_ref = lvl_seg; - /* Split MB motion mode */ - else if (mbmi->mode == SPLITMV) - filter_level += mbd->mode_lf_deltas[3]; + /* Apply delta for reference frame */ + lvl_ref += mbd->ref_lf_deltas[ref]; - /* All other inter motion modes (Nearest, Near, New) */ - else - filter_level += mbd->mode_lf_deltas[2]; - } + /* Apply delta for Inter modes */ + for (mode = 1; mode < 4; mode++) + { + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ - /* Range check */ - if (filter_level > MAX_LOOP_FILTER) - filter_level = MAX_LOOP_FILTER; - else if (filter_level < 0) - filter_level = 0; + lfi->lvl[seg][ref][mode] = lvl_mode; + } + } } - return filter_level; } - void vp8_loop_filter_frame ( VP8_COMMON *cm, @@ -290,49 +288,23 @@ void vp8_loop_filter_frame ) { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - loop_filter_info *lfi = cm->lf_info; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + FRAME_TYPE frame_type = cm->frame_type; int mb_row; int mb_col; - - int baseline_filter_level[MAX_MB_SEGMENTS]; int filter_level; - int alt_flt_enabled = mbd->segmentation_enabled; - int i; unsigned char *y_ptr, *u_ptr, *v_ptr; - mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */ - - /* Note the baseline filter values for each segment */ - if (alt_flt_enabled) - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - /* Delta Value */ - else - { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ - } - } - } - else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, cm->sharpness_level); /* Set up the buffer pointers */ y_ptr = post->y_buffer; @@ -344,51 +316,79 @@ void vp8_loop_filter_frame { for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; - /* Distance of Mb to the various image edges. - * These specified to 8th pel as they are always compared to values that are in 1/8th pel units - * Apply any context driven MB level adjustment - */ - filter_level = vp8_adjust_mb_lf_value(mbd, filter_level); + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); - - /* don't apply across umv border */ - if (mb_row > 0) - cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; u_ptr += 8; v_ptr += 8; - mbd->mode_info_context++; /* step to next MB */ + mode_info_context++; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; u_ptr += post->uv_stride * 8 - post->uv_width; v_ptr += post->uv_stride * 8 - post->uv_width; - mbd->mode_info_context++; /* Skip border mb */ + mode_info_context++; /* Skip border mb */ } } - void vp8_loop_filter_frame_yonly ( VP8_COMMON *cm, @@ -399,49 +399,28 @@ void vp8_loop_filter_frame_yonly { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - int i; unsigned char *y_ptr; int mb_row; int mb_col; - loop_filter_info *lfi = cm->lf_info; - int baseline_filter_level[MAX_MB_SEGMENTS]; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + int filter_level; - int alt_flt_enabled = mbd->segmentation_enabled; FRAME_TYPE frame_type = cm->frame_type; - (void) sharpness_lvl; + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; - /*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */ - mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */ + sharpness_lvl = cm->sharpness_level; - /* Note the baseline filter values for each segment */ - if (alt_flt_enabled) - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - /* Delta Value */ - else - { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ - } - } - } - else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, sharpness_lvl); /* Set up the buffer pointers */ y_ptr = post->y_buffer; @@ -451,44 +430,75 @@ void vp8_loop_filter_frame_yonly { for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; - /* Apply any context driven MB level adjustment */ - filter_level = vp8_adjust_mb_lf_value(mbd, filter_level); + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - /* don't apply across umv border */ - if (mb_row > 0) - cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; - mbd->mode_info_context ++; /* step to next MB */ + mode_info_context ++; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; - mbd->mode_info_context ++; /* Skip border mb */ + mode_info_context ++; /* Skip border mb */ } } - void vp8_loop_filter_partial_frame ( VP8_COMMON *cm, @@ -500,25 +510,32 @@ void vp8_loop_filter_partial_frame { YV12_BUFFER_CONFIG *post = cm->frame_to_show; - int i; unsigned char *y_ptr; int mb_row; int mb_col; - /*int mb_rows = post->y_height >> 4;*/ int mb_cols = post->y_width >> 4; - int linestocopy; + int linestocopy, i; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; - loop_filter_info *lfi = cm->lf_info; - int baseline_filter_level[MAX_MB_SEGMENTS]; int filter_level; int alt_flt_enabled = mbd->segmentation_enabled; FRAME_TYPE frame_type = cm->frame_type; - (void) sharpness_lvl; + const MODE_INFO *mode_info_context; + + int lvl_seg[MAX_MB_SEGMENTS]; + + sharpness_lvl = cm->sharpness_level; - /*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */ - mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); /* Point at base of Mb MODE_INFO list */ +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); linestocopy = (post->y_height >> (4 + Fraction)); @@ -531,29 +548,24 @@ void vp8_loop_filter_partial_frame if (alt_flt_enabled) { for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ + { /* Abs value */ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + { + lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + } /* Delta Value */ else { - baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ + lvl_seg[i] = default_filt_lvl + + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + lvl_seg[i] = (lvl_seg[i] > 0) ? + ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0; } } } else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - baseline_filter_level[i] = default_filt_lvl; - } + lvl_seg[0] = default_filt_lvl; - /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); /* Set up the buffer pointers */ y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride; @@ -563,32 +575,64 @@ void vp8_loop_filter_partial_frame { for (mb_col = 0; mb_col < mb_cols; mb_col++) { - int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; - int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && - mbd->mode_info_context->mbmi.mode != SPLITMV && - mbd->mode_info_context->mbmi.mb_skip_coeff); + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); - filter_level = baseline_filter_level[Segment]; + if (alt_flt_enabled) + filter_level = lvl_seg[mode_info_context->mbmi.segment_id]; + else + filter_level = lvl_seg[0]; if (filter_level) { - if (mb_col > 0) - cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - - if (!skip_lf) - cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h) + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h) + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h) + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } } y_ptr += 16; - mbd->mode_info_context += 1; /* step to next MB */ + mode_info_context += 1; /* step to next MB */ } y_ptr += post->y_stride * 16 - post->y_width; - mbd->mode_info_context += 1; /* Skip border mb */ + mode_info_context += 1; /* Skip border mb */ } } diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h index ca136b3a43b3ab88317778c1f1ad317e2ce0e605..2d6dad306ba2e935f507dd65fc410b80ef8315e6 100644 --- a/vp8/common/loopfilter.h +++ b/vp8/common/loopfilter.h @@ -13,6 +13,7 @@ #define loopfilter_h #include "vpx_ports/mem.h" +#include "vpx_config.h" #define MAX_LOOP_FILTER 63 @@ -22,27 +23,46 @@ typedef enum SIMPLE_LOOPFILTER = 1 } LOOPFILTERTYPE; -/* FRK - * Need to align this structure so when it is declared and +#if ARCH_ARM +#define SIMD_WIDTH 1 +#else +#define SIMD_WIDTH 16 +#endif + +/* Need to align this structure so when it is declared and * passed it can be loaded into vector registers. */ typedef struct { - DECLARE_ALIGNED(16, signed char, lim[16]); - DECLARE_ALIGNED(16, signed char, flim[16]); - DECLARE_ALIGNED(16, signed char, thr[16]); - DECLARE_ALIGNED(16, signed char, mbflim[16]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[4][4][4]; + unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; + unsigned char mode_lf_lut[10]; +} loop_filter_info_n; + +typedef struct +{ + const unsigned char * mblim; + const unsigned char * blim; + const unsigned char * lim; + const unsigned char * hev_thr; } loop_filter_info; #define prototype_loopfilter(sym) \ - void sym(unsigned char *src, int pitch, const signed char *flimit,\ - const signed char *limit, const signed char *thresh, int count) + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) #define prototype_loopfilter_block(sym) \ - void sym(unsigned char *y, unsigned char *u, unsigned char *v,\ + void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ int ystride, int uv_stride, loop_filter_info *lfi) +#define prototype_simple_loopfilter(sym) \ + void sym(unsigned char *y, int ystride, const unsigned char *blimit) + #if ARCH_X86 || ARCH_X86_64 #include "x86/loopfilter_x86.h" #endif @@ -71,38 +91,39 @@ extern prototype_loopfilter_block(vp8_lf_normal_mb_h); #endif extern prototype_loopfilter_block(vp8_lf_normal_b_h); - #ifndef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_mb_v); +extern prototype_simple_loopfilter(vp8_lf_simple_mb_v); #ifndef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_b_v); +extern prototype_simple_loopfilter(vp8_lf_simple_b_v); #ifndef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_mb_h); +extern prototype_simple_loopfilter(vp8_lf_simple_mb_h); #ifndef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_c #endif -extern prototype_loopfilter_block(vp8_lf_simple_b_h); +extern prototype_simple_loopfilter(vp8_lf_simple_b_h); typedef prototype_loopfilter_block((*vp8_lf_block_fn_t)); +typedef prototype_simple_loopfilter((*vp8_slf_block_fn_t)); + typedef struct { vp8_lf_block_fn_t normal_mb_v; vp8_lf_block_fn_t normal_b_v; vp8_lf_block_fn_t normal_mb_h; vp8_lf_block_fn_t normal_b_h; - vp8_lf_block_fn_t simple_mb_v; - vp8_lf_block_fn_t simple_b_v; - vp8_lf_block_fn_t simple_mb_h; - vp8_lf_block_fn_t simple_b_h; + vp8_slf_block_fn_t simple_mb_v; + vp8_slf_block_fn_t simple_b_v; + vp8_slf_block_fn_t simple_mb_h; + vp8_slf_block_fn_t simple_b_h; } vp8_loopfilter_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT @@ -115,9 +136,9 @@ typedef void loop_filter_uvfunction ( unsigned char *u, /* source pointer */ int p, /* pitch */ - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, unsigned char *v ); diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c index 6940529241c77a3cdd313a5c37e6151e77959ed3..10228ae09b5453fab681f3239a4054ad5a351382 100644 --- a/vp8/common/loopfilter_filters.c +++ b/vp8/common/loopfilter_filters.c @@ -24,8 +24,9 @@ static __inline signed char vp8_signed_char_clamp(int t) /* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_filter_mask(signed char limit, signed char flimit, - uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3) +static __inline signed char vp8_filter_mask(uc limit, uc blimit, + uc p3, uc p2, uc p1, uc p0, + uc q0, uc q1, uc q2, uc q3) { signed char mask = 0; mask |= (abs(p3 - p2) > limit) * -1; @@ -34,13 +35,13 @@ static __inline signed char vp8_filter_mask(signed char limit, signed char flimi mask |= (abs(q1 - q0) > limit) * -1; mask |= (abs(q2 - q1) > limit) * -1; mask |= (abs(q3 - q2) > limit) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = ~mask; return mask; } /* is there high variance internal edge ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1) +static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) { signed char hev = 0; hev |= (abs(p1 - p0) > thresh) * -1; @@ -48,7 +49,8 @@ static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, return hev; } -static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1) +static __inline void vp8_filter(signed char mask, uc hev, uc *op1, + uc *op0, uc *oq0, uc *oq1) { signed char ps0, qs0; @@ -98,9 +100,9 @@ void vp8_loop_filter_horizontal_edge_c ( unsigned char *s, int p, /* pitch */ - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -113,11 +115,11 @@ void vp8_loop_filter_horizontal_edge_c */ do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4*p], s[-3*p], s[-2*p], s[-1*p], s[0*p], s[1*p], s[2*p], s[3*p]); - hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); @@ -130,9 +132,9 @@ void vp8_loop_filter_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -145,10 +147,10 @@ void vp8_loop_filter_vertical_edge_c */ do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); - hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); vp8_filter(mask, hev, s - 2, s - 1, s, s + 1); @@ -157,7 +159,7 @@ void vp8_loop_filter_vertical_edge_c while (++i < count * 8); } -static __inline void vp8_mbfilter(signed char mask, signed char hev, +static __inline void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2) { signed char s, u; @@ -216,9 +218,9 @@ void vp8_mbloop_filter_horizontal_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -232,11 +234,11 @@ void vp8_mbloop_filter_horizontal_edge_c do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4*p], s[-3*p], s[-2*p], s[-1*p], s[0*p], s[1*p], s[2*p], s[3*p]); - hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); @@ -251,9 +253,9 @@ void vp8_mbloop_filter_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count ) { @@ -264,10 +266,10 @@ void vp8_mbloop_filter_vertical_edge_c do { - mask = vp8_filter_mask(limit[i], flimit[i], + mask = vp8_filter_mask(limit[0], blimit[0], s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); - hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); @@ -278,13 +280,13 @@ void vp8_mbloop_filter_vertical_edge_c } /* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1) +static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1) { /* Why does this cause problems for win32? * error C2143: syntax error : missing ';' before 'type' * (void) limit; */ - signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1; + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; return mask; } @@ -317,47 +319,37 @@ void vp8_loop_filter_simple_horizontal_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, - int count + const unsigned char *blimit ) { signed char mask = 0; int i = 0; - (void) thresh; do { - /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/ - mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } - while (++i < count * 8); + while (++i < 16); } void vp8_loop_filter_simple_vertical_edge_c ( unsigned char *s, int p, - const signed char *flimit, - const signed char *limit, - const signed char *thresh, - int count + const unsigned char *blimit ) { signed char mask = 0; int i = 0; - (void) thresh; do { - /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/ - mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]); + mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); s += p; } - while (++i < count * 8); + while (++i < 16); } diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index 7bbe5676c778f7795cd0554fe4a36e70d66dfd35..0615262e28e8dc751c820c88d5dcc59f6e6630f5 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -83,6 +83,7 @@ typedef struct VP8_COMMON_RTCD } VP8_COMMON_RTCD; typedef struct VP8Common + { struct vpx_internal_error_info error; @@ -107,7 +108,8 @@ typedef struct VP8Common YV12_BUFFER_CONFIG post_proc_buffer; YV12_BUFFER_CONFIG temp_scale_frame; - FRAME_TYPE last_frame_type; /* Save last frame's frame type for loopfilter init checking and motion search. */ + + FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ FRAME_TYPE frame_type; int show_frame; @@ -149,11 +151,9 @@ typedef struct VP8Common INTERPOLATIONFILTERTYPE mcomp_filter_type; LOOPFILTERTYPE last_filter_type; LOOPFILTERTYPE filter_type; - loop_filter_info lf_info[MAX_LOOP_FILTER+1]; - prototype_loopfilter_block((*lf_mbv)); - prototype_loopfilter_block((*lf_mbh)); - prototype_loopfilter_block((*lf_bv)); - prototype_loopfilter_block((*lf_bh)); + + loop_filter_info_n lf_info; + int filter_level; int last_sharpness_level; int sharpness_level; @@ -206,10 +206,9 @@ typedef struct VP8Common struct postproc_state postproc_state; } VP8_COMMON; - -int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level); -void vp8_init_loop_filter(VP8_COMMON *cm); -void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type); -extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); +void vp8_loop_filter_init(VP8_COMMON *cm); +void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, + int default_filt_lvl, int sharpness_lvl); +void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val); #endif diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm index c6c215c3c6fcaef13c4328534d58fec0927aa16c..ad47284cf9b6c202e7f9a41a7cd1809d11f0fab9 100644 --- a/vp8/common/x86/loopfilter_mmx.asm +++ b/vp8/common/x86/loopfilter_mmx.asm @@ -16,7 +16,7 @@ ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -122,12 +122,10 @@ next8_h: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit - movq mm2, [rdx] ; flimit mm2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm7, mm2 ; flimit * 2 + limit (less than 255) + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm5 pxor mm5, mm5 pcmpeqb mm1, mm5 ; mask mm1 @@ -230,7 +228,7 @@ next8_h: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -406,9 +404,9 @@ next8_v: pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw mm5, 1 ; abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; + mov rdx, arg(2) ;blimit ; - movq mm2, [rdx] ;flimit mm2 + movq mm4, [rdx] ;blimit movq mm1, mm3 ; mm1=mm3=p0 movq mm7, mm6 ; mm7=mm6=q0 @@ -419,10 +417,7 @@ next8_v: paddusb mm1, mm1 ; abs(q0-p0)*2 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm4, mm2 ; flimit * 2 + limit (less than 255) - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm0; ; mask pxor mm0, mm0 @@ -603,7 +598,7 @@ next8_v: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -719,17 +714,15 @@ next8_mbh: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit - movq mm2, [rdx] ; flimit mm2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm7, mm2 ; flimit * 2 + limit (less than 255) + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm5 pxor mm5, mm5 pcmpeqb mm1, mm5 ; mask mm1 - ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) ; mm6 = p0, ; calculate high edge variance @@ -922,7 +915,7 @@ next8_mbh: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1108,9 +1101,9 @@ next8_mbv: pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw mm5, 1 ; abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; + mov rdx, arg(2) ;blimit ; - movq mm2, [rdx] ;flimit mm2 + movq mm4, [rdx] ;blimit movq mm1, mm3 ; mm1=mm3=p0 movq mm7, mm6 ; mm7=mm6=q0 @@ -1121,10 +1114,7 @@ next8_mbv: paddusb mm1, mm1 ; abs(q0-p0)*2 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - paddb mm2, mm2 ; flimit*2 (less than 255) - paddb mm4, mm2 ; flimit * 2 + limit (less than 255) - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por mm1, mm0; ; mask pxor mm0, mm0 @@ -1392,16 +1382,13 @@ next8_mbv: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit ;) global sym(vp8_loop_filter_simple_horizontal_edge_mmx) sym(vp8_loop_filter_simple_horizontal_edge_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi @@ -1410,14 +1397,10 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx): mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - movsxd rcx, dword ptr arg(5) ;count + mov rcx, 2 ; count nexts8_h: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit ; get blimit movq mm3, [rdx] ; - paddb mm3, mm3 ; flimit*2 (less than 255) - paddb mm3, mm7 ; flimit * 2 + limit (less than 255) mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -1445,7 +1428,7 @@ nexts8_h: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor mm3, mm3 pcmpeqb mm5, mm3 @@ -1515,16 +1498,13 @@ nexts8_h: ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit ;) global sym(vp8_loop_filter_simple_vertical_edge_mmx) sym(vp8_loop_filter_simple_vertical_edge_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi @@ -1539,7 +1519,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx): movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? lea rsi, [rsi + rax*4- 2]; ; - movsxd rcx, dword ptr arg(5) ;count + mov rcx, 2 ; count nexts8_v: lea rdi, [rsi + rax]; @@ -1602,14 +1582,10 @@ nexts8_v: paddusb mm5, mm5 ; abs(p0-q0)*2 paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit ; get blimit movq mm7, [rdx] - mov rdx, arg(3) ; get limit - movq mm6, [rdx] - paddb mm7, mm7 ; flimit*2 (less than 255) - paddb mm7, mm6 ; flimit * 2 + limit (less than 255) - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor mm7, mm7 pcmpeqb mm5, mm7 ; mm5 = mask diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index c2ce1a10627b14d8aa772499752c29f72e6e2189..4efff7eb584d227a16b9dabe66094ffbf0814b68 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -110,7 +110,7 @@ psubusb xmm6, xmm5 ; p1-=p0 por xmm6, xmm4 ; abs(p1 - p0) - mov rdx, arg(2) ; get flimit + mov rdx, arg(2) ; get blimit movdqa t1, xmm6 ; save to t1 @@ -123,7 +123,7 @@ psubusb xmm1, xmm7 por xmm2, xmm3 ; abs(p1-q1) - movdqa xmm4, XMMWORD PTR [rdx] ; flimit + movdqa xmm7, XMMWORD PTR [rdx] ; blimit movdqa xmm3, xmm0 ; q0 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero @@ -134,13 +134,11 @@ psrlw xmm2, 1 ; abs(p1-q1)/2 psubusb xmm5, xmm3 ; p0-=q0 - paddb xmm4, xmm4 ; flimit*2 (less than 255) psubusb xmm3, xmm6 ; q0-=p0 por xmm5, xmm3 ; abs(p0 - q0) paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255) movdqa xmm4, t0 ; hev get abs (q1 - q0) @@ -150,7 +148,7 @@ movdqa xmm2, XMMWORD PTR [rdx] ; hev - psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit psubusb xmm4, xmm2 ; hev psubusb xmm3, xmm2 ; hev @@ -278,7 +276,7 @@ ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -328,7 +326,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -574,7 +572,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -624,7 +622,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -904,7 +902,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): movdqa xmm4, XMMWORD PTR [rdx]; limit pmaxub xmm0, xmm7 - mov rdx, arg(2) ; flimit + mov rdx, arg(2) ; blimit psubusb xmm0, xmm4 movdqa xmm5, xmm2 ; q1 @@ -921,12 +919,11 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): psrlw xmm5, 1 ; abs(p1-q1)/2 psubusb xmm6, xmm3 ; q0-p0 - movdqa xmm2, XMMWORD PTR [rdx]; flimit + movdqa xmm4, XMMWORD PTR [rdx]; blimit mov rdx, arg(4) ; get thresh por xmm1, xmm6 ; abs(q0-p0) - paddb xmm2, xmm2 ; flimit*2 (less than 255) movdqa xmm6, t0 ; get abs (q1 - q0) @@ -939,10 +936,9 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh - paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh - psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh por xmm1, xmm0 ; mask @@ -1014,7 +1010,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1081,7 +1077,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -1239,7 +1235,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; int count @@ -1308,7 +1304,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): ;( ; unsigned char *u, ; int src_pixel_step, -; const char *flimit, +; const char *blimit, ; const char *limit, ; const char *thresh, ; unsigned char *v @@ -1376,16 +1372,13 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit, ;) global sym(vp8_loop_filter_simple_horizontal_edge_sse2) sym(vp8_loop_filter_simple_horizontal_edge_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 SAVE_XMM 7 GET_GOT rbx push rsi @@ -1394,13 +1387,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): mov rsi, arg(0) ;src_ptr movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - mov rdx, arg(2) ;flimit ; get flimit + mov rdx, arg(2) ;blimit movdqa xmm3, XMMWORD PTR [rdx] - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - paddb xmm3, xmm3 ; flimit*2 (less than 255) - paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255) mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -1428,7 +1416,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor xmm3, xmm3 pcmpeqb xmm5, xmm3 @@ -1493,16 +1481,13 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): ;( ; unsigned char *src_ptr, ; int src_pixel_step, -; const char *flimit, -; const char *limit, -; const char *thresh, -; int count +; const char *blimit, ;) global sym(vp8_loop_filter_simple_vertical_edge_sse2) sym(vp8_loop_filter_simple_vertical_edge_sse2): push rbp ; save old base pointer value. mov rbp, rsp ; set new base pointer value. - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 3 SAVE_XMM 7 GET_GOT rbx ; save callee-saved reg push rsi @@ -1607,14 +1592,10 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): paddusb xmm5, xmm5 ; abs(p0-q0)*2 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - mov rdx, arg(2) ;flimit + mov rdx, arg(2) ;blimit movdqa xmm7, XMMWORD PTR [rdx] - mov rdx, arg(3) ; get limit - movdqa xmm6, XMMWORD PTR [rdx] - paddb xmm7, xmm7 ; flimit*2 (less than 255) - paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255) - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit pxor xmm7, xmm7 pcmpeqb xmm5, xmm7 ; mm5 = mask diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c index a52420c988f97f6df43402fcffc4a71fc887e8ea..9360ac17c012581930ab17ba5389b19971cefc4e 100644 --- a/vp8/common/x86/loopfilter_x86.c +++ b/vp8/common/x86/loopfilter_x86.c @@ -9,30 +9,18 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "vp8/common/loopfilter.h" -prototype_loopfilter(vp8_loop_filter_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_vertical_edge_c); -prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c); -prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); - prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2); prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2); prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2); prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2); -prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2); -prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2); -prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2); extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; @@ -44,23 +32,13 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - - -void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } @@ -68,23 +46,13 @@ void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); + vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1); -} - - -void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } @@ -92,27 +60,23 @@ void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit); } @@ -120,27 +84,23 @@ void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); if (v_ptr) - vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1); + vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); } #endif @@ -150,20 +110,10 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - - -void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); } @@ -171,20 +121,10 @@ void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr); -} - - -void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); + vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); } @@ -192,24 +132,20 @@ void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride); + vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride); } -void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit); } @@ -217,36 +153,20 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); if (u_ptr) - vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4); + vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4); } -void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); } #endif - -#if 0 -void vp8_fast_loop_filter_vertical_edges_sse(unsigned char *y_ptr, - int y_stride, - loop_filter_info *lfi) -{ - - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); -} -#endif diff --git a/vp8/common/x86/loopfilter_x86.h b/vp8/common/x86/loopfilter_x86.h index 80dbebc8db25e6548868afbe5f415e0da76b76c3..1ed6c213f2f4184ff11cff26e3a75d841f769ff1 100644 --- a/vp8/common/x86/loopfilter_x86.h +++ b/vp8/common/x86/loopfilter_x86.h @@ -24,10 +24,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_mmx); extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx); extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx); extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_mmx); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_mmx); #if !CONFIG_RUNTIME_CPU_DETECT @@ -44,13 +44,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx); #define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_mmx +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_mmx #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_mmx +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_mmx #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx @@ -63,10 +63,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_sse2); extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2); extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2); extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_sse2); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_sse2); #if !CONFIG_RUNTIME_CPU_DETECT @@ -83,13 +83,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2); #define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2 #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_sse2 +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_sse2 #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2 #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_sse2 +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_sse2 #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2 diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index 87374f3c6d2a9ab4a871450465f6c5df47b4a476..33a984b792165a3d83c2012c1c106cfa7cbe0f40 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -9,7 +9,7 @@ */ -#include "vpx_ports/config.h" +#include "vpx_config.h" #include "vpx_ports/x86.h" #include "vp8/common/g_common.h" #include "vp8/common/subpixel.h" @@ -63,9 +63,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_mmx; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_mmx; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_mmx; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_mmx; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_mmx; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_mmx; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_mmx; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_mmx; #if CONFIG_POSTPROC @@ -101,9 +101,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_sse2; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_sse2; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_sse2; + rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_sse2; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_sse2; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_sse2; + rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_sse2; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_sse2; #if CONFIG_POSTPROC diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c index 485aba05311396843c006cd0297f226d645a06c5..fe5a427cd9582043f2cd46d58159a6c99d86d6d8 100644 --- a/vp8/decoder/decodemv.c +++ b/vp8/decoder/decodemv.c @@ -180,11 +180,11 @@ static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p) return (MB_PREDICTION_MODE)i; } -static MB_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p) +static B_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p) { const int i = vp8_treed_read(bc, vp8_sub_mv_ref_tree, p); - return (MB_PREDICTION_MODE)i; + return (B_PREDICTION_MODE)i; } #ifdef VPX_MODE_COUNT @@ -376,7 +376,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, abovemv.as_int = above_block_mv(mi, k, mis); mv_contz = vp8_mv_cont(&leftmv, &abovemv); - switch ((B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/ + switch (sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/ { case NEW4X4: read_mv(bc, &blockmv.as_mv, (const MV_CONTEXT *) mvc); diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 92d8580efb3bd2ca186c7c95399816ba3a676283..f6052b4372b70148fb3d235ce50f2cde74a7f8bb 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -94,7 +94,7 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) { VP8_COMMON *cm = &pbi->common; - vp8_init_loop_filter(cm); + vp8_loop_filter_init(cm); cm->last_frame_type = KEY_FRAME; cm->last_filter_type = cm->filter_type; cm->last_sharpness_level = cm->sharpness_level; diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index a7af9acfb4850ad3ad09ea109bf0bf40fafd5856..0c21689c099597e27fbdcb4a4c43815c99422947 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -274,9 +274,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride; int filter_level; - loop_filter_info *lfi = pc->lf_info; - int alt_flt_enabled = xd->segmentation_enabled; - int Segment; + loop_filter_info_n *lfi_n = &pc->lf_info; pbi->mb_row_di[ithread].mb_row = mb_row; pbi->mb_row_di[ithread].mbd.current_bc = &pbi->mbc[mb_row%num_part]; @@ -362,7 +360,16 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) if (pbi->common.filter_level) { - int skip_lf; + int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV && + xd->mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode]; + const int seg = xd->mode_info_context->mbmi.segment_id; + const int ref_frame = xd->mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + if( mb_row != pc->mb_rows-1 ) { /* Save decoded MB last row data for next-row decoding */ @@ -388,35 +395,57 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) } } - /* update loopfilter info */ - Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0; - skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && - xd->mode_info_context->mbmi.mode != SPLITMV && - xd->mode_info_context->mbmi.mb_skip_coeff); - - filter_level = pbi->mt_baseline_filter_level[Segment]; - /* Distance of Mb to the various image edges. - * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units - * Apply any context driven MB level adjustment - */ - filter_level = vp8_adjust_mb_lf_value(xd, filter_level); - /* loopfilter on this macroblock. */ if (filter_level) { - if (mb_col > 0) - pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - - if (!skip_lf) - pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - - /* don't apply across umv border */ - if (mb_row > 0) - pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - - if (!skip_lf) - pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + if(pc->filter_type == NORMAL_LOOPFILTER) + { + loop_filter_info lfi; + FRAME_TYPE frame_type = pc->frame_type; + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v) + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v) + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h) + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h) + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + } } + } recon_yoffset += 16; @@ -681,53 +710,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) } } - -static void lpf_init( VP8D_COMP *pbi, int default_filt_lvl) -{ - VP8_COMMON *cm = &pbi->common; - MACROBLOCKD *mbd = &pbi->mb; - /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/ /*frame_to_show;*/ - loop_filter_info *lfi = cm->lf_info; - FRAME_TYPE frame_type = cm->frame_type; - - /*int mb_row; - int mb_col; - int baseline_filter_level[MAX_MB_SEGMENTS];*/ - int alt_flt_enabled = mbd->segmentation_enabled; - - int i; - /*unsigned char *y_ptr, *u_ptr, *v_ptr;*/ - - /* Note the baseline filter values for each segment */ - if (alt_flt_enabled) - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) - pbi->mt_baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - /* Delta Value */ - else - { - pbi->mt_baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; - pbi->mt_baseline_filter_level[i] = (pbi->mt_baseline_filter_level[i] >= 0) ? ((pbi->mt_baseline_filter_level[i] <= MAX_LOOP_FILTER) ? pbi->mt_baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ - } - } - } - else - { - for (i = 0; i < MAX_MB_SEGMENTS; i++) - pbi->mt_baseline_filter_level[i] = default_filt_lvl; - } - - /* Initialize the loop filter for this frame. */ - if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) - vp8_init_loop_filter(cm); - else if (frame_type != cm->last_frame_type) - vp8_frame_init_loop_filter(lfi, frame_type); -} - - void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) { int mb_row; @@ -738,12 +720,10 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) volatile int *last_row_current_mb_col = NULL; int nsync = pbi->sync_range; - int filter_level; - loop_filter_info *lfi = pc->lf_info; - int alt_flt_enabled = xd->segmentation_enabled; - int Segment; + int filter_level = pc->filter_level; + loop_filter_info_n *lfi_n = &pc->lf_info; - if(pbi->common.filter_level) + if (filter_level) { /* Set above_row buffer to 127 for decoding first MB row */ vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, pc->yv12_fb[pc->lst_fb_idx].y_width + 5); @@ -764,7 +744,9 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8); vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8); } - lpf_init(pbi, pc->filter_level); + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level, pc->sharpness_level); } setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count); @@ -774,7 +756,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1)) { - xd->current_bc = &pbi->mbc[mb_row%num_part]; /* vp8_decode_mb_row(pbi, pc, mb_row, xd); */ @@ -875,7 +856,16 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) if (pbi->common.filter_level) { - int skip_lf; + int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV && + xd->mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode]; + const int seg = xd->mode_info_context->mbmi.segment_id; + const int ref_frame = xd->mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + /* Save decoded MB last row data for next-row decoding */ if(mb_row != pc->mb_rows-1) { @@ -901,36 +891,58 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) } } - /* update loopfilter info */ - Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0; - skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && - xd->mode_info_context->mbmi.mode != SPLITMV && - xd->mode_info_context->mbmi.mb_skip_coeff); - filter_level = pbi->mt_baseline_filter_level[Segment]; - /* Distance of Mb to the various image edges. - * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units - * Apply any context driven MB level adjustment - */ - filter_level = vp8_adjust_mb_lf_value(xd, filter_level); - /* loopfilter on this macroblock. */ if (filter_level) { - if (mb_col > 0) - pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - - if (!skip_lf) - pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - - /* don't apply across umv border */ - if (mb_row > 0) - pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - - if (!skip_lf) - pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); + if(pc->filter_type == NORMAL_LOOPFILTER) + { + loop_filter_info lfi; + FRAME_TYPE frame_type = pc->frame_type; + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h) + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + } + else + { + if (mb_col > 0) + LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v) + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v) + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h) + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h) + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + } } - } + } recon_yoffset += 16; recon_uvoffset += 8; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 2ec087ee66fddee36c6a4a06d71b4d6be9fc53d9..7b1bd32de0b4583da573350bbebf6ef1bd62d8e4 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2170,7 +2170,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) //when needed. This will avoid unnecessary calls of vp8cx_init_quantizer() for every frame. vp8cx_init_quantizer(cpi); { - vp8_init_loop_filter(cm); + vp8_loop_filter_init(cm); cm->last_frame_type = KEY_FRAME; cm->last_filter_type = cm->filter_type; cm->last_sharpness_level = cm->sharpness_level; diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index 1c2656e0a854d178b922dc115af3ca94e072bafd..19913a9b19736c7681346b2d44ddb6c4a6cab8a1 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -262,10 +262,19 @@ static void vp8_temporal_filter_iterate_c for (mb_row = 0; mb_row < mb_rows; mb_row++) { #if ALT_REF_MC_ENABLED - // Reduced search extent by 3 for 6-tap filter & smaller UMV border - cpi->mb.mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 19)); + // Source frames are extended to 16 pixels. This is different than + // L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS) + // A 6 tap filter is used for motion search. This requires 2 pixels + // before and 3 pixels after. So the largest Y mv on a border would + // then be 16 - 3. The UV blocks are half the size of the Y and + // therefore only extended by 8. The largest mv that a UV block + // can support is 8 - 3. A UV mv is half of a Y mv. + // (16 - 3) >> 1 == 6 which is greater than 8 - 3. + // To keep the mv in play for both Y and UV planes the max that it + // can be on a border is therefore 16 - 5. + cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5)); cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16) - + (VP8BORDERINPIXELS - 19); + + (16 - 5); #endif for (mb_col = 0; mb_col < mb_cols; mb_col++) @@ -277,10 +286,9 @@ static void vp8_temporal_filter_iterate_c vpx_memset(count, 0, 384*sizeof(unsigned short)); #if ALT_REF_MC_ENABLED - // Reduced search extent by 3 for 6-tap filter & smaller UMV border - cpi->mb.mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 19)); + cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5)); cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16) - + (VP8BORDERINPIXELS - 19); + + (16 - 5); #endif for (frame = 0; frame < frame_count; frame++) diff --git a/vpx_mem/include/nds/vpx_mem_nds.h b/vpx_mem/include/nds/vpx_mem_nds.h deleted file mode 100644 index e54f54d9b1f0a277b4c3690230922b45417aa2e2..0000000000000000000000000000000000000000 --- a/vpx_mem/include/nds/vpx_mem_nds.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __VPX_MEM_NDS_H__ -#define __VPX_MEM_NDS_H__ - -#if defined(__cplusplus) -extern "C" { -#endif - -#include <nitro.h> -#include <nitro/os.h> - - void *vpx_mem_nds_alloc(osarena_id id, osheap_handle handle, size_t size, size_t align); - void vpx_mem_nds_free(osarena_id id, osheap_handle handle, void *mem); - int vpx_nds_alloc_heap(osarena_id id, u32 size); - -#if defined(__cplusplus) -} -#endif - -#endif /*__VPX_MEM_NDS_H__*/ diff --git a/vpx_mem/vpx_mem_tracker.c b/vpx_mem/vpx_mem_tracker.c index 938ad0716cdc3250e45e384dab67c3ab7747dbae..9e8623a9a30d2363eb0cd386e2fe778a62c4910e 100644 --- a/vpx_mem/vpx_mem_tracker.c +++ b/vpx_mem/vpx_mem_tracker.c @@ -36,9 +36,6 @@ # include <winbase.h> #elif defined(VXWORKS) # include <sem_lib.h> -#elif defined(NDS_NITRO) -# include <nitro.h> -# include <nitro/os.h> #endif #include <stdio.h> @@ -112,8 +109,6 @@ struct memory_tracker HANDLE mutex; #elif defined(VXWORKS) SEM_ID mutex; -#elif defined(NDS_NITRO) - OSMutex mutex; #elif defined(NO_MUTEX) #else #error "No mutex type defined for this platform!" @@ -193,9 +188,6 @@ int vpx_memory_tracker_init(int padding_size, int pad_value) memtrack.mutex = sem_bcreate(SEM_Q_FIFO, /*SEM_Q_FIFO non-priority based mutex*/ SEM_FULL); /*SEM_FULL initial state is unlocked*/ ret = !memtrack.mutex; -#elif defined(NDS_NITRO) - os_init_mutex(&memtrack.mutex); - ret = 0; #elif defined(NO_MUTEX) ret = 0; #endif @@ -251,9 +243,7 @@ void vpx_memory_tracker_destroy() if (!g_logging.type && g_logging.file && g_logging.file != stderr) { -#if !defined(NDS_NITRO) fclose(g_logging.file); -#endif g_logging.file = NULL; } @@ -368,15 +358,12 @@ int vpx_memory_tracker_set_log_type(int type, char *option) g_logging.file = stderr; ret = 0; } - -#if !defined(NDS_NITRO) else { if ((g_logging.file = fopen((char *)option, "w"))) ret = 0; } -#endif break; #if defined(WIN32) && !defined(_WIN32_WCE) case 1: @@ -506,12 +493,6 @@ static void memory_tracker_dump() p->addr, i, p->size, p->file, p->line); -#ifdef NDS_NITRO - - if (!(i % 20)) os_sleep(500); - -#endif - p = p->next; ++i; } @@ -719,9 +700,6 @@ static int memory_tracker_lock_mutex() ret = WaitForSingleObject(memtrack.mutex, INFINITE); #elif defined(VXWORKS) ret = sem_take(memtrack.mutex, WAIT_FOREVER); -#elif defined(NDS_NITRO) - os_lock_mutex(&memtrack.mutex); - ret = 0; #endif if (ret) @@ -754,9 +732,6 @@ static int memory_tracker_unlock_mutex() ret = !ReleaseMutex(memtrack.mutex); #elif defined(VXWORKS) ret = sem_give(memtrack.mutex); -#elif defined(NDS_NITRO) - os_unlock_mutex(&memtrack.mutex); - ret = 0; #endif if (ret) diff --git a/vpx_scale/arm/nds/yv12extend.c b/vpx_scale/arm/nds/yv12extend.c deleted file mode 100644 index 48c0dfb337a5cea18363e6438cf67e81ec0a737b..0000000000000000000000000000000000000000 --- a/vpx_scale/arm/nds/yv12extend.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/**************************************************************************** -* -* Module Title : yv12extend.c -* -* Description : -* -***************************************************************************/ - -/**************************************************************************** -* Header Files -****************************************************************************/ -#include "vpx_scale/yv12config.h" -#include "vpx_mem/vpx_mem.h" -#include <nitro.h> -#include <nitro/mi.h> -#include <nitro/itcm_begin.h> - -//---- DMA Number -#define DMA_NO 3 - -/**************************************************************************** -* Exports -****************************************************************************/ - -/**************************************************************************** -* -****************************************************************************/ -void -vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf) -{ - int i; - unsigned char *src_ptr1, *src_ptr2; - unsigned char *dest_ptr1, *dest_ptr2; - - unsigned int Border; - int plane_stride; - int plane_height; - int plane_width; - - /***********/ - /* Y Plane */ - /***********/ - Border = ybf->border; - plane_stride = ybf->y_stride; - plane_height = ybf->y_height; - plane_width = ybf->y_width; - - // copy the left and right most columns out - src_ptr1 = ybf->y_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) - { - mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); - mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - // Now copy the top and bottom source lines into each line of the respective borders - src_ptr1 = ybf->y_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)Border; i++) - { - mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); - mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - plane_stride /= 2; - plane_height /= 2; - plane_width /= 2; - Border /= 2; - - /***********/ - /* U Plane */ - /***********/ - - // copy the left and right most columns out - src_ptr1 = ybf->u_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) - { - mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); - mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - // Now copy the top and bottom source lines into each line of the respective borders - src_ptr1 = ybf->u_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)(Border); i++) - { - mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); - mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - /***********/ - /* V Plane */ - /***********/ - - // copy the left and right most columns out - src_ptr1 = ybf->v_buffer; - src_ptr2 = src_ptr1 + plane_width - 1; - dest_ptr1 = src_ptr1 - Border; - dest_ptr2 = src_ptr2 + 1; - - for (i = 0; i < plane_height; i++) - { - mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border); - mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border); - src_ptr1 += plane_stride; - src_ptr2 += plane_stride; - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } - - // Now copy the top and bottom source lines into each line of the respective borders - src_ptr1 = ybf->v_buffer - Border; - src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; - dest_ptr1 = src_ptr1 - (Border * plane_stride); - dest_ptr2 = src_ptr2 + plane_stride; - - for (i = 0; i < (int)(Border); i++) - { - mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride); - mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride); - dest_ptr1 += plane_stride; - dest_ptr2 += plane_stride; - } -} - - - -/**************************************************************************** -* -* ROUTINE : vp8_yv12_copy_frame -* -* INPUTS : -* -* OUTPUTS : None. -* -* RETURNS : void -* -* FUNCTION : Copies the source image into the destination image and -* updates the destination's UMV borders. -* -* SPECIAL NOTES : The frames are assumed to be identical in size. -* -****************************************************************************/ -void -vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) -{ - int yplane_size = (src_ybc->y_height + 2 * src_ybc->border) * (src_ybc->y_stride); - int mem_size = (yplane_size * 3 / 2) + (src_ybc->y_stride * 2); - - mi_cpu_copy_fast(src_ybc->buffer_alloc, dst_ybc->buffer_alloc, mem_size); - - /* unsigned char *src_y, *dst_y; - unsigned char *src_u, *dst_u; - unsigned char *src_v, *dst_v; - - int yheight, uv_height; - int ystride, uv_stride; - int border; - int yoffset, uvoffset; - - border = src_ybc->border; - yheight = src_ybc->y_height; - uv_height = src_ybc->uv_height; - - ystride = src_ybc->y_stride; - uv_stride = src_ybc->uv_stride; - - yoffset = border * (ystride + 1); - uvoffset = border/2 * (uv_stride + 1); - - src_y = src_ybc->y_buffer - yoffset; - dst_y = dst_ybc->y_buffer - yoffset; - src_u = src_ybc->u_buffer - uvoffset; - dst_u = dst_ybc->u_buffer - uvoffset; - src_v = src_ybc->v_buffer - uvoffset; - dst_v = dst_ybc->v_buffer - uvoffset; - - mi_cpu_copy_fast (src_y, dst_y, ystride * (yheight + 2 * border)); - mi_cpu_copy_fast (src_u, dst_u, uv_stride * (uv_height + border)); - mi_cpu_copy_fast (src_v, dst_v, uv_stride * (uv_height + border)); - */ -} - -#include <nitro/itcm_end.h> diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index cb0ab9466b0499837c5c23ce25593a8ec302a031..d02cde28f9f92825cda76755d64b9af085c2cffb 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -24,9 +24,12 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) { if (ybf) { - duck_free(ybf->buffer_alloc); + vpx_free(ybf->buffer_alloc); - ybf->buffer_alloc = 0; + /* buffer_alloc isn't accessed by most functions. Rather y_buffer, + u_buffer and v_buffer point to buffer_alloc and are used. Clear out + all of this so that a freed pointer isn't inadvertently used */ + vpx_memset (ybf, 0, sizeof (YV12_BUFFER_CONFIG)); } else { @@ -44,38 +47,37 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int { /*NOTE:*/ - int yplane_size = (height + 2 * border) * (width + 2 * border); - int uvplane_size = ((1 + height) / 2 + border) * ((1 + width) / 2 + border); - if (ybf) { + int uv_width = width >> 1; + int uv_height = height >> 1; + int yplane_size = (height + 2 * border) * (width + 2 * border); + int uvplane_size = (uv_height + border) * (uv_width + border); + vp8_yv12_de_alloc_frame_buffer(ybf); + /* only support allocating buffers that have + a height and width that are multiples of 16 */ + if ((width & 0xf) | (height & 0xf)) + return -3; + ybf->y_width = width; ybf->y_height = height; ybf->y_stride = width + 2 * border; - ybf->uv_width = (1 + width) / 2; - ybf->uv_height = (1 + height) / 2; - ybf->uv_stride = ybf->uv_width + border; + ybf->uv_width = uv_width; + ybf->uv_height = uv_height; + ybf->uv_stride = uv_width + border; ybf->border = border; ybf->frame_size = yplane_size + 2 * uvplane_size; - /* Added 2 extra lines to framebuffer so that copy12x12 doesn't fail - * when we have a large motion vector in V on the last v block. - * Note : We never use these pixels anyway so this doesn't hurt. - */ - ybf->buffer_alloc = (unsigned char *) duck_memalign(32, ybf->frame_size + (ybf->y_stride * 2) + 32, 0); + ybf->buffer_alloc = (unsigned char *) vpx_memalign(32, ybf->frame_size); if (ybf->buffer_alloc == NULL) return -1; ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border; - - if (yplane_size & 0xf) - yplane_size += 16 - (yplane_size & 0xf); - ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2; ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2;