From 1746b2adc6a9f1e3e0f70a1a6a992f1af6851b5d Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Thu, 2 Aug 2012 11:58:09 -0700
Subject: [PATCH] Added row based loopfilter

Interleaved loopfiltering with decode.  For 1080p clips, up to 1%
performance gain.  For 4k clips, up to 10% seen.  This patch is required
for better "frame-based" multithreading.

Change-Id: Ic834cf32297cc04f27e8205652fb9f70cbe290db
---
 vp8/common/blockd.h      |   2 +-
 vp8/common/loopfilter.c  | 108 +++++++++++++++++++++++++++++++++++++++
 vp8/common/loopfilter.h  |  12 +++++
 vp8/decoder/decodframe.c |  55 ++++++++++++++++----
 vp8/decoder/onyxd_if.c   |   8 ---
 5 files changed, 167 insertions(+), 18 deletions(-)

diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index acef8caa2e..f7ff577635 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -161,7 +161,7 @@ typedef struct
     uint8_t segment_id;                  /* Which set of segmentation parameters should be used for this MB */
 } MB_MODE_INFO;
 
-typedef struct
+typedef struct modeinfo
 {
     MB_MODE_INFO mbmi;
     union b_mode_info bmi[16];
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index b9ac0ff3e8..41b4f1214d 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -196,6 +196,114 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm,
     }
 }
 
+
+void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+                         int mb_row, int post_ystride, int post_uvstride,
+                         unsigned char *y_ptr, unsigned char *u_ptr,
+                         unsigned char *v_ptr)
+{
+    int mb_col;
+    int filter_level;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                        mode_info_context->mbmi.mode != SPLITMV &&
+                        mode_info_context->mbmi.mb_skip_coeff);
+
+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+        const int seg = mode_info_context->mbmi.segment_id;
+        const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+        if (filter_level)
+        {
+            const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+            lfi.mblim = lfi_n->mblim[filter_level];
+            lfi.blim = lfi_n->blim[filter_level];
+            lfi.lim = lfi_n->lim[filter_level];
+            lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+            if (mb_col > 0)
+                vp8_loop_filter_mbv
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            if (!skip_lf)
+                vp8_loop_filter_bv
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            /* don't apply across umv border */
+            if (mb_row > 0)
+                vp8_loop_filter_mbh
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            if (!skip_lf)
+                vp8_loop_filter_bh
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+        }
+
+        y_ptr += 16;
+        u_ptr += 8;
+        v_ptr += 8;
+
+        mode_info_context++;     /* step to next MB */
+    }
+
+}
+
+void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+                         int mb_row, int post_ystride, int post_uvstride,
+                         unsigned char *y_ptr, unsigned char *u_ptr,
+                         unsigned char *v_ptr)
+{
+    int mb_col;
+    int filter_level;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                        mode_info_context->mbmi.mode != SPLITMV &&
+                        mode_info_context->mbmi.mb_skip_coeff);
+
+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+        const int seg = mode_info_context->mbmi.segment_id;
+        const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+        if (filter_level)
+        {
+            if (mb_col > 0)
+                vp8_loop_filter_simple_mbv
+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+            if (!skip_lf)
+                vp8_loop_filter_simple_bv
+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+
+            /* don't apply across umv border */
+            if (mb_row > 0)
+                vp8_loop_filter_simple_mbh
+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+            if (!skip_lf)
+                vp8_loop_filter_simple_bh
+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+        }
+
+        y_ptr += 16;
+        u_ptr += 8;
+        v_ptr += 8;
+
+        mode_info_context++;     /* step to next MB */
+    }
+
+}
 void vp8_loop_filter_frame(VP8_COMMON *cm,
                            MACROBLOCKD *mbd,
                            int frame_type)
diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h
index 0497271b02..b3af2d6500 100644
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -69,6 +69,7 @@ typedef void loop_filter_uvfunction
 /* assorted loopfilter functions which get used elsewhere */
 struct VP8Common;
 struct macroblockd;
+struct modeinfo;
 
 void vp8_loop_filter_init(struct VP8Common *cm);
 
@@ -90,4 +91,15 @@ void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
 void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
                                       int sharpness_lvl);
 
+void vp8_loop_filter_row_normal(struct VP8Common *cm,
+                                struct modeinfo *mode_info_context,
+                                int mb_row, int post_ystride, int post_uvstride,
+                                unsigned char *y_ptr, unsigned char *u_ptr,
+                                unsigned char *v_ptr);
+
+void vp8_loop_filter_row_simple(struct VP8Common *cm,
+                                struct modeinfo *mode_info_context,
+                                int mb_row, int post_ystride, int post_uvstride,
+                                unsigned char *y_ptr, unsigned char *u_ptr,
+                                unsigned char *v_ptr);
 #endif
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 4a33f5aef6..2d497b9404 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -311,6 +311,8 @@ static void decode_mb_rows(VP8D_COMP *pbi)
     VP8_COMMON *const pc = & pbi->common;
     MACROBLOCKD *const xd  = & pbi->mb;
 
+    MODE_INFO *lf_mic = xd->mode_info_context;
+
     int ibc = 0;
     int num_part = 1 << pc->multi_token_partition;
 
@@ -323,6 +325,7 @@ static void decode_mb_rows(VP8D_COMP *pbi)
 
     unsigned char *ref_buffer[MAX_REF_FRAMES][3];
     unsigned char *dst_buffer[3];
+    unsigned char *lf_dst[3];
     int i;
     int ref_fb_index[MAX_REF_FRAMES];
     int ref_fb_corrupted[MAX_REF_FRAMES];
@@ -342,12 +345,17 @@ static void decode_mb_rows(VP8D_COMP *pbi)
         ref_fb_corrupted[i] = pc->yv12_fb[ref_fb_index[i]].corrupted;
     }
 
-    dst_buffer[0] = pc->yv12_fb[dst_fb_idx].y_buffer;
-    dst_buffer[1] = pc->yv12_fb[dst_fb_idx].u_buffer;
-    dst_buffer[2] = pc->yv12_fb[dst_fb_idx].v_buffer;
+    /* Set up the buffer pointers */
+    lf_dst[0] = dst_buffer[0] = pc->yv12_fb[dst_fb_idx].y_buffer;
+    lf_dst[1] = dst_buffer[1] = pc->yv12_fb[dst_fb_idx].u_buffer;
+    lf_dst[2] = dst_buffer[2] = pc->yv12_fb[dst_fb_idx].v_buffer;
 
     xd->up_available = 0;
 
+    /* Initialize the loop filter for this frame. */
+    if(pc->filter_level)
+        vp8_loop_filter_frame_init(pc, xd, pc->filter_level);
+
     /* Decode the individual macro block */
     for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
     {
@@ -449,26 +457,55 @@ static void decode_mb_rows(VP8D_COMP *pbi)
             xd->recon_left[1] += 8;
             xd->recon_left[2] += 8;
 
-
             recon_yoffset += 16;
             recon_uvoffset += 8;
 
             ++xd->mode_info_context;  /* next mb */
 
             xd->above_context++;
-
         }
 
         /* adjust to the next row of mbs */
-        vp8_extend_mb_row(
-            &pc->yv12_fb[dst_fb_idx],
-            xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
-        );
+        vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16,
+                          xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
         ++xd->mode_info_context;      /* skip prediction column */
         xd->up_available = 1;
 
+        if(pc->filter_level)
+        {
+            if(mb_row > 0)
+            {
+                if (pc->filter_type == NORMAL_LOOPFILTER)
+                    vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1,
+                                               recon_y_stride, recon_uv_stride,
+                                               lf_dst[0], lf_dst[1], lf_dst[2]);
+                else
+                    vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1,
+                                               recon_y_stride, recon_uv_stride,
+                                               lf_dst[0], lf_dst[1], lf_dst[2]);
+                lf_dst[0] += recon_y_stride  * 16;
+                lf_dst[1] += recon_uv_stride *  8;
+                lf_dst[2] += recon_uv_stride *  8;
+                lf_mic += pc->mb_cols;
+                lf_mic++;         /* Skip border mb */
+            }
+        }
+    }
+
+    if(pc->filter_level)
+    {
+        if (pc->filter_type == NORMAL_LOOPFILTER)
+            vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1, recon_y_stride,
+                                       recon_uv_stride, lf_dst[0], lf_dst[1],
+                                       lf_dst[2]);
+        else
+            vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride,
+                                       recon_uv_stride, lf_dst[0], lf_dst[1],
+                                       lf_dst[2]);
     }
+
+    vp8_yv12_extend_frame_borders(&pc->yv12_fb[dst_fb_idx]);
 }
 
 static unsigned int read_partition_size(const unsigned char *cx_size)
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 1c67e71d75..6cb7be6bf2 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -467,16 +467,8 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, unsigned long size, const unsi
             pbi->num_fragments = 0;
             return -1;
         }
-
-        if(cm->filter_level)
-        {
-            /* Apply the loop filter if appropriate. */
-            vp8_loop_filter_frame(cm, &pbi->mb, cm->frame_type);
-        }
-        vp8_yv12_extend_frame_borders(cm->frame_to_show);
     }
 
-
     vp8_clear_system_state();
 
 #if CONFIG_ERROR_CONCEALMENT
-- 
GitLab