From 60cb39da860c91adc45f51f2dfa193d19598a801 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rbultje@google.com>
Date: Tue, 6 Dec 2011 11:53:02 -0800
Subject: [PATCH] Dual 16x16 inter prediction.

This patch introduces the concept of dual inter16x16 prediction. A
16x16 inter-predicted macroblock can use 2 references instead of 1,
where both references use the same mvmode (new, near/est, zero). In the
case of newmv, this means that two MVs are coded instead of one. The
frame can be encoded in 3 ways: all MBs single-prediction, all MBs dual
prediction, or per-MB single/dual prediction selection ("hybrid"), in
which case a single bit is coded per-MB to indicate whether the MB uses
single or dual inter prediction.

In the future, we can (maybe?) get further gains by mixing this with
Adrian's 32x32 work, per-segment dual prediction settings, or adding
support for dual splitmv/8x8mv inter prediction.

Gain (on derf-set, CQ mode) is ~2.8% (SSIM) or ~3.6% (glb PSNR). Most
gain is at medium/high bitrates, but there's minor gains at low bitrates
also. Output was confirmed to match between encoder and decoder.

Note for optimization people: this patch introduces a 2nd version of
16x16/8x8 sixtap/bilin functions, which does an avg instead of a
store. They may want to look and make sure this is implemented to
their satisfaction so we can optimize it best in the future.

Change-ID: I59dc84b07cbb3ccf073ac0f756d03d294cb19281
---
 configure                            |   1 +
 vp8/common/alloccommon.c             |   3 +
 vp8/common/blockd.h                  |  13 ++
 vp8/common/filter.c                  | 214 ++++++++++++++++++++
 vp8/common/generic/systemdependent.c |  28 ++-
 vp8/common/onyxc_int.h               |  13 ++
 vp8/common/recon.h                   |  16 ++
 vp8/common/reconinter.c              | 127 ++++++++++++
 vp8/common/reconinter.h              |   6 +
 vp8/common/subpixel.h                |  32 +++
 vp8/decoder/decodemv.c               |  60 +++++-
 vp8/decoder/decodframe.c             |  35 ++++
 vp8/decoder/onyxd_int.h              |   3 +
 vp8/decoder/threading.c              |   4 +
 vp8/encoder/bitstream.c              |  73 ++++++-
 vp8/encoder/encodeframe.c            | 177 ++++++++++++++++-
 vp8/encoder/ethreading.c             |   8 +
 vp8/encoder/onyx_if.c                | 187 ++++++++++++++++++
 vp8/encoder/onyx_int.h               |  38 +++-
 vp8/encoder/pickinter.c              |   6 +
 vp8/encoder/rdopt.c                  | 285 ++++++++++++++++++++++++++-
 vp8/encoder/rdopt.h                  |   4 +-
 22 files changed, 1306 insertions(+), 27 deletions(-)

diff --git a/configure b/configure
index 9aa1fbfd3a..7942af9baa 100755
--- a/configure
+++ b/configure
@@ -217,6 +217,7 @@ HAVE_LIST="
     unistd_h
 "
 EXPERIMENT_LIST="
+    dualpred
     extend_qrange
     segmentation
     segfeatures
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index 2535a78fad..61bb317770 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -193,6 +193,9 @@ void vp8_create_common(VP8_COMMON *oci)
     vp8_default_bmode_probs(oci->fc.bmode_prob);
 
     oci->mb_no_coeff_skip = 1;
+#if CONFIG_DUALPRED
+    oci->dual_pred_mode = HYBRID_PREDICTION;
+#endif /* CONFIG_DUALPRED */
     oci->no_lpf = 0;
     oci->filter_type = NORMAL_LOOPFILTER;
     oci->use_bilinear_mc_filter = 0;
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index d51e881b42..751f58f791 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -184,6 +184,10 @@ typedef struct
     TX_SIZE txfm_size;
 #endif
     int_mv mv;
+#if CONFIG_DUALPRED
+    MV_REFERENCE_FRAME second_ref_frame;
+    int_mv second_mv;
+#endif
     unsigned char partitioning;
     unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
     unsigned char need_to_clamp_mvs;
@@ -236,6 +240,11 @@ typedef struct MacroBlockD
     int fullpixel_mask;
 
     YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
+#if CONFIG_DUALPRED
+    struct {
+        uint8_t *y_buffer, *u_buffer, *v_buffer;
+    } second_pre;
+#endif /* CONFIG_DUALPRED */
     YV12_BUFFER_CONFIG dst;
 
 #if CONFIG_NEWNEAR
@@ -305,6 +314,10 @@ typedef struct MacroBlockD
     vp8_subpix_fn_t  subpixel_predict8x4;
     vp8_subpix_fn_t  subpixel_predict8x8;
     vp8_subpix_fn_t  subpixel_predict16x16;
+#if CONFIG_DUALPRED
+    vp8_subpix_fn_t  subpixel_predict_avg8x8;
+    vp8_subpix_fn_t  subpixel_predict_avg16x16;
+#endif /* CONFIG_DUALPRED */
 
     void *current_bc;
 
diff --git a/vp8/common/filter.c b/vp8/common/filter.c
index ae59529522..256ba47367 100644
--- a/vp8/common/filter.c
+++ b/vp8/common/filter.c
@@ -128,6 +128,61 @@ static void filter_block2d_second_pass
     }
 }
 
+#if CONFIG_DUALPRED
+/*
+ * The only functional difference between filter_block2d_second_pass()
+ * and this function is that filter_block2d_second_pass() does a sixtap
+ * filter on the input and stores it in the output. This function
+ * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
+ * and then averages that with the content already present in the output
+ * ((filter_result + dest + 1) >> 1) and stores that in the output.
+ */
+static void filter_block2d_second_pass_avg
+(
+    int *src_ptr,
+    unsigned char *output_ptr,
+    int output_pitch,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
+
+            /* Normalize back to 0-255 */
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = (unsigned char) ((output_ptr[j] + Temp + 1) >> 1);
+            src_ptr++;
+        }
+
+        /* Start next row */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_pitch;
+    }
+}
+#endif /* CONFIG_DUALPRED */
 
 static void filter_block2d
 (
@@ -193,6 +248,32 @@ void vp8_sixtap_predict8x8_c
 
 }
 
+#if CONFIG_DUALPRED
+void vp8_sixtap_predict_avg8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass_avg(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+}
+#endif /* CONFIG_DUALPRED */
+
 void vp8_sixtap_predict8x4_c
 (
     unsigned char  *src_ptr,
@@ -245,6 +326,33 @@ void vp8_sixtap_predict16x16_c
 
 }
 
+#if CONFIG_DUALPRED
+void vp8_sixtap_predict_avg16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[21*24];   /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData,
+                              src_pixels_per_line, 1, 21, 16, HFilter);
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass_avg(FData + 32, dst_ptr, dst_pitch,
+                                   16, 16, 16, 16, VFilter);
+}
+#endif /* CONFIG_DUALPRED */
 
 /****************************************************************************
  *
@@ -349,6 +457,46 @@ static void filter_block2d_bil_second_pass
     }
 }
 
+#if CONFIG_DUALPRED
+/*
+ * As before for filter_block2d_second_pass_avg(), the functional difference
+ * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
+ * is that filter_block2d_bil_second_pass() does a bilinear filter on input
+ * and stores the result in output; filter_block2d_bil_second_pass_avg(),
+ * instead, does a bilinear filter on input, averages the resulting value
+ * with the values already present in the output and stores the result of
+ * that back into the output ((filter_result + dest + 1) >> 1).
+ */
+static void filter_block2d_bil_second_pass_avg
+(
+    unsigned short *src_ptr,
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
+)
+{
+    unsigned int  i, j;
+    int  Temp;
+
+    for (i = 0; i < height; i++)
+    {
+        for (j = 0; j < width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
+                   (VP8_FILTER_WEIGHT / 2);
+            dst_ptr[j] = (unsigned int)(((Temp >> VP8_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);
+            src_ptr++;
+        }
+
+        /* Next row... */
+        dst_ptr += dst_pitch;
+    }
+}
+#endif /* CONFIG_DUALPRED */
 
 /****************************************************************************
  *
@@ -395,6 +543,28 @@ static void filter_block2d_bil
     filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 
+#if CONFIG_DUALPRED
+static void filter_block2d_bil_avg
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
+    unsigned int   dst_pitch,
+    const short   *HFilter,
+    const short   *VFilter,
+    int            Width,
+    int            Height
+)
+{
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+    /* then 1-D vertically... */
+    filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+#endif /* CONFIG_DUALPRED */
 
 void vp8_bilinear_predict4x4_c
 (
@@ -454,6 +624,28 @@ void vp8_bilinear_predict8x8_c
 
 }
 
+#if CONFIG_DUALPRED
+void vp8_bilinear_predict_avg8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
+                           dst_pitch, HFilter, VFilter, 8, 8);
+}
+#endif /* CONFIG_DUALPRED */
+
 void vp8_bilinear_predict8x4_c
 (
     unsigned char  *src_ptr,
@@ -492,3 +684,25 @@ void vp8_bilinear_predict16x16_c
 
     filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
+
+#if CONFIG_DUALPRED
+void vp8_bilinear_predict_avg16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
+                           dst_pitch, HFilter, VFilter, 16, 16);
+}
+#endif /* CONFIG_DUALPRED */
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 68ed8aab0d..9619163140 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -84,6 +84,10 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
 #endif
     rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;
     rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;
+#if CONFIG_DUALPRED
+    rtcd->recon.avg16x16    = vp8_avg_mem16x16_c;
+    rtcd->recon.avg8x8      = vp8_avg_mem8x8_c;
+#endif /* CONFIG_DUALPRED */
     rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;
     rtcd->recon.recon       = vp8_recon_b_c;
 #if CONFIG_I8X8
@@ -112,14 +116,22 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
 #endif
 
 
-    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
-    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
-    rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_c;
-    rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_c;
-    rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;
-    rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_c;
-    rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_c;
-    rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_c;
+    rtcd->subpix.sixtap16x16       = vp8_sixtap_predict16x16_c;
+    rtcd->subpix.sixtap8x8         = vp8_sixtap_predict8x8_c;
+#if CONFIG_DUALPRED
+    rtcd->subpix.sixtap_avg16x16   = vp8_sixtap_predict_avg16x16_c;
+    rtcd->subpix.sixtap_avg8x8     = vp8_sixtap_predict_avg8x8_c;
+#endif /* CONFIG_DUALPRED */
+    rtcd->subpix.sixtap8x4         = vp8_sixtap_predict8x4_c;
+    rtcd->subpix.sixtap4x4         = vp8_sixtap_predict_c;
+    rtcd->subpix.bilinear16x16     = vp8_bilinear_predict16x16_c;
+    rtcd->subpix.bilinear8x8       = vp8_bilinear_predict8x8_c;
+#if CONFIG_DUALPRED
+    rtcd->subpix.bilinear_avg16x16 = vp8_bilinear_predict_avg16x16_c;
+    rtcd->subpix.bilinear_avg8x8   = vp8_bilinear_predict_avg8x8_c;
+#endif /* CONFIG_DUALPRED */
+    rtcd->subpix.bilinear8x4       = vp8_bilinear_predict8x4_c;
+    rtcd->subpix.bilinear4x4       = vp8_bilinear_predict4x4_c;
 
     rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c;
     rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;
diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index 7329e38697..08fc795012 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -73,6 +73,16 @@ typedef enum
     BILINEAR = 1
 } INTERPOLATIONFILTERTYPE;
 
+#if CONFIG_DUALPRED
+typedef enum
+{
+    SINGLE_PREDICTION_ONLY = 0,
+    DUAL_PREDICTION_ONLY   = 1,
+    HYBRID_PREDICTION      = 2,
+    NB_PREDICTION_TYPES    = 3,
+} DUALPREDMODE_TYPE;
+#endif /* CONFIG_DUALPRED */
+
 typedef struct VP8_COMMON_RTCD
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -130,6 +140,9 @@ typedef struct VP8Common
     /* profile settings */
     int experimental;
     int mb_no_coeff_skip;
+#if CONFIG_DUALPRED
+    DUALPREDMODE_TYPE dual_pred_mode;
+#endif /* CONFIG_DUALPRED */
     int no_lpf;
     int use_bilinear_mc_filter;
     int full_pixel;
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index f459922e78..b82e9a720c 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -49,6 +49,18 @@ extern prototype_copy_block(vp8_recon_copy16x16);
 #endif
 extern prototype_copy_block(vp8_recon_copy8x8);
 
+#if CONFIG_DUALPRED
+#ifndef vp8_recon_avg16x16
+#define vp8_recon_avg16x16 vp8_avg_mem16x16_c
+#endif
+extern prototype_copy_block(vp8_recon_avg16x16);
+
+#ifndef vp8_recon_avg8x8
+#define vp8_recon_avg8x8 vp8_avg_mem8x8_c
+#endif
+extern prototype_copy_block(vp8_recon_avg8x8);
+#endif /* CONFIG_DUALPRED */
+
 #ifndef vp8_recon_copy8x4
 #define vp8_recon_copy8x4 vp8_copy_mem8x4_c
 #endif
@@ -157,6 +169,10 @@ typedef struct vp8_recon_rtcd_vtable
 {
     vp8_copy_block_fn_t  copy16x16;
     vp8_copy_block_fn_t  copy8x8;
+#if CONFIG_DUALPRED
+    vp8_copy_block_fn_t  avg16x16;
+    vp8_copy_block_fn_t  avg8x8;
+#endif /* CONFIG_DUALPRED */
     vp8_copy_block_fn_t  copy8x4;
     vp8_recon_fn_t       recon;
 #if CONFIG_I8X8
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 064a8355ce..f677362e35 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -62,6 +62,30 @@ void vp8_copy_mem16x16_c(
 
 }
 
+#if CONFIG_DUALPRED
+void vp8_avg_mem16x16_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+    int r;
+
+    for (r = 0; r < 16; r++)
+    {
+        int n;
+
+        for (n = 0; n < 16; n++)
+        {
+            dst[n] = (dst[n] + src[n] + 1) >> 1;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+#endif /* CONFIG_DUALPRED */
+
 void vp8_copy_mem8x8_c(
     unsigned char *src,
     int src_stride,
@@ -92,6 +116,30 @@ void vp8_copy_mem8x8_c(
 
 }
 
+#if CONFIG_DUALPRED
+void vp8_avg_mem8x8_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+    int r;
+
+    for (r = 0; r < 8; r++)
+    {
+        int n;
+
+        for (n = 0; n < 8; n++)
+        {
+            dst[n] = (dst[n] + src[n] + 1) >> 1;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+#endif /* CONFIG_DUALPRED */
+
 void vp8_copy_mem8x4_c(
     unsigned char *src,
     int src_stride,
@@ -388,6 +436,74 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
 
 }
 
+#if CONFIG_DUALPRED
+/*
+ * This function should be called after an initial call to
+ * vp8_build_inter16x16_predictors_mb() or _mby()/_mbuv().
+ * It will run a second sixtap filter on a (different) ref
+ * frame and average the result with the output of the
+ * first sixtap filter. The second reference frame is stored
+ * in x->second_pre (the reference frame index is in
+ * x->mode_info_context->mbmi.second_ref_frame). The second
+ * motion vector is x->mode_info_context->mbmi.second_mv.
+ *
+ * This allows blending prediction from two reference frames
+ * which sometimes leads to better prediction than from a
+ * single reference framer.
+ */
+void vp8_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *x,
+                                            unsigned char *dst_y,
+                                            unsigned char *dst_u,
+                                            unsigned char *dst_v,
+                                            int dst_ystride,
+                                            int dst_uvstride)
+{
+    int offset;
+    unsigned char *ptr;
+    unsigned char *uptr, *vptr;
+
+    int mv_row = x->mode_info_context->mbmi.second_mv.as_mv.row;
+    int mv_col = x->mode_info_context->mbmi.second_mv.as_mv.col;
+
+    unsigned char *ptr_base = x->second_pre.y_buffer;
+    int pre_stride = x->block[0].pre_stride;
+
+    ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict_avg16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_y, dst_ystride);
+    }
+    else
+    {
+        RECON_INVOKE(&x->rtcd->recon, avg16x16)(ptr, pre_stride, dst_y, dst_ystride);
+    }
+
+    /* calc uv motion vectors */
+    mv_row = (mv_row + (mv_row > 0)) >> 1;
+    mv_col = (mv_col + (mv_col > 0)) >> 1;
+
+    mv_row &= x->fullpixel_mask;
+    mv_col &= x->fullpixel_mask;
+
+    pre_stride >>= 1;
+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    uptr = x->second_pre.u_buffer + offset;
+    vptr = x->second_pre.v_buffer + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict_avg8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, dst_u, dst_uvstride);
+        x->subpixel_predict_avg8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, dst_v, dst_uvstride);
+    }
+    else
+    {
+        RECON_INVOKE(&x->rtcd->recon, avg8x8)(uptr, pre_stride, dst_u, dst_uvstride);
+        RECON_INVOKE(&x->rtcd->recon, avg8x8)(vptr, pre_stride, dst_v, dst_uvstride);
+    }
+}
+#endif /* CONFIG_DUALPRED */
+
 static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
 {
     int i;
@@ -490,6 +606,17 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
     {
         vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256],
                                            &x->predictor[320], 16, 8);
+#if CONFIG_DUALPRED
+        if (x->mode_info_context->mbmi.second_ref_frame)
+        {
+            /* 256 = offset of U plane in Y+U+V buffer;
+             * 320 = offset of V plane in Y+U+V buffer.
+             * (256=16x16, 320=16x16+8x8). */
+            vp8_build_2nd_inter16x16_predictors_mb(x, x->predictor,
+                                                   &x->predictor[256],
+                                                   &x->predictor[320], 16, 8);
+        }
+#endif /* CONFIG_DUALPRED */
     }
     else
     {
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 456812ecdd..c8e2f803b7 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -19,6 +19,12 @@ extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
                                                unsigned char *dst_v,
                                                int dst_ystride,
                                                int dst_uvstride);
+extern void vp8_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *x,
+                                                   unsigned char *dst_y,
+                                                   unsigned char *dst_u,
+                                                   unsigned char *dst_v,
+                                                   int dst_ystride,
+                                                   int dst_uvstride);
 
 
 extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
diff --git a/vp8/common/subpixel.h b/vp8/common/subpixel.h
index acdeec3bcb..33d4b355de 100644
--- a/vp8/common/subpixel.h
+++ b/vp8/common/subpixel.h
@@ -34,6 +34,18 @@ extern prototype_subpixel_predict(vp8_subpix_sixtap16x16);
 #endif
 extern prototype_subpixel_predict(vp8_subpix_sixtap8x8);
 
+#if CONFIG_DUALPRED
+#ifndef vp8_subpix_sixtap_avg16x16
+#define vp8_subpix_sixtap_avg16x16 vp8_sixtap_predict_avg16x16_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_sixtap_avg16x16);
+
+#ifndef vp8_subpix_sixtap_avg8x8
+#define vp8_subpix_sixtap_avg8x8 vp8_sixtap_predict_avg8x8_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_sixtap_avg8x8);
+#endif /* CONFIG_DUALPRED */
+
 #ifndef vp8_subpix_sixtap8x4
 #define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_c
 #endif
@@ -54,6 +66,18 @@ extern prototype_subpixel_predict(vp8_subpix_bilinear16x16);
 #endif
 extern prototype_subpixel_predict(vp8_subpix_bilinear8x8);
 
+#if CONFIG_DUALPRED
+#ifndef vp8_subpix_bilinear_avg16x16
+#define vp8_subpix_bilinear_avg16x16 vp8_bilinear_predict_avg16x16_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_bilinear_avg16x16);
+
+#ifndef vp8_subpix_bilinear_avg8x8
+#define vp8_subpix_bilinear_avg8x8 vp8_bilinear_predict_avg8x8_c
+#endif
+extern prototype_subpixel_predict(vp8_subpix_bilinear_avg8x8);
+#endif /* CONFIG_DUALPRED */
+
 #ifndef vp8_subpix_bilinear8x4
 #define vp8_subpix_bilinear8x4 vp8_bilinear_predict8x4_c
 #endif
@@ -69,10 +93,18 @@ typedef struct
 {
     vp8_subpix_fn_t  sixtap16x16;
     vp8_subpix_fn_t  sixtap8x8;
+#if CONFIG_DUALPRED
+    vp8_subpix_fn_t  sixtap_avg16x16;
+    vp8_subpix_fn_t  sixtap_avg8x8;
+#endif /* CONFIG_DUALPRED */
     vp8_subpix_fn_t  sixtap8x4;
     vp8_subpix_fn_t  sixtap4x4;
     vp8_subpix_fn_t  bilinear16x16;
     vp8_subpix_fn_t  bilinear8x8;
+#if CONFIG_DUALPRED
+    vp8_subpix_fn_t  bilinear_avg16x16;
+    vp8_subpix_fn_t  bilinear_avg8x8;
+#endif /* CONFIG_DUALPRED */
     vp8_subpix_fn_t  bilinear8x4;
     vp8_subpix_fn_t  bilinear4x4;
 } vp8_subpix_rtcd_vtable_t;
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 7e3137fd26..df2b85349e 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -392,6 +392,17 @@ static void mb_mode_mv_init(VP8D_COMP *pbi)
         pbi->prob_intra = (vp8_prob)vp8_read_literal(bc, 8);
         pbi->prob_last  = (vp8_prob)vp8_read_literal(bc, 8);
         pbi->prob_gf    = (vp8_prob)vp8_read_literal(bc, 8);
+#if CONFIG_DUALPRED
+        pbi->common.dual_pred_mode = vp8_read(bc, 128);
+        if (pbi->common.dual_pred_mode)
+            pbi->common.dual_pred_mode += vp8_read(bc, 128);
+        if (pbi->common.dual_pred_mode == HYBRID_PREDICTION)
+        {
+            pbi->prob_dualpred[0] = (vp8_prob)vp8_read_literal(bc, 8);
+            pbi->prob_dualpred[1] = (vp8_prob)vp8_read_literal(bc, 8);
+            pbi->prob_dualpred[2] = (vp8_prob)vp8_read_literal(bc, 8);
+        }
+#endif /* CONFIG_DUALPRED */
 
         if (vp8_read_bit(bc))
         {
@@ -444,6 +455,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     mb_to_top_edge -= LEFT_TOP_MARGIN;
     mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
     mbmi->need_to_clamp_mvs = 0;
+#if CONFIG_DUALPRED
+    mbmi->second_ref_frame = 0;
+#endif /* CONFIG_DUALPRED */
     /* Distance of Mb to the various image edges.
      * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
      */
@@ -666,6 +680,50 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                                                       mb_to_bottom_edge);
 
         propagate_mv:  /* same MV throughout */
+#if CONFIG_DUALPRED
+            if (pbi->common.dual_pred_mode == DUAL_PREDICTION_ONLY ||
+                (pbi->common.dual_pred_mode == HYBRID_PREDICTION &&
+                 vp8_read(bc, pbi->prob_dualpred[(mi[-1].mbmi.second_ref_frame != INTRA_FRAME) +
+                                                 (mi[-mis].mbmi.second_ref_frame != INTRA_FRAME)])))
+            {
+                mbmi->second_ref_frame = mbmi->ref_frame + 1;
+                if (mbmi->second_ref_frame == 4)
+                    mbmi->second_ref_frame = 1;
+            }
+            if (mbmi->second_ref_frame)
+            {
+                vp8_find_near_mvs(xd, mi, &nearest, &nearby, &best_mv, rct,
+                                  mbmi->second_ref_frame, pbi->common.ref_frame_sign_bias);
+                switch (mbmi->mode) {
+                case ZEROMV:
+                    mbmi->second_mv.as_int = 0;
+                    break;
+                case NEARMV:
+                    mbmi->second_mv.as_int = nearby.as_int;
+                    vp8_clamp_mv(&mbmi->second_mv, mb_to_left_edge, mb_to_right_edge,
+                                 mb_to_top_edge, mb_to_bottom_edge);
+                    break;
+                case NEARESTMV:
+                    mbmi->second_mv.as_int = nearest.as_int;
+                    vp8_clamp_mv(&mbmi->second_mv, mb_to_left_edge, mb_to_right_edge,
+                                 mb_to_top_edge, mb_to_bottom_edge);
+                    break;
+                case NEWMV:
+                    read_mv(bc, &mbmi->second_mv.as_mv, (const MV_CONTEXT *) mvc);
+                    mbmi->second_mv.as_mv.row += best_mv.as_mv.row;
+                    mbmi->second_mv.as_mv.col += best_mv.as_mv.col;
+                    mbmi->need_to_clamp_mvs |= vp8_check_mv_bounds(&mbmi->second_mv,
+                                                                   mb_to_left_edge,
+                                                                   mb_to_right_edge,
+                                                                   mb_to_top_edge,
+                                                                   mb_to_bottom_edge);
+                    break;
+                default:
+                    break;
+                }
+            }
+#endif /* CONFIG_DUALPRED */
+
 #if CONFIG_ERROR_CONCEALMENT
             if(pbi->ec_enabled)
             {
@@ -854,4 +912,4 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi)
 #endif
 
 
-}
\ No newline at end of file
+}
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index f3da2d0b57..2ad5d1b87c 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -135,6 +135,14 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd)
         vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
                                            xd->dst.u_buffer, xd->dst.v_buffer,
                                            xd->dst.y_stride, xd->dst.uv_stride);
+#if CONFIG_DUALPRED
+        if (xd->mode_info_context->mbmi.second_ref_frame)
+        {
+            vp8_build_2nd_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                                   xd->dst.u_buffer, xd->dst.v_buffer,
+                                                   xd->dst.y_stride, xd->dst.uv_stride);
+        }
+#endif /* CONFIG_DUALPRED */
     }
 #ifdef DEC_DEBUG
         if (dec_debug) {
@@ -605,6 +613,25 @@ decode_mb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mb_row, MACROBLOCKD *xd)
         xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
         xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
 
+#if CONFIG_DUALPRED
+        if (xd->mode_info_context->mbmi.second_ref_frame)
+        {
+            int second_ref_fb_idx;
+
+            /* Select the appropriate reference frame for this MB */
+            if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+                second_ref_fb_idx = pc->lst_fb_idx;
+            else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+                second_ref_fb_idx = pc->gld_fb_idx;
+            else
+                second_ref_fb_idx = pc->alt_fb_idx;
+
+            xd->second_pre.y_buffer = pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+            xd->second_pre.u_buffer = pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+            xd->second_pre.v_buffer = pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+        }
+#endif /* CONFIG_DUALPRED */
+
         if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
         {
             /* propagate errors from reference frames */
@@ -852,6 +879,10 @@ static void init_frame(VP8D_COMP *pbi)
             xd->subpixel_predict8x4   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x4);
             xd->subpixel_predict8x8   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x8);
             xd->subpixel_predict16x16 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap16x16);
+#if CONFIG_DUALPRED
+            xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap_avg8x8);
+            xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap_avg16x16);
+#endif /* CONFIG_DUALPRED */
         }
         else
         {
@@ -859,6 +890,10 @@ static void init_frame(VP8D_COMP *pbi)
             xd->subpixel_predict8x4   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x4);
             xd->subpixel_predict8x8   = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x8);
             xd->subpixel_predict16x16 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear16x16);
+#if CONFIG_DUALPRED
+            xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear_avg8x8);
+            xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear_avg16x16);
+#endif /* CONFIG_DUALPRED */
         }
 
         if (pbi->decoded_key_frame && pbi->ec_enabled && !pbi->ec_active)
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 14ac2f5d4a..cf686380a1 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -135,6 +135,9 @@ typedef struct VP8Decompressor
     vp8_prob prob_last;
     vp8_prob prob_gf;
     vp8_prob prob_skip_false;
+#if CONFIG_DUALPRED
+    vp8_prob prob_dualpred[3];
+#endif /* CONFIG_DUALPRED */
 
 #if CONFIG_ERROR_CONCEALMENT
     MB_OVERLAP *overlaps;
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 09dffe2691..f5c916f348 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -50,6 +50,10 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
         mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
         mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
         mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+#if CONFIG_DUALPRED
+        mbd->subpixel_predict_avg8x8 = xd->subpixel_predict_avg8x8;
+        mbd->subpixel_predict_avg16x16 = xd->subpixel_predict_avg16x16;
+#endif /* CONFIG_DUALPRED */
 
         mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
         mbd->mode_info_stride  = pc->mode_info_stride;
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index b19d58e3c7..d3e61699ff 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -959,6 +959,9 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
     int prob_last_coded;
     int prob_gf_coded;
     int prob_skip_false = 0;
+#if CONFIG_DUALPRED
+    int prob_dual_pred[3];
+#endif /* CONFIG_DUALPRED */
 
     cpi->mb.partition_info = cpi->mb.pi;
 
@@ -1012,6 +1015,39 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
     vp8_write_literal(w, prob_last_coded, 8);
     vp8_write_literal(w, prob_gf_coded, 8);
 
+#if CONFIG_DUALPRED
+    if (cpi->common.dual_pred_mode == HYBRID_PREDICTION)
+    {
+        vp8_write(w, 1, 128);
+        vp8_write(w, 1, 128);
+        for (i = 0; i < 3; i++) {
+        if (cpi->single_pred_count[i] + cpi->dual_pred_count[i])
+        {
+            prob_dual_pred[i] = cpi->single_pred_count[i] * 256 /
+                        (cpi->single_pred_count[i] + cpi->dual_pred_count[i]);
+            if (prob_dual_pred[i] < 1)
+                prob_dual_pred[i] = 1;
+            else if (prob_dual_pred[i] > 255)
+                prob_dual_pred[i] = 255;
+        }
+        else
+        {
+            prob_dual_pred[i] = 128;
+        }
+        vp8_write_literal(w, prob_dual_pred[i], 8);
+        }
+    }
+    else if (cpi->common.dual_pred_mode == SINGLE_PREDICTION_ONLY)
+    {
+        vp8_write(w, 0, 128);
+    }
+    else /* dual prediction only */
+    {
+        vp8_write(w, 1, 128);
+        vp8_write(w, 0, 128);
+    }
+#endif /* CONFIG_DUALPRED */
+
     update_mbintra_mode_probs(cpi);
 
     vp8_write_mvprobs(cpi);
@@ -1153,14 +1189,29 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
                     switch (mode)   /* new, split require MVs */
                     {
                     case NEWMV:
-
     #ifdef ENTROPY_STATS
                         active_section = 5;
     #endif
 
                         write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
+#if CONFIG_DUALPRED
+                        if (cpi->common.dual_pred_mode == HYBRID_PREDICTION)
+                        {
+                            int t = m[-mis].mbmi.second_ref_frame != INTRA_FRAME;
+                            int l = m[-1  ].mbmi.second_ref_frame != INTRA_FRAME;
+                            vp8_write(w, mi->second_ref_frame != INTRA_FRAME,
+                                      prob_dual_pred[t + l]);
+                        }
+                        if (mi->second_ref_frame)
+                        {
+                            const int second_rf = mi->second_ref_frame;
+                            int_mv n1, n2;
+                            int ct[4];
+                            vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, second_rf, cpi->common.ref_frame_sign_bias);
+                            write_mv(w, &mi->second_mv.as_mv, &best_mv, mvc);
+                        }
+#endif /* CONFIG_DUALPRED */
                         break;
-
                     case SPLITMV:
                     {
                         int j = 0;
@@ -1207,6 +1258,15 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
                     }
                     break;
                     default:
+#if CONFIG_DUALPRED
+                        if (cpi->common.dual_pred_mode == HYBRID_PREDICTION)
+                        {
+                            int t = m[-mis].mbmi.second_ref_frame != INTRA_FRAME;
+                            int l = m[-1  ].mbmi.second_ref_frame != INTRA_FRAME;
+                            vp8_write(w, mi->second_ref_frame != INTRA_FRAME,
+                                      prob_dual_pred[t + l]);
+                        }
+#endif /* CONFIG_DUALPRED */
                         break;
                     }
                 }
@@ -1228,6 +1288,15 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi)
 #endif
         cpi->mb.partition_info++;
     }
+
+#if CONFIG_DUALPRED
+    if (cpi->common.dual_pred_mode == HYBRID_PREDICTION)
+    {
+        cpi->prob_dualpred[0] = (prob_dual_pred[0] + cpi->prob_dualpred[0] + 1) >> 1;
+        cpi->prob_dualpred[1] = (prob_dual_pred[1] + cpi->prob_dualpred[1] + 1) >> 1;
+        cpi->prob_dualpred[2] = (prob_dual_pred[2] + cpi->prob_dualpred[2] + 1) >> 1;
+    }
+#endif /* CONFIG_DUALPRED */
 }
 
 
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index a6a09247a2..ac3058106d 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -910,7 +910,7 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
         xd->fullpixel_mask = 0xfffffff8;
 }
 
-void vp8_encode_frame(VP8_COMP *cpi)
+static void encode_frame_internal(VP8_COMP *cpi)
 {
     int mb_row;
     MACROBLOCK *const x = & cpi->mb;
@@ -953,6 +953,12 @@ void vp8_encode_frame(VP8_COMP *cpi)
                                         &cpi->common.rtcd.subpix, sixtap8x8);
         xd->subpixel_predict16x16   = SUBPIX_INVOKE(
                                         &cpi->common.rtcd.subpix, sixtap16x16);
+#if CONFIG_DUALPRED
+        xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap_avg8x8);
+        xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+                                        &cpi->common.rtcd.subpix, sixtap_avg16x16);
+#endif /* CONFIG_DUALPRED */
     }
     else
     {
@@ -964,6 +970,12 @@ void vp8_encode_frame(VP8_COMP *cpi)
                                         &cpi->common.rtcd.subpix, bilinear8x8);
         xd->subpixel_predict16x16   = SUBPIX_INVOKE(
                                       &cpi->common.rtcd.subpix, bilinear16x16);
+#if CONFIG_DUALPRED
+        xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
+                                      &cpi->common.rtcd.subpix, bilinear_avg8x8);
+        xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
+                                      &cpi->common.rtcd.subpix, bilinear_avg16x16);
+#endif /* CONFIG_DUALPRED */
     }
 
     // Reset frame count of inter 0,0 motion vector usage.
@@ -1006,6 +1018,11 @@ void vp8_encode_frame(VP8_COMP *cpi)
 
     // re-initencode frame context.
     init_encode_frame_mb_context(cpi);
+#if CONFIG_DUALPRED
+    cpi->rd_single_diff = cpi->rd_dual_diff = cpi->rd_hybrid_diff = 0;
+    cpi->single_pred_count[0] = cpi->single_pred_count[1] = cpi->single_pred_count[2] = 0;
+    cpi->dual_pred_count[0]   = cpi->dual_pred_count[1]   = cpi->dual_pred_count[2]   = 0;
+#endif /* CONFIG_DUALPRED */
 
     {
         struct vpx_usec_timer  emr_timer;
@@ -1189,6 +1206,121 @@ void vp8_encode_frame(VP8_COMP *cpi)
 #endif
 
 }
+
+void vp8_encode_frame(VP8_COMP *cpi)
+{
+#if CONFIG_DUALPRED
+    if (cpi->sf.RD)
+    {
+        int frame_type, pred_type;
+        int redo = 0;
+
+        /*
+         * This code does a single RD pass over the whole frame assuming
+         * either dual, single or hybrid prediction as per whatever has
+         * worked best for that type of frame in the past.
+         * It also predicts whether another coding mode would have worked
+         * better that this coding mode. If that is the case, it remembers
+         * that for subsequent frames. If the difference is above a certain
+         * threshold, it will actually re-encode the current frame using
+         * that different coding mode.
+         */
+        if (cpi->common.frame_type == KEY_FRAME)
+            frame_type = 0;
+        else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)
+            frame_type = 3;
+        else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+            frame_type = 1;
+        else
+            frame_type = 2;
+
+        if (cpi->rd_prediction_type_threshes[frame_type][1] >
+                cpi->rd_prediction_type_threshes[frame_type][0] &&
+            cpi->rd_prediction_type_threshes[frame_type][1] >
+                cpi->rd_prediction_type_threshes[frame_type][2])
+            pred_type = DUAL_PREDICTION_ONLY;
+        else if (cpi->rd_prediction_type_threshes[frame_type][0] >
+                    cpi->rd_prediction_type_threshes[frame_type][1] &&
+                 cpi->rd_prediction_type_threshes[frame_type][0] >
+                    cpi->rd_prediction_type_threshes[frame_type][2])
+            pred_type = SINGLE_PREDICTION_ONLY;
+        else
+            pred_type = HYBRID_PREDICTION;
+
+        cpi->common.dual_pred_mode = pred_type;
+        encode_frame_internal(cpi);
+
+        cpi->rd_single_diff /= cpi->common.MBs;
+        cpi->rd_prediction_type_threshes[frame_type][0] += cpi->rd_single_diff;
+        cpi->rd_prediction_type_threshes[frame_type][0] >>= 1;
+        cpi->rd_dual_diff   /= cpi->common.MBs;
+        cpi->rd_prediction_type_threshes[frame_type][1] += cpi->rd_dual_diff;
+        cpi->rd_prediction_type_threshes[frame_type][1] >>= 1;
+        cpi->rd_hybrid_diff /= cpi->common.MBs;
+        cpi->rd_prediction_type_threshes[frame_type][2] += cpi->rd_hybrid_diff;
+        cpi->rd_prediction_type_threshes[frame_type][2] >>= 1;
+
+        /* FIXME make "100" (the threshold at which to re-encode the
+         * current frame) a commandline option. */
+        if (cpi->common.dual_pred_mode == SINGLE_PREDICTION_ONLY &&
+            (cpi->rd_dual_diff >= 100 || cpi->rd_hybrid_diff >= 100))
+        {
+            redo = 1;
+            cpi->common.dual_pred_mode = cpi->rd_dual_diff > cpi->rd_hybrid_diff ?
+                        DUAL_PREDICTION_ONLY : HYBRID_PREDICTION;
+        }
+        else if (cpi->common.dual_pred_mode == DUAL_PREDICTION_ONLY &&
+                 (cpi->rd_single_diff >= 100 || cpi->rd_hybrid_diff >= 100))
+        {
+            redo = 1;
+            cpi->common.dual_pred_mode = cpi->rd_single_diff > cpi->rd_hybrid_diff ?
+                        SINGLE_PREDICTION_ONLY : HYBRID_PREDICTION;
+        }
+        else if (cpi->common.dual_pred_mode == HYBRID_PREDICTION &&
+                 (cpi->rd_single_diff >= 100 || cpi->rd_dual_diff >= 100))
+        {
+            if (cpi->dual_pred_count == 0)
+            {
+                cpi->common.dual_pred_mode = SINGLE_PREDICTION_ONLY;
+            }
+            else if (cpi->single_pred_count == 0)
+            {
+                cpi->common.dual_pred_mode = DUAL_PREDICTION_ONLY;
+            }
+            else
+            {
+                redo = 1;
+                cpi->common.dual_pred_mode = cpi->rd_single_diff > cpi->rd_dual_diff ?
+                            SINGLE_PREDICTION_ONLY : DUAL_PREDICTION_ONLY;
+            }
+        }
+
+
+        if (redo)
+        {
+            encode_frame_internal(cpi);
+        }
+
+        if (cpi->common.dual_pred_mode == HYBRID_PREDICTION)
+        {
+            if (cpi->dual_pred_count == 0)
+            {
+                cpi->common.dual_pred_mode = SINGLE_PREDICTION_ONLY;
+            }
+            else if (cpi->single_pred_count == 0)
+            {
+                cpi->common.dual_pred_mode = DUAL_PREDICTION_ONLY;
+            }
+        }
+    }
+    else
+#endif /* CONFIG_DUALPRED */
+    {
+        encode_frame_internal(cpi);
+    }
+
+}
+
 void vp8_setup_block_ptrs(MACROBLOCK *x)
 {
     int r, c;
@@ -1416,6 +1548,7 @@ int vp8cx_encode_inter_macroblock
     if (cpi->sf.RD)
     {
         int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+        int single, dual, hybrid;
 
         /* Are we using the fast quantizer for the mode selection? */
         if(cpi->sf.use_fastquant_for_pick)
@@ -1430,7 +1563,23 @@ int vp8cx_encode_inter_macroblock
             cpi->zbin_mode_boost_enabled = 0;
         }
         vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
-                               &distortion, &intra_error);
+                               &distortion, &intra_error, &single, &dual, &hybrid);
+#if CONFIG_DUALPRED
+        cpi->rd_single_diff += single;
+        cpi->rd_dual_diff   += dual;
+        cpi->rd_hybrid_diff += hybrid;
+        if (x->e_mbd.mode_info_context->mbmi.ref_frame &&
+            x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+        {
+            MB_MODE_INFO *t = &x->e_mbd.mode_info_context[-cpi->common.mode_info_stride].mbmi;
+            MB_MODE_INFO *l = &x->e_mbd.mode_info_context[-1].mbmi;
+            int cnt = (t->second_ref_frame != INTRA_FRAME) + (l->second_ref_frame != INTRA_FRAME);
+            if (x->e_mbd.mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
+                cpi->single_pred_count[cnt]++;
+            else
+                cpi->dual_pred_count[cnt]++;
+        }
+#endif /* CONFIG_DUALPRED */
 
         /* switch back to the regular quantizer for the encode */
         if (cpi->sf.improved_quant)
@@ -1581,6 +1730,27 @@ int vp8cx_encode_inter_macroblock
         xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
         xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
 
+#if CONFIG_DUALPRED
+        if (xd->mode_info_context->mbmi.second_ref_frame) {
+            int second_ref_fb_idx;
+
+            cpi->mbs_dual_count++;
+            if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+                second_ref_fb_idx = cpi->common.lst_fb_idx;
+            else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+                second_ref_fb_idx = cpi->common.gld_fb_idx;
+            else
+                second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+            xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+                                            recon_yoffset;
+            xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+                                            recon_uvoffset;
+            xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+                                            recon_uvoffset;
+        }
+#endif /* CONFIG_DUALPRED */
+
         if (!x->skip)
         {
             vp8_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
@@ -1591,10 +1761,11 @@ int vp8cx_encode_inter_macroblock
 
         }
         else
+        {
             vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
                                            xd->dst.u_buffer, xd->dst.v_buffer,
                                            xd->dst.y_stride, xd->dst.uv_stride);
-
+        }
     }
 #if CONFIG_T8X8
     if ( get_seg_tx_type( xd, *segment_id ) == TX_8X8 )
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 64d7707f70..f2fa5b360e 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -398,6 +398,10 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
         zd->subpixel_predict8x4      = xd->subpixel_predict8x4;
         zd->subpixel_predict8x8      = xd->subpixel_predict8x8;
         zd->subpixel_predict16x16    = xd->subpixel_predict16x16;
+#if CONFIG_DUALPRED
+        zd->subpixel_predict_avg8x8  = xd->subpixel_predict_avg8x8;
+        zd->subpixel_predict_avg16x16 = xd->subpixel_predict_avg16x16;
+#endif /* CONFIG_DUALPRED */
         zd->segmentation_enabled     = xd->segmentation_enabled;
         zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
 
@@ -439,6 +443,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
         mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
         mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
         mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+#if CONFIG_DUALPRED
+        mbd->subpixel_predict_avg8x8 = xd->subpixel_predict_avg8x8;
+        mbd->subpixel_predict_avg16x16 = xd->subpixel_predict_avg16x16;
+#endif /* CONFIG_DUALPRED */
 #if CONFIG_RUNTIME_CPU_DETECT
         mbd->rtcd                   = xd->rtcd;
 #endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 126a2db5e3..7b9f08fe63 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -802,6 +802,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     }
 
     cpi->mbs_tested_so_far = 0;
+    cpi->mbs_dual_count = 0;
 
     // best quality defaults
     sf->RD = 1;
@@ -857,6 +858,21 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         sf->thresh_mult[THR_SPLITG   ] = 5000;
         sf->thresh_mult[THR_SPLITA   ] = 5000;
 
+#if CONFIG_DUALPRED
+        sf->thresh_mult[THR_DUAL_ZEROLG   ] = 0;
+        sf->thresh_mult[THR_DUAL_NEARESTLG] = 0;
+        sf->thresh_mult[THR_DUAL_NEARLG   ] = 0;
+        sf->thresh_mult[THR_DUAL_ZEROLA   ] = 0;
+        sf->thresh_mult[THR_DUAL_NEARESTLA] = 0;
+        sf->thresh_mult[THR_DUAL_NEARLA   ] = 0;
+        sf->thresh_mult[THR_DUAL_ZEROGA   ] = 0;
+        sf->thresh_mult[THR_DUAL_NEARESTGA] = 0;
+        sf->thresh_mult[THR_DUAL_NEARGA   ] = 0;
+
+        sf->thresh_mult[THR_DUAL_NEWLG    ] = 1000;
+        sf->thresh_mult[THR_DUAL_NEWLA    ] = 1000;
+        sf->thresh_mult[THR_DUAL_NEWGA    ] = 1000;
+#endif /* CONFIG_DUALPRED */
 
         sf->first_step = 0;
         sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -908,6 +924,22 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         sf->thresh_mult[THR_SPLITMV  ] = 1700;
         sf->thresh_mult[THR_SPLITG   ] = 4500;
         sf->thresh_mult[THR_SPLITA   ] = 4500;
+
+#if CONFIG_DUALPRED
+        sf->thresh_mult[THR_DUAL_ZEROLG   ] = 0;
+        sf->thresh_mult[THR_DUAL_NEARESTLG] = 0;
+        sf->thresh_mult[THR_DUAL_NEARLG   ] = 0;
+        sf->thresh_mult[THR_DUAL_ZEROLA   ] = 0;
+        sf->thresh_mult[THR_DUAL_NEARESTLA] = 0;
+        sf->thresh_mult[THR_DUAL_NEARLA   ] = 0;
+        sf->thresh_mult[THR_DUAL_ZEROGA   ] = 0;
+        sf->thresh_mult[THR_DUAL_NEARESTGA] = 0;
+        sf->thresh_mult[THR_DUAL_NEARGA   ] = 0;
+
+        sf->thresh_mult[THR_DUAL_NEWLG    ] = 1000;
+        sf->thresh_mult[THR_DUAL_NEWLA    ] = 1000;
+        sf->thresh_mult[THR_DUAL_NEWGA    ] = 1000;
+#endif /* CONFIG_DUALPRED */
 #else
         sf->thresh_mult[THR_NEWMV    ] = 1500;
         sf->thresh_mult[THR_NEWG     ] = 1500;
@@ -968,6 +1000,22 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 sf->thresh_mult[THR_NEWA     ] = 2000;
                 sf->thresh_mult[THR_SPLITA   ] = 20000;
             }
+
+#if CONFIG_DUALPRED
+            sf->thresh_mult[THR_DUAL_ZEROLG   ] = 1500;
+            sf->thresh_mult[THR_DUAL_NEARESTLG] = 1500;
+            sf->thresh_mult[THR_DUAL_NEARLG   ] = 1500;
+            sf->thresh_mult[THR_DUAL_ZEROLA   ] = 1500;
+            sf->thresh_mult[THR_DUAL_NEARESTLA] = 1500;
+            sf->thresh_mult[THR_DUAL_NEARLA   ] = 1500;
+            sf->thresh_mult[THR_DUAL_ZEROGA   ] = 1500;
+            sf->thresh_mult[THR_DUAL_NEARESTGA] = 1500;
+            sf->thresh_mult[THR_DUAL_NEARGA   ] = 1500;
+
+            sf->thresh_mult[THR_DUAL_NEWLG    ] = 2000;
+            sf->thresh_mult[THR_DUAL_NEWLA    ] = 2000;
+            sf->thresh_mult[THR_DUAL_NEWGA    ] = 2000;
+#endif /* CONFIG_DUALPRED */
         }
 
         if (Speed > 2)
@@ -1008,6 +1056,22 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 sf->thresh_mult[THR_SPLITA   ] = 50000;
             }
 
+#if CONFIG_DUALPRED
+            sf->thresh_mult[THR_DUAL_ZEROLG   ] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARESTLG] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARLG   ] = 2000;
+            sf->thresh_mult[THR_DUAL_ZEROLA   ] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARESTLA] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARLA   ] = 2000;
+            sf->thresh_mult[THR_DUAL_ZEROGA   ] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARESTGA] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARGA   ] = 2000;
+
+            sf->thresh_mult[THR_DUAL_NEWLG    ] = 2500;
+            sf->thresh_mult[THR_DUAL_NEWLA    ] = 2500;
+            sf->thresh_mult[THR_DUAL_NEWGA    ] = 2500;
+#endif /* CONFIG_DUALPRED */
+
             sf->improved_quant = 0;
             sf->improved_dct = 0;
 
@@ -1065,6 +1129,15 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 cpi->mode_check_freq[THR_NEWA] = 4;
             }
 
+#if CONFIG_DUALPRED
+            cpi->mode_check_freq[THR_DUAL_NEARLG   ] = 2;
+            cpi->mode_check_freq[THR_DUAL_NEARLA   ] = 2;
+            cpi->mode_check_freq[THR_DUAL_NEARGA   ] = 2;
+            cpi->mode_check_freq[THR_DUAL_NEWLG    ] = 4;
+            cpi->mode_check_freq[THR_DUAL_NEWLA    ] = 4;
+            cpi->mode_check_freq[THR_DUAL_NEWGA    ] = 4;
+#endif /* CONFIG_DUALPRED */
+
             if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
             {
                 sf->thresh_mult[THR_NEARESTG ] = 2000;
@@ -1080,6 +1153,12 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 sf->thresh_mult[THR_NEARA    ] = 2000;
                 sf->thresh_mult[THR_NEWA     ] = 4000;
             }
+
+#if CONFIG_DUALPRED
+            sf->thresh_mult[THR_DUAL_NEWLG    ] = 4000;
+            sf->thresh_mult[THR_DUAL_NEWLA    ] = 4000;
+            sf->thresh_mult[THR_DUAL_NEWGA    ] = 4000;
+#endif /* CONFIG_DUALPRED */
         }
 
         break;
@@ -1114,6 +1193,22 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         sf->thresh_mult[THR_SPLITA   ] = 10000;
         sf->search_method = NSTEP;
 
+#if CONFIG_DUALPRED
+        sf->thresh_mult[THR_DUAL_ZEROLG   ] = 1000;
+        sf->thresh_mult[THR_DUAL_NEARESTLG] = 1000;
+        sf->thresh_mult[THR_DUAL_NEARLG   ] = 1000;
+        sf->thresh_mult[THR_DUAL_ZEROLA   ] = 1000;
+        sf->thresh_mult[THR_DUAL_NEARESTLA] = 1000;
+        sf->thresh_mult[THR_DUAL_NEARLA   ] = 1000;
+        sf->thresh_mult[THR_DUAL_ZEROGA   ] = 1000;
+        sf->thresh_mult[THR_DUAL_NEARESTGA] = 1000;
+        sf->thresh_mult[THR_DUAL_NEARGA   ] = 1000;
+
+        sf->thresh_mult[THR_DUAL_NEWLG    ] = 2000;
+        sf->thresh_mult[THR_DUAL_NEWLA    ] = 2000;
+        sf->thresh_mult[THR_DUAL_NEWGA    ] = 2000;
+#endif /* CONFIG_DUALPRED */
+
         if (Speed > 0)
         {
             cpi->mode_check_freq[THR_SPLITG] = 4;
@@ -1201,6 +1296,21 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 sf->thresh_mult[THR_SPLITA   ] = 50000;
             }
 
+#if CONFIG_DUALPRED
+            sf->thresh_mult[THR_DUAL_ZEROLG   ] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARESTLG] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARLG   ] = 2000;
+            sf->thresh_mult[THR_DUAL_ZEROLA   ] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARESTLA] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARLA   ] = 2000;
+            sf->thresh_mult[THR_DUAL_ZEROGA   ] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARESTGA] = 2000;
+            sf->thresh_mult[THR_DUAL_NEARGA   ] = 2000;
+
+            sf->thresh_mult[THR_DUAL_NEWLG    ] = 2500;
+            sf->thresh_mult[THR_DUAL_NEWLA    ] = 2500;
+            sf->thresh_mult[THR_DUAL_NEWGA    ] = 2500;
+#endif /* CONFIG_DUALPRED */
         }
 
         if (Speed > 2)
@@ -1227,6 +1337,15 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 cpi->mode_check_freq[THR_NEWA] = 4;
             }
 
+#if CONFIG_DUALPRED
+            cpi->mode_check_freq[THR_DUAL_NEARLG   ] = 2;
+            cpi->mode_check_freq[THR_DUAL_NEARLA   ] = 2;
+            cpi->mode_check_freq[THR_DUAL_NEARGA   ] = 2;
+            cpi->mode_check_freq[THR_DUAL_NEWLG    ] = 4;
+            cpi->mode_check_freq[THR_DUAL_NEWLA    ] = 4;
+            cpi->mode_check_freq[THR_DUAL_NEWGA    ] = 4;
+#endif /* CONFIG_DUALPRED */
+
             sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
             sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
             sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
@@ -1289,6 +1408,12 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 sf->thresh_mult[THR_NEARA    ] = 2000;
                 sf->thresh_mult[THR_NEWA     ] = 4000;
             }
+
+#if CONFIG_DUALPRED
+            sf->thresh_mult[THR_DUAL_NEWLG    ] = 4000;
+            sf->thresh_mult[THR_DUAL_NEWLA    ] = 4000;
+            sf->thresh_mult[THR_DUAL_NEWGA    ] = 4000;
+#endif /* CONFIG_DUALPRED */
         }
 
         if (Speed > 5)
@@ -1358,6 +1483,22 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 sf->thresh_mult[THR_NEARA    ] = thresh;
             }
 
+#if CONFIG_DUALPRED
+            sf->thresh_mult[THR_DUAL_ZEROLG   ] = thresh;
+            sf->thresh_mult[THR_DUAL_NEARESTLG] = thresh;
+            sf->thresh_mult[THR_DUAL_NEARLG   ] = thresh;
+            sf->thresh_mult[THR_DUAL_ZEROLA   ] = thresh;
+            sf->thresh_mult[THR_DUAL_NEARESTLA] = thresh;
+            sf->thresh_mult[THR_DUAL_NEARLA   ] = thresh;
+            sf->thresh_mult[THR_DUAL_ZEROGA   ] = thresh;
+            sf->thresh_mult[THR_DUAL_NEARESTGA] = thresh;
+            sf->thresh_mult[THR_DUAL_NEARGA   ] = thresh;
+
+            sf->thresh_mult[THR_DUAL_NEWLG    ] = thresh << 1;
+            sf->thresh_mult[THR_DUAL_NEWLA    ] = thresh << 1;
+            sf->thresh_mult[THR_DUAL_NEWGA    ] = thresh << 1;
+#endif /* CONFIG_DUALPRED */
+
             // Disable other intra prediction modes
             sf->thresh_mult[THR_TM] = INT_MAX;
             sf->thresh_mult[THR_V_PRED] = INT_MAX;
@@ -1394,6 +1535,22 @@ void vp8_set_speed_features(VP8_COMP *cpi)
                 cpi->mode_check_freq[THR_NEWA] = 1 << (Tmp + 1);
             }
 
+#if CONFIG_DUALPRED
+            cpi->mode_check_freq[THR_DUAL_ZEROLG   ] = 1 << (Tmp - 1);
+            cpi->mode_check_freq[THR_DUAL_NEARESTLG] = 1 << (Tmp - 1);
+            cpi->mode_check_freq[THR_DUAL_NEARLG   ] = 1 << Tmp;
+            cpi->mode_check_freq[THR_DUAL_ZEROLA   ] = 1 << (Tmp - 1);
+            cpi->mode_check_freq[THR_DUAL_NEARESTLA] = 1 << (Tmp - 1);
+            cpi->mode_check_freq[THR_DUAL_NEARLA   ] = 1 << Tmp;
+            cpi->mode_check_freq[THR_DUAL_ZEROGA   ] = 1 << (Tmp - 1);
+            cpi->mode_check_freq[THR_DUAL_NEARESTGA] = 1 << (Tmp - 1);
+            cpi->mode_check_freq[THR_DUAL_NEARGA   ] = 1 << Tmp;
+
+            cpi->mode_check_freq[THR_DUAL_NEWLG    ] = 1 << (Tmp + 1);
+            cpi->mode_check_freq[THR_DUAL_NEWLA    ] = 1 << (Tmp + 1);
+            cpi->mode_check_freq[THR_DUAL_NEWGA    ] = 1 << (Tmp + 1);
+#endif /* CONFIG_DUALPRED */
+
             cpi->mode_check_freq[THR_NEWMV] = 1 << (Tmp - 1);
         }
 
@@ -1439,6 +1596,31 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
     }
 
+#if CONFIG_DUALPRED
+    if ((cpi->ref_frame_flags & (VP8_LAST_FLAG | VP8_GOLD_FLAG)) != (VP8_LAST_FLAG | VP8_GOLD_FLAG))
+    {
+        sf->thresh_mult[THR_DUAL_ZEROLG   ] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEARESTLG] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEARLG   ] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEWLG    ] = INT_MAX;
+    }
+
+    if ((cpi->ref_frame_flags & (VP8_LAST_FLAG | VP8_ALT_FLAG)) != (VP8_LAST_FLAG | VP8_ALT_FLAG))
+    {
+        sf->thresh_mult[THR_DUAL_ZEROLA   ] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEARESTLA] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEARLA   ] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEWLA    ] = INT_MAX;
+    }
+
+    if ((cpi->ref_frame_flags & (VP8_GOLD_FLAG | VP8_ALT_FLAG)) != (VP8_GOLD_FLAG | VP8_ALT_FLAG))
+    {
+        sf->thresh_mult[THR_DUAL_ZEROGA   ] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEARESTGA] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEARGA   ] = INT_MAX;
+        sf->thresh_mult[THR_DUAL_NEWGA    ] = INT_MAX;
+    }
+#endif /* CONFIG_DUALPRED */
 
     // Slow quant, dct and trellis not worthwhile for first pass
     // so make sure they are always turned off.
@@ -2132,6 +2314,11 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
     cpi->prob_last_coded              = 128;
     cpi->prob_gf_coded                = 128;
     cpi->prob_intra_coded             = 63;
+#if CONFIG_DUALPRED
+    cpi->prob_dualpred[0]             = 128;
+    cpi->prob_dualpred[1]             = 128;
+    cpi->prob_dualpred[2]             = 128;
+#endif /* CONFIG_DUALPRED */
 
     // Prime the recent reference frame useage counters.
     // Hereafter they will be maintained as a sort of moving average
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 38025cad5c..74c9876d0f 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -42,11 +42,12 @@
 #define AF_THRESH   25
 #define AF_THRESH2  100
 #define ARF_DECAY_THRESH 12
-#if CONFIG_I8X8
-#define MAX_MODES 21
-#else
-#define MAX_MODES 20
-#endif
+#if CONFIG_DUALPRED
+#define MAX_MODES (32 + CONFIG_I8X8)
+#else /* CONFIG_DUALPRED */
+#define MAX_MODES (20 + CONFIG_I8X8)
+#endif /* CONFIG_DUALPRED */
+
 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
 
@@ -192,6 +193,24 @@ typedef enum
 #if CONFIG_I8X8
     THR_I8X8_PRED      = 20,
 #endif
+
+#if CONFIG_DUALPRED
+    THR_DUAL_ZEROLG    = 20,
+    THR_DUAL_NEARESTLG = 21,
+    THR_DUAL_NEARLG    = 22,
+
+    THR_DUAL_ZEROLA    = 23,
+    THR_DUAL_NEARESTLA = 24,
+    THR_DUAL_NEARLA    = 25,
+
+    THR_DUAL_ZEROGA    = 26,
+    THR_DUAL_NEARESTGA = 27,
+    THR_DUAL_NEARGA    = 28,
+
+    THR_DUAL_NEWLG     = 29,
+    THR_DUAL_NEWLA     = 30,
+    THR_DUAL_NEWGA     = 31,
+#endif /* CONFIG_DUALPRED */
 }
 THR_MODES;
 
@@ -339,10 +358,16 @@ typedef struct VP8_COMP
     unsigned int mode_test_hit_counts[MAX_MODES];
     unsigned int mode_chosen_counts[MAX_MODES];
     unsigned int mbs_tested_so_far;
+    unsigned int mbs_dual_count;
 
     int rd_thresh_mult[MAX_MODES];
     int rd_baseline_thresh[MAX_MODES];
     int rd_threshes[MAX_MODES];
+#if CONFIG_DUALPRED
+    int rd_single_diff, rd_dual_diff, rd_hybrid_diff;
+    int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
+    int dual_pred_count[3], single_pred_count[3];
+#endif /* CONFIG_DUALPRED */
 
     int RDMULT;
     int RDDIV ;
@@ -491,6 +516,9 @@ typedef struct VP8_COMP
     int prob_skip_false;
     int last_skip_false_probs[3];
     int last_skip_probs_q[3];
+#if CONFIG_DUALPRED
+    int prob_dualpred[3];
+#endif /* CONFIG_DUALPRED */
     int recent_ref_frame_usage[MAX_REF_FRAMES];
 
     int count_mb_ref_frame_usage[MAX_REF_FRAMES];
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index d7a2058680..4b622d8bff 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -44,6 +44,7 @@ extern unsigned int cnt_pm;
 
 extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+extern const MV_REFERENCE_FRAME vp8_second_ref_frame_order[MAX_MODES];
 
 extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
 extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
@@ -528,6 +529,11 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         if (best_rd <= cpi->rd_threshes[mode_index])
             continue;
 
+#if CONFIG_DUALPRED
+        if (vp8_second_ref_frame_order[mode_index])
+            continue;
+#endif /* CONFIG_DUALPRED */
+
         x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index];
 
         if (skip_mode[x->e_mbd.mode_info_context->mbmi.ref_frame])
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 8be284965f..fd5bd2e39e 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -106,6 +106,25 @@ const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] =
 #if CONFIG_I8X8
     I8X8_PRED,
 #endif
+
+#if CONFIG_DUALPRED
+    /* dual prediction modes */
+    ZEROMV,
+    NEARESTMV,
+    NEARMV,
+
+    ZEROMV,
+    NEARESTMV,
+    NEARMV,
+
+    ZEROMV,
+    NEARESTMV,
+    NEARMV,
+
+    NEWMV,
+    NEWMV,
+    NEWMV,
+#endif /* CONFIG_DUALPRED */
 };
 
 const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES] =
@@ -141,7 +160,54 @@ const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES] =
 #if CONFIG_I8X8
     INTRA_FRAME,
 #endif
+
+#if CONFIG_DUALPRED
+    /* dual prediction modes */
+    LAST_FRAME,
+    LAST_FRAME,
+    LAST_FRAME,
+
+    ALTREF_FRAME,
+    ALTREF_FRAME,
+    ALTREF_FRAME,
+
+    GOLDEN_FRAME,
+    GOLDEN_FRAME,
+    GOLDEN_FRAME,
+
+    LAST_FRAME,
+    ALTREF_FRAME,
+    GOLDEN_FRAME,
+#endif /* CONFIG_DUALPRED */
+};
+
+#if CONFIG_DUALPRED
+const MV_REFERENCE_FRAME vp8_second_ref_frame_order[MAX_MODES] =
+{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#if CONFIG_I8X8
+    0,
+#endif
+
+    /* dual prediction modes */
+    GOLDEN_FRAME,
+    GOLDEN_FRAME,
+    GOLDEN_FRAME,
+
+    LAST_FRAME,
+    LAST_FRAME,
+    LAST_FRAME,
+
+    ALTREF_FRAME,
+    ALTREF_FRAME,
+    ALTREF_FRAME,
+
+    GOLDEN_FRAME,
+    LAST_FRAME,
+    ALTREF_FRAME,
 };
+#endif /* CONFIG_DUALPRED */
 
 static void fill_token_costs(
     unsigned int c      [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS],
@@ -997,7 +1063,6 @@ static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                                   x->e_mbd.mode_info_context->mbmi.segment_id);
 #endif
 
-    vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
     ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
         x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
 
@@ -1970,7 +2035,10 @@ static void set_i8x8_block_modes(MACROBLOCK *x, int *modes)
 
 
 
-void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset,
+                            int *returnrate, int *returndistortion, int *returnintra,
+                            int *best_single_rd_diff, int *best_dual_rd_diff,
+                            int *best_hybrid_rd_diff)
 {
     BLOCK *b = &x->block[0];
     BLOCKD *d = &x->e_mbd.block[0];
@@ -1996,6 +2064,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     int distortion;
     int best_rd = INT_MAX;
     int best_intra_rd = INT_MAX;
+#if CONFIG_DUALPRED
+    int best_dual_rd = INT_MAX;
+    int best_single_rd = INT_MAX;
+    int best_hybrid_rd = INT_MAX;
+#endif /* CONFIG_DUALPRED */
     int rate2, distortion2;
     int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
     int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
@@ -2016,6 +2089,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     int_mv frame_nearest_mv[4];
     int_mv frame_near_mv[4];
     int_mv frame_best_ref_mv[4];
+#if CONFIG_DUALPRED
+    int_mv mc_search_result[4];
+#endif /* CONFIG_DUALPRED */
     int frame_mdcounts[4][4];
     unsigned char *y_buffer[4];
     unsigned char *u_buffer[4];
@@ -2023,6 +2099,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
     vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
     vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
+#if CONFIG_DUALPRED
+    for (i = 0; i < 4; i++)
+    {
+#define INVALID_MV 0x80008000
+        mc_search_result[i].as_int = INVALID_MV;
+    }
+#endif /* CONFIG_DUALPRED */
 
     if (cpi->ref_frame_flags & VP8_LAST_FLAG)
     {
@@ -2088,6 +2171,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         int this_rd = INT_MAX;
         int disable_skip = 0;
         int other_cost = 0;
+#if CONFIG_DUALPRED
+        int dualmode_cost = 0;
+        int mode_excluded = 0;
+#endif /* CONFIG_DUALPRED */
 
         // Experimental debug code.
         // Record of rd values recorded for this MB. -1 indicates not measured
@@ -2109,6 +2196,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         x->e_mbd.mode_info_context->mbmi.mode = this_mode;
         x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
         x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index];
+#if CONFIG_DUALPRED
+        x->e_mbd.mode_info_context->mbmi.second_ref_frame = vp8_second_ref_frame_order[mode_index];
+#endif /* CONFIG_DUALPRED */
 
 //#if CONFIG_SEGFEATURES
         // If the segment reference frame feature is enabled....
@@ -2189,6 +2279,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             vp8_update_zbin_extra(cpi, x);
         }
 
+#if CONFIG_DUALPRED
+        if (!x->e_mbd.mode_info_context->mbmi.second_ref_frame)
+#endif /* CONFIG_DUALPRED */
         switch (this_mode)
         {
         case B_PRED:
@@ -2430,6 +2523,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                                              &cpi->fn_ptr[BLOCK_16X16],
                                              x->mvcost, &dis, &sse);
             }
+#if CONFIG_DUALPRED
+            mc_search_result[x->e_mbd.mode_info_context->mbmi.ref_frame].as_int = d->bmi.mv.as_int;
+#endif /* CONFIG_DUALPRED */
 
             mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
@@ -2458,6 +2554,13 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
             vp8_build_inter16x16_predictors_mby(&x->e_mbd);
 
+#if CONFIG_DUALPRED
+            MB_MODE_INFO *t = &x->e_mbd.mode_info_context[-cpi->common.mode_info_stride].mbmi;
+            MB_MODE_INFO *l = &x->e_mbd.mode_info_context[-1].mbmi;
+            int cnt = (t->second_ref_frame != INTRA_FRAME) + (l->second_ref_frame != INTRA_FRAME);
+            dualmode_cost = vp8_cost_bit(cpi->prob_dualpred[cnt], 0);
+#endif /* CONFIG_DUALPRED */
+
             if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
                 x->skip = 1;
             }
@@ -2516,24 +2619,124 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             distortion2 += distortion;
 
             // UV cost and distortion
+            vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
             rd_inter16x16_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
             rate2 += rate_uv;
             distortion2 += distortion_uv;
+#if CONFIG_DUALPRED
+            mode_excluded = cpi->common.dual_pred_mode == DUAL_PREDICTION_ONLY;
+#endif /* CONFIG_DUALPRED */
             break;
 
         default:
             break;
         }
+#if CONFIG_DUALPRED
+        else /* x->e_mbd.mode_info_context->mbmi.second_ref_frame != 0 */
+        {
+            int ref1 = x->e_mbd.mode_info_context->mbmi.ref_frame;
+            int ref2 = x->e_mbd.mode_info_context->mbmi.second_ref_frame;
+
+            mode_excluded = cpi->common.dual_pred_mode == SINGLE_PREDICTION_ONLY;
+            switch (this_mode)
+            {
+            case NEWMV:
+                if (mc_search_result[ref1].as_int == INVALID_MV ||
+                    mc_search_result[ref2].as_int == INVALID_MV)
+                    continue;
+                x->e_mbd.mode_info_context->mbmi.mv.as_int        = mc_search_result[ref1].as_int;
+                x->e_mbd.mode_info_context->mbmi.second_mv.as_int = mc_search_result[ref2].as_int;
+                rate2 += vp8_mv_bit_cost(&mc_search_result[ref1],
+                                         &frame_best_ref_mv[ref1], x->mvcost, 96);
+                rate2 += vp8_mv_bit_cost(&mc_search_result[ref2],
+                                         &frame_best_ref_mv[ref2], x->mvcost, 96);
+                break;
+            case ZEROMV:
+                x->e_mbd.mode_info_context->mbmi.mv.as_int        = 0;
+                x->e_mbd.mode_info_context->mbmi.second_mv.as_int = 0;
+                break;
+            case NEARMV:
+                if (frame_near_mv[ref1].as_int == 0 || frame_near_mv[ref2].as_int == 0)
+                    continue;
+                x->e_mbd.mode_info_context->mbmi.mv.as_int        = frame_near_mv[ref1].as_int;
+                x->e_mbd.mode_info_context->mbmi.second_mv.as_int = frame_near_mv[ref2].as_int;
+                break;
+            case NEARESTMV:
+                if (frame_nearest_mv[ref1].as_int == 0 || frame_nearest_mv[ref2].as_int == 0)
+                    continue;
+                x->e_mbd.mode_info_context->mbmi.mv.as_int        = frame_nearest_mv[ref1].as_int;
+                x->e_mbd.mode_info_context->mbmi.second_mv.as_int = frame_nearest_mv[ref2].as_int;
+                break;
+            default:
+                break;
+            }
+
+            /* Add in the Mv/mode cost */
+            rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+
+            vp8_clamp_mv2(&x->e_mbd.mode_info_context->mbmi.mv, xd);
+            vp8_clamp_mv2(&x->e_mbd.mode_info_context->mbmi.second_mv, xd);
+            if (((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row >> 3) < x->mv_row_min) ||
+                ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row >> 3) > x->mv_row_max) ||
+                ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col >> 3) < x->mv_col_min) ||
+                ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col >> 3) > x->mv_col_max) ||
+                ((x->e_mbd.mode_info_context->mbmi.second_mv.as_mv.row >> 3) < x->mv_row_min) ||
+                ((x->e_mbd.mode_info_context->mbmi.second_mv.as_mv.row >> 3) > x->mv_row_max) ||
+                ((x->e_mbd.mode_info_context->mbmi.second_mv.as_mv.col >> 3) < x->mv_col_min) ||
+                ((x->e_mbd.mode_info_context->mbmi.second_mv.as_mv.col >> 3) > x->mv_col_max))
+                continue;
+
+            /* build first and second prediction */
+            vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+            vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
+            /* do second round and average the results */
+            x->e_mbd.second_pre.y_buffer = y_buffer[ref2];
+            x->e_mbd.second_pre.u_buffer = u_buffer[ref2];
+            x->e_mbd.second_pre.v_buffer = v_buffer[ref2];
+            vp8_build_2nd_inter16x16_predictors_mb(&x->e_mbd, x->e_mbd.predictor,
+                                                   &x->e_mbd.predictor[256],
+                                                   &x->e_mbd.predictor[320], 16, 8);
+
+            /* Y cost and distortion */
+            macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb));
+            rate2 += rate_y;
+            distortion2 += distortion;
+
+            /* UV cost and distortion */
+            rd_inter16x16_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
+            rate2 += rate_uv;
+            distortion2 += distortion_uv;
+
+            /* don't bother w/ skip, we would never have come here if skip were enabled */
+            x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+
+            /* We don't include the cost of the second reference here, because there are only
+             * three options: Last/Golden, ARF/Last or Golden/ARF, or in other words if you
+             * present them in that order, the second one is always known if the first is known */
+            MB_MODE_INFO *t = &x->e_mbd.mode_info_context[-cpi->common.mode_info_stride].mbmi;
+            MB_MODE_INFO *l = &x->e_mbd.mode_info_context[-1].mbmi;
+            int cnt = (t->second_ref_frame != INTRA_FRAME) + (l->second_ref_frame != INTRA_FRAME);
+            dualmode_cost = vp8_cost_bit(cpi->prob_dualpred[cnt], 1);
+        }
+#endif /* CONFIG_DUALPRED */
 
         // Where skip is allowable add in the default per mb cost for the no skip case.
         // where we then decide to skip we have to delete this and replace it with the
         // cost of signallying a skip
         if (cpi->common.mb_no_coeff_skip)
         {
-            other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
-            rate2 += other_cost;
+            int prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 0);
+            other_cost += prob_skip_cost;
+            rate2 += prob_skip_cost;
         }
 
+#if CONFIG_DUALPRED
+        if (cpi->common.dual_pred_mode == HYBRID_PREDICTION)
+        {
+            rate2 += dualmode_cost;
+        }
+#endif /* CONFIG_DUALPRED */
+
         /* Estimate the reference frame signaling cost and add it
          * to the rolling cost variable.
          */
@@ -2589,9 +2792,26 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             *returnintra = distortion2 ;
         }
 
+#if CONFIG_DUALPRED
+        if (!disable_skip &&
+            (this_mode == SPLITMV || x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME))
+        {
+            if (this_rd < best_dual_rd)
+                best_dual_rd = this_rd;
+            if (this_rd < best_single_rd)
+                best_single_rd = this_rd;
+            if (this_rd < best_hybrid_rd)
+                best_hybrid_rd = this_rd;
+        }
+#endif /* CONFIG_DUALPRED */
+
         // Did this mode help.. i.i is it the new best mode
         if (this_rd < best_rd || x->skip)
         {
+#if CONFIG_DUALPRED
+            if (!mode_excluded)
+            {
+#endif /* CONFIG_DUALPRED */
             // Note index of best mode so far
             best_mode_index = mode_index;
 
@@ -2624,7 +2844,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                 {
                     best_bmodes[i] = x->e_mbd.block[i].bmi;
                 }
-
+#if CONFIG_DUALPRED
+            }
+#endif /* CONFIG_DUALPRED */
 
             // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
             cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
@@ -2642,6 +2864,48 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
         }
 
+#if CONFIG_DUALPRED
+        /* keep record of best dual/single-only prediction */
+        if (!disable_skip &&
+            x->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
+            this_mode != SPLITMV)
+        {
+            int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+            if (cpi->common.dual_pred_mode == HYBRID_PREDICTION)
+            {
+                single_rate = rate2 - dualmode_cost;
+                hybrid_rate = rate2;
+            }
+            else
+            {
+                single_rate = rate2;
+                hybrid_rate = rate2 + dualmode_cost;
+            }
+
+            single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+            hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+            if (x->e_mbd.mode_info_context->mbmi.second_ref_frame == INTRA_FRAME &&
+                single_rd < best_single_rd)
+            {
+                best_single_rd = single_rd;
+                if (0) printf("single rd [DMC: %d]: %d\n", dualmode_cost, single_rd);
+            }
+            else if (x->e_mbd.mode_info_context->mbmi.second_ref_frame != INTRA_FRAME &&
+                     single_rd < best_dual_rd)
+            {
+                best_dual_rd = single_rd;
+                if (0) printf("dual rd [DMC: %d]: %d\n", dualmode_cost, single_rd);
+            }
+            if (hybrid_rd < best_hybrid_rd)
+            {
+                best_hybrid_rd = hybrid_rd;
+                if (0) printf("hybrid rd [DMC: %d]: %d\n", best_hybrid_rd, hybrid_rd);
+            }
+        }
+#endif /* CONFIG_DUALPRED */
+
         if (x->skip)
             break;
 
@@ -2694,6 +2958,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                                         (cpi->common.mb_no_coeff_skip) ? 1 : 0;
         x->e_mbd.mode_info_context->mbmi.partitioning = 0;
 
+#if CONFIG_DUALPRED
+        *best_single_rd_diff = *best_dual_rd_diff = *best_hybrid_rd_diff = 0;
+#endif /* CONFIG_DUALPRED */
+
         return;
     }
 
@@ -2730,8 +2998,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
     rd_update_mvcount(cpi, x, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
 
-
-
+#if CONFIG_DUALPRED
+    *best_single_rd_diff = best_rd - best_single_rd;
+    *best_dual_rd_diff   = best_rd - best_dual_rd;
+    *best_hybrid_rd_diff = best_rd - best_hybrid_rd;
+#endif /* CONFIG_DUALPRED */
 }
 
 void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index 20fe4b5bd1..a22abd12e5 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -16,7 +16,9 @@
 #define RDCOST_8x8(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 
 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
-extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset,
+                                   int *returnrate, int *returndistortion, int *returnintra,
+                                   int *best_single_rd_diff, int *best_dual_rd_diff, int *best_hybrid_rd_diff);
 extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
 
 extern void vp8_mv_pred
-- 
GitLab