diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c index b8e403419dd989a75812df405738bccf9a9fdb85..78cc6fa379d66901b19963cbf81fce1afaa80f26 100644 --- a/vp8/encoder/arm/neon/denoising_neon.c +++ b/vp8/encoder/arm/neon/denoising_neon.c @@ -68,8 +68,8 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int64x2_t v_sum_diff_total = vdupq_n_s64(0); /* Go over lines. */ - int i; - for (i = 0; i < 16; ++i) { + int r; + for (r = 0; r < 16; ++r) { /* Load inputs. */ const uint8x16_t v_sig = vld1q_u8(sig); const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); @@ -145,14 +145,91 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, /* Too much adjustments => copy block. */ { - const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), vget_low_s64(v_sum_diff_total)); - const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); int sum_diff_thresh = SUM_DIFF_THRESHOLD; if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; - if (s0 > sum_diff_thresh) + if (sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_running_avg_y_stride * 16; + running_avg_y -= running_avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, + v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, + v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, + v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = + vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = + vpaddlq_s8(v_sum_diff); + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, + fedcba98_76543210); + } + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + { + // Update the sum of all pixel differences of this MB. + x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } + } else { return COPY_BLOCK; + } + } } /* Tell above level that block was filtered. */