Commit 80465dae authored by JackyChen's avatar JackyChen

Add SSE2 code and unit test for VP9 denoiser.

This SSE2 is based on VP8 denoiser's SSE2 code. In VP8, there are
only 16x16 blocks in denoiser, while in VP9, there are 13 different
block sizes.

By adding this SSE2 code, the improvement of encoder speed is around
20%(using C code vs using SSE2 code), vary for different clips.

The unit test for VP9 denoiser is to confirm that the SSE2 code is
bit-exact with the C code. The unit test covers all block size.

Change-Id: Ic8d8ac26db4ea40a5f146b5678a065af07eaaa3d
parent 63e49be3
......@@ -135,13 +135,16 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
endif
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
LIBVPX_TEST_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += vp9_denoiser_sse2_test.cc
endif
endif # VP9
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
endif # CONFIG_SHARED
##
## TEST DATA
##
......
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "vpx_scale/yv12config.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/encoder/vp9_context_tree.h"
#include "vp9/encoder/vp9_denoiser.h"
using libvpx_test::ACMRandom;
namespace {
const int kNumPixels = 64 * 64;
class VP9DenoiserTest
: public ::testing::TestWithParam<int> {
public:
virtual ~VP9DenoiserTest() {}
virtual void SetUp() {
bs = (BLOCK_SIZE)GetParam();
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
BLOCK_SIZE bs;
};
TEST_P(VP9DenoiserTest, BitexactCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 4000;
// Allocate the space for input and output,
// where sig_block is the block to be denoised,
// mc_avg_block is the denoised reference block,
// avg_block_c is the denoised result from C code,
// avg_block_sse2 is the denoised result from SSE2 code.
DECLARE_ALIGNED_ARRAY(16, uint8_t, sig_block, kNumPixels);
DECLARE_ALIGNED_ARRAY(16, uint8_t, mc_avg_block, kNumPixels);
DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_c, kNumPixels);
DECLARE_ALIGNED_ARRAY(16, uint8_t, avg_block_sse2, kNumPixels);
for (int i = 0; i < count_test_block; ++i) {
// Generate random motion magnitude, 20% of which exceed the threshold.
uint8_t motion_magnitude_random
= rnd.Rand8() % (uint8_t)(MOTION_MAGNITUDE_THRESHOLD * 1.2);
// Initialize a test block with random number in range [0, 255].
for (int j = 0; j < kNumPixels; ++j) {
int temp = 0;
sig_block[j] = rnd.Rand8();
// The pixels in mc_avg_block are generated by adding a random
// number in range [-19, 19] to corresponding pixels in sig_block.
temp = sig_block[j] + (rnd.Rand8() % 2 == 0? -1 : 1) *
(rnd.Rand8()%20);
// Clip.
mc_avg_block[j] = (temp < 0? 0 : (temp > 255? 255 : temp));
}
ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_c(sig_block, 64,
mc_avg_block, 64, avg_block_c, 64,
0, bs, motion_magnitude_random));
ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2(sig_block, 64,
mc_avg_block, 64, avg_block_sse2, 64,
0, bs, motion_magnitude_random));
// Test bitexactness.
for (int h = 0; h < (4 << b_height_log2_lookup[bs]); ++h) {
for (int w = 0; w < (4 << b_width_log2_lookup[bs]); ++w) {
EXPECT_EQ(avg_block_c[h * 64 + w], avg_block_sse2[h * 64 + w]);
}
}
}
}
// Test for all block size.
INSTANTIATE_TEST_CASE_P(
SSE2, VP9DenoiserTest,
::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
BLOCK_64X64));
} // namespace
......@@ -36,7 +36,6 @@ const int size_group_lookup[BLOCK_SIZES] =
const int num_pels_log2_lookup[BLOCK_SIZES] =
{4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
{ // 4X4
// 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
......
......@@ -1057,6 +1057,14 @@ specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
#
# Denoiser
#
if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
specialize qw/vp9_denoiser_filter sse2/;
}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# the transform coefficients are held in 32-bit
# values, so the assembler code for vp9_block_error can no longer be used.
......
......@@ -31,9 +31,6 @@
static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
#endif
static const int widths[] = {4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64};
static const int heights[] = {4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64};
static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
(void)bs;
return 3 + (increase_denoising ? 1 : 0);
......@@ -52,7 +49,9 @@ static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
}
static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
return widths[bs] * heights[bs] * (increase_denoising ? 60 : 40);
return (4 << b_width_log2_lookup[bs]) *
(4 << b_height_log2_lookup[bs]) *
(increase_denoising ? 60 : 40);
}
static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
......@@ -61,25 +60,31 @@ static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
noise_motion_thresh(bs, increase_denoising)) {
return 0;
} else {
return widths[bs] * heights[bs] * 20;
return (4 << b_width_log2_lookup[bs]) *
(4 << b_height_log2_lookup[bs]) * 20;
}
}
static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2);
int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
return (4 << b_width_log2_lookup[bs]) *
(4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
}
static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2);
return (4 << b_width_log2_lookup[bs]) *
(4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
}
static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
const uint8_t *mc_avg,
int mc_avg_stride,
uint8_t *avg, int avg_stride,
int increase_denoising,
BLOCK_SIZE bs,
int motion_magnitude) {
// TODO(jackychen): If increase_denoising is enabled in the future,
// we might need to update the code for calculating 'total_adj' in
// case the C code is not bit-exact with corresponding sse2 code.
int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
const uint8_t *mc_avg,
int mc_avg_stride,
uint8_t *avg, int avg_stride,
int increase_denoising,
BLOCK_SIZE bs,
int motion_magnitude) {
int r, c;
const uint8_t *sig_start = sig;
const uint8_t *mc_avg_start = mc_avg;
......@@ -102,8 +107,8 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
}
// First attempt to apply a strong temporal denoising filter.
for (r = 0; r < heights[bs]; ++r) {
for (c = 0; c < widths[bs]; ++c) {
for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
diff = mc_avg[c] - sig[c];
absdiff = abs(diff);
......@@ -152,8 +157,8 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
mc_avg = mc_avg_start;
avg = avg_start;
sig = sig_start;
for (r = 0; r < heights[bs]; ++r) {
for (c = 0; c < widths[bs]; ++c) {
for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
diff = mc_avg[c] - sig[c];
adj = abs(diff);
if (adj > delta) {
......@@ -193,8 +198,8 @@ static uint8_t *block_start(uint8_t *framebuf, int stride,
static void copy_block(uint8_t *dest, int dest_stride,
const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
int r;
for (r = 0; r < heights[bs]; ++r) {
vpx_memcpy(dest, src, widths[bs]);
for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs]));
dest += dest_stride;
src += src_stride;
}
......@@ -336,10 +341,10 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
&motion_magnitude);
if (decision == FILTER_BLOCK) {
decision = denoiser_filter(src.buf, src.stride,
mc_avg_start, mc_avg.y_stride,
avg_start, avg.y_stride,
0, bs, motion_magnitude);
decision = vp9_denoiser_filter(src.buf, src.stride,
mc_avg_start, mc_avg.y_stride,
avg_start, avg.y_stride,
0, bs, motion_magnitude);
}
if (decision == FILTER_BLOCK) {
......
......@@ -55,6 +55,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
#endif
int border);
#if CONFIG_VP9_TEMPORAL_DENOISING
int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising);
#endif
void vp9_denoiser_free(VP9_DENOISER *denoiser);
#ifdef __cplusplus
......
This diff is collapsed.
......@@ -120,6 +120,10 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
endif
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment