Commit 94ebdba7 authored by Johann Koenig's avatar Johann Koenig Committed by Gerrit Code Review
Browse files

Merge "vp9 temporal filter: sse4 implementation"

parents 0e8fea6c 6dfeea65
......@@ -141,53 +141,60 @@ TEST_P(TemporalFilterTest, SizeCombinations) {
count_chk.CopyFrom(count_ref);
reference_filter(a, b, width, height, filter_strength, filter_weight,
&accum_ref, &count_ref);
filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
height, filter_strength, filter_weight,
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
ASM_REGISTER_STATE_CHECK(
filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
height, filter_strength, filter_weight,
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));
EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
EXPECT_TRUE(count_chk.CheckValues(count_ref));
if (HasFailure()) {
printf("Width: %d Height: %d\n", width, height);
count_chk.PrintDifference(count_ref);
accum_chk.PrintDifference(accum_ref);
ASSERT_TRUE(false);
return;
}
}
}
}
TEST_P(TemporalFilterTest, CompareReferenceRandom) {
const int width = 16;
const int height = 16;
Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
a.Set(&rnd_, &ACMRandom::Rand8);
b.Set(&rnd_, &ACMRandom::Rand8);
for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
accum_ref.Set(rnd_.Rand8());
accum_chk.CopyFrom(accum_ref);
count_ref.Set(rnd_.Rand8());
count_chk.CopyFrom(count_ref);
reference_filter(a, b, width, height, filter_strength, filter_weight,
&accum_ref, &count_ref);
filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
height, filter_strength, filter_weight,
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
EXPECT_TRUE(count_chk.CheckValues(count_ref));
if (HasFailure()) {
printf("Weight: %d Strength: %d\n", filter_weight, filter_strength);
count_chk.PrintDifference(count_ref);
accum_chk.PrintDifference(accum_ref);
ASSERT_TRUE(false);
for (int width = 8; width <= 16; width += 8) {
for (int height = 8; height <= 16; height += 8) {
Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
for (int repeat = 0; repeat < 10; ++repeat) {
a.Set(&rnd_, &ACMRandom::Rand8);
b.Set(&rnd_, &ACMRandom::Rand8);
accum_ref.Set(rnd_.Rand8());
accum_chk.CopyFrom(accum_ref);
count_ref.Set(rnd_.Rand8());
count_chk.CopyFrom(count_ref);
reference_filter(a, b, width, height, filter_strength,
filter_weight, &accum_ref, &count_ref);
ASM_REGISTER_STATE_CHECK(filter_func_(
a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,
filter_strength, filter_weight, accum_chk.TopLeftPixel(),
count_chk.TopLeftPixel()));
EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
EXPECT_TRUE(count_chk.CheckValues(count_ref));
if (HasFailure()) {
printf("Weight: %d Strength: %d\n", filter_weight,
filter_strength);
count_chk.PrintDifference(count_ref);
accum_chk.PrintDifference(accum_ref);
return;
}
}
}
}
}
}
......@@ -222,9 +229,8 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
}
vpx_usec_timer_mark(&timer);
const int elapsed_time =
static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
printf("Temporal filter %dx%d time: %5d ms\n", width, height,
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("Temporal filter %dx%d time: %5d us\n", width, height,
elapsed_time);
}
}
......@@ -233,10 +239,8 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,
::testing::Values(&vp9_temporal_filter_apply_c));
/* TODO(johannkoenig): https://bugs.chromium.org/p/webm/issues/detail?id=1378
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,
::testing::Values(&vp9_temporal_filter_apply_sse4_1));
#endif // HAVE_SSE4_1
*/
} // namespace
......@@ -198,7 +198,7 @@ add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const str
specialize qw/vp9_diamond_search_sad avx/;
add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_temporal_filter_apply sse2 msa/;
specialize qw/vp9_temporal_filter_apply sse4_1/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
......
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp9_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
static void temporal_filter_apply_8size_msa(const uint8_t *frm1_ptr,
uint32_t stride,
const uint8_t *frm2_ptr,
int32_t filt_sth, int32_t filt_wgt,
uint32_t *acc, uint16_t *cnt) {
uint32_t row;
uint64_t f0, f1, f2, f3;
v16i8 frm2, frm1 = { 0 };
v16i8 frm4, frm3 = { 0 };
v16u8 frm_r, frm_l;
v8i16 frm2_r, frm2_l;
v8i16 diff0, diff1, mod0_h, mod1_h;
v4i32 cnst3, cnst16, filt_wt, strength;
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
v4i32 acc0, acc1, acc2, acc3;
v8i16 cnt0, cnt1;
filt_wt = __msa_fill_w(filt_wgt);
strength = __msa_fill_w(filt_sth);
cnst3 = __msa_ldi_w(3);
cnst16 = __msa_ldi_w(16);
for (row = 2; row--;) {
LD4(frm1_ptr, stride, f0, f1, f2, f3);
frm1_ptr += (4 * stride);
LD_SB2(frm2_ptr, 16, frm2, frm4);
frm2_ptr += 32;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
INSERT_D2_SB(f0, f1, frm1);
INSERT_D2_SB(f2, f3, frm3);
ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < cnst16);
diff0_l = (mod1_w < cnst16);
diff1_r = (mod2_w < cnst16);
diff1_l = (mod3_w < cnst16);
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
UNPCK_UB_SH(frm2, frm2_r, frm2_l);
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
acc += 8;
ST_SW2(mod2_w, mod3_w, acc, 4);
acc += 8;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < cnst16);
diff0_l = (mod1_w < cnst16);
diff1_r = (mod2_w < cnst16);
diff1_l = (mod3_w < cnst16);
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
UNPCK_UB_SH(frm4, frm2_r, frm2_l);
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
acc += 8;
ST_SW2(mod2_w, mod3_w, acc, 4);
acc += 8;
}
}
static void temporal_filter_apply_16size_msa(const uint8_t *frm1_ptr,
uint32_t stride,
const uint8_t *frm2_ptr,
int32_t filt_sth, int32_t filt_wgt,
uint32_t *acc, uint16_t *cnt) {
uint32_t row;
v16i8 frm1, frm2, frm3, frm4;
v16u8 frm_r, frm_l;
v16i8 zero = { 0 };
v8u16 frm2_r, frm2_l;
v8i16 diff0, diff1, mod0_h, mod1_h;
v4i32 cnst3, cnst16, filt_wt, strength;
v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
v4i32 acc0, acc1, acc2, acc3;
v8i16 cnt0, cnt1;
filt_wt = __msa_fill_w(filt_wgt);
strength = __msa_fill_w(filt_sth);
cnst3 = __msa_ldi_w(3);
cnst16 = __msa_ldi_w(16);
for (row = 8; row--;) {
LD_SB2(frm1_ptr, stride, frm1, frm3);
frm1_ptr += stride;
LD_SB2(frm2_ptr, 16, frm2, frm4);
frm2_ptr += 16;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < cnst16);
diff0_l = (mod1_w < cnst16);
diff1_r = (mod2_w < cnst16);
diff1_l = (mod3_w < cnst16);
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
acc += 8;
ST_SW2(mod2_w, mod3_w, acc, 4);
acc += 8;
LD_SW2(acc, 4, acc0, acc1);
LD_SW2(acc + 8, 4, acc2, acc3);
LD_SH2(cnt, 8, cnt0, cnt1);
ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
UNPCK_SH_SW(diff0, diff0_r, diff0_l);
UNPCK_SH_SW(diff1, diff1_r, diff1_l);
MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
mod0_w, mod1_w, mod2_w, mod3_w);
MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
mod1_w, mod2_w, mod3_w);
SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
diff0_r = (mod0_w < cnst16);
diff0_l = (mod1_w < cnst16);
diff1_r = (mod2_w < cnst16);
diff1_l = (mod3_w < cnst16);
SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
mod1_w, mod2_w, mod3_w);
mod0_w = diff0_r & mod0_w;
mod1_w = diff0_l & mod1_w;
mod2_w = diff1_r & mod2_w;
mod3_w = diff1_l & mod3_w;
MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
mod0_w, mod1_w, mod2_w, mod3_w);
PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
ST_SH2(mod0_h, mod1_h, cnt, 8);
cnt += 16;
ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
mod0_w, mod1_w, mod2_w, mod3_w);
ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
mod2_w, mod3_w);
ST_SW2(mod0_w, mod1_w, acc, 4);
acc += 8;
ST_SW2(mod2_w, mod3_w, acc, 4);
acc += 8;
frm1_ptr += stride;
frm2_ptr += 16;
}
}
void vp9_temporal_filter_apply_msa(const uint8_t *frame1_ptr, uint32_t stride,
const uint8_t *frame2_ptr, uint32_t blk_w,
uint32_t blk_h, int32_t strength,
int32_t filt_wgt, uint32_t *accu,
uint16_t *cnt) {
if (8 == (blk_w * blk_h)) {
temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
filt_wgt, accu, cnt);
} else if (16 == (blk_w * blk_h)) {
temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
filt_wgt, accu, cnt);
} else {
vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
strength, filt_wgt, accu, cnt);
}
}
......@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <math.h>
#include <limits.h>
......@@ -100,6 +101,12 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,
int byte = 0;
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
assert(strength >= 0);
assert(strength <= 6);
assert(filter_weight >= 0);
assert(filter_weight <= 2);
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
int pixel_value = *frame2;
......@@ -376,45 +383,44 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int adj_strength = strength + 2 * (mbd->bd - 8);
// Apply the filter (YUV)
vp9_highbd_temporal_filter_apply_c(
vp9_highbd_temporal_filter_apply(
f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
adj_strength, filter_weight, accumulator, count);
vp9_highbd_temporal_filter_apply_c(
vp9_highbd_temporal_filter_apply(
f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
mb_uv_width, mb_uv_height, adj_strength, filter_weight,
accumulator + 256, count + 256);
vp9_highbd_temporal_filter_apply_c(
vp9_highbd_temporal_filter_apply(
f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
mb_uv_width, mb_uv_height, adj_strength, filter_weight,
accumulator + 512, count + 512);
} else {
// Apply the filter (YUV)
vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16, strength,
filter_weight, accumulator, count);
vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_width,
mb_uv_height, strength, filter_weight,
accumulator + 256, count + 256);
vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_width,
mb_uv_height, strength, filter_weight,
accumulator + 512, count + 512);
}
#else
// Apply the filter (YUV)
// TODO(jingning): Need SIMD optimization for this.
vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16, strength, filter_weight,
accumulator, count);
vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 256,
count + 256);
vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 512,
count + 512);
}
#else
// Apply the filter (YUV)
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16, strength, filter_weight,
accumulator, count);
vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 256,
count + 256);
vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_width, mb_uv_height,
strength, filter_weight, accumulator + 512,
count + 512);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
......
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <smmintrin.h>
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
// Division using multiplication and shifting. The C implementation does:
// modifier *= 3;
// modifier /= index;
// where 'modifier' is a set of summed values and 'index' is the number of
// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
// which may be bound by the edges of the block being filtered.
//
// This equation works out to (m * 3) / i which reduces to:
// m * 3/4
// m * 1/2
// m * 1/3
//
// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
// m * C / 65536
// we can create a C to replicate the division.
//
// m * 49152 / 65536 = m * 3/4
// m * 32758 / 65536 = m * 1/2
// m * 21846 / 65536 = m * 0.3333
//
// These are loaded using an instruction expecting int16_t values but are used
// with _mm_mulhi_epu16(), which treats them as unsigned.
#define NEIGHBOR_CONSTANT_4 (int16_t)49152
#define NEIGHBOR_CONSTANT_6 (int16_t)32768
#define NEIGHBOR_CONSTANT_9 (int16_t)21846
// Load values from 'a' and 'b'. Compute the difference squared and sum
// neighboring values such that:
// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
// Values to the left and right of the row are set to 0.
// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
// Shift all the values one place to the left/right so we can efficiently sum
// diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
// It becomes necessary to treat the values as unsigned at this point. The
// 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
// forward since the filter is only applied to smooth small pixel changes.
// Once the value has saturated to uint16_t it is well outside the useful
// range.
__m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
*sum = sum_u16;
}
static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
__m128i *sum_1) {
const __m128i zero = _mm_setzero_si128();
const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);