Commit 0ede9f52 authored by Jingning Han's avatar Jingning Han

Unify subtract function used in VP8/9

This commit replaces the vp8_ prefixed subtract function with the
common vpx_subtract_block function. It removes redundant SIMD
optimization codes and unit tests.

Change-Id: I42e086c32c93c6125e452dcaa6ed04337fe028d9
parent 9cb3a134
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "./vpx_config.h"
#include "./vp8_rtcd.h"
#include "vp8/common/blockd.h"
#include "vp8/encoder/block.h"
#include "vpx_mem/vpx_mem.h"
typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch);
namespace {
class SubtractBlockTest : public ::testing::TestWithParam<SubtractBlockFunc> {
public:
virtual void TearDown() {
libvpx_test::ClearSystemState();
}
};
using libvpx_test::ACMRandom;
TEST_P(SubtractBlockTest, SimpleSubtract) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
BLOCK be;
BLOCKD bd;
// in libvpx, this stride is always 16
const int kDiffPredStride = 16;
const int kSrcStride[] = {32, 16, 8, 4, 0};
const int kBlockWidth = 4;
const int kBlockHeight = 4;
// Allocate... align to 16 for mmx/sse tests
uint8_t *source = reinterpret_cast<uint8_t*>(
vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source)));
be.src_diff = reinterpret_cast<int16_t*>(
vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff)));
bd.predictor = reinterpret_cast<unsigned char*>(
vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
for (int i = 0; kSrcStride[i] > 0; ++i) {
// start at block0
be.src = 0;
be.base_src = &source;
be.src_stride = kSrcStride[i];
// set difference
int16_t *src_diff = be.src_diff;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
src_diff[c] = static_cast<int16_t>(0xa5a5u);
}
src_diff += kDiffPredStride;
}
// set destination
uint8_t *base_src = *be.base_src;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
base_src[c] = rnd.Rand8();
}
base_src += be.src_stride;
}
// set predictor
uint8_t *predictor = bd.predictor;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
predictor[c] = rnd.Rand8();
}
predictor += kDiffPredStride;
}
ASM_REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
base_src = *be.base_src;
src_diff = be.src_diff;
predictor = bd.predictor;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r
<< ", c = " << c;
}
src_diff += kDiffPredStride;
predictor += kDiffPredStride;
base_src += be.src_stride;
}
}
vpx_free(be.src_diff);
vpx_free(source);
vpx_free(bd.predictor);
}
INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
::testing::Values(vp8_subtract_b_c));
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
::testing::Values(vp8_subtract_b_neon));
#endif
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
::testing::Values(vp8_subtract_b_mmx));
#endif
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest,
::testing::Values(vp8_subtract_b_sse2));
#endif
} // namespace
......@@ -104,7 +104,6 @@ endif
LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
......
......@@ -343,15 +343,6 @@ add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
specialize qw/vp8_mbuverror mmx sse2/;
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
specialize qw/vp8_subtract_b mmx sse2 neon/;
add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
specialize qw/vp8_subtract_mby mmx sse2 neon/;
add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
specialize qw/vp8_subtract_mbuv mmx sse2 neon/;
#
# Motion search
#
......
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/encoder/block.h"
void vp8_subtract_b_neon(
BLOCK *be,
BLOCKD *bd,
int pitch) {
unsigned char *src_ptr, *predictor;
int src_stride;
int16_t *src_diff;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint16x8_t q10u16, q11u16, q12u16, q13u16;
src_ptr = *be->base_src + be->src;
src_stride = be->src_stride;
predictor = bd->predictor;
d0u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d4u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d6u8 = vld1_u8(src_ptr);
d1u8 = vld1_u8(predictor);
predictor += pitch;
d3u8 = vld1_u8(predictor);
predictor += pitch;
d5u8 = vld1_u8(predictor);
predictor += pitch;
d7u8 = vld1_u8(predictor);
q10u16 = vsubl_u8(d0u8, d1u8);
q11u16 = vsubl_u8(d2u8, d3u8);
q12u16 = vsubl_u8(d4u8, d5u8);
q13u16 = vsubl_u8(d6u8, d7u8);
src_diff = be->src_diff;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
return;
}
void vp8_subtract_mby_neon(
int16_t *diff,
unsigned char *src,
int src_stride,
unsigned char *pred,
int pred_stride) {
int i;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
for (i = 0; i < 8; i++) { // subtract_mby_loop
q0u8 = vld1q_u8(src);
src += src_stride;
q2u8 = vld1q_u8(src);
src += src_stride;
q1u8 = vld1q_u8(pred);
pred += pred_stride;
q3u8 = vld1q_u8(pred);
pred += pred_stride;
q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
vst1q_u16((uint16_t *)diff, q8u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q9u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q10u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q11u16);
diff += 8;
}
return;
}
void vp8_subtract_mbuv_neon(
int16_t *diff,
unsigned char *usrc,
unsigned char *vsrc,
int src_stride,
unsigned char *upred,
unsigned char *vpred,
int pred_stride) {
int i, j;
unsigned char *src_ptr, *pred_ptr;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
diff += 256;
for (i = 0; i < 2; i++) {
if (i == 0) {
src_ptr = usrc;
pred_ptr = upred;
} else if (i == 1) {
src_ptr = vsrc;
pred_ptr = vpred;
}
for (j = 0; j < 2; j++) {
d0u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d1u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d3u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d4u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d5u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d6u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d7u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
q8u16 = vsubl_u8(d0u8, d1u8);
q9u16 = vsubl_u8(d2u8, d3u8);
q10u16 = vsubl_u8(d4u8, d5u8);
q11u16 = vsubl_u8(d6u8, d7u8);
vst1q_u16((uint16_t *)diff, q8u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q9u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q10u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q11u16);
diff += 8;
}
}
return;
}
......@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_config.h"
#include "vp8_rtcd.h"
......@@ -19,80 +20,29 @@
#include "vpx_mem/vpx_mem.h"
#include "rdopt.h"
// TODO(jingning,johannkoenig): use vpx_subtract_block to replace
// codec specified vp9_subtract_ functions.
void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
{
unsigned char *src_ptr = (*(be->base_src) + be->src);
short *diff_ptr = be->src_diff;
unsigned char *pred_ptr = bd->predictor;
int src_stride = be->src_stride;
int r, c;
void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
unsigned char *src_ptr = (*(be->base_src) + be->src);
short *diff_ptr = be->src_diff;
unsigned char *pred_ptr = bd->predictor;
int src_stride = be->src_stride;
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
diff_ptr[c] = src_ptr[c] - pred_ptr[c];
}
diff_ptr += pitch;
pred_ptr += pitch;
src_ptr += src_stride;
}
vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
pred_ptr, pitch);
}
void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
int src_stride, unsigned char *upred,
unsigned char *vpred, int pred_stride)
{
short *udiff = diff + 256;
short *vdiff = diff + 320;
int r, c;
unsigned char *vpred, int pred_stride) {
short *udiff = diff + 256;
short *vdiff = diff + 320;
for (r = 0; r < 8; r++)
{
for (c = 0; c < 8; c++)
{
udiff[c] = usrc[c] - upred[c];
}
udiff += 8;
upred += pred_stride;
usrc += src_stride;
}
for (r = 0; r < 8; r++)
{
for (c = 0; c < 8; c++)
{
vdiff[c] = vsrc[c] - vpred[c];
}
vdiff += 8;
vpred += pred_stride;
vsrc += src_stride;
}
vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
}
void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
unsigned char *pred, int pred_stride)
{
int r, c;
for (r = 0; r < 16; r++)
{
for (c = 0; c < 16; c++)
{
diff[c] = src[c] - pred[c];
}
diff += 16;
pred += pred_stride;
src += src_stride;
}
void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
unsigned char *pred, int pred_stride) {
vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
}
static void vp8_subtract_mb(MACROBLOCK *x)
......
......@@ -19,6 +19,13 @@ extern "C" {
#endif
void vp8_encode_inter16x16(MACROBLOCK *x);
void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch);
void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
int src_stride, unsigned char *upred,
unsigned char *vpred, int pred_stride);
void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
unsigned char *pred, int pred_stride);
void vp8_build_dcblock(MACROBLOCK *b);
void vp8_transform_mb(MACROBLOCK *mb);
void vp8_transform_mbuv(MACROBLOCK *x);
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
; short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp8_subtract_b_mmx_impl) PRIVATE
sym(vp8_subtract_b_mmx_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rdi, arg(2) ;diff
mov rax, arg(3) ;Predictor
mov rsi, arg(0) ;z
movsxd rdx, dword ptr arg(1);src_stride;
movsxd rcx, dword ptr arg(4);pitch
pxor mm7, mm7
movd mm0, [rsi]
movd mm1, [rax]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi], mm0
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*2],mm0
movd mm0, [rsi+rdx*2]
movd mm1, [rax+rcx*2]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*4], mm0
lea rsi, [rsi+rdx*2]
lea rcx, [rcx+rcx*2]
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*2], mm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
;unsigned char *pred, int pred_stride)
global sym(vp8_subtract_mby_mmx) PRIVATE
sym(vp8_subtract_mby_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rdi, arg(0) ;diff
mov rsi, arg(1) ;src
movsxd rdx, dword ptr arg(2);src_stride
mov rax, arg(3) ;pred
push rbx
movsxd rbx, dword ptr arg(4);pred_stride
pxor mm0, mm0
mov rcx, 16
.submby_loop:
movq mm1, [rsi]
movq mm3, [rax]
movq mm2, mm1
movq mm4, mm3
punpcklbw mm1, mm0
punpcklbw mm3, mm0
punpckhbw mm2, mm0
punpckhbw mm4, mm0
psubw mm1, mm3
psubw mm2, mm4
movq [rdi], mm1
movq [rdi+8], mm2
movq mm1, [rsi+8]
movq mm3, [rax+8]
movq mm2, mm1
movq mm4, mm3
punpcklbw mm1, mm0
punpcklbw mm3, mm0
punpckhbw mm2, mm0
punpckhbw mm4, mm0
psubw mm1, mm3
psubw mm2, mm4
movq [rdi+16], mm1
movq [rdi+24], mm2
add rdi, 32
lea rax, [rax+rbx]
lea rsi, [rsi+rdx]
dec rcx
jnz .submby_loop
pop rbx
pop rdi
pop rsi
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
; int src_stride, unsigned char *upred,
; unsigned char *vpred, int pred_stride)
global sym(vp8_subtract_mbuv_mmx) PRIVATE
sym(vp8_subtract_mbuv_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
; end prolog
mov rdi, arg(0) ;diff
mov rsi, arg(1) ;usrc
movsxd rdx, dword ptr arg(3);src_stride;
mov rax, arg(4) ;upred
add rdi, 256*2 ;diff = diff + 256 (shorts)
mov rcx, 8
push rbx
movsxd rbx, dword ptr arg(6);pred_stride
pxor mm7, mm7
.submbu_loop:
movq mm0, [rsi]
movq mm1, [rax]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi], mm0
movq [rdi+8], mm3
add rdi, 16
add rsi, rdx
add rax, rbx
dec rcx
jnz .submbu_loop
mov rsi, arg(2) ;vsrc
mov rax, arg(5) ;vpred
mov rcx, 8
.submbv_loop:
movq mm0, [rsi]
movq mm1, [rax]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi], mm0
movq [rdi+8], mm3
add rdi, 16
add rsi, rdx
add rax, rbx
dec rcx
jnz .submbv_loop
pop rbx
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
; short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp8_subtract_b_sse2_impl) PRIVATE
sym(vp8_subtract_b_sse2_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdi, arg(2) ;diff
mov rax, arg(3) ;Predictor
mov rsi, arg(0) ;z
movsxd rdx, dword ptr arg(1);src_stride;
movsxd rcx, dword ptr arg(4);pitch
pxor mm7, mm7
movd mm0, [rsi]
movd mm1, [rax]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi], mm0
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*2], mm0
movd mm0, [rsi+rdx*2]
movd mm1, [rax+rcx*2]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*4], mm0
lea rsi, [rsi+rdx*2]
lea rcx, [rcx+rcx*2]
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*2], mm0
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
;unsigned char *pred, int pred_stride)
global sym(vp8_subtract_mby_sse2) PRIVATE
sym(vp8_subtract_mby_sse2):
push rbp