Commit 7756e989 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Gerrit Code Review
Browse files

Merge "Add subtract_block SSE2 version and unit test."

Showing with 223 additions and 843 deletions
......@@ -66,6 +66,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
......
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
extern "C" {
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_blockd.h"
}
typedef void (*subtract_fn_t)(int rows, int cols,
int16_t *diff_ptr, ptrdiff_t diff_stride,
const uint8_t *src_ptr, ptrdiff_t src_stride,
const uint8_t *pred_ptr, ptrdiff_t pred_stride);
namespace vp9 {
class VP9SubtractBlockTest : public ::testing::TestWithParam<subtract_fn_t> {
public:
virtual void TearDown() {
libvpx_test::ClearSystemState();
}
};
using libvpx_test::ACMRandom;
TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
// FIXME(rbultje) split in its own file
for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
const int block_width = 4 << b_width_log2(bsize);
const int block_height = 4 << b_height_log2(bsize);
int16_t *diff = new int16_t[block_width * block_height * 2];
uint8_t *pred = new uint8_t[block_width * block_height * 2];
uint8_t *src = new uint8_t[block_width * block_height * 2];
for (int n = 0; n < 100; n++) {
for (int r = 0; r < block_height; ++r) {
for (int c = 0; c < block_width * 2; ++c) {
src[r * block_width * 2 + c] = rnd.Rand8();
pred[r * block_width * 2 + c] = rnd.Rand8();
}
}
GetParam()(block_height, block_width, diff, block_width,
src, block_width, pred, block_width);
for (int r = 0; r < block_height; ++r) {
for (int c = 0; c < block_width; ++c) {
EXPECT_EQ(diff[r * block_width + c],
(src[r * block_width + c] -
pred[r * block_width + c])) << "r = " << r
<< ", c = " << c
<< ", bs = " << bsize;
}
}
GetParam()(block_height, block_width, diff, block_width * 2,
src, block_width * 2, pred, block_width * 2);
for (int r = 0; r < block_height; ++r) {
for (int c = 0; c < block_width; ++c) {
EXPECT_EQ(diff[r * block_width * 2 + c],
(src[r * block_width * 2 + c] -
pred[r * block_width * 2 + c])) << "r = " << r
<< ", c = " << c
<< ", bs = " << bsize;
}
}
}
delete[] diff;
delete[] pred;
delete[] src;
}
}
INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
::testing::Values(vp9_subtract_block_c));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
::testing::Values(vp9_subtract_block_sse2));
#endif
} // namespace vp9
......@@ -533,6 +533,9 @@ prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
specialize vp9_block_error mmx sse2
vp9_block_error_sse2=vp9_block_error_xmm
prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
specialize vp9_subtract_block sse2
#
# Structured Similarity (SSIM)
#
......
......@@ -22,10 +22,10 @@
DECLARE_ALIGNED(16, extern const uint8_t,
vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
void vp9_subtract_block(int rows, int cols,
int16_t *diff_ptr, int diff_stride,
const uint8_t *src_ptr, int src_stride,
const uint8_t *pred_ptr, int pred_stride) {
void vp9_subtract_block_c(int rows, int cols,
int16_t *diff_ptr, ptrdiff_t diff_stride,
const uint8_t *src_ptr, ptrdiff_t src_stride,
const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
int r, c;
for (r = 0; r < rows; r++) {
......
......@@ -42,10 +42,6 @@ void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_subtract_block(int rows, int cols,
int16_t *diff_ptr, int diff_stride,
const uint8_t *src_ptr, int src_stride,
const uint8_t *pred_ptr, int pred_stride);
void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
; short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp9_subtract_b_mmx_impl) PRIVATE
sym(vp9_subtract_b_mmx_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rdi, arg(2) ;diff
mov rax, arg(3) ;Predictor
mov rsi, arg(0) ;z
movsxd rdx, dword ptr arg(1);src_stride;
movsxd rcx, dword ptr arg(4);pitch
pxor mm7, mm7
movd mm0, [rsi]
movd mm1, [rax]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi], mm0
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*2],mm0
movd mm0, [rsi+rdx*2]
movd mm1, [rax+rcx*2]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*4], mm0
lea rsi, [rsi+rdx*2]
lea rcx, [rcx+rcx*2]
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*2], mm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
global sym(vp9_subtract_mby_mmx) PRIVATE
sym(vp9_subtract_mby_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(1) ;src
mov rdi, arg(0) ;diff
mov rax, arg(2) ;pred
movsxd rdx, dword ptr arg(3) ;stride
mov rcx, 16
pxor mm0, mm0
.submby_loop:
movq mm1, [rsi]
movq mm3, [rax]
movq mm2, mm1
movq mm4, mm3
punpcklbw mm1, mm0
punpcklbw mm3, mm0
punpckhbw mm2, mm0
punpckhbw mm4, mm0
psubw mm1, mm3
psubw mm2, mm4
movq [rdi], mm1
movq [rdi+8], mm2
movq mm1, [rsi+8]
movq mm3, [rax+8]
movq mm2, mm1
movq mm4, mm3
punpcklbw mm1, mm0
punpcklbw mm3, mm0
punpckhbw mm2, mm0
punpckhbw mm4, mm0
psubw mm1, mm3
psubw mm2, mm4
movq [rdi+16], mm1
movq [rdi+24], mm2
add rdi, 32
add rax, 16
lea rsi, [rsi+rdx]
sub rcx, 1
jnz .submby_loop
pop rdi
pop rsi
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
global sym(vp9_subtract_mbuv_mmx) PRIVATE
sym(vp9_subtract_mbuv_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
;short *udiff = diff + 256;
;short *vdiff = diff + 320;
;unsigned char *upred = pred + 256;
;unsigned char *vpred = pred + 320;
;unsigned char *z = usrc;
;unsigned short *diff = udiff;
;unsigned char *Predictor= upred;
mov rdi, arg(0) ;diff
mov rax, arg(3) ;pred
mov rsi, arg(1) ;z = usrc
add rdi, 256*2 ;diff = diff + 256 (shorts)
add rax, 256 ;Predictor = pred + 256
movsxd rdx, dword ptr arg(4) ;stride;
pxor mm7, mm7
movq mm0, [rsi]
movq mm1, [rax]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi], mm0
movq [rdi+8], mm3
movq mm0, [rsi+rdx]
movq mm1, [rax+8]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+16], mm0
movq [rdi+24], mm3
movq mm0, [rsi+rdx*2]
movq mm1, [rax+16]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+32], mm0
movq [rdi+40], mm3
lea rsi, [rsi+rdx*2]
movq mm0, [rsi+rdx]
movq mm1, [rax+24]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+48], mm0
movq [rdi+56], mm3
add rdi, 64
add rax, 32
lea rsi, [rsi+rdx*2]
movq mm0, [rsi]
movq mm1, [rax]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi], mm0
movq [rdi+8], mm3
movq mm0, [rsi+rdx]
movq mm1, [rax+8]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+16], mm0
movq [rdi+24], mm3
movq mm0, [rsi+rdx*2]
movq mm1, [rax+16]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+32], mm0
movq [rdi+40], mm3
lea rsi, [rsi+rdx*2]
movq mm0, [rsi+rdx]
movq mm1, [rax+24]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+48], mm0
movq [rdi+56], mm3
;unsigned char *z = vsrc;
;unsigned short *diff = vdiff;
;unsigned char *Predictor= vpred;
mov rdi, arg(0) ;diff
mov rax, arg(3) ;pred
mov rsi, arg(2) ;z = usrc
add rdi, 320*2 ;diff = diff + 320 (shorts)
add rax, 320 ;Predictor = pred + 320
movsxd rdx, dword ptr arg(4) ;stride;
pxor mm7, mm7
movq mm0, [rsi]
movq mm1, [rax]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi], mm0
movq [rdi+8], mm3
movq mm0, [rsi+rdx]
movq mm1, [rax+8]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+16], mm0
movq [rdi+24], mm3
movq mm0, [rsi+rdx*2]
movq mm1, [rax+16]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+32], mm0
movq [rdi+40], mm3
lea rsi, [rsi+rdx*2]
movq mm0, [rsi+rdx]
movq mm1, [rax+24]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+48], mm0
movq [rdi+56], mm3
add rdi, 64
add rax, 32
lea rsi, [rsi+rdx*2]
movq mm0, [rsi]
movq mm1, [rax]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi], mm0
movq [rdi+8], mm3
movq mm0, [rsi+rdx]
movq mm1, [rax+8]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+16], mm0
movq [rdi+24], mm3
movq mm0, [rsi+rdx*2]
movq mm1, [rax+16]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+32], mm0
movq [rdi+40], mm3
lea rsi, [rsi+rdx*2]
movq mm0, [rsi+rdx]
movq mm1, [rax+24]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi+48], mm0
movq [rdi+56], mm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
......@@ -8,349 +8,121 @@
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
; short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp9_subtract_b_sse2_impl) PRIVATE
sym(vp9_subtract_b_sse2_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdi, arg(2) ;diff
mov rax, arg(3) ;Predictor
mov rsi, arg(0) ;z
movsxd rdx, dword ptr arg(1);src_stride;
movsxd rcx, dword ptr arg(4);pitch
pxor mm7, mm7
movd mm0, [rsi]
movd mm1, [rax]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi], mm0
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*2], mm0
movd mm0, [rsi+rdx*2]
movd mm1, [rax+rcx*2]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*4], mm0
lea rsi, [rsi+rdx*2]
lea rcx, [rcx+rcx*2]
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*2], mm0
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
global sym(vp9_subtract_mby_sse2) PRIVATE
sym(vp9_subtract_mby_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(1) ;src
mov rdi, arg(0) ;diff
mov rax, arg(2) ;pred
movsxd rdx, dword ptr arg(3) ;stride
mov rcx, 8 ; do two lines at one time
.submby_loop:
movdqa xmm0, XMMWORD PTR [rsi] ; src
movdqa xmm1, XMMWORD PTR [rax] ; pred
movdqa xmm2, xmm0
psubb xmm0, xmm1
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi +16], xmm2
movdqa xmm4, XMMWORD PTR [rsi + rdx]
movdqa xmm5, XMMWORD PTR [rax + 16]
movdqa xmm6, xmm4
psubb xmm4, xmm5
pxor xmm5, [GLOBAL(t80)] ;convert to signed values
pxor xmm6, [GLOBAL(t80)]
pcmpgtb xmm5, xmm6 ; obtain sign information
movdqa xmm6, xmm4
movdqa xmm7, xmm5
punpcklbw xmm4, xmm5 ; put sign back to subtraction
punpckhbw xmm6, xmm7 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi +32], xmm4
movdqa XMMWORD PTR [rdi +48], xmm6
add rdi, 64
add rax, 32
lea rsi, [rsi+rdx*2]
sub rcx, 1
jnz .submby_loop
pop rdi
pop rsi
; begin epilog
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
global sym(vp9_subtract_mbuv_sse2) PRIVATE
sym(vp9_subtract_mbuv_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdi, arg(0) ;diff
mov rax, arg(3) ;pred
mov rsi, arg(1) ;z = usrc
add rdi, 256*2 ;diff = diff + 256 (shorts)
add rax, 256 ;Predictor = pred + 256
movsxd rdx, dword ptr arg(4) ;stride;
lea rcx, [rdx + rdx*2]
;u
;line 0 1
movq xmm0, MMWORD PTR [rsi] ; src
movq xmm2, MMWORD PTR [rsi+rdx]
movdqa xmm1, XMMWORD PTR [rax] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi +16], xmm2
;line 2 3
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
movq xmm2, MMWORD PTR [rsi+rcx]
movdqa xmm1, XMMWORD PTR [rax+16] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 32], xmm0
movdqa XMMWORD PTR [rdi + 48], xmm2
;line 4 5
lea rsi, [rsi + rdx*4]
movq xmm0, MMWORD PTR [rsi] ; src
movq xmm2, MMWORD PTR [rsi+rdx]
movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 64], xmm0
movdqa XMMWORD PTR [rdi + 80], xmm2
;line 6 7
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
movq xmm2, MMWORD PTR [rsi+rcx]
movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 96], xmm0
movdqa XMMWORD PTR [rdi + 112], xmm2
;v
mov rsi, arg(2) ;z = vsrc
add rdi, 64*2 ;diff = diff + 320 (shorts)
add rax, 64 ;Predictor = pred + 320
;line 0 1
movq xmm0, MMWORD PTR [rsi] ; src
movq xmm2, MMWORD PTR [rsi+rdx]
movdqa xmm1, XMMWORD PTR [rax] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi +16], xmm2
;line 2 3
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
movq xmm2, MMWORD PTR [rsi+rcx]
movdqa xmm1, XMMWORD PTR [rax+16] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 32], xmm0
movdqa XMMWORD PTR [rdi + 48], xmm2
;line 4 5
lea rsi, [rsi + rdx*4]
movq xmm0, MMWORD PTR [rsi] ; src
movq xmm2, MMWORD PTR [rsi+rdx]
movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 64], xmm0
movdqa XMMWORD PTR [rdi + 80], xmm2
;line 6 7
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
movq xmm2, MMWORD PTR [rsi+rcx]
movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 96], xmm0
movdqa XMMWORD PTR [rdi + 112], xmm2
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
t80:
times 16 db 0x80
%include "third_party/x86inc/x86inc.asm"
SECTION .text
; void vp9_subtract_block(int rows, int cols,
; int16_t *diff, ptrdiff_t diff_stride,
; const uint8_t *src, ptrdiff_t src_stride,
; const uint8_t *pred, ptrdiff_t pred_stride)
INIT_XMM sse2
cglobal subtract_block, 7, 7, 8, \
rows, cols, diff, diff_stride, src, src_stride, \
pred, pred_stride
%define pred_str colsq
pxor m7, m7 ; dedicated zero register
cmp colsd, 4
je .case_4
cmp colsd, 8
je .case_8
cmp colsd, 16
je .case_16
cmp colsd, 32
je .case_32
%macro loop16 6
mova m0, [srcq+%1]
mova m4, [srcq+%2]
mova m1, [predq+%3]
mova m5, [predq+%4]
punpckhbw m2, m0, m7
punpckhbw m3, m1, m7
punpcklbw m0, m7
punpcklbw m1, m7
psubw m2, m3
psubw m0, m1
punpckhbw m1, m4, m7
punpckhbw m3, m5, m7
punpcklbw m4, m7
punpcklbw m5, m7
psubw m1, m3
psubw m4, m5
mova [diffq+mmsize*0+%5], m0
mova [diffq+mmsize*1+%5], m2
mova [diffq+mmsize*0+%6], m4
mova [diffq+mmsize*1+%6], m1
%endmacro
mov pred_str, pred_stridemp
.loop_64:
loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
lea diffq, [diffq+diff_strideq*2]
add predq, pred_str
add srcq, src_strideq
dec rowsd
jg .loop_64
RET
.case_32:
mov pred_str, pred_stridemp
.loop_32:
loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
lea diffq, [diffq+diff_strideq*2]
add predq, pred_str
add srcq, src_strideq
dec rowsd
jg .loop_32
RET
.case_16:
mov pred_str, pred_stridemp
.loop_16:
loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
lea diffq, [diffq+diff_strideq*4]
lea predq, [predq+pred_str*2]
lea srcq, [srcq+src_strideq*2]
sub rowsd, 2
jg .loop_16
RET
%macro loop_h 0
movh m0, [srcq]
movh m2, [srcq+src_strideq]
movh m1, [predq]
movh m3, [predq+pred_str]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
psubw m0, m1
psubw m2, m3
mova [diffq], m0
mova [diffq+diff_strideq*2], m2
%endmacro
.case_8:
mov pred_str, pred_stridemp
.loop_8:
loop_h
lea diffq, [diffq+diff_strideq*4]
lea srcq, [srcq+src_strideq*2]
lea predq, [predq+pred_str*2]
sub rowsd, 2
jg .loop_8
RET
INIT_MMX
.case_4:
mov pred_str, pred_stridemp
.loop_4:
loop_h
lea diffq, [diffq+diff_strideq*4]
lea srcq, [srcq+src_strideq*2]
lea predq, [predq+pred_str*2]
sub rowsd, 2
jg .loop_4
emms
RET
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vpx_ports/x86.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/encoder/x86/vp9_dct_mmx.h"
// TODO(jimbankoski) Consider rewriting the c to take the same values rather
// than going through these pointer conversions
#if 0 && HAVE_MMX
void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
vp9_short_fdct4x4_mmx(input, output, pitch);
vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
}
void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
short *diff, unsigned char *predictor,
int pitch);
void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
unsigned char *z = *(be->base_src) + be->src;
unsigned int src_stride = be->src_stride;
short *diff = &be->src_diff[0];
unsigned char *predictor = *(bd->base_dst) + bd->dst;
// TODO(jingning): The prototype function in c has been changed. Need to
// modify the mmx and sse versions.
vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
}
#endif
#if 0 && HAVE_SSE2
void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
short *diff, unsigned char *predictor,
int pitch);
void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
unsigned char *z = *(be->base_src) + be->src;
unsigned int src_stride = be->src_stride;
short *diff = &be->src_diff[0];
unsigned char *predictor = *(bd->base_dst) + bd->dst;
// TODO(jingning): The prototype function in c has been changed. Need to
// modify the mmx and sse versions.
vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
}
#endif
......@@ -73,13 +73,11 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment