Commit ef887974 authored by Johann's avatar Johann
Browse files

vp8 fast quantizer with intrinsics

Reduce dependency on offsets file by using intrinsics. Disassembly shows
improvements over previous assembly specifically in register management,
preloading, and {pro,epi}log. Speed change is within margin of error.

Change-Id: I8131b4b4d62bc092407fe847bfaa8f2c0e1384ff
Showing with 110 additions and 142 deletions
......@@ -236,147 +236,6 @@ ZIGZAG_LOOP 15
pop rbp
ret
; void vp8_fast_quantize_b_sse2 | arg
; (BLOCK *b, | 0
; BLOCKD *d) | 1
global sym(vp8_fast_quantize_b_sse2) PRIVATE
sym(vp8_fast_quantize_b_sse2):
push rbp
mov rbp, rsp
GET_GOT rbx
%if ABI_IS_32BIT
push rdi
push rsi
%else
%if LIBVPX_YASM_WIN64
push rdi
push rsi
%else
; these registers are used for passing arguments
%endif
%endif
; end prolog
%if ABI_IS_32BIT
mov rdi, arg(0) ; BLOCK *b
mov rsi, arg(1) ; BLOCKD *d
%else
%if LIBVPX_YASM_WIN64
mov rdi, rcx ; BLOCK *b
mov rsi, rdx ; BLOCKD *d
%else
;mov rdi, rdi ; BLOCK *b
;mov rsi, rsi ; BLOCKD *d
%endif
%endif
mov rax, [rdi + vp8_block_coeff]
mov rcx, [rdi + vp8_block_round]
mov rdx, [rdi + vp8_block_quant_fast]
; z = coeff
movdqa xmm0, [rax]
movdqa xmm4, [rax + 16]
; dup z so we can save sz
movdqa xmm1, xmm0
movdqa xmm5, xmm4
; sz = z >> 15
psraw xmm0, 15
psraw xmm4, 15
; x = abs(z) = (z ^ sz) - sz
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0
psubw xmm5, xmm4
; x += round
paddw xmm1, [rcx]
paddw xmm5, [rcx + 16]
mov rax, [rsi + vp8_blockd_qcoeff]
mov rcx, [rsi + vp8_blockd_dequant]
mov rdi, [rsi + vp8_blockd_dqcoeff]
; y = x * quant >> 16
pmulhw xmm1, [rdx]
pmulhw xmm5, [rdx + 16]
; x = (y ^ sz) - sz
pxor xmm1, xmm0
pxor xmm5, xmm4
psubw xmm1, xmm0
psubw xmm5, xmm4
; qcoeff = x
movdqa [rax], xmm1
movdqa [rax + 16], xmm5
; x * dequant
movdqa xmm2, xmm1
movdqa xmm3, xmm5
pmullw xmm2, [rcx]
pmullw xmm3, [rcx + 16]
; dqcoeff = x * dequant
movdqa [rdi], xmm2
movdqa [rdi + 16], xmm3
pxor xmm4, xmm4 ;clear all bits
pcmpeqw xmm1, xmm4
pcmpeqw xmm5, xmm4
pcmpeqw xmm4, xmm4 ;set all bits
pxor xmm1, xmm4
pxor xmm5, xmm4
pand xmm1, [GLOBAL(inv_zig_zag)]
pand xmm5, [GLOBAL(inv_zig_zag + 16)]
pmaxsw xmm1, xmm5
mov rcx, [rsi + vp8_blockd_eob]
; now down to 8
pshufd xmm5, xmm1, 00001110b
pmaxsw xmm1, xmm5
; only 4 left
pshuflw xmm5, xmm1, 00001110b
pmaxsw xmm1, xmm5
; okay, just 2!
pshuflw xmm5, xmm1, 00000001b
pmaxsw xmm1, xmm5
movd eax, xmm1
and eax, 0xff
mov BYTE PTR [rcx], al ; store eob
; begin epilog
%if ABI_IS_32BIT
pop rsi
pop rdi
%else
%if LIBVPX_YASM_WIN64
pop rsi
pop rdi
%endif
%endif
RESTORE_GOT
pop rbp
ret
SECTION_RODATA
align 16
inv_zig_zag:
......
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp8/common/blockd.h"
#include "vp8/common/entropy.h"
#include "vp8/encoder/block.h"
#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE
#include <emmintrin.h> //SSE2
void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
{
__m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
__m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
__m128i round0 = _mm_load_si128((__m128i *)(b->round));
__m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
__m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
__m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
__m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
__m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
__m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
__m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
__m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
/* sign of z: z >> 15 */
sz0 = _mm_srai_epi16(z0, 15);
sz1 = _mm_srai_epi16(z1, 15);
/* x = abs(z): (z ^ sz) - sz */
x0 = _mm_xor_si128(z0, sz0);
x1 = _mm_xor_si128(z1, sz1);
x0 = _mm_sub_epi16(x0, sz0);
x1 = _mm_sub_epi16(x1, sz1);
/* x += round */
x0 = _mm_add_epi16(x0, round0);
x1 = _mm_add_epi16(x1, round1);
/* y = (x * quant) >> 16 */
y0 = _mm_mulhi_epi16(x0, quant_fast0);
y1 = _mm_mulhi_epi16(x1, quant_fast1);
/* x = abs(y) = (y ^ sz) - sz */
y0 = _mm_xor_si128(y0, sz0);
y1 = _mm_xor_si128(y1, sz1);
x0 = _mm_sub_epi16(y0, sz0);
x1 = _mm_sub_epi16(y1, sz1);
/* qcoeff = x */
_mm_store_si128((__m128i *)(d->qcoeff), x0);
_mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
/* x * dequant */
xdq0 = _mm_mullo_epi16(x0, dequant0);
xdq1 = _mm_mullo_epi16(x1, dequant1);
/* dqcoeff = x * dequant */
_mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
_mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
/* build a mask for the zig zag */
zeros = _mm_setzero_si128();
x0 = _mm_cmpeq_epi16(x0, zeros);
x1 = _mm_cmpeq_epi16(x1, zeros);
ones = _mm_cmpeq_epi16(zeros, zeros);
x0 = _mm_xor_si128(x0, ones);
x1 = _mm_xor_si128(x1, ones);
x0 = _mm_and_si128(x0, inv_zig_zag0);
x1 = _mm_and_si128(x1, inv_zig_zag1);
x0 = _mm_max_epi16(x0, x1);
/* now down to 8 */
x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
x0 = _mm_max_epi16(x0, x1);
/* only 4 left */
x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
x0 = _mm_max_epi16(x0, x1);
/* okay, just 2! */
x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
x0 = _mm_max_epi16(x0, x1);
*d->eob = 0xFF & _mm_cvtsi128_si32(x0);
}
......@@ -89,8 +89,15 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
# TODO(johann) make this generic
ifeq ($(HAVE_SSE2),yes)
vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
endif
ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
ifeq ($(HAVE_SSE2),yes)
......@@ -112,7 +119,6 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes)
VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
endif
VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
$(eval $(call asm_offsets_template,\
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment