diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index be9f26c7fcee495ef842498f3e653857eff09bde..4a2329fc1b1a4dc856fce6e8a15ae230c7113a3c 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -129,9 +129,6 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) rc = vp8_default_zig_zag1d[i]; z = coeff_ptr[rc]; - //if ( i == 0 ) - // zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2; - //else zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; zbin_boost_ptr ++; @@ -144,13 +141,13 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) y = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc]; // quantize (x) x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value if (y) { eob = i; // last nonzero coeffs - zbin_boost_ptr = &b->zrun_zbin_boost[0]; // reset zero runlength + zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength } } } diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 57bf3c93a158a5402296532b835c68d0c11c0cfd..45e1a2ad3387278728633840ad239460a7185afc 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -11,220 +11,169 @@ %include "vpx_ports/x86_abi_support.asm" -;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; const int *default_zig_zag, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr, +;int vp8_regular_quantize_b_impl_sse2( +; short *coeff_ptr, +; short *zbin_ptr, +; short *qcoeff_ptr, +; short *dequant_ptr, +; const int *default_zig_zag, +; short *round_ptr, +; short *quant_ptr, +; short *dqcoeff_ptr, ; unsigned short zbin_oq_value, -; short *zbin_boost_ptr); +; short *zbin_boost_ptr, +; short *quant_shift); ; global sym(vp8_regular_quantize_b_impl_sse2) sym(vp8_regular_quantize_b_impl_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 10 + SHADOW_ARGS_TO_STACK 11 + SAVE_XMM push rsi push rdi push rbx - ; end prolog - ALIGN_STACK 16, rax + %define abs_minus_zbin 0 + %define temp_qcoeff 32 + %define qcoeff 64 + %define eob_tmp 96 + %define stack_size 112 + sub rsp, stack_size + ; end prolog - %define abs_minus_zbin_lo 0 - %define abs_minus_zbin_hi 16 - %define temp_qcoeff_lo 32 - %define temp_qcoeff_hi 48 - %define save_xmm6 64 - %define save_xmm7 80 - %define eob 96 - - %define vp8_regularquantizeb_stack_size eob + 16 - - sub rsp, vp8_regularquantizeb_stack_size - - movdqa OWORD PTR[rsp + save_xmm6], xmm6 - movdqa OWORD PTR[rsp + save_xmm7], xmm7 - - mov rdx, arg(0) ;coeff_ptr - mov eax, arg(8) ;zbin_oq_value - - mov rcx, arg(1) ;zbin_ptr - movd xmm7, eax + mov rdx, arg(0) ; coeff_ptr + mov rcx, arg(1) ; zbin_ptr + movd xmm7, arg(8) ; zbin_oq_value + mov rdi, arg(5) ; round_ptr + mov rsi, arg(6) ; quant_ptr + ; z movdqa xmm0, OWORD PTR[rdx] movdqa xmm4, OWORD PTR[rdx + 16] + pshuflw xmm7, xmm7, 0 + punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value + movdqa xmm1, xmm0 movdqa xmm5, xmm4 - psraw xmm0, 15 ;sign of z (aka sz) - psraw xmm4, 15 ;sign of z (aka sz) + ; sz + psraw xmm0, 15 + psraw xmm4, 15 + ; (z ^ sz) pxor xmm1, xmm0 pxor xmm5, xmm4 - movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr - movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr - - pshuflw xmm7, xmm7, 0 - psubw xmm1, xmm0 ;x = abs(z) + ; x = abs(z) + psubw xmm1, xmm0 + psubw xmm5, xmm4 - punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value - psubw xmm5, xmm4 ;x = abs(z) + movdqa xmm2, OWORD PTR[rcx] + movdqa xmm3, OWORD PTR[rcx + 16] + ; *zbin_ptr + zbin_oq_value paddw xmm2, xmm7 paddw xmm3, xmm7 - psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value) - psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value) - - mov rdi, arg(5) ;round_ptr - mov rsi, arg(6) ;quant_ptr + ; x - (*zbin_ptr + zbin_oq_value) + psubw xmm1, xmm2 + psubw xmm5, xmm3 + movdqa OWORD PTR[rsp + abs_minus_zbin], xmm1 + movdqa OWORD PTR[rsp + abs_minus_zbin + 16], xmm5 - movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1 - movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5 - - paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back - paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back + ; add (zbin_ptr + zbin_oq_value) back + paddw xmm1, xmm2 + paddw xmm5, xmm3 movdqa xmm2, OWORD PTR[rdi] - movdqa xmm3, OWORD PTR[rsi] - movdqa xmm6, OWORD PTR[rdi + 16] + + movdqa xmm3, OWORD PTR[rsi] movdqa xmm7, OWORD PTR[rsi + 16] + ; x + round paddw xmm1, xmm2 paddw xmm5, xmm6 - pmulhw xmm1, xmm3 - pmulhw xmm5, xmm7 - - mov rsi, arg(2) ;qcoeff_ptr - pxor xmm6, xmm6 - - pxor xmm1, xmm0 - pxor xmm5, xmm4 - - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1 - movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5 - - movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff - movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff - - xor rax, rax - mov rcx, -1 - - mov [rsp + eob], rcx - mov rsi, arg(9) ;zbin_boost_ptr - - mov rbx, arg(4) ;default_zig_zag + ; y = x * quant_ptr >> 16 + pmulhw xmm3, xmm1 + pmulhw xmm7, xmm5 -rq_zigzag_loop: - movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ + ; y += x + paddw xmm1, xmm3 + paddw xmm5, xmm7 - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] + movdqa OWORD PTR[rsp + temp_qcoeff], xmm1 + movdqa OWORD PTR[rsp + temp_qcoeff + 16], xmm5 - sub edx, edi ;x - zbin - jl rq_zigzag_1 - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1 - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1a - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1a - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1a: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1b - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1b - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1b: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1c - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1c - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1c: - lea rax, [rax + 1] - - cmp rax, 16 - jl rq_zigzag_loop - - mov rdi, arg(2) ;qcoeff_ptr - mov rcx, arg(3) ;dequant_ptr - mov rsi, arg(7) ;dqcoeff_ptr - - movdqa xmm2, OWORD PTR[rdi] - movdqa xmm3, OWORD PTR[rdi + 16] + pxor xmm6, xmm6 + ; zero qcoeff + movdqa OWORD PTR[rsp + qcoeff], xmm6 + movdqa OWORD PTR[rsp + qcoeff + 16], xmm6 + + mov [rsp + eob_tmp], DWORD -1 ; eob + mov rsi, arg(9) ; zbin_boost_ptr + mov rdi, arg(4) ; default_zig_zag + mov rax, arg(10) ; quant_shift_ptr + +%macro ZIGZAG_LOOP 2 +rq_zigzag_loop_%1: + movsxd rdx, DWORD PTR[rdi + (%1 * 4)] ; rc + movsx ebx, WORD PTR [rsi] ; *zbin_boost_ptr + lea rsi, [rsi + 2] ; zbin_boost_ptr++ + + ; x + movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] + + ; if (x >= zbin) + sub ecx, ebx ; x - zbin + jl rq_zigzag_loop_%2 ; x < zbin + + movsx ebx, WORD PTR[rsp + temp_qcoeff + rdx *2] + + ; downshift by quant_shift[rdx] + movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc] + sar ebx, cl ; also sets Z bit + je rq_zigzag_loop_%2 ; !y + mov WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ; reset to b->zrun_zbin_boost + mov [rsp + eob_tmp], DWORD %1 ; eob = i +%endmacro +ZIGZAG_LOOP 0, 1 +ZIGZAG_LOOP 1, 2 +ZIGZAG_LOOP 2, 3 +ZIGZAG_LOOP 3, 4 +ZIGZAG_LOOP 4, 5 +ZIGZAG_LOOP 5, 6 +ZIGZAG_LOOP 6, 7 +ZIGZAG_LOOP 7, 8 +ZIGZAG_LOOP 8, 9 +ZIGZAG_LOOP 9, 10 +ZIGZAG_LOOP 10, 11 +ZIGZAG_LOOP 11, 12 +ZIGZAG_LOOP 12, 13 +ZIGZAG_LOOP 13, 14 +ZIGZAG_LOOP 14, 15 +ZIGZAG_LOOP 15, end +rq_zigzag_loop_end: + + mov rbx, arg(2) ; qcoeff_ptr + mov rcx, arg(3) ; dequant_ptr + mov rsi, arg(7) ; dqcoeff_ptr + mov rax, [rsp + eob_tmp] ; eob + + movdqa xmm2, OWORD PTR[rsp + qcoeff] + movdqa xmm3, OWORD PTR[rsp + qcoeff + 16] + + ; y ^ sz + pxor xmm2, xmm0 + pxor xmm3, xmm4 + ; x = (y ^ sz) - sz + psubw xmm2, xmm0 + psubw xmm3, xmm4 movdqa xmm0, OWORD PTR[rcx] movdqa xmm1, OWORD PTR[rcx + 16] @@ -232,23 +181,20 @@ rq_zigzag_1c: pmullw xmm0, xmm2 pmullw xmm1, xmm3 - movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff - movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff - - mov rax, [rsp + eob] - - movdqa xmm6, OWORD PTR[rsp + save_xmm6] - movdqa xmm7, OWORD PTR[rsp + save_xmm7] + movdqa OWORD PTR[rbx], xmm2 + movdqa OWORD PTR[rbx + 16], xmm3 + movdqa OWORD PTR[rsi], xmm0 ; store dqcoeff + movdqa OWORD PTR[rsi + 16], xmm1 ; store dqcoeff add rax, 1 - add rsp, vp8_regularquantizeb_stack_size - pop rsp - ; begin epilog + add rsp, stack_size + pop rsp pop rbx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm old mode 100755 new mode 100644 diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h index b5b22c022ec2742c6af19409b7bdbfea9bba024d..266efb446afcdb7884f78135e4cf9654be54f80d 100644 --- a/vp8/encoder/x86/quantize_x86.h +++ b/vp8/encoder/x86/quantize_x86.h @@ -27,11 +27,11 @@ extern prototype_quantize_block(vp8_regular_quantize_b_sse2); #if !CONFIG_RUNTIME_CPU_DETECT -/* The sse2 quantizer has not been updated to match the new exact - * quantizer introduced in commit e04e2935 - *#undef vp8_quantize_quantb - *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2 - */ +// Currently, this function realizes a gain on x86 and a loss on x86_64 +#if ARCH_X86 +#undef vp8_quantize_quantb +#define vp8_quantize_quantb vp8_regular_quantize_b_sse2 +#endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index f9b3ea1d887e96f5ea5b32af92364a0c674251ee..31438f91662ce4b05f0eff93ebf1bd2979aeff12 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -108,37 +108,26 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, - short *qcoeff_ptr,short *dequant_ptr, - const int *default_zig_zag, short *round_ptr, - short *quant_ptr, short *dqcoeff_ptr, - unsigned short zbin_oq_value, - short *zbin_boost_ptr); + short *qcoeff_ptr,short *dequant_ptr, + const int *default_zig_zag, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr, + unsigned short zbin_oq_value, + short *zbin_boost_ptr, + short *quant_shift_ptr); void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d) { - short *zbin_boost_ptr = b->zrun_zbin_boost; - short *coeff_ptr = b->coeff; - short *zbin_ptr = b->zbin; - short *round_ptr = b->round; - short *quant_ptr = b->quant; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - short zbin_oq_value = b->zbin_extra; - - d->eob = vp8_regular_quantize_b_impl_sse2( - coeff_ptr, - zbin_ptr, - qcoeff_ptr, - dequant_ptr, - vp8_default_zig_zag1d, - - round_ptr, - quant_ptr, - dqcoeff_ptr, - zbin_oq_value, - zbin_boost_ptr - ); + d->eob = vp8_regular_quantize_b_impl_sse2(b->coeff, + b->zbin, + d->qcoeff, + d->dequant, + vp8_default_zig_zag1d, + b->round, + b->quant, + d->dqcoeff, + b->zbin_extra, + b->zrun_zbin_boost, + b->quant_shift); } int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); @@ -307,7 +296,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ +#if ARCH_X86 + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2; +#endif cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;