diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c index 869f6e20f2dd0106b61cd3a119a237e3ad162c73..fb9aa2e2504c3f708b0f54a2508a7ca21fc963a2 100644 --- a/vp8/common/alloccommon.c +++ b/vp8/common/alloccommon.c @@ -188,7 +188,7 @@ void vp8_setup_version(VP8_COMMON *cm) void vp8_create_common(VP8_COMMON *oci) { vp8_machine_specific_config(oci); - vp8_default_coef_probs(oci); + vp8_init_mbmode_probs(oci); vp8_default_bmode_probs(oci->fc.bmode_prob); diff --git a/vp8/common/default_coef_probs.h b/vp8/common/default_coef_probs.h new file mode 100755 index 0000000000000000000000000000000000000000..0d195636bcb70b8b2f372ffa91cf5c35b7b9ecca --- /dev/null +++ b/vp8/common/default_coef_probs.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +*/ + + +/*Generated file, included by entropy.c*/ + + +static const vp8_prob default_coef_probs [BLOCK_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = +{ + { /* Block Type ( 0 ) */ + { /* Coeff Band ( 0 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, + { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, + { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, + { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, + { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, + { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, + { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, + { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, + { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, + { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, + { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, + { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, + { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 1 ) */ + { /* Coeff Band ( 0 )*/ + { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, + { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, + { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, + { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, + { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, + { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, + { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, + { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, + { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, + { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, + { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, + { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, + { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, + { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, + { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 2 ) */ + { /* Coeff Band ( 0 )*/ + { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, + { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, + { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, + { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, + { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, + { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, + { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 3 ) */ + { /* Coeff Band ( 0 )*/ + { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, + { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, + { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, + { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, + { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, + { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, + { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, + { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, + { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, + { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, + { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + } + } +}; diff --git a/vp8/common/defaultcoefcounts.h b/vp8/common/defaultcoefcounts.h deleted file mode 100644 index 293e74269f4479a0b1c695b783c306a07e4c14a6..0000000000000000000000000000000000000000 --- a/vp8/common/defaultcoefcounts.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef __DEFAULTCOEFCOUNTS_H -#define __DEFAULTCOEFCOUNTS_H - -#include "entropy.h" - -extern const unsigned int vp8_default_coef_counts[BLOCK_TYPES] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; - -extern const unsigned int vp8_default_coef_counts_8x8[BLOCK_TYPES] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; - -#endif \ No newline at end of file diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c index ca37aab22abd44b54245dbc4795c3f7333601eb0..34b54b9a77139ec92589b8b281cea2e0c0f2fe3d 100644 --- a/vp8/common/entropy.c +++ b/vp8/common/entropy.c @@ -15,6 +15,7 @@ #include "string.h" #include "blockd.h" #include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" #define uchar unsigned char /* typedefs can clash */ #define uint unsigned int @@ -193,36 +194,13 @@ vp8_extra_bit_struct vp8_extra_bits[12] = #endif { 0, 0, 0, 0} }; -#include "defaultcoefcounts.h" + +#include "default_coef_probs.h" void vp8_default_coef_probs(VP8_COMMON *pc) { - int h = 0; - - do - { - int i = 0; - - do - { - int k = 0; - - do - { - unsigned int branch_ct [ENTROPY_NODES] [2]; - vp8_tree_probs_from_distribution( - MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree, - pc->fc.coef_probs[h][i][k], - branch_ct, - vp8_default_coef_counts[h][i][k], - 256, 1); - - } - while (++k < PREV_COEF_CONTEXTS); - } - while (++i < COEF_BANDS); - } - while (++h < BLOCK_TYPES); + vpx_memcpy(pc->fc.coef_probs, default_coef_probs, + sizeof(default_coef_probs)); #if CONFIG_T8X8 h = 0; do @@ -250,7 +228,6 @@ void vp8_default_coef_probs(VP8_COMMON *pc) #endif } - void vp8_coef_tree_initialize() { init_bit_trees(); diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm index 34a7e18aea727ec7175f57de50d3832d4a13df2a..83d3765ff8e1ec51de76f7c124c59488530515d7 100644 --- a/vp8/common/x86/idctllm_sse2.asm +++ b/vp8/common/x86/idctllm_sse2.asm @@ -11,7 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" -;void idct_dequant_0_2x_sse2 +;void vp8_idct_dequant_0_2x_sse2 ; ( ; short *qcoeff - 0 ; short *dequant - 1 @@ -21,8 +21,8 @@ ; int blk_stride - 5 ; ) -global sym(idct_dequant_0_2x_sse2) -sym(idct_dequant_0_2x_sse2): +global sym(vp8_idct_dequant_0_2x_sse2) +sym(vp8_idct_dequant_0_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -97,8 +97,8 @@ sym(idct_dequant_0_2x_sse2): pop rbp ret -global sym(idct_dequant_full_2x_sse2) -sym(idct_dequant_full_2x_sse2): +global sym(vp8_idct_dequant_full_2x_sse2) +sym(vp8_idct_dequant_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -353,7 +353,7 @@ sym(idct_dequant_full_2x_sse2): pop rbp ret -;void idct_dequant_dc_0_2x_sse2 +;void vp8_idct_dequant_dc_0_2x_sse2 ; ( ; short *qcoeff - 0 ; short *dequant - 1 @@ -362,8 +362,8 @@ sym(idct_dequant_full_2x_sse2): ; int dst_stride - 4 ; short *dc - 5 ; ) -global sym(idct_dequant_dc_0_2x_sse2) -sym(idct_dequant_dc_0_2x_sse2): +global sym(vp8_idct_dequant_dc_0_2x_sse2) +sym(vp8_idct_dequant_dc_0_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -438,8 +438,8 @@ sym(idct_dequant_dc_0_2x_sse2): pop rbp ret -global sym(idct_dequant_dc_full_2x_sse2) -sym(idct_dequant_dc_full_2x_sse2): +global sym(vp8_idct_dequant_dc_full_2x_sse2) +sym(vp8_idct_dequant_dc_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 4efff7eb584d227a16b9dabe66094ffbf0814b68..295609c58167a1cb6018b0f3366bd6f467217ee0 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -1395,8 +1395,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): neg rax ; calculate mask - movdqu xmm1, [rsi+2*rax] ; p1 - movdqu xmm0, [rdi] ; q1 + movdqa xmm1, [rsi+2*rax] ; p1 + movdqa xmm0, [rdi] ; q1 movdqa xmm2, xmm1 movdqa xmm7, xmm0 movdqa xmm4, xmm0 @@ -1406,8 +1406,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw xmm1, 1 ; abs(p1-q1)/2 - movdqu xmm5, [rsi+rax] ; p0 - movdqu xmm4, [rsi] ; q0 + movdqa xmm5, [rsi+rax] ; p0 + movdqa xmm4, [rsi] ; q0 movdqa xmm0, xmm4 ; q0 movdqa xmm6, xmm5 ; p0 psubusb xmm5, xmm4 ; p0-=q0 @@ -1449,7 +1449,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): psubsb xmm3, xmm0 ; q0-= q0 add pxor xmm3, [GLOBAL(t80)] ; unoffset - movdqu [rsi], xmm3 ; write back + movdqa [rsi], xmm3 ; write back ; now do +3 side psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 @@ -1465,7 +1465,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): paddsb xmm6, xmm0 ; p0+= p0 add pxor xmm6, [GLOBAL(t80)] ; unoffset - movdqu [rsi+rax], xmm6 ; write back + movdqa [rsi+rax], xmm6 ; write back ; begin epilog pop rdi @@ -1507,17 +1507,17 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): lea rdx, [rsi + rax*4] lea rcx, [rdx + rax] - movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 - movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 - movdqu xmm2, [rdi] ; 13 12 11 10 - movdqu xmm3, [rcx] ; 53 52 51 50 + movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 + movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 + movd xmm2, [rdi] ; 13 12 11 10 + movd xmm3, [rcx] ; 53 52 51 50 punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 - movdqu xmm4, [rsi + rax*2] ; 23 22 21 20 - movdqu xmm5, [rdx + rax*2] ; 63 62 61 60 - movdqu xmm6, [rdi + rax*2] ; 33 32 31 30 - movdqu xmm7, [rcx + rax*2] ; 73 72 71 70 + movd xmm4, [rsi + rax*2] ; 23 22 21 20 + movd xmm5, [rdx + rax*2] ; 63 62 61 60 + movd xmm6, [rdi + rax*2] ; 33 32 31 30 + movd xmm7, [rcx + rax*2] ; 73 72 71 70 punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 @@ -1540,17 +1540,17 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): lea rdx, [rsi + rax*4] lea rcx, [rdx + rax] - movdqu xmm4, [rsi] ; 83 82 81 80 - movdqu xmm1, [rdx] ; c3 c2 c1 c0 - movdqu xmm6, [rdi] ; 93 92 91 90 - movdqu xmm3, [rcx] ; d3 d2 d1 d0 + movd xmm4, [rsi] ; 83 82 81 80 + movd xmm1, [rdx] ; c3 c2 c1 c0 + movd xmm6, [rdi] ; 93 92 91 90 + movd xmm3, [rcx] ; d3 d2 d1 d0 punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 - movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0 - movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0 - movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0 - movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0 + movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0 + movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 + movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0 + movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 diff --git a/vp8/common/x86/postproc_mmx.c b/vp8/common/x86/postproc_mmx.c deleted file mode 100644 index 6b6321ace3d672617105085cfb36956262e7135b..0000000000000000000000000000000000000000 --- a/vp8/common/x86/postproc_mmx.c +++ /dev/null @@ -1,1508 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include <math.h> -#include <stdlib.h> -#include "vpx_scale/yv12config.h" -#include "pragmas.h" - -#define VP8_FILTER_WEIGHT 128 -#define VP8_FILTER_SHIFT 7 - - - -/* static constants */ -__declspec(align(16)) -const static short Blur[48] = -{ - - 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, - 64, 64, 64, 64, 64, 64, 64, 64, - 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, - 0, 0, 0, 0, 0, 0, 0, 0, - -}; -#define RD __declspec(align(16)) __int64 rd = 0x0040004000400040; -#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004}; - -#ifndef RELOCATEABLE -const static RD; -const static R4D2; -#endif - - -/* external references */ -extern double vp8_gaussian(double sigma, double mu, double x); -extern short vp8_rv[]; -extern int vp8_q2mbl(int x) ; - - - -void vp8_post_proc_down_and_across_mmx -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit -) -{ -#ifdef RELOCATEABLE - RD - R4D2 -#endif - - __asm - { - push ebx - lea ebx, Blur - movd mm2, flimit - punpcklwd mm2, mm2 - punpckldq mm2, mm2 - - mov esi, src_ptr - mov edi, dst_ptr - - mov ecx, DWORD PTR rows - mov eax, src_pixels_per_line ; - destination pitch? - pxor mm0, mm0 ; - mm0 = 00000000 - - nextrow: - - xor edx, edx ; - - clear out edx for use as loop counter - nextcol: - - pxor mm7, mm7 ; - - mm7 = 00000000 - movq mm6, [ebx + 32 ] ; - mm6 = kernel 2 taps - movq mm3, [esi] ; - mm4 = r0 p0..p7 - punpcklbw mm3, mm0 ; - mm3 = p0..p3 - movq mm1, mm3 ; - mm1 = p0..p3 - pmullw mm3, mm6 ; - mm3 *= kernel 2 modifiers - - movq mm6, [ebx + 48] ; - mm6 = kernel 3 taps - movq mm5, [esi + eax] ; - mm4 = r1 p0..p7 - punpcklbw mm5, mm0 ; - mm5 = r1 p0..p3 - pmullw mm6, mm5 ; - mm6 *= p0..p3 * kernel 3 modifiers - paddusw mm3, mm6 ; - mm3 += mm6 - - ; - thresholding - movq mm7, mm1 ; - mm7 = r0 p0..p3 - psubusw mm7, mm5 ; - mm7 = r0 p0..p3 - r1 p0..p3 - psubusw mm5, mm1 ; - mm5 = r1 p0..p3 - r0 p0..p3 - paddusw mm7, mm5 ; - mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw mm7, mm2 - - movq mm6, [ebx + 64 ] ; - mm6 = kernel 4 modifiers - movq mm5, [esi + 2*eax] ; - mm4 = r2 p0..p7 - punpcklbw mm5, mm0 ; - mm5 = r2 p0..p3 - pmullw mm6, mm5 ; - mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = r0 p0..p3 - psubusw mm6, mm5 ; - mm6 = r0 p0..p3 - r2 p0..p3 - psubusw mm5, mm1 ; - mm5 = r2 p0..p3 - r2 p0..p3 - paddusw mm6, mm5 ; - mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - - neg eax - movq mm6, [ebx ] ; - kernel 0 taps - movq mm5, [esi+2*eax] ; - mm4 = r-2 p0..p7 - punpcklbw mm5, mm0 ; - mm5 = r-2 p0..p3 - pmullw mm6, mm5 ; - mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = r0 p0..p3 - psubusw mm6, mm5 ; - mm6 = p0..p3 - r-2 p0..p3 - psubusw mm5, mm1 ; - mm5 = r-2 p0..p3 - p0..p3 - paddusw mm6, mm5 ; - mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - movq mm6, [ebx + 16] ; - kernel 1 taps - movq mm4, [esi+eax] ; - mm4 = r-1 p0..p7 - punpcklbw mm4, mm0 ; - mm4 = r-1 p0..p3 - pmullw mm6, mm4 ; - mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = r0 p0..p3 - psubusw mm6, mm4 ; - mm6 = p0..p3 - r-2 p0..p3 - psubusw mm4, mm1 ; - mm5 = r-1 p0..p3 - p0..p3 - paddusw mm6, mm4 ; - mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - - paddusw mm3, rd ; - mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; - mm3 /= 128 - - pand mm1, mm7 ; - mm1 select vals > thresh from source - pandn mm7, mm3 ; - mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; - combination - - packuswb mm1, mm0 ; - pack to bytes - - movd [edi], mm1 ; - neg eax ; - pitch is positive - - - add esi, 4 - add edi, 4 - add edx, 4 - - cmp edx, cols - jl nextcol - // done with the all cols, start the across filtering in place - sub esi, edx - sub edi, edx - - - push eax - xor edx, edx - mov eax, [edi-4]; - - acrossnextcol: - pxor mm7, mm7 ; - mm7 = 00000000 - movq mm6, [ebx + 32 ] ; - movq mm4, [edi+edx] ; - mm4 = p0..p7 - movq mm3, mm4 ; - mm3 = p0..p7 - punpcklbw mm3, mm0 ; - mm3 = p0..p3 - movq mm1, mm3 ; - mm1 = p0..p3 - pmullw mm3, mm6 ; - mm3 *= kernel 2 modifiers - - movq mm6, [ebx + 48] - psrlq mm4, 8 ; - mm4 = p1..p7 - movq mm5, mm4 ; - mm5 = p1..p7 - punpcklbw mm5, mm0 ; - mm5 = p1..p4 - pmullw mm6, mm5 ; - mm6 *= p1..p4 * kernel 3 modifiers - paddusw mm3, mm6 ; - mm3 += mm6 - - ; - thresholding - movq mm7, mm1 ; - mm7 = p0..p3 - psubusw mm7, mm5 ; - mm7 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; - mm5 = p1..p4 - p0..p3 - paddusw mm7, mm5 ; - mm7 = abs(p0..p3 - p1..p4) - pcmpgtw mm7, mm2 - - movq mm6, [ebx + 64 ] - psrlq mm4, 8 ; - mm4 = p2..p7 - movq mm5, mm4 ; - mm5 = p2..p7 - punpcklbw mm5, mm0 ; - mm5 = p2..p5 - pmullw mm6, mm5 ; - mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = p0..p3 - psubusw mm6, mm5 ; - mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; - mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - - movq mm6, [ebx ] - movq mm4, [edi+edx-2] ; - mm4 = p-2..p5 - movq mm5, mm4 ; - mm5 = p-2..p5 - punpcklbw mm5, mm0 ; - mm5 = p-2..p1 - pmullw mm6, mm5 ; - mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = p0..p3 - psubusw mm6, mm5 ; - mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; - mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - movq mm6, [ebx + 16] - psrlq mm4, 8 ; - mm4 = p-1..p5 - punpcklbw mm4, mm0 ; - mm4 = p-1..p2 - pmullw mm6, mm4 ; - mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = p0..p3 - psubusw mm6, mm4 ; - mm6 = p0..p3 - p1..p4 - psubusw mm4, mm1 ; - mm5 = p1..p4 - p0..p3 - paddusw mm6, mm4 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - paddusw mm3, rd ; - mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; - mm3 /= 128 - - pand mm1, mm7 ; - mm1 select vals > thresh from source - pandn mm7, mm3 ; - mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; - combination - - packuswb mm1, mm0 ; - pack to bytes - mov DWORD PTR [edi+edx-4], eax ; - store previous four bytes - movd eax, mm1 - - add edx, 4 - cmp edx, cols - jl acrossnextcol; - - mov DWORD PTR [edi+edx-4], eax - pop eax - - // done with this rwo - add esi, eax ; - next line - mov eax, dst_pixels_per_line ; - destination pitch? - add edi, eax ; - next destination - mov eax, src_pixels_per_line ; - destination pitch? - - dec ecx ; - decrement count - jnz nextrow ; - next row - pop ebx - - } -} - - - -void vp8_post_proc_down_and_across_xmm -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit -) -{ -#ifdef RELOCATEABLE - R4D2 -#endif - - __asm - { - movd xmm2, flimit - punpcklwd xmm2, xmm2 - punpckldq xmm2, xmm2 - punpcklqdq xmm2, xmm2 - - mov esi, src_ptr - mov edi, dst_ptr - - mov ecx, DWORD PTR rows - mov eax, src_pixels_per_line ; - destination pitch? - pxor xmm0, xmm0 ; - mm0 = 00000000 - - nextrow: - - xor edx, edx ; - - clear out edx for use as loop counter - nextcol: - movq xmm3, QWORD PTR [esi] ; - - mm4 = r0 p0..p7 - punpcklbw xmm3, xmm0 ; - mm3 = p0..p3 - movdqa xmm1, xmm3 ; - mm1 = p0..p3 - psllw xmm3, 2 ; - - movq xmm5, QWORD PTR [esi + eax] ; - mm4 = r1 p0..p7 - punpcklbw xmm5, xmm0 ; - mm5 = r1 p0..p3 - paddusw xmm3, xmm5 ; - mm3 += mm6 - - ; - thresholding - movdqa xmm7, xmm1 ; - mm7 = r0 p0..p3 - psubusw xmm7, xmm5 ; - mm7 = r0 p0..p3 - r1 p0..p3 - psubusw xmm5, xmm1 ; - mm5 = r1 p0..p3 - r0 p0..p3 - paddusw xmm7, xmm5 ; - mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw xmm7, xmm2 - - movq xmm5, QWORD PTR [esi + 2*eax] ; - mm4 = r2 p0..p7 - punpcklbw xmm5, xmm0 ; - mm5 = r2 p0..p3 - paddusw xmm3, xmm5 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = r0 p0..p3 - psubusw xmm6, xmm5 ; - mm6 = r0 p0..p3 - r2 p0..p3 - psubusw xmm5, xmm1 ; - mm5 = r2 p0..p3 - r2 p0..p3 - paddusw xmm6, xmm5 ; - mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - - neg eax - movq xmm5, QWORD PTR [esi+2*eax] ; - mm4 = r-2 p0..p7 - punpcklbw xmm5, xmm0 ; - mm5 = r-2 p0..p3 - paddusw xmm3, xmm5 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = r0 p0..p3 - psubusw xmm6, xmm5 ; - mm6 = p0..p3 - r-2 p0..p3 - psubusw xmm5, xmm1 ; - mm5 = r-2 p0..p3 - p0..p3 - paddusw xmm6, xmm5 ; - mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - movq xmm4, QWORD PTR [esi+eax] ; - mm4 = r-1 p0..p7 - punpcklbw xmm4, xmm0 ; - mm4 = r-1 p0..p3 - paddusw xmm3, xmm4 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = r0 p0..p3 - psubusw xmm6, xmm4 ; - mm6 = p0..p3 - r-2 p0..p3 - psubusw xmm4, xmm1 ; - mm5 = r-1 p0..p3 - p0..p3 - paddusw xmm6, xmm4 ; - mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - - paddusw xmm3, rd42 ; - mm3 += round value - psraw xmm3, 3 ; - mm3 /= 8 - - pand xmm1, xmm7 ; - mm1 select vals > thresh from source - pandn xmm7, xmm3 ; - mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; - combination - - packuswb xmm1, xmm0 ; - pack to bytes - movq QWORD PTR [edi], xmm1 ; - - neg eax ; - pitch is positive - add esi, 8 - add edi, 8 - - add edx, 8 - cmp edx, cols - - jl nextcol - - // done with the all cols, start the across filtering in place - sub esi, edx - sub edi, edx - - xor edx, edx - movq mm0, QWORD PTR [edi-8]; - - acrossnextcol: - movq xmm7, QWORD PTR [edi +edx -2] - movd xmm4, DWORD PTR [edi +edx +6] - - pslldq xmm4, 8 - por xmm4, xmm7 - - movdqa xmm3, xmm4 - psrldq xmm3, 2 - punpcklbw xmm3, xmm0 ; - mm3 = p0..p3 - movdqa xmm1, xmm3 ; - mm1 = p0..p3 - psllw xmm3, 2 - - - movdqa xmm5, xmm4 - psrldq xmm5, 3 - punpcklbw xmm5, xmm0 ; - mm5 = p1..p4 - paddusw xmm3, xmm5 ; - mm3 += mm6 - - ; - thresholding - movdqa xmm7, xmm1 ; - mm7 = p0..p3 - psubusw xmm7, xmm5 ; - mm7 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; - mm5 = p1..p4 - p0..p3 - paddusw xmm7, xmm5 ; - mm7 = abs(p0..p3 - p1..p4) - pcmpgtw xmm7, xmm2 - - movdqa xmm5, xmm4 - psrldq xmm5, 4 - punpcklbw xmm5, xmm0 ; - mm5 = p2..p5 - paddusw xmm3, xmm5 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = p0..p3 - psubusw xmm6, xmm5 ; - mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; - mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - - movdqa xmm5, xmm4 ; - mm5 = p-2..p5 - punpcklbw xmm5, xmm0 ; - mm5 = p-2..p1 - paddusw xmm3, xmm5 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = p0..p3 - psubusw xmm6, xmm5 ; - mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; - mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - psrldq xmm4, 1 ; - mm4 = p-1..p5 - punpcklbw xmm4, xmm0 ; - mm4 = p-1..p2 - paddusw xmm3, xmm4 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = p0..p3 - psubusw xmm6, xmm4 ; - mm6 = p0..p3 - p1..p4 - psubusw xmm4, xmm1 ; - mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm4 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - paddusw xmm3, rd42 ; - mm3 += round value - psraw xmm3, 3 ; - mm3 /= 8 - - pand xmm1, xmm7 ; - mm1 select vals > thresh from source - pandn xmm7, xmm3 ; - mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; - combination - - packuswb xmm1, xmm0 ; - pack to bytes - movq QWORD PTR [edi+edx-8], mm0 ; - store previous four bytes - movdq2q mm0, xmm1 - - add edx, 8 - cmp edx, cols - jl acrossnextcol; - - // last 8 pixels - movq QWORD PTR [edi+edx-8], mm0 - - // done with this rwo - add esi, eax ; - next line - mov eax, dst_pixels_per_line ; - destination pitch? - add edi, eax ; - next destination - mov eax, src_pixels_per_line ; - destination pitch? - - dec ecx ; - decrement count - jnz nextrow ; - next row - } -} - - -void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit) -{ - int c, i; - __declspec(align(16)) - int flimit2[2]; - __declspec(align(16)) - unsigned char d[16][8]; - - flimit = vp8_q2mbl(flimit); - - for (i = 0; i < 2; i++) - flimit2[i] = flimit; - - rows += 8; - - for (c = 0; c < cols; c += 4) - { - unsigned char *s = &dst[c]; - - __asm - { - mov esi, s ; - pxor mm0, mm0 ; - - mov eax, pitch ; - neg eax // eax = -pitch - - lea esi, [esi + eax*8]; // edi = s[-pitch*8] - neg eax - - - pxor mm5, mm5 - pxor mm6, mm6 ; - - pxor mm7, mm7 ; - mov edi, esi - - mov ecx, 15 ; - - loop_initvar: - movd mm1, DWORD PTR [edi]; - punpcklbw mm1, mm0 ; - - paddw mm5, mm1 ; - pmullw mm1, mm1 ; - - movq mm2, mm1 ; - punpcklwd mm1, mm0 ; - - punpckhwd mm2, mm0 ; - paddd mm6, mm1 ; - - paddd mm7, mm2 ; - lea edi, [edi+eax] ; - - dec ecx - jne loop_initvar - //save the var and sum - xor edx, edx - loop_row: - movd mm1, DWORD PTR [esi] // [s-pitch*8] - movd mm2, DWORD PTR [edi] // [s+pitch*7] - - punpcklbw mm1, mm0 - punpcklbw mm2, mm0 - - paddw mm5, mm2 - psubw mm5, mm1 - - pmullw mm2, mm2 - movq mm4, mm2 - - punpcklwd mm2, mm0 - punpckhwd mm4, mm0 - - paddd mm6, mm2 - paddd mm7, mm4 - - pmullw mm1, mm1 - movq mm2, mm1 - - punpcklwd mm1, mm0 - psubd mm6, mm1 - - punpckhwd mm2, mm0 - psubd mm7, mm2 - - - movq mm3, mm6 - pslld mm3, 4 - - psubd mm3, mm6 - movq mm1, mm5 - - movq mm4, mm5 - pmullw mm1, mm1 - - pmulhw mm4, mm4 - movq mm2, mm1 - - punpcklwd mm1, mm4 - punpckhwd mm2, mm4 - - movq mm4, mm7 - pslld mm4, 4 - - psubd mm4, mm7 - - psubd mm3, mm1 - psubd mm4, mm2 - - psubd mm3, flimit2 - psubd mm4, flimit2 - - psrad mm3, 31 - psrad mm4, 31 - - packssdw mm3, mm4 - packsswb mm3, mm0 - - movd mm1, DWORD PTR [esi+eax*8] - - movq mm2, mm1 - punpcklbw mm1, mm0 - - paddw mm1, mm5 - mov ecx, edx - - and ecx, 127 - movq mm4, vp8_rv[ecx*2] - - paddw mm1, mm4 - //paddw xmm1, eight8s - psraw mm1, 4 - - packuswb mm1, mm0 - pand mm1, mm3 - - pandn mm3, mm2 - por mm1, mm3 - - and ecx, 15 - movd DWORD PTR d[ecx*4], mm1 - - mov ecx, edx - sub ecx, 8 - - and ecx, 15 - movd mm1, DWORD PTR d[ecx*4] - - movd [esi], mm1 - lea esi, [esi+eax] - - lea edi, [edi+eax] - add edx, 1 - - cmp edx, rows - jl loop_row - - } - - } -} - -void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit) -{ - int c, i; - __declspec(align(16)) - int flimit4[4]; - __declspec(align(16)) - unsigned char d[16][8]; - - flimit = vp8_q2mbl(flimit); - - for (i = 0; i < 4; i++) - flimit4[i] = flimit; - - rows += 8; - - for (c = 0; c < cols; c += 8) - { - unsigned char *s = &dst[c]; - - __asm - { - mov esi, s ; - pxor xmm0, xmm0 ; - - mov eax, pitch ; - neg eax // eax = -pitch - - lea esi, [esi + eax*8]; // edi = s[-pitch*8] - neg eax - - - pxor xmm5, xmm5 - pxor xmm6, xmm6 ; - - pxor xmm7, xmm7 ; - mov edi, esi - - mov ecx, 15 ; - - loop_initvar: - movq xmm1, QWORD PTR [edi]; - punpcklbw xmm1, xmm0 ; - - paddw xmm5, xmm1 ; - pmullw xmm1, xmm1 ; - - movdqa xmm2, xmm1 ; - punpcklwd xmm1, xmm0 ; - - punpckhwd xmm2, xmm0 ; - paddd xmm6, xmm1 ; - - paddd xmm7, xmm2 ; - lea edi, [edi+eax] ; - - dec ecx - jne loop_initvar - //save the var and sum - xor edx, edx - loop_row: - movq xmm1, QWORD PTR [esi] // [s-pitch*8] - movq xmm2, QWORD PTR [edi] // [s+pitch*7] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - paddw xmm5, xmm2 - psubw xmm5, xmm1 - - pmullw xmm2, xmm2 - movdqa xmm4, xmm2 - - punpcklwd xmm2, xmm0 - punpckhwd xmm4, xmm0 - - paddd xmm6, xmm2 - paddd xmm7, xmm4 - - pmullw xmm1, xmm1 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm0 - psubd xmm6, xmm1 - - punpckhwd xmm2, xmm0 - psubd xmm7, xmm2 - - - movdqa xmm3, xmm6 - pslld xmm3, 4 - - psubd xmm3, xmm6 - movdqa xmm1, xmm5 - - movdqa xmm4, xmm5 - pmullw xmm1, xmm1 - - pmulhw xmm4, xmm4 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm4 - punpckhwd xmm2, xmm4 - - movdqa xmm4, xmm7 - pslld xmm4, 4 - - psubd xmm4, xmm7 - - psubd xmm3, xmm1 - psubd xmm4, xmm2 - - psubd xmm3, flimit4 - psubd xmm4, flimit4 - - psrad xmm3, 31 - psrad xmm4, 31 - - packssdw xmm3, xmm4 - packsswb xmm3, xmm0 - - movq xmm1, QWORD PTR [esi+eax*8] - - movq xmm2, xmm1 - punpcklbw xmm1, xmm0 - - paddw xmm1, xmm5 - mov ecx, edx - - and ecx, 127 - movdqu xmm4, vp8_rv[ecx*2] - - paddw xmm1, xmm4 - //paddw xmm1, eight8s - psraw xmm1, 4 - - packuswb xmm1, xmm0 - pand xmm1, xmm3 - - pandn xmm3, xmm2 - por xmm1, xmm3 - - and ecx, 15 - movq QWORD PTR d[ecx*8], xmm1 - - mov ecx, edx - sub ecx, 8 - - and ecx, 15 - movq mm0, d[ecx*8] - - movq [esi], mm0 - lea esi, [esi+eax] - - lea edi, [edi+eax] - add edx, 1 - - cmp edx, rows - jl loop_row - - } - - } -} -#if 0 -/**************************************************************************** - * - * ROUTINE : plane_add_noise_wmt - * - * INPUTS : unsigned char *Start starting address of buffer to add gaussian - * noise to - * unsigned int Width width of plane - * unsigned int Height height of plane - * int Pitch distance between subsequent lines of frame - * int q quantizer used to determine amount of noise - * to add - * - * OUTPUTS : None. - * - * RETURNS : void. - * - * FUNCTION : adds gaussian noise to a plane of pixels - * - * SPECIAL NOTES : None. - * - ****************************************************************************/ -void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) -{ - unsigned int i; - - __declspec(align(16)) unsigned char blackclamp[16]; - __declspec(align(16)) unsigned char whiteclamp[16]; - __declspec(align(16)) unsigned char bothclamp[16]; - char char_dist[300]; - char Rand[2048]; - double sigma; -// return; - __asm emms - sigma = a + .5 + .6 * (63 - q) / 63.0; - - // set up a lookup table of 256 entries that matches - // a gaussian distribution with sigma determined by q. - // - { - double i; - int next, j; - - next = 0; - - for (i = -32; i < 32; i++) - { - double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i); - int a = (int)(g + .5); - - if (a) - { - for (j = 0; j < a; j++) - { - char_dist[next+j] = (char) i; - } - - next = next + j; - } - - } - - for (next = next; next < 256; next++) - char_dist[next] = 0; - - } - - for (i = 0; i < 2048; i++) - { - Rand[i] = char_dist[rand() & 0xff]; - } - - for (i = 0; i < 16; i++) - { - blackclamp[i] = -char_dist[0]; - whiteclamp[i] = -char_dist[0]; - bothclamp[i] = -2 * char_dist[0]; - } - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = Rand + (rand() & 0xff); - - __asm - { - mov ecx, [Width] - mov esi, Pos - mov edi, Ref - xor eax, eax - - nextset: - movdqu xmm1, [esi+eax] // get the source - - psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise - paddusb xmm1, bothclamp - psubusb xmm1, whiteclamp - - movdqu xmm2, [edi+eax] // get the noise for this line - paddb xmm1, xmm2 // add it in - movdqu [esi+eax], xmm1 // store the result - - add eax, 16 // move to the next line - - cmp eax, ecx - jl nextset - - - } - - } -} -#endif -__declspec(align(16)) -static const int four8s[4] = { 8, 8, 8, 8}; -void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit) -{ - int r, i; - __declspec(align(16)) - int flimit4[4]; - unsigned char *s = src; - int sumsq; - int sum; - - - flimit = vp8_q2mbl(flimit); - flimit4[0] = - flimit4[1] = - flimit4[2] = - flimit4[3] = flimit; - - for (r = 0; r < rows; r++) - { - - - sumsq = 0; - sum = 0; - - for (i = -8; i <= 6; i++) - { - sumsq += s[i] * s[i]; - sum += s[i]; - } - - __asm - { - mov eax, sumsq - movd xmm7, eax - - mov eax, sum - movd xmm6, eax - - mov esi, s - xor ecx, ecx - - mov edx, cols - add edx, 8 - pxor mm0, mm0 - pxor mm1, mm1 - - pxor xmm0, xmm0 - nextcol4: - - movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5 - movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10 - - punpcklbw xmm1, xmm0 // expanding - punpcklbw xmm2, xmm0 // expanding - - punpcklwd xmm1, xmm0 // expanding to dwords - punpcklwd xmm2, xmm0 // expanding to dwords - - psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5 - paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2 - - paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5 - pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5 - - paddd xmm6, xmm2 - paddd xmm7, xmm1 - - pshufd xmm6, xmm6, 0 // duplicate the last ones - pshufd xmm7, xmm7, 0 // duplicate the last ones - - psrldq xmm1, 4 // 8--7 9--6 10--5 0000 - psrldq xmm2, 4 // 8--7 9--6 10--5 0000 - - pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared - pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared - - paddd xmm6, xmm4 - paddd xmm7, xmm3 - - pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared - pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared - - paddd xmm7, xmm3 - paddd xmm6, xmm4 - - pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared - pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared - - paddd xmm7, xmm3 - paddd xmm6, xmm4 - - movdqa xmm3, xmm6 - pmaddwd xmm3, xmm3 - - movdqa xmm5, xmm7 - pslld xmm5, 4 - - psubd xmm5, xmm7 - psubd xmm5, xmm3 - - psubd xmm5, flimit4 - psrad xmm5, 31 - - packssdw xmm5, xmm0 - packsswb xmm5, xmm0 - - movd xmm1, DWORD PTR [esi+ecx] - movq xmm2, xmm1 - - punpcklbw xmm1, xmm0 - punpcklwd xmm1, xmm0 - - paddd xmm1, xmm6 - paddd xmm1, four8s - - psrad xmm1, 4 - packssdw xmm1, xmm0 - - packuswb xmm1, xmm0 - pand xmm1, xmm5 - - pandn xmm5, xmm2 - por xmm5, xmm1 - - movd [esi+ecx-8], mm0 - movq mm0, mm1 - - movdq2q mm1, xmm5 - psrldq xmm7, 12 - - psrldq xmm6, 12 - add ecx, 4 - - cmp ecx, edx - jl nextcol4 - - } - s += pitch; - } -} - -#if 0 - -/**************************************************************************** - * - * ROUTINE : plane_add_noise_mmx - * - * INPUTS : unsigned char *Start starting address of buffer to add gaussian - * noise to - * unsigned int Width width of plane - * unsigned int Height height of plane - * int Pitch distance between subsequent lines of frame - * int q quantizer used to determine amount of noise - * to add - * - * OUTPUTS : None. - * - * RETURNS : void. - * - * FUNCTION : adds gaussian noise to a plane of pixels - * - * SPECIAL NOTES : None. - * - ****************************************************************************/ -void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) -{ - unsigned int i; - int Pitch4 = Pitch * 4; - const int noise_amount = 2; - const int noise_adder = 2 * noise_amount + 1; - - __declspec(align(16)) unsigned char blackclamp[16]; - __declspec(align(16)) unsigned char whiteclamp[16]; - __declspec(align(16)) unsigned char bothclamp[16]; - - char char_dist[300]; - char Rand[2048]; - - double sigma; - __asm emms - sigma = a + .5 + .6 * (63 - q) / 63.0; - - // set up a lookup table of 256 entries that matches - // a gaussian distribution with sigma determined by q. - // - { - double i, sum = 0; - int next, j; - - next = 0; - - for (i = -32; i < 32; i++) - { - int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i)); - - if (a) - { - for (j = 0; j < a; j++) - { - char_dist[next+j] = (char) i; - } - - next = next + j; - } - - } - - for (next = next; next < 256; next++) - char_dist[next] = 0; - - } - - for (i = 0; i < 2048; i++) - { - Rand[i] = char_dist[rand() & 0xff]; - } - - for (i = 0; i < 16; i++) - { - blackclamp[i] = -char_dist[0]; - whiteclamp[i] = -char_dist[0]; - bothclamp[i] = -2 * char_dist[0]; - } - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = Rand + (rand() & 0xff); - - __asm - { - mov ecx, [Width] - mov esi, Pos - mov edi, Ref - xor eax, eax - - nextset: - movq mm1, [esi+eax] // get the source - - psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise - paddusb mm1, bothclamp - psubusb mm1, whiteclamp - - movq mm2, [edi+eax] // get the noise for this line - paddb mm1, mm2 // add it in - movq [esi+eax], mm1 // store the result - - add eax, 8 // move to the next line - - cmp eax, ecx - jl nextset - - - } - - } -} -#else -extern char an[8][64][3072]; -extern int cd[8][64]; - -void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) -{ - unsigned int i; - __declspec(align(16)) unsigned char blackclamp[16]; - __declspec(align(16)) unsigned char whiteclamp[16]; - __declspec(align(16)) unsigned char bothclamp[16]; - - - __asm emms - - for (i = 0; i < 16; i++) - { - blackclamp[i] = -cd[a][q]; - whiteclamp[i] = -cd[a][q]; - bothclamp[i] = -2 * cd[a][q]; - } - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = an[a][q] + (rand() & 0xff); - - __asm - { - mov ecx, [Width] - mov esi, Pos - mov edi, Ref - xor eax, eax - - nextset: - movq mm1, [esi+eax] // get the source - - psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise - paddusb mm1, bothclamp - psubusb mm1, whiteclamp - - movq mm2, [edi+eax] // get the noise for this line - paddb mm1, mm2 // add it in - movq [esi+eax], mm1 // store the result - - add eax, 8 // move to the next line - - cmp eax, ecx - jl nextset - } - } -} - - -void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) -{ - unsigned int i; - - __declspec(align(16)) unsigned char blackclamp[16]; - __declspec(align(16)) unsigned char whiteclamp[16]; - __declspec(align(16)) unsigned char bothclamp[16]; - - __asm emms - - for (i = 0; i < 16; i++) - { - blackclamp[i] = -cd[a][q]; - whiteclamp[i] = -cd[a][q]; - bothclamp[i] = -2 * cd[a][q]; - } - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = an[a][q] + (rand() & 0xff); - - __asm - { - mov ecx, [Width] - mov esi, Pos - mov edi, Ref - xor eax, eax - - nextset: - movdqu xmm1, [esi+eax] // get the source - - psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise - paddusb xmm1, bothclamp - psubusb xmm1, whiteclamp - - movdqu xmm2, [edi+eax] // get the noise for this line - paddb xmm1, xmm2 // add it in - movdqu [esi+eax], xmm1 // store the result - - add eax, 16 // move to the next line - - cmp eax, ecx - jl nextset - } - } -} - -#endif diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c index 4c88db474b75a9843f681e09dfcdf5b5af04a48a..3a48068622af43ea99c6f5fd0f368d4462c4a9ba 100644 --- a/vp8/decoder/x86/idct_blk_sse2.c +++ b/vp8/decoder/x86/idct_blk_sse2.c @@ -12,17 +12,17 @@ #include "vp8/common/idct.h" #include "vp8/decoder/dequantize.h" -void idct_dequant_dc_0_2x_sse2 +void vp8_idct_dequant_dc_0_2x_sse2 (short *q, short *dq, unsigned char *pre, unsigned char *dst, int dst_stride, short *dc); -void idct_dequant_dc_full_2x_sse2 +void vp8_idct_dequant_dc_full_2x_sse2 (short *q, short *dq, unsigned char *pre, unsigned char *dst, int dst_stride, short *dc); -void idct_dequant_0_2x_sse2 +void vp8_idct_dequant_0_2x_sse2 (short *q, short *dq ,unsigned char *pre, unsigned char *dst, int dst_stride, int blk_stride); -void idct_dequant_full_2x_sse2 +void vp8_idct_dequant_full_2x_sse2 (short *q, short *dq ,unsigned char *pre, unsigned char *dst, int dst_stride, int blk_stride); @@ -35,14 +35,14 @@ void vp8_dequant_dc_idct_add_y_block_sse2 for (i = 0; i < 4; i++) { if (((short *)(eobs))[0] & 0xfefe) - idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc); + vp8_idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc); else - idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc); + vp8_idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc); if (((short *)(eobs))[1] & 0xfefe) - idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2); + vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2); else - idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2); + vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2); q += 64; dc += 4; @@ -61,14 +61,14 @@ void vp8_dequant_idct_add_y_block_sse2 for (i = 0; i < 4; i++) { if (((short *)(eobs))[0] & 0xfefe) - idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16); + vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16); else - idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16); + vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16); if (((short *)(eobs))[1] & 0xfefe) - idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16); + vp8_idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16); else - idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16); + vp8_idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16); q += 64; pre += 64; @@ -82,33 +82,33 @@ void vp8_dequant_idct_add_uv_block_sse2 unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) { if (((short *)(eobs))[0] & 0xfefe) - idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8); + vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8); else - idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8); + vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8); q += 32; pre += 32; dstu += stride*4; if (((short *)(eobs))[1] & 0xfefe) - idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8); + vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8); else - idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8); + vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8); q += 32; pre += 32; if (((short *)(eobs))[2] & 0xfefe) - idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8); + vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8); else - idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8); + vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8); q += 32; pre += 32; dstv += stride*4; if (((short *)(eobs))[3] & 0xfefe) - idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8); + vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8); else - idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8); + vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8); } diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 64d1c93042c18734afcf4df04ad28868925bc433..e65a7f9d4dda64184400004940d8b2a1fb5a6f96 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -22,7 +22,8 @@ #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" #include "bitstream.h" -#include "vp8/common/defaultcoefcounts.h" + +#include "defaultcoefcounts.h" #if CONFIG_SEGMENTATION static int segment_cost = 0; #endif @@ -1295,7 +1296,7 @@ static int independent_coef_context_savings(VP8_COMP *cpi) if (cpi->common.frame_type == KEY_FRAME) { /* Reset to default probabilities at key frames */ - sum_probs_over_prev_coef_context(vp8_default_coef_counts[i][j], + sum_probs_over_prev_coef_context(default_coef_counts[i][j], prev_coef_count_sum); } else diff --git a/vp8/common/defaultcoefcounts.c b/vp8/encoder/defaultcoefcounts.h similarity index 98% rename from vp8/common/defaultcoefcounts.c rename to vp8/encoder/defaultcoefcounts.h index 34d1fb1d510b16dd8c558f5e19cf3d90343f153d..3b54c823cb29970765e1168c832fa2a0485c048f 100644 --- a/vp8/common/defaultcoefcounts.c +++ b/vp8/encoder/defaultcoefcounts.h @@ -8,14 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "defaultcoefcounts.h" - /* Generated file, included by entropy.c */ -const unsigned int vp8_default_coef_counts[BLOCK_TYPES] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS] = +static const unsigned int default_coef_counts[BLOCK_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [MAX_ENTROPY_TOKENS] = { { @@ -401,4 +399,4 @@ const unsigned int vp8_default_coef_counts_8x8[BLOCK_TYPES] } } }; -#endif \ No newline at end of file +#endif diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 1a883b8ab2eaf8e2cf90da7556234427921bd134..b6e5a41c2b1cb3b3ed9f275b88a7ffd739167aa6 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -19,8 +19,6 @@ VP8_COMMON_SRCS-yes += common/asm_com_offsets.c VP8_COMMON_SRCS-yes += common/blockd.c VP8_COMMON_SRCS-yes += common/coefupdateprobs.h VP8_COMMON_SRCS-yes += common/debugmodes.c -VP8_COMMON_SRCS-yes += common/defaultcoefcounts.h -VP8_COMMON_SRCS-yes += common/defaultcoefcounts.c VP8_COMMON_SRCS-yes += common/entropy.c VP8_COMMON_SRCS-yes += common/entropymode.c VP8_COMMON_SRCS-yes += common/entropymv.c diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index d46d99df6626188f2ff2956d52a8f48232db6915..b71a54aea938d906839e1aef1b99aa50c6366a07 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -34,6 +34,7 @@ VP8_CX_SRCS-yes += vp8_cx_iface.c #INCLUDES += encoder VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c +VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h VP8_CX_SRCS-yes += encoder/bitstream.c VP8_CX_SRCS-yes += encoder/boolhuff.c VP8_CX_SRCS-yes += encoder/dct.c