Commit b6da40ad authored by Yaowu Xu's avatar Yaowu Xu

Merge branch 'master' into nextgenv2

Change-Id: I0e4030a37354bb23b3aa8be5cc1473770b9e7b06
parents 236623cf dc9d36c0
......@@ -67,12 +67,22 @@ TEST_P(ErrorBlockTest, OperationCheck) {
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
const int msb = bit_depth_ + 8 - 1;
for (int i = 0; i < kNumIterations; ++i) {
int err_count = 0;
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
coeff[j] = rnd(2 << 20) - (1 << 20);
dqcoeff[j] = rnd(2 << 20) - (1 << 20);
// coeff and dqcoeff will always have at least the same sign, and this
// can be used for optimization, so generate test input precisely.
if (rnd(2)) {
// Positive number
coeff[j] = rnd(1 << msb);
dqcoeff[j] = rnd(1 << msb);
} else {
// Negative number
coeff[j] = -rnd(1 << msb);
dqcoeff[j] = -rnd(1 << msb);
}
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
bit_depth_);
......@@ -85,7 +95,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
<< "Error: Error Block Test, C output doesn't match optimized output. "
<< "First failed at test case " << first_failure;
}
......@@ -100,23 +110,36 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
int max_val = ((1 << 20) - 1);
const int msb = bit_depth_ + 8 - 1;
int max_val = ((1 << msb) - 1);
for (int i = 0; i < kNumIterations; ++i) {
int err_count = 0;
int k = (i / 9) % 5;
int k = (i / 9) % 9;
// Change the maximum coeff value, to test different bit boundaries
if ( k == 4 && (i % 9) == 0 ) {
if ( k == 8 && (i % 9) == 0 ) {
max_val >>= 1;
}
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
if (k < 4) { // Test at maximum values
coeff[j] = k % 2 ? max_val : -max_val;
dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
if (k < 4) {
// Test at positive maximum values
coeff[j] = k % 2 ? max_val : 0;
dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
} else if (k < 8) {
// Test at negative maximum values
coeff[j] = k % 2 ? -max_val : 0;
dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
} else {
coeff[j] = rnd(2 << 14) - (1 << 14);
dqcoeff[j] = rnd(2 << 14) - (1 << 14);
if (rnd(2)) {
// Positive number
coeff[j] = rnd(1 << 14);
dqcoeff[j] = rnd(1 << 14);
} else {
// Negative number
coeff[j] = -rnd(1 << 14);
dqcoeff[j] = -rnd(1 << 14);
}
}
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
......@@ -130,21 +153,13 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
<< "Error: Error Block Test, C output doesn't match optimized output. "
<< "First failed at test case " << first_failure;
}
using std::tr1::make_tuple;
#if CONFIG_USE_X86INC && HAVE_SSE2
int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bps) {
assert(bps == 8);
return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
}
#if CONFIG_USE_X86INC
int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
......@@ -153,6 +168,15 @@ int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
}
#if HAVE_SSE2
int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bps) {
assert(bps == 8);
return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
}
INSTANTIATE_TEST_CASE_P(
SSE2, ErrorBlockTest,
::testing::Values(
......@@ -165,5 +189,23 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
&wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
#endif // HAVE_SSE2
#if HAVE_AVX
int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bps) {
assert(bps == 8);
return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
}
INSTANTIATE_TEST_CASE_P(
AVX, ErrorBlockTest,
::testing::Values(
make_tuple(&wrap_vp9_highbd_block_error_8bit_avx,
&wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
#endif // HAVE_AVX
#endif // CONFIG_USE_X86INC
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
......@@ -30,13 +30,13 @@ static void alloc_mode_context(VP10_COMMON *cm, int num_4x4_blk,
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
......
......@@ -39,12 +39,20 @@ void vp10_calc_indices(const double *data, const double *centroids,
}
}
// Generate a random number in the range [0, 32768).
static unsigned int lcg_rand16(unsigned int *state) {
*state = *state * 1103515245 + 12345;
return *state / 65536 % 32768;
}
static void calc_centroids(const double *data, double *centroids,
const uint8_t *indices, int n, int k, int dim) {
int i, j, index;
int count[PALETTE_MAX_SIZE];
unsigned int rand_state = data[0];
assert(n <= 32768);
srand((unsigned int) data[0]);
memset(count, 0, sizeof(count[0]) * k);
memset(centroids, 0, sizeof(centroids[0]) * k * dim);
......@@ -59,8 +67,7 @@ static void calc_centroids(const double *data, double *centroids,
for (i = 0; i < k; ++i) {
if (count[i] == 0) {
// TODO(huisu): replace rand() with something else.
memcpy(centroids + i * dim, data + (rand() % n) * dim,
memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
sizeof(centroids[0]) * dim);
} else {
const double norm = 1.0 / count[i];
......
......@@ -512,7 +512,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
SKIP_TXFM_NONE) {
// full forward transform and quantization
vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
vp10_xform_quant(x, plane, block, blk_row, blk_col,
plane_bsize, tx_size);
dist_block(x, plane, block, tx_size, &dist, &sse);
} else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
SKIP_TXFM_AC_ONLY) {
......
......@@ -554,7 +554,6 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
int16_t token;
EXTRABIT extra;
pt = get_entropy_context(tx_size, pd->above_context + blk_col,
pd->left_context + blk_row);
scan = so->scan;
......
......@@ -248,7 +248,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc";
specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp/;
......
......@@ -30,13 +30,13 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
......
......@@ -21,12 +21,7 @@
#include "vp9/encoder/vp9_denoiser.h"
#include "vp9/encoder/vp9_encoder.h"
/* The VP9 denoiser is a work-in-progress. It currently is only designed to work
* with speed 6, though it (inexplicably) seems to also work with speed 5 (one
* would need to modify the source code in vp9_pickmode.c and vp9_encoder.c to
* make the calls to the vp9_denoiser_* functions when in speed 5).
*
* The implementation is very similar to that of the VP8 denoiser. While
/* The VP9 denoiser is similar to that of the VP8 denoiser. While
* choosing the motion vectors / reference frames, the denoiser is run, and if
* it did not modify the signal to much, the denoised block is copied to the
* signal.
......@@ -328,7 +323,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
struct buf_2d src = mb->plane[0].src;
int is_skin = 0;
if (bs <= BLOCK_16X16 && !denoiser->no_denoising) {
if (bs <= BLOCK_16X16 && denoiser->denoising_on) {
// Take center pixel in block to determine is_skin.
const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
......@@ -345,7 +340,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
is_skin = vp9_skin_pixel(ysource, usource, vsource);
}
if (!denoiser->no_denoising)
if (denoiser->denoising_on)
decision = perform_motion_compensation(denoiser, mb, bs,
denoiser->increase_denoising,
mi_row, mi_col, ctx,
......@@ -528,8 +523,8 @@ void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
int height) {
// Denoiser is off by default, i.e., no denoising is performed.
// Noise level is measured periodically, and if observed to be above
// thresh_noise_estimate, then denoising is performed, i.e., no_denoising = 0.
denoiser->no_denoising = 1;
// thresh_noise_estimate, then denoising is performed, i.e., denoising_on = 1.
denoiser->denoising_on = 0;
denoiser->noise_estimate = 0;
denoiser->noise_estimate_count = 0;
denoiser->thresh_noise_estimate = 20;
......@@ -657,9 +652,9 @@ void vp9_denoiser_update_noise_estimate(VP9_COMP *const cpi) {
// Reset counter and check noise level condition.
cpi->denoiser.noise_estimate_count = 0;
if (cpi->denoiser.noise_estimate > cpi->denoiser.thresh_noise_estimate)
cpi->denoiser.no_denoising = 0;
cpi->denoiser.denoising_on = 1;
else
cpi->denoiser.no_denoising = 1;
cpi->denoiser.denoising_on = 0;
}
}
}
......
......@@ -32,7 +32,7 @@ typedef struct vp9_denoiser {
YV12_BUFFER_CONFIG last_source;
int increase_denoising;
int frame_buffer_initialized;
int no_denoising;
int denoising_on;
int noise_estimate;
int thresh_noise_estimate;
int noise_estimate_count;
......
......@@ -1183,10 +1183,13 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
double group_weight_factor) {
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
// Clamp the target rate to VBR min / max limts.
const int target_rate =
vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
if (section_target_bandwidth <= 0) {
if (target_rate <= 0) {
return rc->worst_quality; // Highest value allowed
} else {
const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
......@@ -1195,7 +1198,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
const double av_err_per_mb = section_err / active_mbs;
const double speed_term = 1.0 + 0.04 * oxcf->speed;
const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
BPER_MB_NORMBITS) / active_mbs;
int q;
......@@ -2444,7 +2447,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if ((i <= rc->max_gf_interval) ||
((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
const double frame_boost =
calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
// How fast is prediction quality decaying.
if (!detect_flash(twopass, 0)) {
......@@ -2737,11 +2740,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
}
target_rate = gf_group->bit_allocation[gf_group->index];
if (cpi->common.frame_type == KEY_FRAME)
target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
else
target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
rc->base_frame_target = target_rate;
{
......
......@@ -1816,6 +1816,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
int target_rate = rc->base_frame_target;
if (cpi->common.frame_type == KEY_FRAME)
target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
else
target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
// Correction to rate target based on prior over or under shoot.
if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
vbr_rate_correction(cpi, &target_rate);
......
......@@ -296,30 +296,11 @@ int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz) {
int i;
int32_t c, d;
int64_t error = 0, sqcoeff = 0;
int16_t diff;
const int32_t hi = 0x00007fff;
const int32_t lo = 0xffff8000;
for (i = 0; i < block_size; i++) {
c = coeff[i];
d = dqcoeff[i];
// Saturate to 16 bits
c = (c > hi) ? hi : ((c < lo) ? lo : c);
d = (d > hi) ? hi : ((d < lo) ? lo : d);
diff = d - c;
error += diff * diff;
sqcoeff += c * c;
}
assert(error >= 0 && sqcoeff >= 0);
*ssz = sqcoeff;
return error;
// Note that the C versions of these 2 functions (vp9_block_error and
// vp9_highbd_block_error_8bit are the same, but the optimized assembly
// routines are not compatible in the non high bitdepth configuration, so
// they still cannot share the same name.
return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
}
static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
......
;
; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%define private_prefix vp9
%include "third_party/x86inc/x86inc.asm"
SECTION .text
ALIGN 16
;
; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
; intptr_t block_size, int64_t *ssz)
;
INIT_XMM avx
cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
vzeroupper
; If only one iteration is required, then handle this as a special case.
; It is the most frequent case, so we can have a significant gain here
; by not setting up a loop and accumulators.
cmp sizeq, 16
jne .generic
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Common case of size == 16
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Load input vectors
mova xm0, [dqcq]
packssdw xm0, [dqcq+16]
mova xm2, [uqcq]
packssdw xm2, [uqcq+16]
mova xm1, [dqcq+32]
packssdw xm1, [dqcq+48]
mova xm3, [uqcq+32]
packssdw xm3, [uqcq+48]
; Compute the errors.
psubw xm0, xm2
psubw xm1, xm3
; Individual errors are max 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
pmaddwd xm2, xm2
pmaddwd xm3, xm3
pmaddwd xm0, xm0
pmaddwd xm1, xm1
; Squares are always positive, so we can use unsigned arithmetic after
; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
; fit in 32bits
paddd xm2, xm3
paddd xm0, xm1
; Accumulate horizontally in 64 bits, there is no chance of overflow here
pxor xm5, xm5
pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
paddq xm2, xm3
paddq xm0, xm1
psrldq xm3, xm2, 8
psrldq xm1, xm0, 8
paddq xm2, xm3
paddq xm0, xm1
; Store the return value
%if ARCH_X86_64
movq rax, xm0
movq [sszq], xm2
%else
movd eax, xm0
pextrd edx, xm0, 1
movq [sszd], xm2
%endif
RET
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Generic case of size != 16, speculative low precision
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
.generic:
pxor xm4, xm4 ; sse accumulator
pxor xm5, xm5 ; overflow detection register for xm4
pxor xm6, xm6 ; ssz accumulator
pxor xm7, xm7 ; overflow detection register for xm6
lea uqcq, [uqcq+sizeq*4]
lea dqcq, [dqcq+sizeq*4]
neg sizeq
; Push the negative size as the high precision code might need it
push sizeq
.loop:
; Load input vectors
mova xm0, [dqcq+sizeq*4]
packssdw xm0, [dqcq+sizeq*4+16]
mova xm2, [uqcq+sizeq*4]
packssdw xm2, [uqcq+sizeq*4+16]
mova xm1, [dqcq+sizeq*4+32]
packssdw xm1, [dqcq+sizeq*4+48]
mova xm3, [uqcq+sizeq*4+32]
packssdw xm3, [uqcq+sizeq*4+48]
add sizeq, 16
; Compute the squared errors.
; Individual errors are max 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
psubw xm0, xm2
pmaddwd xm2, xm2
pmaddwd xm0, xm0
psubw xm1, xm3
pmaddwd xm3, xm3
pmaddwd xm1, xm1
; Squares are always positive, so we can use unsigned arithmetic after
; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
; fit in 32bits
paddd xm2, xm3
paddd xm0, xm1
; We accumulate using 32 bit arithmetic, but detect potential overflow
; by checking if the MSB of the accumulators have ever been a set bit.
; If yes, we redo the whole compute at the end on higher precision, but
; this happens extremely rarely, so we still achieve a net gain.
paddd xm4, xm0
paddd xm6, xm2
por xm5, xm4 ; OR in the accumulator for overflow detection
por xm7, xm6 ; OR in the accumulator for overflow detection
jnz .loop
; Add pairs horizontally (still only on 32 bits)
phaddd xm4, xm4
por xm5, xm4 ; OR in the accumulator for overflow detection
phaddd xm6, xm6
por xm7, xm6 ; OR in the accumulator for overflow detection
; Check for possibility of overflow by testing if bit 32 of each dword lane
; have ever been set. If they were not, then there was no overflow and the
; final sum will fit in 32 bits. If overflow happened, then
; we redo the whole computation on higher precision.
por xm7, xm5
pmovmskb r4, xm7
test r4, 0x8888
jnz .highprec
phaddd xm4, xm4
phaddd xm6, xm6
pmovzxdq xm4, xm4
pmovzxdq xm6, xm6
; Restore stack
pop sizeq
; Store the return value
%if ARCH_X86_64
movq rax, xm4
movq [sszq], xm6
%else
movd eax, xm4
pextrd edx, xm4, 1
movq [sszd], xm6
%endif
RET
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Generic case of size != 16, high precision case
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.highprec:
pxor xm4, xm4 ; sse accumulator
pxor xm5, xm5 ; dedicated zero register
pxor xm6, xm6 ; ssz accumulator
pop sizeq
.loophp:
mova xm0, [dqcq+sizeq*4]
packssdw xm0, [dqcq+sizeq*4+16]
mova xm2, [uqcq+sizeq*4]
packssdw xm2, [uqcq+sizeq*4+16]
mova xm1, [dqcq+sizeq*4+32]
packssdw xm1, [dqcq+sizeq*4+48]
mova xm3, [uqcq+sizeq*4+32]
packssdw xm3, [uqcq+sizeq*4+48]
add sizeq, 16
; individual errors are max. 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
psubw xm0, xm2
pmaddwd xm2, xm2
pmaddwd xm0, xm0
psubw xm1, xm3
pmaddwd xm3, xm3
pmaddwd xm1, xm1
; accumulate in 64bit
punpckldq xm7, xm0, xm5
punpckhdq xm0, xm5
paddq xm4, xm7
punpckldq xm7, xm2, xm5
punpckhdq xm2, xm5
paddq xm6, xm7
punpckldq xm7, xm1, xm5
punpckhdq xm1, xm5
paddq xm4, xm7
punpckldq xm7, xm3, xm5
punpckhdq xm3, xm5
paddq xm6, xm7
paddq xm4, xm0
paddq xm4, xm1
paddq xm6, xm2
paddq xm6, xm3
jnz .loophp
; Accumulate horizontally
movhlps xm5, xm4
movhlps xm7, xm6
paddq xm4, xm5
paddq xm6, xm7
; Store the return value
%if ARCH_X86_64
movq rax, xm4
movq [sszq], xm6
%else
movd eax, xm4
pextrd edx, xm4, 1
movq [sszd], xm6
%endif
RET
END
......@@ -102,6 +102,7 @@ ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
else
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
endif
......
......@@ -248,7 +248,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
endif
ifeq ($(ARCH_X86_64),yes)
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm
endif
endif
endif # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
......
......@@ -913,25 +913,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Quantization
#
if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_highbd_quantize_b sse2/;
specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
} else {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_highbd_quantize_b sse2/;
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
} # CONFIG_VP9_HIGHBITDEPTH
add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER