Commit 969957f9 authored by Jingning Han's avatar Jingning Han Committed by James Zern
Browse files

Fix real-time compression regression in hbd mode

This commit resolves the compression performance regression in
real-time encoding setting when high bit-depth mode is enabled.

The current solution temporarily disables the SIMD implementations
of vpx_satd, hadamard8x8, and hadamard16x16 in high bit-depth mode.

The commit makes the coding results bit-wise identical between
regular coding pipeline and high bit-depth at profile 0.

BUG=webm:1365

Change-Id: Icfb900821733749685370460a1a5a7e07f76f4bf
parent 9efc42f4
......@@ -315,11 +315,13 @@ INSTANTIATE_TEST_CASE_P(
::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(C, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_c),
make_tuple(64, &vpx_satd_c),
make_tuple(256, &vpx_satd_c),
make_tuple(1024, &vpx_satd_c)));
#endif
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
......@@ -345,12 +347,14 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(64, &vpx_int_pro_col_sse2,
&vpx_int_pro_col_c)));
#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_sse2),
make_tuple(64, &vpx_satd_sse2),
make_tuple(256, &vpx_satd_sse2),
make_tuple(1024, &vpx_satd_sse2)));
#endif
#endif
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(
......@@ -376,12 +380,14 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(64, &vpx_int_pro_col_neon,
&vpx_int_pro_col_c)));
#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_neon),
make_tuple(64, &vpx_satd_neon),
make_tuple(256, &vpx_satd_neon),
make_tuple(1024, &vpx_satd_neon)));
#endif
#endif // !CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_NEON
#if HAVE_MSA
INSTANTIATE_TEST_CASE_P(
......@@ -407,11 +413,13 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(64, &vpx_int_pro_col_msa,
&vpx_int_pro_col_c)));
#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_msa),
make_tuple(64, &vpx_satd_msa),
make_tuple(256, &vpx_satd_msa),
make_tuple(1024, &vpx_satd_msa)));
#endif
#endif // !CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_MSA
} // namespace
......@@ -144,6 +144,7 @@ TEST_P(Hadamard8x8Test, VaryStride) {
}
}
#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
::testing::Values(&vpx_hadamard_8x8_c));
......@@ -166,6 +167,7 @@ INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
::testing::Values(&vpx_hadamard_8x8_msa));
#endif // HAVE_MSA
#endif // !CONFIG_VP9_HIGHBITDEPTH
class Hadamard16x16Test : public HadamardTestBase {};
......@@ -210,6 +212,7 @@ TEST_P(Hadamard16x16Test, VaryStride) {
}
}
#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_c));
......@@ -227,4 +230,5 @@ INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_msa));
#endif // HAVE_MSA
#endif // !CONFIG_VP9_HIGHBITDEPTH
} // namespace
......@@ -132,6 +132,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
specialize qw/vp9_block_error_fp/;
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
......
......@@ -1815,7 +1815,9 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
}
#if CONFIG_VP9_HIGHBITDEPTH
{
// TODO(jingning): Implement integral projection functions for high bit-depth
// setting and remove this part of code.
if (xd->bd != 8) {
unsigned int this_sad;
tmp_mv->row = 0;
tmp_mv->col = 0;
......
......@@ -590,21 +590,6 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
*out_dist_sum += dist << 4;
}
#if CONFIG_VP9_HIGHBITDEPTH
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
int *skippable, int64_t *sse, BLOCK_SIZE bsize,
TX_SIZE tx_size) {
MACROBLOCKD *xd = &x->e_mbd;
unsigned int var_y, sse_y;
(void)tx_size;
model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y,
&sse_y);
*sse = INT_MAX;
*skippable = 0;
return;
}
#else
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
int *skippable, int64_t *sse, BLOCK_SIZE bsize,
TX_SIZE tx_size) {
......@@ -624,6 +609,20 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
const int bw = 4 * num_4x4_w;
const int bh = 4 * num_4x4_h;
#if CONFIG_VP9_HIGHBITDEPTH
// TODO(jingning): Implement the high bit-depth Hadamard transforms and
// remove this check condition.
if (xd->bd != 8) {
unsigned int var_y, sse_y;
(void)tx_size;
model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
&var_y, &sse_y);
*sse = INT_MAX;
*skippable = 0;
return;
}
#endif
(void)cpi;
// The max tx_size passed in is TX_16X16.
......@@ -648,14 +647,14 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
switch (tx_size) {
case TX_16X16:
vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
vpx_hadamard_16x16(src_diff, diff_stride, coeff);
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
case TX_8X8:
vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
vpx_hadamard_8x8(src_diff, diff_stride, coeff);
vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob, scan_order->scan,
......@@ -699,7 +698,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
if (*eob == 1)
this_rdc->rate += (int)abs(qcoeff[0]);
else if (*eob > 1)
this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4);
this_rdc->rate += vpx_satd(qcoeff, step << 4);
this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
}
......@@ -711,7 +710,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
}
#endif
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
......
......@@ -321,7 +321,7 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
return error;
}
int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
int block_size) {
int i;
int64_t error = 0;
......
......@@ -67,9 +67,10 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
// The order of the output coeff of the hadamard is not important. For
// optimization purposes the final transpose may be skipped.
void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
tran_low_t *coeff) {
int idx;
int16_t buffer[64];
int16_t buffer2[64];
int16_t *tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
......@@ -80,17 +81,19 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
// dynamic range [-2040, 2040]
coeff += 8; // coeff: 15 bit
// dynamic range [-16320, 16320]
hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit
// dynamic range [-2040, 2040]
// buffer2: 15 bit
// dynamic range [-16320, 16320]
++tmp_buf;
}
for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
}
// In place 16x16 2D Hadamard transform
void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
tran_low_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
// src_diff: 9 bit, dynamic range [-255, 255]
......@@ -101,15 +104,15 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
// coeff: 15 bit, dynamic range [-16320, 16320]
for (idx = 0; idx < 64; ++idx) {
int16_t a0 = coeff[0];
int16_t a1 = coeff[64];
int16_t a2 = coeff[128];
int16_t a3 = coeff[192];
tran_low_t a0 = coeff[0];
tran_low_t a1 = coeff[64];
tran_low_t a2 = coeff[128];
tran_low_t a3 = coeff[192];
int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
int16_t b3 = (a2 - a3) >> 1;
tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
tran_low_t b3 = (a2 - a3) >> 1;
coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
coeff[64] = b1 + b3;
......@@ -122,7 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
int vpx_satd_c(const int16_t *coeff, int length) {
int vpx_satd_c(const tran_low_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i) satd += abs(coeff[i]);
......
......@@ -885,14 +885,26 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
specialize qw/vpx_minmax_8x8 sse2 neon msa/;
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_8x8/;
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_16x16/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd/;
} else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd sse2 neon msa/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd sse2 neon msa/;
}
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
specialize qw/vpx_int_pro_row sse2 neon msa/;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment