Commit e357b9ef authored by Jingning Han's avatar Jingning Han
Browse files

Support measure distortion in the pixel domain

Use pixel domain distortion metric in speed 0. This improves the
compression performance by 0.3% for both low and high resolution
test sets.

Change-Id: I5b5b7115960de73f0b5e5d0c69db305e490e6f1d
parent 14011f03
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <cmath>
#include <cstdlib>
#include <string>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "vpx_ports/mem.h"
using libvpx_test::ACMRandom;
namespace {
const int kNumIterations = 10000;
typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size);
typedef std::tr1::tuple<SSI16Func, SSI16Func> SumSquaresParam;
class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
public:
virtual ~SumSquaresTest() {}
virtual void SetUp() {
ref_func_ = GET_PARAM(0);
tst_func_ = GET_PARAM(1);
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
SSI16Func ref_func_;
SSI16Func tst_func_;
};
TEST_P(SumSquaresTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, int16_t, src[256 * 256]);
const int msb = 11; // Up to 12 bit input
const int limit = 1 << (msb + 1);
for (int k = 0; k < kNumIterations; k++) {
const int size = 4 << rnd(6); // Up to 128x128
int stride = 4 << rnd(7); // Up to 256 stride
while (stride < size) { // Make sure it's valid
stride = 4 << rnd(7);
}
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
src[i * stride + j] = rnd(2) ? rnd(limit) : -rnd(limit);
}
}
const uint64_t res_ref = ref_func_(src, stride, size);
uint64_t res_tst;
ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));
ASSERT_EQ(res_ref, res_tst)
<< "Error: Sum Squares Test"
<< " C output does not match optimized output.";
}
}
TEST_P(SumSquaresTest, ExtremeValues) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, int16_t, src[256 * 256]);
const int msb = 11; // Up to 12 bit input
const int limit = 1 << (msb + 1);
for (int k = 0; k < kNumIterations; k++) {
const int size = 4 << rnd(6); // Up to 128x128
int stride = 4 << rnd(7); // Up to 256 stride
while (stride < size) { // Make sure it's valid
stride = 4 << rnd(7);
}
const int val = rnd(2) ? limit - 1 : -(limit - 1);
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
src[i * stride + j] = val;
}
}
const uint64_t res_ref = ref_func_(src, stride, size);
uint64_t res_tst;
ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));
ASSERT_EQ(res_ref, res_tst)
<< "Error: Sum Squares Test"
<< " C output does not match optimized output.";
}
}
using std::tr1::make_tuple;
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, SumSquaresTest,
::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
&vpx_sum_squares_2d_i16_sse2)));
#endif // HAVE_SSE2
} // namespace
......@@ -170,6 +170,7 @@ endif # VP9
## Multi-codec / unconditional whitebox tests.
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc
TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
......
......@@ -64,6 +64,7 @@ typedef struct {
} REF_DEFINITION;
struct rdcost_block_args {
const VP9_COMP *cpi;
MACROBLOCK *x;
ENTROPY_CONTEXT t_above[16];
ENTROPY_CONTEXT t_left[16];
......@@ -463,38 +464,123 @@ static int cost_coeffs(MACROBLOCK *x,
return cost;
}
static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, int block,
int blk_row, int blk_col, TX_SIZE tx_size,
int64_t *out_dist, int64_t *out_sse) {
const int ss_txfrm_size = tx_size << 1;
MACROBLOCKD* const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
int64_t this_sse;
int shift = tx_size == TX_32X32 ? 0 : 2;
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
if (cpi->sf.txfm_domain_distortion) {
const int ss_txfrm_size = tx_size << 1;
int64_t this_sse;
const int shift = tx_size == TX_32X32 ? 0 : 2;
const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
const tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
#if CONFIG_VP9_HIGHBITDEPTH
const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
*out_dist = vp9_highbd_block_error_dispatch(coeff, dqcoeff,
16 << ss_txfrm_size,
&this_sse, bd) >> shift;
const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
*out_dist = vp9_highbd_block_error_dispatch(
coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd) >>
shift;
#else
*out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse) >> shift;
*out_dist =
vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >>
shift;
#endif // CONFIG_VP9_HIGHBITDEPTH
*out_sse = this_sse >> shift;
*out_sse = this_sse >> shift;
if (x->skip_encode && !is_inter_block(xd->mi[0])) {
// TODO(jingning): tune the model to better capture the distortion.
int64_t p = (pd->dequant[1] * pd->dequant[1] *
(1 << ss_txfrm_size)) >>
if (x->skip_encode && !is_inter_block(xd->mi[0])) {
// TODO(jingning): tune the model to better capture the distortion.
const int64_t p =
(pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
#if CONFIG_VP9_HIGHBITDEPTH
(shift + 2 + (bd - 8) * 2);
#else
(shift + 2);
#endif // CONFIG_VP9_HIGHBITDEPTH
*out_dist += (p >> 4);
*out_sse += p;
}
} else {
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
const int bs = 4 * num_4x4_blocks_wide_lookup[tx_bsize];
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
const int src_idx = 4 * (blk_row * src_stride + blk_col);
const int dst_idx = 4 * (blk_row * dst_stride + blk_col);
const uint8_t *src = &p->src.buf[src_idx];
const uint8_t *dst = &pd->dst.buf[dst_idx];
const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const uint16_t *eob = &p->eobs[block];
unsigned int tmp;
cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
*out_sse = (int64_t)tmp * 16;
if (*eob) {
#if CONFIG_VP9_HIGHBITDEPTH
(shift + 2 + (bd - 8) * 2);
DECLARE_ALIGNED(16, uint16_t, recon16[1024]);
uint8_t *recon = (uint8_t *)recon16;
#else
(shift + 2);
DECLARE_ALIGNED(16, uint8_t, recon[1024]);
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
recon = CONVERT_TO_BYTEPTR(recon);
vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0,
bs, bs, xd->bd);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
} else {
switch (tx_size) {
case TX_4X4:
vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
break;
case TX_8X8:
vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);
break;
case TX_16X16:
vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);
break;
case TX_32X32:
vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);
break;
default:
assert(0 && "Invalid transform size");
}
}
} else {
#endif // CONFIG_VP9_HIGHBITDEPTH
vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
switch (tx_size) {
case TX_32X32:
vp9_idct32x32_add(dqcoeff, recon, 32, *eob);
break;
case TX_16X16:
vp9_idct16x16_add(dqcoeff, recon, 32, *eob);
break;
case TX_8X8:
vp9_idct8x8_add(dqcoeff, recon, 32, *eob);
break;
case TX_4X4:
// this is like vp9_short_idct4x4 but has a special case around
// eob<=1, which is significant (not just an optimization) for
// the lossless case.
x->itxm_add(dqcoeff, recon, 32, *eob);
break;
default:
assert(0 && "Invalid transform size");
break;
}
#if CONFIG_VP9_HIGHBITDEPTH
}
#endif // CONFIG_VP9_HIGHBITDEPTH
*out_dist += (p >> 4);
*out_sse += p;
cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, 32, &tmp);
}
*out_dist = (int64_t)tmp * 16;
}
}
......@@ -506,9 +592,8 @@ static int rate_block(int plane, int block, int row, int col,
args->use_fast_coef_costing);
}
static void block_rd_txfm(int plane, int block, int row, int col,
BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
struct rdcost_block_args *args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
......@@ -523,20 +608,47 @@ static void block_rd_txfm(int plane, int block, int row, int col,
if (!is_inter_block(mi)) {
struct encode_b_args arg = {x, NULL, &mi->skip};
vp9_encode_block_intra(plane, block, row, col, plane_bsize, tx_size, &arg);
dist_block(x, plane, block, tx_size, &dist, &sse);
vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
&arg);
if (args->cpi->sf.txfm_domain_distortion) {
dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,
&sse);
} else {
const int bs = 4 << tx_size;
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
unsigned int tmp;
sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs);
#if CONFIG_VP9_HIGHBITDEPTH
if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
#endif // CONFIG_VP9_HIGHBITDEPTH
sse = sse * 16;
variance(src, src_stride, dst, dst_stride, &tmp);
dist = (int64_t)tmp * 16;
}
} else if (max_txsize_lookup[plane_bsize] == tx_size) {
if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
SKIP_TXFM_NONE) {
// full forward transform and quantization
vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
dist_block(x, plane, block, tx_size, &dist, &sse);
vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,
&sse);
} else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
SKIP_TXFM_AC_ONLY) {
// compute DC coefficient
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size);
vp9_xform_quant_dc(x, plane, block, blk_row, blk_col, plane_bsize,
tx_size);
sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
dist = sse;
if (x->plane[plane].eobs[block]) {
......@@ -560,8 +672,9 @@ static void block_rd_txfm(int plane, int block, int row, int col,
}
} else {
// full forward transform and quantization
vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
dist_block(x, plane, block, tx_size, &dist, &sse);
vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,
&sse);
}
rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
......@@ -570,7 +683,7 @@ static void block_rd_txfm(int plane, int block, int row, int col,
return;
}
rate = rate_block(plane, block, row, col, tx_size, args);
rate = rate_block(plane, block, blk_row, blk_col, tx_size, args);
rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
......@@ -593,16 +706,15 @@ static void block_rd_txfm(int plane, int block, int row, int col,
args->skippable &= !x->plane[plane].eobs[block];
}
static void txfm_rd_in_plane(MACROBLOCK *x,
int *rate, int64_t *distortion,
int *skippable, int64_t *sse,
int64_t ref_best_rd, int plane,
BLOCK_SIZE bsize, TX_SIZE tx_size,
int use_fast_coef_casting) {
static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
int64_t *distortion, int *skippable, int64_t *sse,
int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,
TX_SIZE tx_size, int use_fast_coef_casting) {
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
struct rdcost_block_args args;
vp9_zero(args);
args.cpi = cpi;
args.x = x;
args.best_rd = ref_best_rd;
args.use_fast_coef_costing = use_fast_coef_casting;
......@@ -643,8 +755,7 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
mi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
txfm_rd_in_plane(x, rate, distortion, skip,
sse, ref_best_rd, 0, bs,
txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,
mi->tx_size, cpi->sf.use_fast_coef_costing);
}
......@@ -695,9 +806,8 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
else
r_tx_size += vp9_cost_one(tx_probs[m]);
}
txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
&sse[n], ref_best_rd, 0, bs, n,
cpi->sf.use_fast_coef_costing);
txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0,
bs, n, cpi->sf.use_fast_coef_costing);
r[n][1] = r[n][0];
if (r[n][0] < INT_MAX) {
r[n][1] += r_tx_size;
......@@ -1172,9 +1282,8 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
*skippable = 1;
for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
ref_best_rd, plane, bsize, uv_tx_size,
cpi->sf.use_fast_coef_costing);
txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing);
if (pnrate == INT_MAX) {
is_cost_valid = 0;
break;
......
......@@ -162,6 +162,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
sf->txfm_domain_distortion = 1;
}
if (speed >= 2) {
......@@ -279,6 +280,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
sf->exhaustive_searches_thresh = INT_MAX;
if (speed >= 1) {
sf->txfm_domain_distortion = 1;
sf->use_square_partition_only = !frame_is_intra_only(cm);
sf->less_rectangular_check = 1;
sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
......@@ -541,6 +543,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
sf->disable_filter_search_var_thresh = 0;
sf->adaptive_interp_filter_search = 0;
sf->allow_partition_search_skip = 0;
sf->txfm_domain_distortion = 0;
for (i = 0; i < TX_SIZES; i++) {
sf->intra_y_mode_mask[i] = INTRA_ALL;
......
......@@ -246,6 +246,11 @@ typedef struct SPEED_FEATURES {
// Coefficient probability model approximation step size
int coeff_prob_appx_step;
// Use transform domain distortion. Use pixel domain distortion when
// this flag is set to be zero. The pixel domain distortion computation
// improves the distortion metric precision.
int txfm_domain_distortion;
// The threshold is to determine how slow the motino is, it is used when
// use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
MOTION_THRESHOLD lf_motion_threshold;
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
int size) {
int r, c;
uint64_t ss = 0;
for (r = 0; r < size; r++) {
for (c = 0; c < size; c++) {
const int16_t v = src[c];
ss += v * v;
}
src += src_stride;
}
return ss;
}
......@@ -277,6 +277,8 @@ endif # CONFIG_VP9_ENCODER
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
DSP_SRCS-yes += sum_squares.c
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
......
......@@ -1169,6 +1169,9 @@ specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
specialize qw/vpx_sum_squares_2d_i16 sse2/;
#
# Structured Similarity (SSIM)
#
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <emmintrin.h>
#include <stdio.h>
#include "./vpx_dsp_rtcd.h"
static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
int stride) {
const __m128i v_val_0_w =
_mm_loadl_epi64((const __m128i *)(src + 0 * stride));
const __m128i v_val_1_w =
_mm_loadl_epi64((const __m128i *)(src + 1 * stride));
const __m128i v_val_2_w =
_mm_loadl_epi64((const __m128i *)(src + 2 * stride));
const __m128i v_val_3_w =
_mm_loadl_epi64((const __m128i *)(src + 3 * stride));
const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
const __m128i v_sum_d =
_mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
}
// TODO(jingning): Evaluate the performance impact here.
#ifdef __GNUC__
// This prevents GCC/Clang from inlining this function into
// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
// maintenance instructions in the common case of 4x4.
__attribute__((noinline))
#endif
static uint64_t
vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
int r, c;
const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
__m128i v_acc_q = _mm_setzero_si128();
for (r = 0; r < size; r += 8) {
__m128i v_acc_d = _mm_setzero_si128();
for (c = 0; c < size; c += 8) {
const int16_t *b = src + c;
const __m128i v_val_0_w =
_mm_load_si128((const __m128i *)(b + 0 * stride));
const __m128i v_val_1_w =
_mm_load_si128((const __m128i *)(b + 1 * stride));
const __m128i v_val_2_w =
_mm_load_si128((const __m128i *)(b + 2 * stride));
const __m128i v_val_3_w =
_mm_load_si128((const __m128i *)(b + 3 * stride));
const __m128i v_val_4_w =
_mm_load_si128((const __m128i *)(b + 4 * stride));
const __m128i v_val_5_w =
_mm_load_si128((const __m128i *)(b + 5 * stride));
const __m128i v_val_6_w =
_mm_load_si128((const __m128i *)(b + 6 * stride));
const __m128i v_val_7_w =
_mm_load_si128((const __m128i *)(b + 7 * stride));
const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);