Commit 93c32a55 authored by Fritz Koenig's avatar Fritz Koenig

Rework idct calling structure.

Moving the eob structure allows for a non-struct based
function to handle decoding an entire mb of
idct/dequant/recon data.  This allows for SIMD functions
to idct/dequant/recon multiple blocks at once.

SSE2 implementation gives 3% gain on Atom.

Change-Id: I8a8f3efd546ea4e0535f517d94f347cfb737c9c2
parent b0660457
......@@ -218,6 +218,7 @@ typedef struct
//not used DECLARE_ALIGNED(16, short, reference[384]);
DECLARE_ALIGNED(16, short, qcoeff[400]);
DECLARE_ALIGNED(16, short, dqcoeff[400]);
DECLARE_ALIGNED(16, char, eobs[25]);
// 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries.
BLOCKD block[25];
......
This diff is collapsed.
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "idct.h"
#include "dequantize.h"
void vp8_dequant_dc_idct_add_y_block_v6
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]);
else
vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride);
if (eobs[1] > 1)
vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
else
vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride);
if (eobs[2] > 1)
vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
else
vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride);
if (eobs[3] > 1)
vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
else
vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride);
q += 64;
dc += 4;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_v6
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride);
else
{
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride);
else
{
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride);
((int *)(q+16))[0] = 0;
}
if (eobs[2] > 1)
vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride);
else
{
vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride);
((int *)(q+32))[0] = 0;
}
if (eobs[3] > 1)
vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride);
else
{
vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride);
((int *)(q+48))[0] = 0;
}
q += 64;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_v6
(short *q, short *dq, unsigned char *pre,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i;
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride);
else
{
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride);
else
{
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstu += 4*stride;
eobs += 2;
}
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride);
else
{
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride);
else
{
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstv += 4*stride;
eobs += 2;
}
}
......@@ -16,6 +16,9 @@
extern prototype_dequant_block(vp8_dequantize_b_v6);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_v6
......@@ -25,12 +28,24 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
#undef vp8_dequant_dc_idct_add
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
#undef vp8_dequant_dc_idct_add_y_block
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
#undef vp8_dequant_idct_add_uv_block
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
#endif
#if HAVE_ARMV7
extern prototype_dequant_block(vp8_dequantize_b_neon);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_neon
......@@ -40,6 +55,15 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
#undef vp8_dequant_dc_idct_add
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
#undef vp8_dequant_dc_idct_add_y_block
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
#undef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
#undef vp8_dequant_idct_add_uv_block
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
#endif
#endif
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "idct.h"
#include "dequantize.h"
void vp8_dequant_dc_idct_add_y_block_neon
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);
else
vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);
if (eobs[1] > 1)
vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
else
vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);
if (eobs[2] > 1)
vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
else
vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);
if (eobs[3] > 1)
vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
else
vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);
q += 64;
dc += 4;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_neon
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);
else
{
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);
else
{
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);
((int *)(q+16))[0] = 0;
}
if (eobs[2] > 1)
vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);
else
{
vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);
((int *)(q+32))[0] = 0;
}
if (eobs[3] > 1)
vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);
else
{
vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);
((int *)(q+48))[0] = 0;
}
q += 64;
pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_neon
(short *q, short *dq, unsigned char *pre,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i;
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);
else
{
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);
else
{
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstu += 4*stride;
eobs += 2;
}
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);
else
{
vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);
else
{
vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
pre += 32;
dstv += 4*stride;
eobs += 2;
}
}
......@@ -237,7 +237,7 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
DEQUANT_INVOKE(&pbi->dequant, block)(b);
// do 2nd order transform on the dc block
if (b->eob > 1)
if (xd->eobs[24] > 1)
{
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
((int *)b->qcoeff)[0] = 0;
......@@ -255,24 +255,10 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
((int *)b->qcoeff)[0] = 0;
}
for (i = 0; i < 16; i++)
{
b = &xd->block[i];
if (b->eob > 1)
{
DEQUANT_INVOKE(&pbi->dequant, dc_idct_add)
(b->qcoeff, &b->dequant[0][0], b->predictor,
*(b->base_dst) + b->dst, 16, b->dst_stride,
xd->block[24].diff[i]);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(xd->block[24].diff[i], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
}
}
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
(xd->qcoeff, &xd->block[0].dequant[0][0],
xd->predictor, xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
}
else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
{
......@@ -282,13 +268,17 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
BLOCKD *b = &xd->block[i];
vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
if (b->eob > 1)
if (xd->eobs[i] > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
DEQUANT_INVOKE(&pbi->dequant, idct_add)
(b->qcoeff, &b->dequant[0][0], b->predictor,
*(b->base_dst) + b->dst, 16, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
(b->qcoeff[0] * b->dequant[0][0], b->predictor,
*(b->base_dst) + b->dst, 16, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
......@@ -296,37 +286,16 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
}
else
{
for (i = 0; i < 16; i++)
{
BLOCKD *b = &xd->block[i];
if (b->eob > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
(xd->qcoeff, &xd->block[0].dequant[0][0],
xd->predictor, xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
for (i = 16; i < 24; i++)
{
BLOCKD *b = &xd->block[i];
if (b->eob > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
(xd->qcoeff+16*16, &xd->block[16].dequant[0][0],
xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.uv_stride, xd->eobs+16);
}
static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
......
......@@ -27,6 +27,21 @@
int pitch, int stride, \
int dc)
#define prototype_dequant_dc_idct_add_y_block(sym) \
void sym(short *q, short *dq, \
unsigned char *pre, unsigned char *dst, \
int stride, char *eobs, short *dc)
#define prototype_dequant_idct_add_y_block(sym) \
void sym(short *q, short *dq, \
unsigned char *pre, unsigned char *dst, \
int stride, char *eobs)
#define prototype_dequant_idct_add_uv_block(sym) \
void sym(short *q, short *dq, \
unsigned char *pre, unsigned char *dst_u, \
unsigned char *dst_v, int stride, char *eobs)
#if ARCH_X86 || ARCH_X86_64
#include "x86/dequantize_x86.h"
#endif
......@@ -50,16 +65,42 @@ extern prototype_dequant_idct_add(vp8_dequant_idct_add);
#endif
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
#ifndef vp8_dequant_dc_idct_add_y_block
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
#endif
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
#ifndef vp8_dequant_idct_add_y_block
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
#endif
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block);
#ifndef vp8_dequant_idct_add_uv_block
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
#endif
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block);
typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
typedef struct
{
vp8_dequant_block_fn_t block;
vp8_dequant_idct_add_fn_t idct_add;
vp8_dequant_dc_idct_add_fn_t dc_idct_add;
vp8_dequant_block_fn_t block;
vp8_dequant_idct_add_fn_t idct_add;
vp8_dequant_dc_idct_add_fn_t dc_idct_add;
vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
vp8_dequant_idct_add_y_block_fn_t idct_add_y_block;
vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block;
} vp8_dequant_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
......
......@@ -266,6 +266,8 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
BOOL_DECODER *bc = x->current_bc;
char *eobs = x->eobs;
ENTROPY_CONTEXT *a;
ENTROPY_CONTEXT *l;
int i;
......@@ -416,8 +418,8 @@ ONE_CONTEXT_NODE_0_:
qcoeff_ptr [ scan[15] ] = (INT16) v;
BLOCK_FINISHED:
t = ((x->block[i].eob = c) != !type); // any nonzero data?
eobtotal += x->block[i].eob;
t = ((eobs[i] = c) != !type); // any nonzero data?
eobtotal += c;
*a = *l = t;
qcoeff_ptr += 16;
......
......@@ -19,12 +19,15 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
{
// Pure C:
#if CONFIG_RUNTIME_CPU_DETECT
pbi->mb.rtcd = &pbi->common.rtcd;
pbi->dequant.block = vp8_dequantize_b_c;
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
pbi->dboolhuff.start = vp8dx_start_decode_c;
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
pbi->mb.rtcd = &pbi->common.rtcd;
pbi->dequant.block = vp8_dequantize_b_c;
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c;
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_c;
pbi->dboolhuff.start = vp8dx_start_decode_c;
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
#if 0 //For use with RTCD, when implemented
pbi->dboolhuff.debool = vp8dx_decode_bool_c;
pbi->dboolhuff.devalue = vp8dx_decode_value_c;
......
/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "idct.h"
#include "dequantize.h"
void vp8_dequant_dc_idct_add_y_block_c
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i, j;
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
if (*eobs++ > 1)
vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]);
else
vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride);
q += 16;
pre += 4;
dst += 4;
dc ++;
}
pre += 64 - 16;
dst += 4*stride - 16;
}
}
void vp8_dequant_idct_add_y_block_c
(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride, char *eobs)
{
int i, j;
for (i = 0; i < 4; i++)
{
for (j = 0; j < 4; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride);
((int *)q)[0] = 0;
}
q += 16;
pre += 4;
dst += 4;
}
pre += 64 - 16;
dst += 4*stride - 16;
}
}
void vp8_dequant_idct_add_uv_block_c
(short *q, short *dq, unsigned char *pre,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i, j;
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride);
((int *)q)[0] = 0;
}
q += 16;
pre += 4;
dstu += 4;
}
pre += 32 - 8;
dstu += 4*stride - 8;
}
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
if (*eobs++ > 1)
vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride);
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride);
((int *)q)[0] = 0;
}
q += 16;
pre += 4;
dstv += 4;
}
pre += 32 - 8;
dstv += 4*stride - 8;
}
}
......@@ -23,7 +23,9 @@
extern prototype_dequant_block(vp8_dequantize_b_mmx);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_dequant_block
......@@ -35,6 +37,33 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);