dsputil.h 31.1 KB
Newer Older
Fabrice Bellard's avatar
Fabrice Bellard committed
1 2
/*
 * DSP utils
3
 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
Fabrice Bellard's avatar
Fabrice Bellard committed
5
 *
6
 * This file is part of Libav.
7
 *
8
 * Libav is free software; you can redistribute it and/or
Fabrice Bellard's avatar
Fabrice Bellard committed
9 10
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
Fabrice Bellard's avatar
Fabrice Bellard committed
12
 *
13
 * Libav is distributed in the hope that it will be useful,
Fabrice Bellard's avatar
Fabrice Bellard committed
14 15 16 17 18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with Libav; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Fabrice Bellard's avatar
Fabrice Bellard committed
21
 */
Michael Niedermayer's avatar
Michael Niedermayer committed
22 23

/**
24
 * @file
Michael Niedermayer's avatar
Michael Niedermayer committed
25
 * DSP utils.
26 27
 * note, many functions in here may use MMX which trashes the FPU state, it is
 * absolutely necessary to call emms_c() between dsp & float/double code
Michael Niedermayer's avatar
Michael Niedermayer committed
28 29
 */

30 31
#ifndef AVCODEC_DSPUTIL_H
#define AVCODEC_DSPUTIL_H
Fabrice Bellard's avatar
Fabrice Bellard committed
32

33
#include "libavutil/intreadwrite.h"
34
#include "avcodec.h"
Fabrice Bellard's avatar
Fabrice Bellard committed
35

Michael Niedermayer's avatar
Michael Niedermayer committed
36

Michael Niedermayer's avatar
Michael Niedermayer committed
37
//#define DEBUG
Fabrice Bellard's avatar
Fabrice Bellard committed
38 39 40
/* dct code */
typedef short DCTELEM;

41
void fdct_ifast (DCTELEM *data);
42
void fdct_ifast248 (DCTELEM *data);
43
void ff_jpeg_fdct_islow (DCTELEM *data);
44
void ff_fdct248_islow (DCTELEM *data);
Fabrice Bellard's avatar
Fabrice Bellard committed
45 46

void j_rev_dct (DCTELEM *data);
Michael Niedermayer's avatar
Michael Niedermayer committed
47
void j_rev_dct4 (DCTELEM *data);
Michael Niedermayer's avatar
Michael Niedermayer committed
48
void j_rev_dct2 (DCTELEM *data);
Michael Niedermayer's avatar
Michael Niedermayer committed
49
void j_rev_dct1 (DCTELEM *data);
50
void ff_wmv2_idct_c(DCTELEM *data);
Fabrice Bellard's avatar
Fabrice Bellard committed
51

52
void ff_fdct_mmx(DCTELEM *block);
53
void ff_fdct_mmx2(DCTELEM *block);
54
void ff_fdct_sse2(DCTELEM *block);
Fabrice Bellard's avatar
Fabrice Bellard committed
55

56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
#define H264_IDCT(depth) \
void ff_h264_idct8_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int stride);\
void ff_h264_idct_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int stride);\
void ff_h264_idct8_dc_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int stride);\
void ff_h264_idct_dc_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int stride);\
void ff_h264_lowres_idct_add_ ## depth ## _c(uint8_t *dst, int stride, DCTELEM *block);\
void ff_h264_lowres_idct_put_ ## depth ## _c(uint8_t *dst, int stride, DCTELEM *block);\
void ff_h264_idct_add16_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_idct_add16intra_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_idct8_add4_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_idct_add8_ ## depth ## _c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_luma_dc_dequant_idct_ ## depth ## _c(DCTELEM *output, DCTELEM *input, int qmul);\
void ff_h264_chroma_dc_dequant_idct_ ## depth ## _c(DCTELEM *block, int qmul);

H264_IDCT( 8)
H264_IDCT( 9)
H264_IDCT(10)

74
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
75 76
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);

77
/* encoding scans */
78 79 80
extern const uint8_t ff_alternate_horizontal_scan[64];
extern const uint8_t ff_alternate_vertical_scan[64];
extern const uint8_t ff_zigzag_direct[64];
81
extern const uint8_t ff_zigzag248_direct[64];
82

Fabrice Bellard's avatar
Fabrice Bellard committed
83
/* pixel operations */
84
#define MAX_NEG_CROP 1024
Fabrice Bellard's avatar
Fabrice Bellard committed
85 86

/* temporary */
87
extern uint32_t ff_squareTbl[512];
Måns Rullgård's avatar
Måns Rullgård committed
88
extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
Fabrice Bellard's avatar
Fabrice Bellard committed
89

90 91 92 93 94 95 96 97 98 99 100 101 102 103
#define PUTAVG_PIXELS(depth)\
void ff_put_pixels8x8_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
void ff_avg_pixels8x8_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
void ff_put_pixels16x16_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
void ff_avg_pixels16x16_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);

PUTAVG_PIXELS( 8)
PUTAVG_PIXELS( 9)
PUTAVG_PIXELS(10)

#define ff_put_pixels8x8_c ff_put_pixels8x8_8_c
#define ff_avg_pixels8x8_c ff_avg_pixels8x8_8_c
#define ff_put_pixels16x16_c ff_put_pixels16x16_8_c
#define ff_avg_pixels16x16_c ff_avg_pixels16x16_8_c
104

105
/* VP3 DSP functions */
106 107 108
void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
David Conrad's avatar
David Conrad committed
109
void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
Fabrice Bellard's avatar
Fabrice Bellard committed
110

111 112 113
void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);

114 115 116 117 118 119 120 121
/* Bink functions */
void ff_bink_idct_c    (DCTELEM *block);
void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);

/* EA functions */
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);

122 123 124 125
/* 1/2^n downscaling functions from imgconvert.c */
void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
126 127 128

void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
129

Michael Niedermayer's avatar
Michael Niedermayer committed
130
/* minimum alignment rules ;)
Diego Biurrun's avatar
Diego Biurrun committed
131 132
If you notice errors in the align stuff, need more alignment for some ASM code
for some CPU or need to use a function with less aligned data then send a mail
133
to the libav-devel mailing list, ...
Diego Biurrun's avatar
Diego Biurrun committed
134 135 136

!warning These alignments might not match reality, (missing attribute((align))
stuff somewhere possible).
Diego Biurrun's avatar
Diego Biurrun committed
137
I (Michael) did not check them, these are just the alignments which I think
Diego Biurrun's avatar
Diego Biurrun committed
138
could be reached easily ...
Fabrice Bellard's avatar
Fabrice Bellard committed
139

Michael Niedermayer's avatar
Michael Niedermayer committed
140 141 142
!future video codecs might need functions with less strict alignment
*/

143
/*
144 145 146 147
void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
148
void clear_blocks_c(DCTELEM *blocks);
149
*/
Fabrice Bellard's avatar
Fabrice Bellard committed
150 151

/* add and put pixel (decoding) */
Michael Niedermayer's avatar
Michael Niedermayer committed
152
// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
Michael Niedermayer's avatar
Michael Niedermayer committed
153
//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
154
typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
155
typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
156
typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
Michael Niedermayer's avatar
Michael Niedermayer committed
157
typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
Michael Niedermayer's avatar
Michael Niedermayer committed
158

Kostya Shishkov's avatar
Kostya Shishkov committed
159 160
typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);

Michael Niedermayer's avatar
Michael Niedermayer committed
161
#define DEF_OLD_QPEL(name)\
162 163 164
void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
void ff_avg_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
Michael Niedermayer's avatar
Michael Niedermayer committed
165 166 167 168 169 170 171 172 173 174 175 176 177

DEF_OLD_QPEL(qpel16_mc11_old_c)
DEF_OLD_QPEL(qpel16_mc31_old_c)
DEF_OLD_QPEL(qpel16_mc12_old_c)
DEF_OLD_QPEL(qpel16_mc32_old_c)
DEF_OLD_QPEL(qpel16_mc13_old_c)
DEF_OLD_QPEL(qpel16_mc33_old_c)
DEF_OLD_QPEL(qpel8_mc11_old_c)
DEF_OLD_QPEL(qpel8_mc31_old_c)
DEF_OLD_QPEL(qpel8_mc12_old_c)
DEF_OLD_QPEL(qpel8_mc32_old_c)
DEF_OLD_QPEL(qpel8_mc13_old_c)
DEF_OLD_QPEL(qpel8_mc33_old_c)
Michael Niedermayer's avatar
Michael Niedermayer committed
178 179 180 181 182 183

#define CALL_2X_PIXELS(a, b, n)\
static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
    b(block  , pixels  , line_size, h);\
    b(block+n, pixels+n, line_size, h);\
}
Michael Niedermayer's avatar
Michael Niedermayer committed
184

Fabrice Bellard's avatar
Fabrice Bellard committed
185
/* motion estimation */
Michael Niedermayer's avatar
Michael Niedermayer committed
186
// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
Diego Biurrun's avatar
Diego Biurrun committed
187
// although currently h<4 is not used as functions with width <8 are neither used nor implemented
188
typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
Michael Niedermayer's avatar
Michael Niedermayer committed
189

190 191 192 193 194 195 196
/**
 * Scantable.
 */
typedef struct ScanTable{
    const uint8_t *scantable;
    uint8_t permutated[64];
    uint8_t raster_end[64];
197
#if ARCH_PPC
198
                /** Used by dct_quantize_altivec to find last-non-zero */
199
    DECLARE_ALIGNED(16, uint8_t, inverse)[64];
200 201 202 203 204
#endif
} ScanTable;

void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);

205 206 207
#define EMULATED_EDGE(depth) \
void ff_emulated_edge_mc_ ## depth (uint8_t *buf, const uint8_t *src, int linesize,\
                         int block_w, int block_h,\
208 209
                         int src_x, int src_y, int w, int h);

210 211 212 213 214 215
EMULATED_EDGE(8)
EMULATED_EDGE(9)
EMULATED_EDGE(10)

#define ff_emulated_edge_mc ff_emulated_edge_mc_8

216 217 218 219
void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *dest, int linesize);
void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *dest, int linesize);
void ff_put_signed_pixels_clamped_c(const DCTELEM *block, uint8_t *dest, int linesize);

Michael Niedermayer's avatar
Michael Niedermayer committed
220 221 222
/**
 * DSPContext.
 */
223 224
typedef struct DSPContext {
    /* pixel ops : interface with DCT */
225 226 227
    void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
    void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
    void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
228
    void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
Kostya Shishkov's avatar
Kostya Shishkov committed
229
    void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
230
    void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
Loren Merritt's avatar
Loren Merritt committed
231 232
    void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
    void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
233
    int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
    /**
     * Motion estimation with emulated edge values.
     * @param buf pointer to destination buffer (unaligned)
     * @param src pointer to pixel source (unaligned)
     * @param linesize width (in pixels) for src/buf
     * @param block_w number of pixels (per row) to copy to buf
     * @param block_h nummber of pixel rows to copy to buf
     * @param src_x offset of src to start of row - this may be negative
     * @param src_y offset of src to top of image - this may be negative
     * @param w width of src in pixels
     * @param h height of src in pixels
     */
    void (*emulated_edge_mc)(uint8_t *buf, const uint8_t *src, int linesize,
                             int block_w, int block_h,
                             int src_x, int src_y, int w, int h);
Michael Niedermayer's avatar
Michael Niedermayer committed
249 250 251
    /**
     * translational global motion compensation.
     */
252
    void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
Michael Niedermayer's avatar
Michael Niedermayer committed
253 254 255
    /**
     * global motion compensation.
     */
256
    void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
257
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
Loren Merritt's avatar
Loren Merritt committed
258
    void (*clear_block)(DCTELEM *block/*align 16*/);
259
    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
260 261
    int (*pix_sum)(uint8_t * pix, int line_size);
    int (*pix_norm1)(uint8_t * pix, int line_size);
262
// 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
263

264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
    me_cmp_func sse[6];
    me_cmp_func hadamard8_diff[6];
    me_cmp_func dct_sad[6];
    me_cmp_func quant_psnr[6];
    me_cmp_func bit[6];
    me_cmp_func rd[6];
    me_cmp_func vsad[6];
    me_cmp_func vsse[6];
    me_cmp_func nsse[6];
    me_cmp_func w53[6];
    me_cmp_func w97[6];
    me_cmp_func dct_max[6];
    me_cmp_func dct264_sad[6];

    me_cmp_func me_pre_cmp[6];
    me_cmp_func me_cmp[6];
    me_cmp_func me_sub_cmp[6];
    me_cmp_func mb_cmp[6];
    me_cmp_func ildct_cmp[6]; //only width 16 used
    me_cmp_func frame_skip_cmp[6]; //only width 8 used
285

286 287
    int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
                             int size);
288

Michael Niedermayer's avatar
Michael Niedermayer committed
289 290
    /**
     * Halfpel motion compensation with rounding (a+b+1)>>1.
Luca Barbato's avatar
Luca Barbato committed
291
     * this is an array[4][4] of motion compensation functions for 4
292
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
293
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
294 295 296 297 298
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
299
    op_pixels_func put_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
300 301 302

    /**
     * Halfpel motion compensation with rounding (a+b+1)>>1.
303
     * This is an array[4][4] of motion compensation functions for 4
304
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
305
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
306 307 308 309 310
     * @param block destination into which the result is averaged (a+b+1)>>1
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
311
    op_pixels_func avg_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
312 313 314

    /**
     * Halfpel motion compensation with no rounding (a+b)>>1.
Luca Barbato's avatar
Luca Barbato committed
315
     * this is an array[2][4] of motion compensation functions for 2
Michael Niedermayer's avatar
doxy  
Michael Niedermayer committed
316
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
317
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
318 319 320 321 322
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
Michael Niedermayer's avatar
Michael Niedermayer committed
323
    op_pixels_func put_no_rnd_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
324 325 326

    /**
     * Halfpel motion compensation with no rounding (a+b)>>1.
Luca Barbato's avatar
Luca Barbato committed
327
     * this is an array[2][4] of motion compensation functions for 2
Michael Niedermayer's avatar
doxy  
Michael Niedermayer committed
328
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
329
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
330 331 332 333 334
     * @param block destination into which the result is averaged (a+b)>>1
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
Michael Niedermayer's avatar
Michael Niedermayer committed
335
    op_pixels_func avg_no_rnd_pixels_tab[4][4];
336

337
    void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
338

339 340
    /**
     * Thirdpel motion compensation with rounding (a+b+1)>>1.
Luca Barbato's avatar
Luca Barbato committed
341 342
     * this is an array[12] of motion compensation functions for the 9 thirdpe
     * positions<br>
343 344 345 346 347 348 349
     * *pixels_tab[ xthirdpel + 4*ythirdpel ]
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
    tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
350 351
    tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?

352 353 354 355
    qpel_mc_func put_qpel_pixels_tab[2][16];
    qpel_mc_func avg_qpel_pixels_tab[2][16];
    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
    qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
Michael Niedermayer's avatar
Michael Niedermayer committed
356
    qpel_mc_func put_mspel_pixels_tab[8];
357

Michael Niedermayer's avatar
Michael Niedermayer committed
358
    /**
Luca Barbato's avatar
Luca Barbato committed
359
     * h264 Chroma MC
Michael Niedermayer's avatar
Michael Niedermayer committed
360 361 362
     */
    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
363

364 365
    qpel_mc_func put_h264_qpel_pixels_tab[4][16];
    qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
366

367 368 369
    qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
    qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];

370
    me_cmp_func pix_abs[2][4];
371

Michael Niedermayer's avatar
huffyuv  
Michael Niedermayer committed
372 373
    /* huffyuv specific */
    void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
Loren Merritt's avatar
Loren Merritt committed
374
    void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w);
Michael Niedermayer's avatar
Michael Niedermayer committed
375
    void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
376 377 378 379
    /**
     * subtract huffyuv's variant of median prediction
     * note, this might read from src1[-1], src2[-1]
     */
380 381
    void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top);
    void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
Loren Merritt's avatar
Loren Merritt committed
382
    int  (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left);
383
    void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha);
Loren Merritt's avatar
Loren Merritt committed
384 385
    /* this might write to dst[w] */
    void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
Michael Niedermayer's avatar
Michael Niedermayer committed
386
    void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
Mans Rullgard's avatar
Mans Rullgard committed
387
    void (*bswap16_buf)(uint16_t *dst, const uint16_t *src, int len);
388

Michael Niedermayer's avatar
Michael Niedermayer committed
389 390 391
    void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);

Michael Niedermayer's avatar
Michael Niedermayer committed
392
    void (*h261_loop_filter)(uint8_t *src, int stride);
393

394 395 396
    void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);

David Conrad's avatar
David Conrad committed
397
    void (*vp3_idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
398 399 400
    void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
    void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values);

401
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
402
    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
Loren Merritt's avatar
Loren Merritt committed
403
    void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
404
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
405
    void (*vector_fmul)(float *dst, const float *src0, const float *src1, int len);
406 407
    void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
408
    void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
Loren Merritt's avatar
Loren Merritt committed
409
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
410
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
Loren Merritt's avatar
Loren Merritt committed
411
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
412
    void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
    /**
     * Multiply a vector of floats by a scalar float.  Source and
     * destination vectors must overlap exactly or not at all.
     * @param dst result vector, 16-byte aligned
     * @param src input vector, 16-byte aligned
     * @param mul scalar value
     * @param len length of vector, multiple of 4
     */
    void (*vector_fmul_scalar)(float *dst, const float *src, float mul,
                               int len);
    /**
     * Multiply a vector of floats by concatenated short vectors of
     * floats and by a scalar float.  Source and destination vectors
     * must overlap exactly or not at all.
     * [0]: short vectors of length 2, 8-byte aligned
     * [1]: short vectors of length 4, 16-byte aligned
     * @param dst output vector, 16-byte aligned
     * @param src input vector, 16-byte aligned
     * @param sv  array of pointers to short vectors
     * @param mul scalar value
     * @param len number of elements in src and dst, multiple of 4
     */
    void (*vector_fmul_sv_scalar[2])(float *dst, const float *src,
                                     const float **sv, float mul, int len);
    /**
     * Multiply short vectors of floats by a scalar float, store
     * concatenated result.
     * [0]: short vectors of length 2, 8-byte aligned
     * [1]: short vectors of length 4, 16-byte aligned
     * @param dst output vector, 16-byte aligned
     * @param sv  array of pointers to short vectors
     * @param mul scalar value
     * @param len number of output elements, multiple of 4
     */
    void (*sv_fmul_scalar[2])(float *dst, const float **sv,
                              float mul, int len);
    /**
     * Calculate the scalar product of two vectors of floats.
     * @param v1  first vector, 16-byte aligned
     * @param v2  second vector, 16-byte aligned
     * @param len length of vectors, multiple of 4
     */
    float (*scalarproduct_float)(const float *v1, const float *v2, int len);
    /**
     * Calculate the sum and difference of two vectors of floats.
     * @param v1  first input vector, sum output, 16-byte aligned
     * @param v2  second input vector, difference output, 16-byte aligned
     * @param len length of vectors, multiple of 4
     */
    void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
463

464 465
    /* (I)DCT */
    void (*fdct)(DCTELEM *block/* align 16*/);
466
    void (*fdct248)(DCTELEM *block/* align 16*/);
467

468 469
    /* IDCT really*/
    void (*idct)(DCTELEM *block/* align 16*/);
470

Michael Niedermayer's avatar
Michael Niedermayer committed
471
    /**
Michael Niedermayer's avatar
Michael Niedermayer committed
472
     * block -> idct -> clip to unsigned 8 bit -> dest.
Michael Niedermayer's avatar
Michael Niedermayer committed
473
     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
Panagiotis Issaris's avatar
Panagiotis Issaris committed
474
     * @param line_size size in bytes of a horizontal line of dest
Michael Niedermayer's avatar
Michael Niedermayer committed
475
     */
476
    void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
477

Michael Niedermayer's avatar
Michael Niedermayer committed
478 479
    /**
     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
Panagiotis Issaris's avatar
Panagiotis Issaris committed
480
     * @param line_size size in bytes of a horizontal line of dest
Michael Niedermayer's avatar
Michael Niedermayer committed
481
     */
482
    void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
483

Michael Niedermayer's avatar
Michael Niedermayer committed
484
    /**
Michael Niedermayer's avatar
Michael Niedermayer committed
485
     * idct input permutation.
Michael Niedermayer's avatar
Michael Niedermayer committed
486 487 488 489
     * several optimized IDCTs need a permutated input (relative to the normal order of the reference
     * IDCT)
     * this permutation must be performed before the idct_put/add, note, normally this can be merged
     * with the zigzag/alternate scan<br>
Michael Niedermayer's avatar
Michael Niedermayer committed
490 491 492 493 494 495
     * an example to avoid confusion:
     * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
     * - (x -> referece dct -> reference idct -> x)
     * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
     * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
     */
496 497 498 499 500 501
    uint8_t idct_permutation[64];
    int idct_permutation_type;
#define FF_NO_IDCT_PERM 1
#define FF_LIBMPEG2_IDCT_PERM 2
#define FF_SIMPLE_IDCT_PERM 3
#define FF_TRANSPOSE_IDCT_PERM 4
502
#define FF_PARTTRANS_IDCT_PERM 5
503
#define FF_SSE2_IDCT_PERM 6
504

505 506 507 508
    int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
#define BASIS_SHIFT 16
#define RECON_SHIFT 6
509

510
    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int sides);
511
#define EDGE_WIDTH 16
512 513
#define EDGE_TOP    1
#define EDGE_BOTTOM 2
514

515
    void (*prefetch)(void *mem, int stride, int h);
516 517

    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
518

519
    /* mlp/truehd functions */
520 521
    void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
                               int firorder, int iirorder,
522 523 524
                               unsigned int filter_shift, int32_t mask, int blocksize,
                               int32_t *sample_buffer);

525
    /* intrax8 functions */
526 527
    void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize);
    void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
528 529
           int * range, int * sum,  int edges);

530 531
    /**
     * Calculate scalar product of two vectors.
532
     * @param len length of vectors, should be multiple of 16
533 534
     * @param shift number of bits to discard from product
     */
535
    int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len, int shift);
536 537 538 539 540 541
    /* ape functions */
    /**
     * Calculate scalar product of v1 and v2,
     * and v1[i] += v3[i] * mul
     * @param len length of vectors, should be multiple of 16
     */
542
    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
Kostya Shishkov's avatar
Kostya Shishkov committed
543

544 545 546 547 548 549 550 551 552 553 554 555 556 557
    /**
     * Apply symmetric window in 16-bit fixed-point.
     * @param output destination array
     *               constraints: 16-byte aligned
     * @param input  source array
     *               constraints: 16-byte aligned
     * @param window window array
     *               constraints: 16-byte aligned, at least len/2 elements
     * @param len    full window length
     *               constraints: multiple of ? greater than zero
     */
    void (*apply_window_int16)(int16_t *output, const int16_t *input,
                               const int16_t *window, unsigned int len);

Kostya Shishkov's avatar
Kostya Shishkov committed
558 559 560 561
    /* rv30 functions */
    qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
    qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];

Kostya Shishkov's avatar
Kostya Shishkov committed
562 563 564 565 566
    /* rv40 functions */
    qpel_mc_func put_rv40_qpel_pixels_tab[4][16];
    qpel_mc_func avg_rv40_qpel_pixels_tab[4][16];
    h264_chroma_mc_func put_rv40_chroma_pixels_tab[3];
    h264_chroma_mc_func avg_rv40_chroma_pixels_tab[3];
Kostya Shishkov's avatar
Kostya Shishkov committed
567 568 569 570

    /* bink functions */
    op_fill_func fill_block_tab[2];
    void (*scale_block)(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize);
571 572
} DSPContext;

Måns Rullgård's avatar
Måns Rullgård committed
573
void dsputil_static_init(void);
574
void dsputil_init(DSPContext* p, AVCodecContext *avctx);
Fabrice Bellard's avatar
Fabrice Bellard committed
575

576 577
int ff_check_alignment(void);

Michael Niedermayer's avatar
Michael Niedermayer committed
578 579 580 581
/**
 * permute block according to permuatation.
 * @param last last non zero element in scantable order
 */
582
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
583

584 585
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);

586
#define         BYTE_VEC32(c)   ((c)*0x01010101UL)
587
#define         BYTE_VEC64(c)   ((c)*0x0001000100010001UL)
Michael Niedermayer's avatar
cleanup  
Michael Niedermayer committed
588 589 590 591 592 593 594 595 596 597 598

static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
{
    return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
}

static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
{
    return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
}

599 600 601 602 603 604 605 606 607 608
static inline uint64_t rnd_avg64(uint64_t a, uint64_t b)
{
    return (a | b) - (((a ^ b) & ~BYTE_VEC64(0x01)) >> 1);
}

static inline uint64_t no_rnd_avg64(uint64_t a, uint64_t b)
{
    return (a & b) + (((a ^ b) & ~BYTE_VEC64(0x01)) >> 1);
}

Michael Niedermayer's avatar
Michael Niedermayer committed
609 610 611 612 613 614 615 616 617 618 619 620
static inline int get_penalty_factor(int lambda, int lambda2, int type){
    switch(type&0xFF){
    default:
    case FF_CMP_SAD:
        return lambda>>FF_LAMBDA_SHIFT;
    case FF_CMP_DCT:
        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
    case FF_CMP_W53:
        return (4*lambda)>>(FF_LAMBDA_SHIFT);
    case FF_CMP_W97:
        return (2*lambda)>>(FF_LAMBDA_SHIFT);
    case FF_CMP_SATD:
621
    case FF_CMP_DCT264:
Michael Niedermayer's avatar
Michael Niedermayer committed
622 623 624 625 626 627 628 629 630 631 632
        return (2*lambda)>>FF_LAMBDA_SHIFT;
    case FF_CMP_RD:
    case FF_CMP_PSNR:
    case FF_CMP_SSE:
    case FF_CMP_NSSE:
        return lambda2>>FF_LAMBDA_SHIFT;
    case FF_CMP_BIT:
        return 1;
    }
}

633
void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
634
void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
635 636 637 638 639 640 641 642
void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);

643
void ff_dsputil_init_dwt(DSPContext *c);
644 645 646 647 648 649
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx);

650
#if ARCH_ARM
Fabrice Bellard's avatar
Fabrice Bellard committed
651

652
#if HAVE_NEON
653 654 655
#   define STRIDE_ALIGN 16
#endif

656
#elif ARCH_PPC
657

Michael Niedermayer's avatar
Michael Niedermayer committed
658
#define STRIDE_ALIGN 16
659

660
#elif HAVE_MMI
661

Michael Niedermayer's avatar
Michael Niedermayer committed
662
#define STRIDE_ALIGN 16
663

664
#endif
Fabrice Bellard's avatar
Fabrice Bellard committed
665

666 667
#ifndef STRIDE_ALIGN
#   define STRIDE_ALIGN 8
Fabrice Bellard's avatar
Fabrice Bellard committed
668 669
#endif

670 671 672 673 674 675 676
#define LOCAL_ALIGNED_A(a, t, v, s, o, ...)             \
    uint8_t la_##v[sizeof(t s o) + (a)];                \
    t (*v) o = (void *)FFALIGN((uintptr_t)la_##v, a)

#define LOCAL_ALIGNED_D(a, t, v, s, o, ...) DECLARE_ALIGNED(a, t, v) s o

#define LOCAL_ALIGNED(a, t, v, ...) LOCAL_ALIGNED_A(a, t, v, __VA_ARGS__,,)
677 678

#if HAVE_LOCAL_ALIGNED_8
679
#   define LOCAL_ALIGNED_8(t, v, ...) LOCAL_ALIGNED_D(8, t, v, __VA_ARGS__,,)
680
#else
681
#   define LOCAL_ALIGNED_8(t, v, ...) LOCAL_ALIGNED(8, t, v, __VA_ARGS__)
682 683 684
#endif

#if HAVE_LOCAL_ALIGNED_16
685
#   define LOCAL_ALIGNED_16(t, v, ...) LOCAL_ALIGNED_D(16, t, v, __VA_ARGS__,,)
686
#else
687
#   define LOCAL_ALIGNED_16(t, v, ...) LOCAL_ALIGNED(16, t, v, __VA_ARGS__)
688 689
#endif

690
/* PSNR */
691
void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
692 693
              int orig_linesize[3], int coded_linesize,
              AVCodecContext *avctx);
694

695
#define WRAPPER8_16(name8, name16)\
696 697 698 699 700
static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
    return name8(s, dst           , src           , stride, h)\
          +name8(s, dst+8         , src+8         , stride, h);\
}

701
#define WRAPPER8_16_SQ(name8, name16)\
702 703 704 705 706 707 708 709 710 711 712
static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
    int score=0;\
    score +=name8(s, dst           , src           , stride, 8);\
    score +=name8(s, dst+8         , src+8         , stride, 8);\
    if(h==16){\
        dst += 8*stride;\
        src += 8*stride;\
        score +=name8(s, dst           , src           , stride, 8);\
        score +=name8(s, dst+8         , src+8         , stride, 8);\
    }\
    return score;\
Michael Niedermayer's avatar
Michael Niedermayer committed
713 714
}

715

716
static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
717 718 719 720
{
    int i;
    for(i=0; i<h; i++)
    {
721
        AV_WN16(dst   , AV_RN16(src   ));
722 723 724 725 726
        dst+=dstStride;
        src+=srcStride;
    }
}

727
static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
728 729 730 731
{
    int i;
    for(i=0; i<h; i++)
    {
732
        AV_WN32(dst   , AV_RN32(src   ));
733 734 735 736 737
        dst+=dstStride;
        src+=srcStride;
    }
}

738
static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
739 740 741 742
{
    int i;
    for(i=0; i<h; i++)
    {
743 744
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
745 746 747 748 749
        dst+=dstStride;
        src+=srcStride;
    }
}

750
static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
751 752 753 754
{
    int i;
    for(i=0; i<h; i++)
    {
755 756
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
757 758 759 760 761 762
        dst[8]= src[8];
        dst+=dstStride;
        src+=srcStride;
    }
}

763
static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
764 765 766 767
{
    int i;
    for(i=0; i<h; i++)
    {
768 769 770 771
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
        AV_WN32(dst+8 , AV_RN32(src+8 ));
        AV_WN32(dst+12, AV_RN32(src+12));
772 773 774 775 776
        dst+=dstStride;
        src+=srcStride;
    }
}

777
static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
778 779 780 781
{
    int i;
    for(i=0; i<h; i++)
    {
782 783 784 785
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
        AV_WN32(dst+8 , AV_RN32(src+8 ));
        AV_WN32(dst+12, AV_RN32(src+12));
786 787 788 789 790 791
        dst[16]= src[16];
        dst+=dstStride;
        src+=srcStride;
    }
}

792
#endif /* AVCODEC_DSPUTIL_H */