h264dsp_init.c 17.5 KB
Newer Older
1 2 3
/*
 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
 *
4
 * This file is part of Libav.
5
 *
6
 * Libav is free software; you can redistribute it and/or
7 8
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * Libav is distributed in the hope that it will be useful,
12 13 14 15 16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with Libav; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 20
 */

21
#include "libavutil/cpu.h"
22
#include "libavutil/x86/asm.h"
23
#include "libavcodec/h264dsp.h"
24
#include "dsputil_mmx.h"
25 26 27

/***********************************/
/* IDCT */
28 29 30 31
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT)                                  \
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
                                                       int16_t *block,  \
                                                       int stride);
32

33
IDCT_ADD_FUNC(, 8, mmx)
34
IDCT_ADD_FUNC(, 10, sse2)
35
IDCT_ADD_FUNC(_dc, 8, mmx2)
36
IDCT_ADD_FUNC(_dc, 10, mmx2)
37
IDCT_ADD_FUNC(8_dc, 8, mmx2)
38
IDCT_ADD_FUNC(8_dc, 10, sse2)
39 40
IDCT_ADD_FUNC(8, 8, mmx)
IDCT_ADD_FUNC(8, 8, sse2)
41 42 43 44 45 46 47 48
IDCT_ADD_FUNC(8, 10, sse2)
#if HAVE_AVX
IDCT_ADD_FUNC(, 10, avx)
IDCT_ADD_FUNC(8_dc, 10, avx)
IDCT_ADD_FUNC(8, 10, avx)
#endif


49 50 51 52
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT)                         \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT       \
    (uint8_t *dst, const int *block_offset,                             \
     DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]);
53

54 55 56
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
IDCT_ADD_REP_FUNC(8, 4, 8, mmx2)
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
57 58
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
59 60 61
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
IDCT_ADD_REP_FUNC(, 16, 8, mmx2)
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
62
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
63 64 65
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx2)
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
66 67 68 69 70 71 72
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
#if HAVE_AVX
IDCT_ADD_REP_FUNC(, 16, 10, avx)
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
#endif


73 74 75 76 77
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT)                      \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT     \
    (uint8_t **dst, const int *block_offset,                          \
     DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]);

78 79 80
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8, 8, mmx2)
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
81 82 83 84
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
#if HAVE_AVX
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
#endif
85

86
void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul);
87
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
88

89 90 91
/***********************************/
/* deblocking */

92 93 94 95
void ff_h264_loop_filter_strength_mmx2(int16_t bS[2][4][4], uint8_t nnz[40],
                                       int8_t ref[2][40], int16_t mv[2][40][2],
                                       int bidir, int edges, int step,
                                       int mask_mv0, int mask_mv1, int field);
96

97 98 99 100 101 102
#define LF_FUNC(DIR, TYPE, DEPTH, OPT)                                        \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
                                                               int stride,    \
                                                               int alpha,     \
                                                               int beta,      \
                                                               int8_t *tc0);
103
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
                                                               int stride,    \
                                                               int alpha,     \
                                                               int beta);

#define LF_FUNCS(type, depth)                   \
LF_FUNC(h,  chroma,       depth, mmx2)          \
LF_IFUNC(h, chroma_intra, depth, mmx2)          \
LF_FUNC(v,  chroma,       depth, mmx2)          \
LF_IFUNC(v, chroma_intra, depth, mmx2)          \
LF_FUNC(h,  luma,         depth, mmx2)          \
LF_IFUNC(h, luma_intra,   depth, mmx2)          \
LF_FUNC(h,  luma,         depth, sse2)          \
LF_IFUNC(h, luma_intra,   depth, sse2)          \
LF_FUNC(v,  luma,         depth, sse2)          \
LF_IFUNC(v, luma_intra,   depth, sse2)          \
LF_FUNC(h,  chroma,       depth, sse2)          \
LF_IFUNC(h, chroma_intra, depth, sse2)          \
LF_FUNC(v,  chroma,       depth, sse2)          \
LF_IFUNC(v, chroma_intra, depth, sse2)          \
LF_FUNC(h,  luma,         depth, avx)           \
LF_IFUNC(h, luma_intra,   depth, avx)           \
LF_FUNC(v,  luma,         depth, avx)           \
LF_IFUNC(v, luma_intra,   depth, avx)           \
LF_FUNC(h,  chroma,       depth, avx)           \
LF_IFUNC(h, chroma_intra, depth, avx)           \
LF_FUNC(v,  chroma,       depth, avx)           \
LF_IFUNC(v, chroma_intra, depth, avx)

LF_FUNCS(uint8_t,   8)
134 135
LF_FUNCS(uint16_t, 10)

136
#if ARCH_X86_32
137 138 139
LF_FUNC(v8, luma, 8, mmx2)
static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha,
                                     int beta, int8_t *tc0)
140
{
141 142 143 144
    if ((tc0[0] & tc0[1]) >= 0)
        ff_deblock_v8_luma_8_mmx2(pix + 0, stride, alpha, beta, tc0);
    if ((tc0[2] & tc0[3]) >= 0)
        ff_deblock_v8_luma_8_mmx2(pix + 8, stride, alpha, beta, tc0 + 2);
145
}
146 147 148 149

LF_IFUNC(v8, luma_intra, 8, mmx2)
static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride,
                                           int alpha, int beta)
150
{
151 152
    ff_deblock_v8_luma_intra_8_mmx2(pix + 0, stride, alpha, beta);
    ff_deblock_v8_luma_intra_8_mmx2(pix + 8, stride, alpha, beta);
153
}
154
#endif /* ARCH_X86_32 */
155

156 157
LF_FUNC(v,  luma,       10, mmx2)
LF_IFUNC(v, luma_intra, 10, mmx2)
158

159 160 161
/***********************************/
/* weighted prediction */

162 163 164 165
#define H264_WEIGHT(W, OPT)                                             \
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride,         \
                                      int height, int log2_denom,       \
                                      int weight, int offset);
166

167 168 169 170 171
#define H264_BIWEIGHT(W, OPT)                                           \
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src,     \
                                        int stride, int height,         \
                                        int log2_denom, int weightd,    \
                                        int weights, int offset);
172

173 174 175
#define H264_BIWEIGHT_MMX(W)                    \
    H264_WEIGHT(W, mmx2)                        \
    H264_BIWEIGHT(W, mmx2)
176

177 178 179 180 181
#define H264_BIWEIGHT_MMX_SSE(W)                \
    H264_BIWEIGHT_MMX(W)                        \
    H264_WEIGHT(W, sse2)                        \
    H264_BIWEIGHT(W, sse2)                      \
    H264_BIWEIGHT(W, ssse3)
182 183

H264_BIWEIGHT_MMX_SSE(16)
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
H264_BIWEIGHT_MMX_SSE(8)
H264_BIWEIGHT_MMX(4)

#define H264_WEIGHT_10(W, DEPTH, OPT)                                   \
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
                                                    int stride,         \
                                                    int height,         \
                                                    int log2_denom,     \
                                                    int weight,         \
                                                    int offset);

#define H264_BIWEIGHT_10(W, DEPTH, OPT)                                 \
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,     \
                                                      uint8_t *src,     \
                                                      int stride,       \
                                                      int height,       \
                                                      int log2_denom,   \
                                                      int weightd,      \
                                                      int weights,      \
                                                      int offset);

#define H264_BIWEIGHT_10_SSE(W, DEPTH)          \
    H264_WEIGHT_10(W, DEPTH, sse2)              \
    H264_WEIGHT_10(W, DEPTH, sse4)              \
    H264_BIWEIGHT_10(W, DEPTH, sse2)            \
    H264_BIWEIGHT_10(W, DEPTH, sse4)
210 211

H264_BIWEIGHT_10_SSE(16, 10)
212 213
H264_BIWEIGHT_10_SSE(8,  10)
H264_BIWEIGHT_10_SSE(4,  10)
214

215 216
void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                         const int chroma_format_idc)
217
{
218
#if HAVE_YASM
219
    int mm_flags = av_get_cpu_flags();
220

221
    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMXEXT)
222
        c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2;
223 224

    if (bit_depth == 8) {
225 226 227 228 229 230 231 232
        if (mm_flags & AV_CPU_FLAG_MMX) {
            c->h264_idct_dc_add   =
            c->h264_idct_add      = ff_h264_idct_add_8_mmx;
            c->h264_idct8_dc_add  =
            c->h264_idct8_add     = ff_h264_idct8_add_8_mmx;

            c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
            c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
233
            if (chroma_format_idc == 1)
234 235 236 237 238
                c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
            if (mm_flags & AV_CPU_FLAG_CMOV)
                c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;

239
            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
240 241 242 243 244 245 246 247 248 249 250 251 252 253
                c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmx2;
                c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2;
                c->h264_idct_add16   = ff_h264_idct_add16_8_mmx2;
                c->h264_idct8_add4   = ff_h264_idct8_add4_8_mmx2;
                if (chroma_format_idc == 1)
                    c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2;
                c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx2;

                c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmx2;
                c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmx2;
                if (chroma_format_idc == 1) {
                    c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmx2;
                    c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmx2;
                }
254
#if ARCH_X86_32
255 256 257 258 259 260 261 262
                c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_mmx2;
                c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_mmx2;
                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2;
                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2;
#endif /* ARCH_X86_32 */
                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmx2;
                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmx2;
                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmx2;
263

264 265 266
                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmx2;
                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmx2;
                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmx2;
267

268 269
                if (mm_flags & AV_CPU_FLAG_SSE2) {
                    c->h264_idct8_add  = ff_h264_idct8_add_8_sse2;
270

271 272 273 274 275 276
                    c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
                    c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
                    if (chroma_format_idc == 1)
                        c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
                    c->h264_idct_add16intra      = ff_h264_idct_add16intra_8_sse2;
                    c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
277

278 279
                    c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
                    c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
280

281 282
                    c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
                    c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
283

284
#if HAVE_ALIGNED_STACK
285 286 287 288 289 290 291 292 293 294 295
                    c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_sse2;
                    c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_sse2;
                    c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
                    c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
#endif /* HAVE_ALIGNED_STACK */
                }
                if (mm_flags & AV_CPU_FLAG_SSSE3) {
                    c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
                    c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
                }
                if (mm_flags & AV_CPU_FLAG_AVX) {
296
#if HAVE_ALIGNED_STACK
297 298 299 300 301 302
                    c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_avx;
                    c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
                    c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
                    c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
#endif /* HAVE_ALIGNED_STACK */
                }
303 304 305
            }
        }
    } else if (bit_depth == 10) {
306
        if (mm_flags & AV_CPU_FLAG_MMX) {
307
            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
308
#if ARCH_X86_32
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
                c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmx2;
                c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2;
                c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmx2;
                c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmx2;
                c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmx2;
                c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_mmx2;
#endif /* ARCH_X86_32 */
                c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmx2;
                if (mm_flags & AV_CPU_FLAG_SSE2) {
                    c->h264_idct_add     = ff_h264_idct_add_10_sse2;
                    c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;

                    c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
                    if (chroma_format_idc == 1)
                        c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
                    c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
325
#if HAVE_ALIGNED_STACK
326 327 328
                    c->h264_idct8_add  = ff_h264_idct8_add_10_sse2;
                    c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
329

330 331 332
                    c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
                    c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
                    c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
333

334 335 336
                    c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
                    c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
                    c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
337

338 339
                    c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
                    c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
340
#if HAVE_ALIGNED_STACK
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
                    c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
                    c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
                    c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
                    c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
                }
                if (mm_flags & AV_CPU_FLAG_SSE4) {
                    c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
                    c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
                    c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;

                    c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
                    c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
                    c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
                }
356
#if HAVE_AVX
357 358 359 360 361 362 363 364 365
                if (mm_flags & AV_CPU_FLAG_AVX) {
                    c->h264_idct_dc_add  =
                    c->h264_idct_add     = ff_h264_idct_add_10_avx;
                    c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;

                    c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
                    if (chroma_format_idc == 1)
                        c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
                    c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
366
#if HAVE_ALIGNED_STACK
367 368 369
                    c->h264_idct8_add  = ff_h264_idct8_add_10_avx;
                    c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
#endif /* HAVE_ALIGNED_STACK */
370

371 372
                    c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
                    c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
373
#if HAVE_ALIGNED_STACK
374 375 376 377 378 379
                    c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
                    c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;
                    c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_avx;
                    c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_avx;
#endif /* HAVE_ALIGNED_STACK */
                }
380
#endif /* HAVE_AVX */
381
            }
382 383
        }
    }
384
#endif /* HAVE_YASM */
385
}