Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
ffmpeg
Commits
5917d17c
Commit
5917d17c
authored
Oct 03, 2002
by
Leon van Stuivenberg
Committed by
Michael Niedermayer
Oct 03, 2002
Browse files
ps2 optimizations update patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
Originally committed as revision 996 to
svn://svn.ffmpeg.org/ffmpeg/trunk
parent
a46a3ce4
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
533 additions
and
258 deletions
+533
-258
libavcodec/Makefile
libavcodec/Makefile
+1
-1
libavcodec/mpegvideo.c
libavcodec/mpegvideo.c
+3
-0
libavcodec/mpegvideo.h
libavcodec/mpegvideo.h
+3
-0
libavcodec/ps2/dsputil_mmi.c
libavcodec/ps2/dsputil_mmi.c
+86
-69
libavcodec/ps2/idct_mmi.c
libavcodec/ps2/idct_mmi.c
+308
-188
libavcodec/ps2/mmi.h
libavcodec/ps2/mmi.h
+32
-0
libavcodec/ps2/mpegvideo_mmi.c
libavcodec/ps2/mpegvideo_mmi.c
+100
-0
No files found.
libavcodec/Makefile
View file @
5917d17c
...
...
@@ -77,7 +77,7 @@ OBJS += ppc/dsputil_ppc.o
endif
ifeq
($(TARGET_MMI),yes)
OBJS
+=
ps2/dsputil_mmi.o ps2/idct_mmi.o
OBJS
+=
ps2/dsputil_mmi.o ps2/idct_mmi.o
ps2/mpegvideo_mmi.o
endif
ifeq
($(TARGET_ALTIVEC),yes)
...
...
libavcodec/mpegvideo.c
View file @
5917d17c
...
...
@@ -216,6 +216,9 @@ int MPV_common_init(MpegEncContext *s)
#ifdef HAVE_MLIB
MPV_common_init_mlib
(
s
);
#endif
#ifdef HAVE_MMI
MPV_common_init_mmi
(
s
);
#endif
/* load & permutate scantables
...
...
libavcodec/mpegvideo.h
View file @
5917d17c
...
...
@@ -503,6 +503,9 @@ void MPV_common_init_axp(MpegEncContext *s);
#ifdef HAVE_MLIB
void
MPV_common_init_mlib
(
MpegEncContext
*
s
);
#endif
#ifdef HAVE_MMI
void
MPV_common_init_mmi
(
MpegEncContext
*
s
);
#endif
extern
void
(
*
draw_edges
)(
UINT8
*
buf
,
int
wrap
,
int
width
,
int
height
,
int
w
);
void
ff_conceal_past_errors
(
MpegEncContext
*
s
,
int
conceal_all
);
void
ff_copy_bits
(
PutBitContext
*
pb
,
UINT8
*
src
,
int
length
);
...
...
libavcodec/ps2/dsputil_mmi.c
View file @
5917d17c
...
...
@@ -20,96 +20,113 @@
*/
#include "../dsputil.h"
void
ff_mmi_idct
(
DCTELEM
*
block
);
#include "mmi.h"
/* the provided 'as' in binutils 2.9EE doesn't support
the EE's mips3 instructions properly */
#define AS_BUGGY
static
void
clear_blocks_mmi
(
DCTELEM
*
blocks
)
{
/* $4 = blocks */
int
i
;
for
(
i
=
0
;
i
<
6
;
i
++
)
{
sq
(
$
0
,
0
,
$
4
);
sq
(
$
0
,
16
,
$
4
);
sq
(
$
0
,
32
,
$
4
);
sq
(
$
0
,
48
,
$
4
);
sq
(
$
0
,
64
,
$
4
);
sq
(
$
0
,
80
,
$
4
);
sq
(
$
0
,
96
,
$
4
);
sq
(
$
0
,
112
,
$
4
);
__asm__
__volatile__
(
"addi $4, $4, 128"
);
asm
volatile
(
"sq $0, 0(%0)
\n\t
"
"sq $0, 16(%0)
\n\t
"
"sq $0, 32(%0)
\n\t
"
"sq $0, 48(%0)
\n\t
"
"sq $0, 64(%0)
\n\t
"
"sq $0, 80(%0)
\n\t
"
"sq $0, 96(%0)
\n\t
"
"sq $0, 112(%0)
\n\t
"
::
"r"
(
blocks
)
:
"memory"
);
blocks
+=
64
;
}
}
static
void
put_pixels_clamped_mmi
(
const
DCTELEM
*
block
,
UINT8
*
pixels
,
int
line_size
)
static
void
get_pixels_mmi
(
DCTELEM
*
block
,
const
UINT8
*
pixels
,
int
line_size
)
{
/* $4 = block, $5 = pixels, $6 = line_size */
__asm__
__volatile__
(
"li $11, 255"
:::
"$11"
);
lq
(
$
4
,
0
,
$
12
);
pcpyld
(
$
11
,
$
11
,
$
11
);
pcpyh
(
$
11
,
$
11
);
#define PUT(rs) \
ppacb($0, $##rs, $##rs); \
sd3(rs, 0, 5); \
__asm__ __volatile__ ("add $5, $5, $6");
pminh
(
$
12
,
$
11
,
$
12
);
pmaxh
(
$
12
,
$
0
,
$
12
);
lq
(
$
4
,
16
,
$
13
);
PUT
(
12
);
pminh
(
$
13
,
$
11
,
$
13
);
pmaxh
(
$
13
,
$
0
,
$
13
);
lq
(
$
4
,
32
,
$
12
);
PUT
(
13
);
pminh
(
$
12
,
$
11
,
$
12
);
pmaxh
(
$
12
,
$
0
,
$
12
);
lq
(
$
4
,
48
,
$
13
);
PUT
(
12
);
pminh
(
$
13
,
$
11
,
$
13
);
pmaxh
(
$
13
,
$
0
,
$
13
);
lq
(
$
4
,
64
,
$
12
);
PUT
(
13
);
pminh
(
$
12
,
$
11
,
$
12
);
pmaxh
(
$
12
,
$
0
,
$
12
);
lq
(
$
4
,
80
,
$
13
);
PUT
(
12
);
pminh
(
$
13
,
$
11
,
$
13
);
pmaxh
(
$
13
,
$
0
,
$
13
);
lq
(
$
4
,
96
,
$
12
);
PUT
(
13
);
int
i
;
for
(
i
=
0
;
i
<
8
;
i
++
)
{
#ifdef AS_BUGGY
ld3
(
5
,
0
,
8
);
asm
volatile
(
"add %1, %1, %2
\n\t
"
"pextlb $8, $0, $8
\n\t
"
"sq $8, 0(%0)
\n\t
"
::
"r"
(
block
),
"r"
(
pixels
),
"r"
(
line_size
)
:
"$8"
,
"memory"
);
#else
asm
volatile
(
"ld $8, 0(%1)
\n\t
"
"add %1, %1, %2
\n\t
"
"pextlb $8, $0, $8
\n\t
"
"sq $8, 0(%0)
\n\t
"
::
"r"
(
block
),
"r"
(
pixels
),
"r"
(
line_size
)
:
"$8"
,
"memory"
);
#endif
block
+=
8
;
}
}
pminh
(
$
12
,
$
11
,
$
12
);
pmaxh
(
$
12
,
$
0
,
$
12
);
lq
(
$
4
,
112
,
$
13
);
PUT
(
12
);
pminh
(
$
13
,
$
11
,
$
13
);
pmaxh
(
$
13
,
$
0
,
$
13
);
PUT
(
13
);
static
void
put_pixels8_mmi
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
int
i
;
for
(
i
=
0
;
i
<
h
;
i
++
)
{
#ifdef AS_BUGGY
ldr3
(
5
,
0
,
8
);
ldl3
(
5
,
7
,
8
);
asm
volatile
(
"add $5, $5, $6
\n\t
"
);
sd3
(
8
,
0
,
4
);
asm
volatile
(
"add $4, $4, $6
\n\t
"
);
#else
asm
volatile
(
"ldr $8, 0(%1)
\n\t
"
"ldl $8, 7(%1)
\n\t
"
"add %1, %1, %2
\n\t
"
"sd $8, 0(%0)
\n\t
"
"add %0, %0, %2
\n\t
"
::
"r"
(
block
),
"r"
(
pixels
),
"r"
(
line_size
)
:
"$8"
,
"memory"
);
#endif
}
}
/* todo
static void add_pixels_clamped_mmi(const DCTELEM * block, UINT8 * pixels,
int line_size)
static
void
put_pixels16_mmi
(
uint8_t
*
block
,
const
uint8_t
*
pixels
,
int
line_size
,
int
h
)
{
int
i
;
for
(
i
=
0
;
i
<
h
;
i
++
)
{
#ifdef AS_BUGGY
ldr3
(
5
,
0
,
8
);
ldl3
(
5
,
7
,
8
);
ldr3
(
5
,
8
,
9
);
ldl3
(
5
,
15
,
9
);
asm
volatile
(
"add $5, $5, $6
\n\t
"
);
pcpyld
(
$
9
,
$
8
,
$
8
);
sq
(
$
8
,
0
,
$
4
);
asm
volatile
(
"add $4, $4, $6
\n\t
"
);
#else
asm
volatile
(
"ldr $8, 0(%1)
\n\t
"
"ldl $8, 7(%1)
\n\t
"
"ldr $9, 8(%1)
\n\t
"
"ldl $9, 15(%1)
\n\t
"
"add %1, %1, %2
\n\t
"
"pcpyld $8, $9, $8
\n\t
"
"sq $8, 0(%0)
\n\t
"
"add %0, %0, %2
\n\t
"
::
"r"
(
block
),
"r"
(
pixels
),
"r"
(
line_size
)
:
"$8"
,
"$9"
,
"memory"
);
#endif
}
}
*/
void
dsputil_init_mmi
(
void
)
{
put_pixels_clamped
=
put_pixels_clamped_mmi
;
//add_pixels_clamped = add_pixels_clamped_mmi;
clear_blocks
=
clear_blocks_mmi
;
ff_idct
=
ff_mmi_idct
;
put_pixels_tab
[
1
][
0
]
=
put_pixels8_mmi
;
put_no_rnd_pixels_tab
[
1
][
0
]
=
put_pixels8_mmi
;
put_pixels_tab
[
0
][
0
]
=
put_pixels16_mmi
;
put_no_rnd_pixels_tab
[
0
][
0
]
=
put_pixels16_mmi
;
get_pixels
=
get_pixels_mmi
;
}
libavcodec/ps2/idct_mmi.c
View file @
5917d17c
...
...
@@ -12,217 +12,337 @@
mmi port by leonvs@iae.nl
*/
#include "../common.h"
#include "../dsputil.h"
#include "mmi.h"
#define BITS_INV_ACC 5 // 4 or 5 for IEEE
#define SHIFT_INV_ROW (16 - BITS_INV_ACC)
#define SHIFT_INV_COL (1 + BITS_INV_ACC)
// assume SHIFT_INV_ROW == 11
static
int
roundertable
[
8
][
4
]
align16
=
{
{
0x103ff
,
0x103ff
,
0x103ff
,
0x103ff
},
{
1023
,
1023
,
1023
,
1023
}
};
#define TAB_i_04 0
#define TAB_i_17 64
#define TAB_i_26 128
#define TAB_i_35 192
static
short
rowtable
[
4
][
32
]
align16
=
{
{
16384
,
16384
,
22725
,
12873
,
21407
,
8867
,
19266
,
4520
,
16384
,
-
16383
,
19266
,
-
22724
,
8867
,
-
21406
,
-
4519
,
-
12872
,
16384
,
-
16383
,
12873
,
4520
,
-
8866
,
21407
,
-
22724
,
19266
,
16384
,
16384
,
4520
,
19266
,
-
21406
,
-
8866
,
-
12872
,
-
22724
},
{
22725
,
22725
,
31521
,
17855
,
29692
,
12299
,
26722
,
6270
,
22725
,
-
22724
,
26722
,
-
31520
,
12299
,
-
29691
,
-
6269
,
-
17854
,
22725
,
-
22724
,
17855
,
6270
,
-
12298
,
29692
,
-
31520
,
26722
,
22725
,
22725
,
6270
,
26722
,
-
29691
,
-
12298
,
-
17854
,
-
31520
},
{
21407
,
21407
,
29692
,
16819
,
27969
,
11585
,
25172
,
5906
,
21407
,
-
21406
,
25172
,
-
29691
,
11585
,
-
27968
,
-
5905
,
-
16818
,
21407
,
-
21406
,
16819
,
5906
,
-
11584
,
27969
,
-
29691
,
25172
,
21407
,
21407
,
5906
,
25172
,
-
27968
,
-
11584
,
-
16818
,
-
29691
},
{
19266
,
19266
,
26722
,
15137
,
25172
,
10426
,
22654
,
5315
,
19266
,
-
19265
,
22654
,
-
26721
,
10426
,
-
25171
,
-
5314
,
-
15136
,
19266
,
-
19265
,
15137
,
5315
,
-
10425
,
25172
,
-
26721
,
22654
,
19266
,
19266
,
5315
,
22654
,
-
25171
,
-
10425
,
-
15136
,
-
26721
}
};
#define TG_3_16_minus_one 0
#define ONE_plus_tg_3_16 16
#define ONE_plus_tg_1_16 32
#define TG_1_16_minus_one 48
#define TG_2_16_minus_one 64
#define ONE_plus_tg_2_16 80
#define ZERO_ocos_4_16 96
#define TG1 6518
#define TG2 13573
#define TG3 21895
#define MN1 -32768
#define PL1 32768
#define CS4 23170
static
short
coltable
[
7
][
8
]
align16
=
{
{
MN1
,
TG3
,
MN1
,
TG3
,
MN1
,
TG3
,
MN1
,
TG3
},
{
-
TG3
,
-
PL1
,
-
TG3
,
-
PL1
,
-
TG3
,
-
PL1
,
-
TG3
,
-
PL1
},
{
-
TG1
,
-
PL1
,
-
TG1
,
-
PL1
,
-
TG1
,
-
PL1
,
-
TG1
,
-
PL1
},
{
MN1
,
TG1
,
MN1
,
TG1
,
MN1
,
TG1
,
MN1
,
TG1
},
{
MN1
,
TG2
,
MN1
,
TG2
,
MN1
,
TG2
,
MN1
,
TG2
},
{
-
TG2
,
-
PL1
,
-
TG2
,
-
PL1
,
-
TG2
,
-
PL1
,
-
TG2
,
-
PL1
},
{
CS4
,
0
,
CS4
,
0
,
CS4
,
0
,
CS4
,
0
}
#define ROUNDER_0 0
#define ROUNDER_1 16
#define TAB_i_04 (32+0)
#define TAB_i_17 (32+64)
#define TAB_i_26 (32+128)
#define TAB_i_35 (32+192)
#define TG_1_16 (32+256+0)
#define TG_2_16 (32+256+16)
#define TG_3_16 (32+256+32)
#define COS_4_16 (32+256+48)
#define CLIPMAX (32+256+64+0)
static
short
consttable
[]
align16
=
{
/* rounder 0*/
// assume SHIFT_INV_ROW == 11
0x3ff
,
1
,
0x3ff
,
1
,
0x3ff
,
1
,
0x3ff
,
1
,
/* rounder 1*/
0x3ff
,
0
,
0x3ff
,
0
,
0x3ff
,
0
,
0x3ff
,
0
,
/* row 0/4*/
16384
,
21407
,
-
16384
,
-
21407
,
22725
,
19266
,
-
22725
,
-
12873
,
8867
,
16384
,
8867
,
16384
,
4520
,
12873
,
-
4520
,
19266
,
16384
,
-
8867
,
16384
,
-
8867
,
12873
,
-
22725
,
19266
,
-
22725
,
21407
,
-
16384
,
-
21407
,
16384
,
19266
,
4520
,
-
12873
,
4520
,
/* row 1/7*/
22725
,
29692
,
-
22725
,
-
29692
,
31521
,
26722
,
-
31521
,
-
17855
,
12299
,
22725
,
12299
,
22725
,
6270
,
17855
,
-
6270
,
26722
,
22725
,
-
12299
,
22725
,
-
12299
,
17855
,
-
31521
,
26722
,
-
31521
,
29692
,
-
22725
,
-
29692
,
22725
,
26722
,
6270
,
-
17855
,
6270
,
/* row 2/6*/
21407
,
27969
,
-
21407
,
-
27969
,
29692
,
25172
,
-
29692
,
-
16819
,
11585
,
21407
,
11585
,
21407
,
5906
,
16819
,
-
5906
,
25172
,
21407
,
-
11585
,
21407
,
-
11585
,
16819
,
-
29692
,
25172
,
-
29692
,
27969
,
-
21407
,
-
27969
,
21407
,
25172
,
5906
,
-
16819
,
5906
,
/*row 3/5*/
19266
,
25172
,
-
19266
,
-
25172
,
26722
,
22654
,
-
26722
,
-
15137
,
10426
,
19266
,
10426
,
19266
,
5315
,
15137
,
-
5315
,
22654
,
19266
,
-
10426
,
19266
,
-
10426
,
15137
,
-
26722
,
22654
,
-
26722
,
25172
,
-
19266
,
-
25172
,
19266
,
22654
,
5315
,
-
15137
,
5315
,
/*column constants*/
TG1
,
TG1
,
TG1
,
TG1
,
TG1
,
TG1
,
TG1
,
TG1
,
TG2
,
TG2
,
TG2
,
TG2
,
TG2
,
TG2
,
TG2
,
TG2
,
TG3
,
TG3
,
TG3
,
TG3
,
TG3
,
TG3
,
TG3
,
TG3
,
CS4
,
CS4
,
CS4
,
CS4
,
CS4
,
CS4
,
CS4
,
CS4
,
/* clamp */
255
,
255
,
255
,
255
,
255
,
255
,
255
,
255
};
#define noprevh(rt, rd)
#define DCT_8_INV_ROW1(rowoff, taboff, rnd, outreg) { \
\
lq($4, rowoff, $16);
/* r16 = x7 x6 x5 x4 x3 x2 x1 x0 */
\
lq($24, 0+taboff, $17);
/* r17 = w19 w17 w3 w1 w18 w16 w2 w0 */
\
pinth($16, $16, $16);
/* r16 = x7 x3 x6 x2 x5 x1 x4 x0 */
\
phmadh($17, $16, $17);
/* r17 = (x7*w19+x3*w17)b0'' (x6*w3+x2*w1)a0'' (x5*w18+x1*w16)b0' (x4*w2+x0*w0)a0' */
\
lq($24, 16+taboff, $18);
/* r18 = w23 w21 w7 w5 w22 w20 w6 w4 */
\
lq($24, 32+taboff, $19);
/* r19 = w27 w25 w11 w9 w26 w24 w10 w8 */
\
lq($24, 48+taboff, $20);
/* r20 = w31 w29 w15 w13 w30 w28 w14 w12 */
\
phmadh($18, $16, $18);
/* r18 = (b1'')(a1'')(b1')(a1') */
\
pcpyud($17, $17, $21);
/* r21 = (b0'')(a0'')(b0'')(a0'') */
\
paddw($17, $21, $17);
/* r17 = (--)(--)(b0)(a0) */
\
phmadh($19, $16, $19);
/* r19 = (b2'')(a2'')(b2')(a2') */
\
pcpyud($18, $18, $21);
/* r21 = (b1'')(a1'')(b1'')(a1'') */
\
paddw($18, $21, $18);
/* r18 = (--)(--)(b1)(a1) */
\
pcpyud($19, $19, $21); \
phmadh($20, $16, $20);
/* r12 = (b3'')(a3'')(b3')(a3') */
\
paddw($19, $21, $19);
/* r19 = (--)(--)(b2)(a2) */
\
pextlw($19, $17, $16);
/* r16 = (b2)(b0)(a2)(a0) */
\
pcpyud($20, $20, $21); \
paddw($20, $21, $20);
/* r20 = (--)(--)(b3)(a3) */
\
pextlw($20, $18, $17);
/* r17 = (b3)(b1)(a3)(a1) */
\
pextlw($17, $16, $20);
/* r20 = (a3)(a2)(a1)(a0)" */
\
pextuw($17, $16, $21);
/* r21 = (b3)(b2)(b1)(b0) */
\
paddw($20, rnd, $20);
/* r20 = (a3)(a2)(a1)(a0) */
\
paddw($20, $21, $17);
/* r17 = ()()()(a0+b0) */
\
psubw($20, $21, $18);
/* r18 = ()()()(a0-b0) */
\
#define DCT_8_INV_ROW1(blk, rowoff, taboff, rnd, outreg) { \
lq(blk, rowoff, $16);
/* r16 = x7 x5 x3 x1 x6 x4 x2 x0 */
\
/*slot*/
\
lq($24, 0+taboff, $17);
/* r17 = w */
\
/*delay slot $16*/
\
lq($24, 16+taboff, $18);
/* r18 = w */
\
prevh($16, $2);
/* r2 = x1 x3 x5 x7 x0 x2 x4 x6 */
\
lq($24, 32+taboff, $19);
/* r19 = w */
\
phmadh($17, $16, $17);
/* r17 = b1"b0'a1"a0' */
\
lq($24, 48+taboff, $20);
/* r20 = w */
\
phmadh($18, $2, $18);
/* r18 = b1'b0"a1'a0" */
\
phmadh($19, $16, $19);
/* r19 = b3"b2'a3"a2' */
\
phmadh($20, $2, $20);
/* r20 = b3'b2"a3'a2" */
\
paddw($17, $18, $17);
/* r17 = (b1)(b0)(a1)(a0) */
\
paddw($19, $20, $19);
/* r19 = (b3)(b2)(a3)(a2) */
\
pcpyld($19, $17, $18);
/* r18 = (a3)(a2)(a1)(a0) */
\
pcpyud($17, $19, $20);
/* r20 = (b3)(b2)(b1)(b0) */
\
paddw($18, rnd, $18);
/* r18 = (a3)(a2)(a1)(a0) */
\
paddw($18, $20, $17);
/* r17 = ()()()(a0+b0) */
\
psubw($18, $20, $20);
/* r20 = ()()()(a0-b0) */
\
psraw($17, SHIFT_INV_ROW, $17);
/* r17 = (y3 y2 y1 y0) */
\
psraw($18, SHIFT_INV_ROW, $18);
/* r18 = (y4 y5 y6 y7) */
\
ppach($18, $17, outreg);
/* out = y4 y5 y6 y7 y3 y2 y1 y0 Note order */
\
psraw($20, SHIFT_INV_ROW, $20);
/* r20 = (y4 y5 y6 y7) */
\
ppach($20, $17, outreg);
/* out = y4 y5 y6 y7 y3 y2 y1 y0 Note order */
\
\
prevh(outreg, $2); \
pcpyud($2, $2, $2); \
pcpyld($2, outreg, outreg); \
}
#define DCT_8_INV_COL4(pextop, blkoff, revop) { \
lq($24, TG_3_16_minus_one, $2);
/* r2 = (tn3)(-1) x 4 */
\
pextop($11, $13, $3);
/* r3 = (x3)(x5) x 4 */
\
lq($24, ONE_plus_tg_3_16, $16);
/* r16 = -((+1)(tn3)) x 4 */
\
phmadh($3, $2, $17);
/* r17 = (tm35) x 4 */
\
lq($24, ONE_plus_tg_1_16, $2);
/* r2 = -((+1)(tn1)) x 4 */
\
phmadh($3, $16, $18);
/* r18 = -(tp35) x 4 */
\
lq($24, TG_1_16_minus_one, $16);
/* r16 = (tn1)(-1) x 4 */
\
pextop($9, $15, $3);
/* r3 = (x1)(x7) x 4 */
\
phmadh($3, $2, $19); \
lq($24, ZERO_ocos_4_16, $2);
/* r2 = (0)(cos4) x 4 */
\
phmadh($3, $16, $20);
/* r20 = (tm17) x 4 */
\
psubw($0, $19, $19);
/* r19 = (tp17) x 4 */
\
paddw($19, $18, $3);
/* r3 = t1 */
\
paddw($20, $17, $16);
/* r16 = t2 */
\
psubw($20, $17, $23);
/* r23 = b3 */
\
psubw($19, $18, $20);
/* r20 = b0 */
\
paddw($3, $16, $17);
/* (t1+t2) */
\
psubw($3, $16, $18);
/* (t1-t2) */
\
psraw($17, 15, $17); \
lq($24, TG_2_16_minus_one, $3);
/* r3 = (tn2)(-1) x 4 */
\
pmulth($17, $2, $21);
/* r21 = b1 */
\
psraw($18, 15, $18); \
lq($24, ONE_plus_tg_2_16, $16);
/* r16 = -((+1)(tn2)) x 4 */
\
pmulth($18, $2, $22);
/* r22 = b2 */
\
\
pextop($10, $14, $2);
/* r2 = (x2)(x6) x 4 */
\
phmadh($2, $3, $18);
/* r18 = (tm26) x 4 */
\
phmadh($2, $16, $19);
/* r19 = -(tp26) x 4 */
\
pextop($8, $0, $17);
/* r17 = (x0)(0) x 4 */
\
psraw($17, 1, $17); \
pextop($12, $0, $16);
/* r16 = (x4)(0) x 4 */
\
psraw($16, 1, $16); \
paddw($17, $16, $2);
/* r2 = tp04 */
\
psubw($17, $16, $3);
/* r3 = tm04 */
\
psubw($2, $19, $16);
/* r16 = a0 */
\
paddw($3, $18, $17);
/* r17 = a1 */
\
psubw($3, $18, $18);
/* r18 = a2 */
\
paddw($2, $19, $19);
/* r19 = a3 */
\
\
paddw($16, $20, $2);
/* y0 a0+b0 */
\
psubw($16, $20, $16);
/* y7 a0-b0 */
\
psraw($2, SHIFT_INV_COL+15, $2); \
psraw($16, SHIFT_INV_COL+15, $16); \
ppach($0, $2, $2); \
ppach($0, $16, $16); \
revop($2, $2); \
revop($16, $16); \
sd3(2, 0+blkoff, 4); \
sd3(16, 112+blkoff, 4); \
\
paddw($17, $21, $3);
/* y1 a1+b1 */
\
psubw($17, $21, $17);
/* y6 a1-b1 */
\
psraw($3, SHIFT_INV_COL+15, $3); \
psraw($17, SHIFT_INV_COL+15, $17); \
ppach($0, $3, $3); \
ppach($0, $17, $17); \
revop($3, $3); \
revop($17, $17); \
sd3(3, 16+blkoff, 4); \
sd3(17, 96+blkoff, 4); \
\
paddw($18, $22, $2);
/* y2 a2+b2 */
\
psubw($18, $22, $18);
/* y5 a2-b2 */
\
psraw($2, SHIFT_INV_COL+15, $2); \
psraw($18, SHIFT_INV_COL+15, $18); \
ppach($0, $2, $2); \
ppach($0, $18, $18); \
revop($2, $2); \
revop($18, $18); \
sd3(2, 32+blkoff, 4); \
sd3(18, 80+blkoff, 4); \
\
paddw($19, $23, $3);
/* y3 a3+b3 */
\
psubw($19, $23, $19);
/* y4 a3-b3 */
\
psraw($3, SHIFT_INV_COL+15, $3); \
psraw($19, SHIFT_INV_COL+15, $19); \
ppach($0, $3, $3); \
ppach($0, $19, $19); \
revop($3, $3); \
revop($19, $19); \
sd3(3, 48+blkoff, 4); \
sd3(19, 64+blkoff, 4); \
}
#define DCT_8_INV_COL8() \
\
lq($24, TG_3_16, $2);
/* r2 = tn3 */
\
\
pmulth($11, $2, $17);
/* r17 = x3 * tn3 (6420) */
\
psraw($17, 15, $17); \
pmfhl_uw($3);
/* r3 = 7531 */
\
psraw($3, 15, $3); \
pinteh($3, $17, $17);
/* r17 = x3 * tn3 */
\
psubh($17, $13, $17);
/* r17 = tm35 */
\
\
pmulth($13, $2, $18);
/* r18 = x5 * tn3 (6420) */
\
psraw($18, 15, $18); \
pmfhl_uw($3);
/* r3 = 7531 */
\
psraw($3, 15, $3); \
pinteh($3, $18, $18);
/* r18 = x5 * tn3 */
\
paddh($18, $11, $18);
/* r18 = tp35 */
\
\
lq($24, TG_1_16, $2);
/* r2 = tn1 */
\
\
pmulth($15, $2, $19);
/* r19 = x7 * tn1 (6420) */
\
psraw($19, 15, $19); \
pmfhl_uw($3);
/* r3 = 7531 */
\
psraw($3, 15, $3); \
pinteh($3, $19, $19);
/* r19 = x7 * tn1 */
\
paddh($19, $9, $19);
/* r19 = tp17 */
\
\
pmulth($9, $2, $20);
/* r20 = x1 * tn1 (6420) */
\
psraw($20, 15, $20); \
pmfhl_uw($3);
/* r3 = 7531 */
\
psraw($3, 15, $3); \
pinteh($3, $20, $20);
/* r20 = x1 * tn1 */
\
psubh($20, $15, $20);
/* r20 = tm17 */
\
\
psubh($19, $18, $3);
/* r3 = t1 */
\
paddh($20, $17, $16);
/* r16 = t2 */
\
psubh($20, $17, $23);
/* r23 = b3 */
\
paddh($19, $18, $20);
/* r20 = b0 */
\
\
lq($24, COS_4_16, $2);
/* r2 = cs4 */
\
\
paddh($3, $16, $21);
/* r21 = t1+t2 */
\
psubh($3, $16, $22);
/* r22 = t1-t2 */
\
\
pmulth($21, $2, $21);
/* r21 = cs4 * (t1+t2) 6420 */
\
psraw($21, 15, $21); \
pmfhl_uw($3);
/* r3 = 7531 */
\
psraw($3, 15, $3); \
pinteh($3, $21, $21);
/* r21 = b1 */
\
\
pmulth($22, $2, $22);
/* r22 = cs4 * (t1-t2) 6420 */
\
psraw($22, 15, $22); \
pmfhl_uw($3);
/* r3 = 7531 */
\
psraw($3, 15, $3); \
pinteh($3, $22, $22);
/* r22 = b2 */
\
\
lq($24, TG_2_16, $2);
/* r2 = tn2 */
\
\
pmulth($10, $2, $17);
/* r17 = x2 * tn2 (6420) */
\
psraw($17, 15, $17); \
pmfhl_uw($3);
/* r3 = 7531 */
\
psraw($3, 15, $3); \
pinteh($3, $17, $17);
/* r17 = x3 * tn3 */
\
psubh($17, $14, $17);
/* r17 = tm26 */
\
\
pmulth($14, $2, $18);
/* r18 = x6 * tn2 (6420) */
\
psraw($18, 15, $18); \
pmfhl_uw($3);
/* r3 = 7531 */
\
psraw($3, 15, $3); \
pinteh($3, $18, $18);
/* r18 = x6 * tn2 */
\
paddh($18, $10, $18);
/* r18 = tp26 */
\
\
paddh($8, $12, $2);
/* r2 = tp04 */
\
psubh($8, $12, $3);
/* r3 = tm04 */
\
\
paddh($2, $18, $16);
/* r16 = a0 */
\
psubh($2, $18, $19);
/* r19 = a3 */
\
psubh($3, $17, $18);
/* r18 = a2 */
\
paddh($3, $17, $17);
/* r17 = a1 */
#define DCT_8_INV_COL8_STORE(blk) \
\
paddh($16, $20, $2);
/* y0 a0+b0 */
\
psubh($16, $20, $16);
/* y7 a0-b0 */
\
psrah($2, SHIFT_INV_COL, $2); \
psrah($16, SHIFT_INV_COL, $16); \
sq($2, 0, blk); \
sq($16, 112, blk); \
\
paddh($17, $21, $3);
/* y1 a1+b1 */
\
psubh($17, $21, $17);
/* y6 a1-b1 */
\
psrah($3, SHIFT_INV_COL, $3); \
psrah($17, SHIFT_INV_COL, $17); \
sq($3, 16, blk); \
sq($17, 96, blk); \
\
paddh($18, $22, $2);
/* y2 a2+b2 */
\
psubh($18, $22, $18);
/* y5 a2-b2 */
\
psrah($2, SHIFT_INV_COL, $2); \
psrah($18, SHIFT_INV_COL, $18); \
sq($2, 32, blk); \
sq($18, 80, blk); \
\
paddh($19, $23, $3);
/* y3 a3+b3 */
\
psubh($19, $23, $19);
/* y4 a3-b3 */
\
psrah($3, SHIFT_INV_COL, $3); \
psrah($19, SHIFT_INV_COL, $19); \
sq($3, 48, blk); \
sq($19, 64, blk);
#define DCT_8_INV_COL8_PMS() \
paddh($16, $20, $2);
/* y0 a0+b0 */
\
psubh($16, $20, $20);
/* y7 a0-b0 */
\
psrah($2, SHIFT_INV_COL, $16); \
psrah($20, SHIFT_INV_COL, $20); \
\
paddh($17, $21, $3);
/* y1 a1+b1 */
\
psubh($17, $21, $21);
/* y6 a1-b1 */
\
psrah($3, SHIFT_INV_COL, $17); \
psrah($21, SHIFT_INV_COL, $21); \
\