Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
libvpx
Commits
8099220e
Commit
8099220e
authored
Dec 01, 2017
by
Kaustubh Raste
Committed by
Gerrit Code Review
Dec 01, 2017
Browse files
Merge "mips msa optimize vpx_scaled_2d function"
parents
c22ab8ab
339f4dca
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
619 additions
and
2 deletions
+619
-2
test/convolve_test.cc
test/convolve_test.cc
+1
-1
vpx_dsp/mips/macros_msa.h
vpx_dsp/mips/macros_msa.h
+21
-0
vpx_dsp/mips/vpx_convolve8_msa.c
vpx_dsp/mips/vpx_convolve8_msa.c
+596
-0
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/vpx_dsp_rtcd_defs.pl
+1
-1
No files found.
test/convolve_test.cc
View file @
8099220e
...
...
@@ -1359,7 +1359,7 @@ const ConvolveFunctions convolve8_msa(
vpx_convolve8_avg_horiz_msa
,
vpx_convolve8_vert_msa
,
vpx_convolve8_avg_vert_msa
,
vpx_convolve8_msa
,
vpx_convolve8_avg_msa
,
vpx_scaled_horiz_c
,
vpx_scaled_avg_horiz_c
,
vpx_scaled_vert_c
,
vpx_scaled_avg_vert_c
,
vpx_scaled_2d_
c
,
vpx_scaled_avg_2d_c
,
0
);
vpx_scaled_avg_vert_c
,
vpx_scaled_2d_
msa
,
vpx_scaled_avg_2d_c
,
0
);
const
ConvolveParam
kArrayConvolve8_msa
[]
=
{
ALL_SIZES
(
convolve8_msa
)
};
INSTANTIATE_TEST_CASE_P
(
MSA
,
ConvolveTest
,
...
...
vpx_dsp/mips/macros_msa.h
View file @
8099220e
...
...
@@ -555,6 +555,7 @@
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
out3) \
...
...
@@ -1182,6 +1183,7 @@
out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
}
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__)
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
...
...
@@ -1595,6 +1597,25 @@
out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
}
/* Description : Sign extend byte elements from input vector and return
halfword results in pair of vectors
Arguments : Input - in (byte vector)
Outputs - out0, out1 (sign extended halfword vectors)
Return Type - signed halfword
Details : Sign bit of byte elements from input vector 'in' is
extracted and interleaved right with same vector 'in0' to
generate 8 signed halfword elements in 'out0'
Then interleaved left with same vector 'in0' to
generate 8 signed halfword elements in 'out1'
*/
#define UNPCK_SB_SH(in, out0, out1) \
{ \
v16i8 tmp_m; \
\
tmp_m = __msa_clti_s_b((v16i8)in, 0); \
ILVRL_B2_SH(tmp_m, in, out0, out1); \
}
/* Description : Zero extend unsigned byte elements to halfword elements
Arguments : Input - in (unsigned byte vector)
Outputs - out0, out1 (unsigned halfword vectors)
...
...
vpx_dsp/mips/vpx_convolve8_msa.c
View file @
8099220e
...
...
@@ -629,3 +629,599 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
}
}
}
static
void
filter_horiz_w4_msa
(
const
uint8_t
*
src_x
,
ptrdiff_t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
x_filter
)
{
uint64_t
srcd0
,
srcd1
,
srcd2
,
srcd3
;
uint32_t
res
;
v16u8
src0
=
{
0
},
src1
=
{
0
},
dst0
;
v16i8
out0
,
out1
;
v16i8
shf1
=
{
0
,
8
,
16
,
24
,
4
,
12
,
20
,
28
,
1
,
9
,
17
,
25
,
5
,
13
,
21
,
29
};
v16i8
shf2
=
shf1
+
2
;
v16i8
filt_shf0
=
{
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
};
v16i8
filt_shf1
=
filt_shf0
+
2
;
v16i8
filt_shf2
=
filt_shf0
+
4
;
v16i8
filt_shf3
=
filt_shf0
+
6
;
v8i16
filt
,
src0_h
,
src1_h
,
src2_h
,
src3_h
,
filt0
,
filt1
,
filt2
,
filt3
;
LD4
(
src_x
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_UB
(
srcd0
,
srcd1
,
src0
);
INSERT_D2_UB
(
srcd2
,
srcd3
,
src1
);
VSHF_B2_SB
(
src0
,
src1
,
src0
,
src1
,
shf1
,
shf2
,
out0
,
out1
);
XORI_B2_128_SB
(
out0
,
out1
);
UNPCK_SB_SH
(
out0
,
src0_h
,
src1_h
);
UNPCK_SB_SH
(
out1
,
src2_h
,
src3_h
);
filt
=
LD_SH
(
x_filter
);
VSHF_B2_SH
(
filt
,
filt
,
filt
,
filt
,
filt_shf0
,
filt_shf1
,
filt0
,
filt1
);
VSHF_B2_SH
(
filt
,
filt
,
filt
,
filt
,
filt_shf2
,
filt_shf3
,
filt2
,
filt3
);
src0_h
*=
filt0
;
src0_h
+=
src1_h
*
filt1
;
src0_h
+=
src2_h
*
filt2
;
src0_h
+=
src3_h
*
filt3
;
src1_h
=
(
v8i16
)
__msa_sldi_b
((
v16i8
)
src0_h
,
(
v16i8
)
src0_h
,
8
);
src0_h
=
__msa_adds_s_h
(
src0_h
,
src1_h
);
src0_h
=
__msa_srari_h
(
src0_h
,
FILTER_BITS
);
src0_h
=
__msa_sat_s_h
(
src0_h
,
7
);
dst0
=
PCKEV_XORI128_UB
(
src0_h
,
src0_h
);
res
=
__msa_copy_u_w
((
v4i32
)
dst0
,
0
);
SW
(
res
,
dst
);
}
static
void
filter_horiz_w8_msa
(
const
uint8_t
*
src_x
,
ptrdiff_t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
x_filter
)
{
uint64_t
srcd0
,
srcd1
,
srcd2
,
srcd3
;
v16u8
src0
=
{
0
},
src1
=
{
0
},
src2
=
{
0
},
src3
=
{
0
};
v16u8
tmp0
,
tmp1
,
tmp2
,
tmp3
,
dst0
;
v16i8
out0
,
out1
,
out2
,
out3
;
v16i8
shf1
=
{
0
,
8
,
16
,
24
,
1
,
9
,
17
,
25
,
2
,
10
,
18
,
26
,
3
,
11
,
19
,
27
};
v16i8
shf2
=
shf1
+
4
;
v8i16
filt
,
src0_h
,
src1_h
,
src2_h
,
src3_h
,
src4_h
,
src5_h
,
src6_h
,
src7_h
;
v8i16
filt0
,
filt1
,
filt2
,
filt3
,
filt4
,
filt5
,
filt6
,
filt7
;
LD4
(
src_x
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_UB
(
srcd0
,
srcd1
,
src0
);
INSERT_D2_UB
(
srcd2
,
srcd3
,
src1
);
LD4
(
src_x
+
4
*
src_pitch
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_UB
(
srcd0
,
srcd1
,
src2
);
INSERT_D2_UB
(
srcd2
,
srcd3
,
src3
);
filt
=
LD_SH
(
x_filter
);
SPLATI_H4_SH
(
filt
,
0
,
1
,
2
,
3
,
filt0
,
filt1
,
filt2
,
filt3
);
SPLATI_H4_SH
(
filt
,
4
,
5
,
6
,
7
,
filt4
,
filt5
,
filt6
,
filt7
);
// transpose
VSHF_B2_UB
(
src0
,
src1
,
src0
,
src1
,
shf1
,
shf2
,
tmp0
,
tmp1
);
VSHF_B2_UB
(
src2
,
src3
,
src2
,
src3
,
shf1
,
shf2
,
tmp2
,
tmp3
);
ILVRL_W2_SB
(
tmp2
,
tmp0
,
out0
,
out1
);
ILVRL_W2_SB
(
tmp3
,
tmp1
,
out2
,
out3
);
XORI_B4_128_SB
(
out0
,
out1
,
out2
,
out3
);
UNPCK_SB_SH
(
out0
,
src0_h
,
src1_h
);
UNPCK_SB_SH
(
out1
,
src2_h
,
src3_h
);
UNPCK_SB_SH
(
out2
,
src4_h
,
src5_h
);
UNPCK_SB_SH
(
out3
,
src6_h
,
src7_h
);
src0_h
*=
filt0
;
src4_h
*=
filt4
;
src0_h
+=
src1_h
*
filt1
;
src4_h
+=
src5_h
*
filt5
;
src0_h
+=
src2_h
*
filt2
;
src4_h
+=
src6_h
*
filt6
;
src0_h
+=
src3_h
*
filt3
;
src4_h
+=
src7_h
*
filt7
;
src0_h
=
__msa_adds_s_h
(
src0_h
,
src4_h
);
src0_h
=
__msa_srari_h
(
src0_h
,
FILTER_BITS
);
src0_h
=
__msa_sat_s_h
(
src0_h
,
7
);
dst0
=
PCKEV_XORI128_UB
(
src0_h
,
src0_h
);
ST8x1_UB
(
dst0
,
dst
);
}
static
void
filter_horiz_w16_msa
(
const
uint8_t
*
src_x
,
ptrdiff_t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
x_filter
)
{
uint64_t
srcd0
,
srcd1
,
srcd2
,
srcd3
;
v16u8
src0
=
{
0
},
src1
=
{
0
},
src2
=
{
0
},
src3
=
{
0
};
v16u8
src4
=
{
0
},
src5
=
{
0
},
src6
=
{
0
},
src7
=
{
0
};
v16u8
tmp0
,
tmp1
,
tmp2
,
tmp3
,
dst0
;
v16i8
out0
,
out1
,
out2
,
out3
,
out4
,
out5
,
out6
,
out7
;
v16i8
shf1
=
{
0
,
8
,
16
,
24
,
1
,
9
,
17
,
25
,
2
,
10
,
18
,
26
,
3
,
11
,
19
,
27
};
v16i8
shf2
=
shf1
+
4
;
v8i16
filt
,
src0_h
,
src1_h
,
src2_h
,
src3_h
,
src4_h
,
src5_h
,
src6_h
,
src7_h
;
v8i16
filt0
,
filt1
,
filt2
,
filt3
,
filt4
,
filt5
,
filt6
,
filt7
;
v8i16
dst0_h
,
dst1_h
,
dst2_h
,
dst3_h
;
LD4
(
src_x
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_UB
(
srcd0
,
srcd1
,
src0
);
INSERT_D2_UB
(
srcd2
,
srcd3
,
src1
);
LD4
(
src_x
+
4
*
src_pitch
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_UB
(
srcd0
,
srcd1
,
src2
);
INSERT_D2_UB
(
srcd2
,
srcd3
,
src3
);
LD4
(
src_x
+
8
*
src_pitch
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_UB
(
srcd0
,
srcd1
,
src4
);
INSERT_D2_UB
(
srcd2
,
srcd3
,
src5
);
LD4
(
src_x
+
12
*
src_pitch
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_UB
(
srcd0
,
srcd1
,
src6
);
INSERT_D2_UB
(
srcd2
,
srcd3
,
src7
);
filt
=
LD_SH
(
x_filter
);
SPLATI_H4_SH
(
filt
,
0
,
1
,
2
,
3
,
filt0
,
filt1
,
filt2
,
filt3
);
SPLATI_H4_SH
(
filt
,
4
,
5
,
6
,
7
,
filt4
,
filt5
,
filt6
,
filt7
);
// transpose
VSHF_B2_UB
(
src0
,
src1
,
src0
,
src1
,
shf1
,
shf2
,
tmp0
,
tmp1
);
VSHF_B2_UB
(
src2
,
src3
,
src2
,
src3
,
shf1
,
shf2
,
tmp2
,
tmp3
);
ILVRL_W2_SB
(
tmp2
,
tmp0
,
out0
,
out1
);
ILVRL_W2_SB
(
tmp3
,
tmp1
,
out2
,
out3
);
XORI_B4_128_SB
(
out0
,
out1
,
out2
,
out3
);
UNPCK_SB_SH
(
out0
,
src0_h
,
src1_h
);
UNPCK_SB_SH
(
out1
,
src2_h
,
src3_h
);
UNPCK_SB_SH
(
out2
,
src4_h
,
src5_h
);
UNPCK_SB_SH
(
out3
,
src6_h
,
src7_h
);
VSHF_B2_UB
(
src4
,
src5
,
src4
,
src5
,
shf1
,
shf2
,
tmp0
,
tmp1
);
VSHF_B2_UB
(
src6
,
src7
,
src6
,
src7
,
shf1
,
shf2
,
tmp2
,
tmp3
);
ILVRL_W2_SB
(
tmp2
,
tmp0
,
out4
,
out5
);
ILVRL_W2_SB
(
tmp3
,
tmp1
,
out6
,
out7
);
XORI_B4_128_SB
(
out4
,
out5
,
out6
,
out7
);
dst0_h
=
src0_h
*
filt0
;
dst1_h
=
src4_h
*
filt4
;
dst0_h
+=
src1_h
*
filt1
;
dst1_h
+=
src5_h
*
filt5
;
dst0_h
+=
src2_h
*
filt2
;
dst1_h
+=
src6_h
*
filt6
;
dst0_h
+=
src3_h
*
filt3
;
dst1_h
+=
src7_h
*
filt7
;
UNPCK_SB_SH
(
out4
,
src0_h
,
src1_h
);
UNPCK_SB_SH
(
out5
,
src2_h
,
src3_h
);
UNPCK_SB_SH
(
out6
,
src4_h
,
src5_h
);
UNPCK_SB_SH
(
out7
,
src6_h
,
src7_h
);
dst2_h
=
src0_h
*
filt0
;
dst3_h
=
src4_h
*
filt4
;
dst2_h
+=
src1_h
*
filt1
;
dst3_h
+=
src5_h
*
filt5
;
dst2_h
+=
src2_h
*
filt2
;
dst3_h
+=
src6_h
*
filt6
;
dst2_h
+=
src3_h
*
filt3
;
dst3_h
+=
src7_h
*
filt7
;
ADDS_SH2_SH
(
dst0_h
,
dst1_h
,
dst2_h
,
dst3_h
,
dst0_h
,
dst2_h
);
SRARI_H2_SH
(
dst0_h
,
dst2_h
,
FILTER_BITS
);
SAT_SH2_SH
(
dst0_h
,
dst2_h
,
7
);
dst0
=
PCKEV_XORI128_UB
(
dst0_h
,
dst2_h
);
ST_UB
(
dst0
,
dst
);
}
static
void
transpose4x4_to_dst
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
)
{
v16u8
in0
;
v16i8
out0
=
{
0
,
4
,
8
,
12
,
1
,
5
,
9
,
13
,
2
,
6
,
10
,
14
,
3
,
7
,
11
,
15
};
in0
=
LD_UB
(
src
);
out0
=
__msa_vshf_b
(
out0
,
(
v16i8
)
in0
,
(
v16i8
)
in0
);
ST4x4_UB
(
out0
,
out0
,
0
,
1
,
2
,
3
,
dst
,
dst_stride
);
}
static
void
transpose8x8_to_dst
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
)
{
v16u8
in0
,
in1
,
in2
,
in3
,
out0
,
out1
,
out2
,
out3
,
tmp0
,
tmp1
,
tmp2
,
tmp3
;
v16i8
shf1
=
{
0
,
8
,
16
,
24
,
1
,
9
,
17
,
25
,
2
,
10
,
18
,
26
,
3
,
11
,
19
,
27
};
v16i8
shf2
=
shf1
+
4
;
LD_UB4
(
src
,
16
,
in0
,
in1
,
in2
,
in3
);
VSHF_B2_UB
(
in0
,
in1
,
in0
,
in1
,
shf1
,
shf2
,
tmp0
,
tmp1
);
VSHF_B2_UB
(
in2
,
in3
,
in2
,
in3
,
shf1
,
shf2
,
tmp2
,
tmp3
);
ILVRL_W2_UB
(
tmp2
,
tmp0
,
out0
,
out1
);
ILVRL_W2_UB
(
tmp3
,
tmp1
,
out2
,
out3
);
ST8x4_UB
(
out0
,
out1
,
dst
,
dst_stride
);
ST8x4_UB
(
out2
,
out3
,
dst
+
4
*
dst_stride
,
dst_stride
);
}
static
void
transpose16x16_to_dst
(
const
uint8_t
*
src
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
)
{
v16u8
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
in6
,
in7
,
in8
,
in9
,
in10
,
in11
,
in12
;
v16u8
in13
,
in14
,
in15
,
out0
,
out1
,
out2
,
out3
,
out4
,
out5
,
out6
,
out7
,
out8
;
v16u8
out9
,
out10
,
out11
,
out12
,
out13
,
out14
,
out15
;
LD_UB8
(
src
,
16
,
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
in6
,
in7
);
LD_UB8
(
src
+
16
*
8
,
16
,
in8
,
in9
,
in10
,
in11
,
in12
,
in13
,
in14
,
in15
);
TRANSPOSE16x8_UB_UB
(
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
in6
,
in7
,
in8
,
in9
,
in10
,
in11
,
in12
,
in13
,
in14
,
in15
,
out0
,
out1
,
out2
,
out3
,
out4
,
out5
,
out6
,
out7
);
ST_UB8
(
out0
,
out1
,
out2
,
out3
,
out4
,
out5
,
out6
,
out7
,
dst
,
dst_stride
);
dst
+=
8
*
dst_stride
;
SLDI_B4_0_UB
(
in0
,
in1
,
in2
,
in3
,
in0
,
in1
,
in2
,
in3
,
8
);
SLDI_B4_0_UB
(
in4
,
in5
,
in6
,
in7
,
in4
,
in5
,
in6
,
in7
,
8
);
SLDI_B4_0_UB
(
in8
,
in9
,
in10
,
in11
,
in8
,
in9
,
in10
,
in11
,
8
);
SLDI_B4_0_UB
(
in12
,
in13
,
in14
,
in15
,
in12
,
in13
,
in14
,
in15
,
8
);
TRANSPOSE16x8_UB_UB
(
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
in6
,
in7
,
in8
,
in9
,
in10
,
in11
,
in12
,
in13
,
in14
,
in15
,
out8
,
out9
,
out10
,
out11
,
out12
,
out13
,
out14
,
out15
);
ST_UB8
(
out8
,
out9
,
out10
,
out11
,
out12
,
out13
,
out14
,
out15
,
dst
,
dst_stride
);
}
static
void
scaledconvolve_horiz_w4
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
InterpKernel
*
x_filters
,
int
x0_q4
,
int
x_step_q4
,
int
h
)
{
DECLARE_ALIGNED
(
16
,
uint8_t
,
temp
[
4
*
4
]);
int
y
,
z
,
i
;
src
-=
SUBPEL_TAPS
/
2
-
1
;
for
(
y
=
0
;
y
<
h
;
y
+=
4
)
{
int
x_q4
=
x0_q4
;
for
(
z
=
0
;
z
<
4
;
++
z
)
{
const
uint8_t
*
const
src_x
=
&
src
[
x_q4
>>
SUBPEL_BITS
];
const
int16_t
*
const
x_filter
=
x_filters
[
x_q4
&
SUBPEL_MASK
];
if
(
x_q4
&
SUBPEL_MASK
)
{
filter_horiz_w4_msa
(
src_x
,
src_stride
,
temp
+
(
z
*
4
),
x_filter
);
}
else
{
for
(
i
=
0
;
i
<
4
;
++
i
)
{
temp
[
z
*
4
+
i
]
=
src_x
[
i
*
src_stride
+
3
];
}
}
x_q4
+=
x_step_q4
;
}
transpose4x4_to_dst
(
temp
,
dst
,
dst_stride
);
src
+=
src_stride
*
4
;
dst
+=
dst_stride
*
4
;
}
}
static
void
scaledconvolve_horiz_w8
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
InterpKernel
*
x_filters
,
int
x0_q4
,
int
x_step_q4
,
int
h
)
{
DECLARE_ALIGNED
(
16
,
uint8_t
,
temp
[
8
*
8
]);
int
y
,
z
,
i
;
src
-=
SUBPEL_TAPS
/
2
-
1
;
// This function processes 8x8 areas. The intermediate height is not always
// a multiple of 8, so force it to be a multiple of 8 here.
y
=
h
+
(
8
-
(
h
&
0x7
));
do
{
int
x_q4
=
x0_q4
;
for
(
z
=
0
;
z
<
8
;
++
z
)
{
const
uint8_t
*
const
src_x
=
&
src
[
x_q4
>>
SUBPEL_BITS
];
const
int16_t
*
const
x_filter
=
x_filters
[
x_q4
&
SUBPEL_MASK
];
if
(
x_q4
&
SUBPEL_MASK
)
{
filter_horiz_w8_msa
(
src_x
,
src_stride
,
temp
+
(
z
*
8
),
x_filter
);
}
else
{
for
(
i
=
0
;
i
<
8
;
++
i
)
{
temp
[
z
*
8
+
i
]
=
src_x
[
3
+
i
*
src_stride
];
}
}
x_q4
+=
x_step_q4
;
}
transpose8x8_to_dst
(
temp
,
dst
,
dst_stride
);
src
+=
src_stride
*
8
;
dst
+=
dst_stride
*
8
;
}
while
(
y
-=
8
);
}
static
void
scaledconvolve_horiz_mul16
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
InterpKernel
*
x_filters
,
int
x0_q4
,
int
x_step_q4
,
int
w
,
int
h
)
{
DECLARE_ALIGNED
(
16
,
uint8_t
,
temp
[
16
*
16
]);
int
x
,
y
,
z
,
i
;
src
-=
SUBPEL_TAPS
/
2
-
1
;
// This function processes 16x16 areas. The intermediate height is not always
// a multiple of 16, so force it to be a multiple of 8 here.
y
=
h
+
(
16
-
(
h
&
0xF
));
do
{
int
x_q4
=
x0_q4
;
for
(
x
=
0
;
x
<
w
;
x
+=
16
)
{
for
(
z
=
0
;
z
<
16
;
++
z
)
{
const
uint8_t
*
const
src_x
=
&
src
[
x_q4
>>
SUBPEL_BITS
];
const
int16_t
*
const
x_filter
=
x_filters
[
x_q4
&
SUBPEL_MASK
];
if
(
x_q4
&
SUBPEL_MASK
)
{
filter_horiz_w16_msa
(
src_x
,
src_stride
,
temp
+
(
z
*
16
),
x_filter
);
}
else
{
for
(
i
=
0
;
i
<
16
;
++
i
)
{
temp
[
z
*
16
+
i
]
=
src_x
[
3
+
i
*
src_stride
];
}
}
x_q4
+=
x_step_q4
;
}
transpose16x16_to_dst
(
temp
,
dst
+
x
,
dst_stride
);
}
src
+=
src_stride
*
16
;
dst
+=
dst_stride
*
16
;
}
while
(
y
-=
16
);
}
static
void
filter_vert_w4_msa
(
const
uint8_t
*
src_y
,
ptrdiff_t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
y_filter
)
{
uint32_t
srcw0
,
srcw1
,
srcw2
,
srcw3
,
srcw4
,
srcw5
,
srcw6
,
srcw7
;
uint32_t
res
;
v16u8
src0
=
{
0
},
src1
=
{
0
},
dst0
;
v16i8
out0
,
out1
;
v16i8
shf1
=
{
0
,
1
,
2
,
3
,
16
,
17
,
18
,
19
,
4
,
5
,
6
,
7
,
20
,
21
,
22
,
23
};
v16i8
shf2
=
shf1
+
8
;
v16i8
filt_shf0
=
{
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
};
v16i8
filt_shf1
=
filt_shf0
+
2
;
v16i8
filt_shf2
=
filt_shf0
+
4
;
v16i8
filt_shf3
=
filt_shf0
+
6
;
v8i16
filt
,
src0_h
,
src1_h
,
src2_h
,
src3_h
;
v8i16
filt0
,
filt1
,
filt2
,
filt3
;
LW4
(
src_y
,
src_pitch
,
srcw0
,
srcw1
,
srcw2
,
srcw3
);
LW4
(
src_y
+
4
*
src_pitch
,
src_pitch
,
srcw4
,
srcw5
,
srcw6
,
srcw7
);
INSERT_W4_UB
(
srcw0
,
srcw1
,
srcw2
,
srcw3
,
src0
);
INSERT_W4_UB
(
srcw4
,
srcw5
,
srcw6
,
srcw7
,
src1
);
VSHF_B2_SB
(
src0
,
src1
,
src0
,
src1
,
shf1
,
shf2
,
out0
,
out1
);
XORI_B2_128_SB
(
out0
,
out1
);
UNPCK_SB_SH
(
out0
,
src0_h
,
src1_h
);
UNPCK_SB_SH
(
out1
,
src2_h
,
src3_h
);
filt
=
LD_SH
(
y_filter
);
VSHF_B2_SH
(
filt
,
filt
,
filt
,
filt
,
filt_shf0
,
filt_shf1
,
filt0
,
filt1
);
VSHF_B2_SH
(
filt
,
filt
,
filt
,
filt
,
filt_shf2
,
filt_shf3
,
filt2
,
filt3
);
src0_h
*=
filt0
;
src0_h
+=
src1_h
*
filt1
;
src0_h
+=
src2_h
*
filt2
;
src0_h
+=
src3_h
*
filt3
;
src1_h
=
(
v8i16
)
__msa_sldi_b
((
v16i8
)
src0_h
,
(
v16i8
)
src0_h
,
8
);
src0_h
=
__msa_adds_s_h
(
src0_h
,
src1_h
);
src0_h
=
__msa_srari_h
(
src0_h
,
FILTER_BITS
);
src0_h
=
__msa_sat_s_h
(
src0_h
,
7
);
dst0
=
PCKEV_XORI128_UB
(
src0_h
,
src0_h
);
res
=
__msa_copy_u_w
((
v4i32
)
dst0
,
0
);
SW
(
res
,
dst
);
}
static
void
filter_vert_w8_msa
(
const
uint8_t
*
src_y
,
ptrdiff_t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
y_filter
)
{
uint64_t
srcd0
,
srcd1
,
srcd2
,
srcd3
;
v16u8
dst0
;
v16i8
src0
=
{
0
},
src1
=
{
0
},
src2
=
{
0
},
src3
=
{
0
};
v8i16
filt
,
src0_h
,
src1_h
,
src2_h
,
src3_h
,
src4_h
,
src5_h
,
src6_h
,
src7_h
;
v8i16
filt0
,
filt1
,
filt2
,
filt3
,
filt4
,
filt5
,
filt6
,
filt7
;
LD4
(
src_y
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_SB
(
srcd0
,
srcd1
,
src0
);
INSERT_D2_SB
(
srcd2
,
srcd3
,
src1
);
LD4
(
src_y
+
4
*
src_pitch
,
src_pitch
,
srcd0
,
srcd1
,
srcd2
,
srcd3
);
INSERT_D2_SB
(
srcd0
,
srcd1
,
src2
);
INSERT_D2_SB
(
srcd2
,
srcd3
,
src3
);
filt
=
LD_SH
(
y_filter
);
SPLATI_H4_SH
(
filt
,
0
,
1
,
2
,
3
,
filt0
,
filt1
,
filt2
,
filt3
);
SPLATI_H4_SH
(
filt
,
4
,
5
,
6
,
7
,
filt4
,
filt5
,
filt6
,
filt7
);
XORI_B4_128_SB
(
src0
,
src1
,
src2
,
src3
);
UNPCK_SB_SH
(
src0
,
src0_h
,
src1_h
);
UNPCK_SB_SH
(
src1
,
src2_h
,
src3_h
);
UNPCK_SB_SH
(
src2
,
src4_h
,
src5_h
);
UNPCK_SB_SH
(
src3
,
src6_h
,
src7_h
);
src0_h
*=
filt0
;
src4_h
*=
filt4
;
src0_h
+=
src1_h
*
filt1
;
src4_h
+=
src5_h
*
filt5
;
src0_h
+=
src2_h
*
filt2
;
src4_h
+=
src6_h
*
filt6
;
src0_h
+=
src3_h
*
filt3
;
src4_h
+=
src7_h
*
filt7
;
src0_h
=
__msa_adds_s_h
(
src0_h
,
src4_h
);
src0_h
=
__msa_srari_h
(
src0_h
,
FILTER_BITS
);
src0_h
=
__msa_sat_s_h
(
src0_h
,
7
);
dst0
=
PCKEV_XORI128_UB
(
src0_h
,
src0_h
);
ST8x1_UB
(
dst0
,
dst
);
}
static
void
filter_vert_mul_w16_msa
(
const
uint8_t
*
src_y
,
ptrdiff_t
src_pitch
,
uint8_t
*
dst
,
const
int16_t
*
y_filter
,
int
w
)
{
int
x
;
v16u8
dst0
;
v16i8
src0
,
src1
,
src2
,
src3
,
src4
,
src5
,
src6
,
src7
;
v8i16
filt
,
src0_h
,
src1_h
,
src2_h
,
src3_h
,
src4_h
,
src5_h
,
src6_h
,
src7_h
;
v8i16
src8_h
,
src9_h
,
src10_h
,
src11_h
,
src12_h
,
src13_h
,
src14_h
,
src15_h
;
v8i16
filt0
,
filt1
,
filt2
,
filt3
,
filt4
,
filt5
,
filt6
,
filt7
;
filt
=
LD_SH
(
y_filter
);
SPLATI_H4_SH
(
filt
,
0
,
1
,
2
,
3
,
filt0
,
filt1
,
filt2
,
filt3
);
SPLATI_H4_SH
(
filt
,
4
,
5
,
6
,
7
,
filt4
,
filt5
,
filt6
,
filt7
);
for
(
x
=
0
;
x
<
w
;
x
+=
16
)
{
LD_SB8
(
src_y
,
src_pitch
,
src0
,
src1
,
src2
,
src3
,
src4
,
src5
,
src6
,
src7
);
src_y
+=
16
;
XORI_B4_128_SB
(
src0
,
src1
,
src2
,
src3
);
XORI_B4_128_SB
(
src4
,
src5
,
src6
,
src7
);
UNPCK_SB_SH
(
src0
,
src0_h
,
src1_h
);
UNPCK_SB_SH
(
src1
,
src2_h
,
src3_h
);
UNPCK_SB_SH
(
src2
,
src4_h
,
src5_h
);
UNPCK_SB_SH
(
src3
,
src6_h
,
src7_h
);
UNPCK_SB_SH
(
src4
,
src8_h
,
src9_h
);
UNPCK_SB_SH
(
src5
,
src10_h
,
src11_h
);
UNPCK_SB_SH
(
src6
,
src12_h
,
src13_h
);
UNPCK_SB_SH
(
src7
,
src14_h
,
src15_h
);
src0_h
*=
filt0
;
src1_h
*=
filt0
;
src8_h
*=
filt4
;
src9_h
*=
filt4
;
src0_h
+=
src2_h
*
filt1
;
src1_h
+=
src3_h
*
filt1
;
src8_h
+=
src10_h
*
filt5
;
src9_h
+=
src11_h
*
filt5
;
src0_h
+=
src4_h
*
filt2
;
src1_h
+=
src5_h
*
filt2
;
src8_h
+=
src12_h
*
filt6
;
src9_h
+=
src13_h
*
filt6
;
src0_h
+=
src6_h
*
filt3
;
src1_h
+=
src7_h
*
filt3
;
src8_h
+=
src14_h
*
filt7
;
src9_h
+=
src15_h
*
filt7
;
ADDS_SH2_SH
(
src0_h
,
src8_h
,
src1_h
,
src9_h
,
src0_h
,
src1_h
);
SRARI_H2_SH
(
src0_h
,
src1_h
,
FILTER_BITS
);
SAT_SH2_SH
(
src0_h
,
src1_h
,
7
);
dst0
=
PCKEV_XORI128_UB
(
src0_h
,
src1_h
);
ST_UB
(
dst0
,
dst
);
dst
+=
16
;
}
}
static
void
scaledconvolve_vert_w4
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
InterpKernel
*
y_filters
,
int
y0_q4
,
int
y_step_q4
,
int
h
)
{
int
y
;
int
y_q4
=
y0_q4
;
src
-=
src_stride
*
(
SUBPEL_TAPS
/
2
-
1
);
for
(
y
=
0
;
y
<
h
;
++
y
)
{
const
uint8_t
*
src_y
=
&
src
[(
y_q4
>>
SUBPEL_BITS
)
*
src_stride
];
const
int16_t
*
const
y_filter
=
y_filters
[
y_q4
&
SUBPEL_MASK
];
if
(
y_q4
&
SUBPEL_MASK
)
{
filter_vert_w4_msa
(
src_y
,
src_stride
,
&
dst
[
y
*
dst_stride
],
y_filter
);
}
else
{
uint32_t
srcd
=
LW
(
src_y
+
3
*
src_stride
);
SW
(
srcd
,
dst
+
y
*
dst_stride
);
}
y_q4
+=
y_step_q4
;
}
}
static
void
scaledconvolve_vert_w8
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
InterpKernel
*
y_filters
,
int
y0_q4
,
int
y_step_q4
,
int
h
)
{
int
y
;
int
y_q4
=
y0_q4
;
src
-=
src_stride
*
(
SUBPEL_TAPS
/
2
-
1
);
for
(
y
=
0
;
y
<
h
;
++
y
)
{
const
uint8_t
*
src_y
=
&
src
[(
y_q4
>>
SUBPEL_BITS
)
*
src_stride
];
const
int16_t
*
const
y_filter
=
y_filters
[
y_q4
&
SUBPEL_MASK
];
if
(
y_q4
&
SUBPEL_MASK
)
{
filter_vert_w8_msa
(
src_y
,
src_stride
,
&
dst
[
y
*
dst_stride
],
y_filter
);
}
else
{
uint64_t
srcd
=
LD
(
src_y
+
3
*
src_stride
);
SD
(
srcd
,
dst
+
y
*
dst_stride
);
}
y_q4
+=
y_step_q4
;
}
}