Commit 7c5f00f8 authored by Parag Salasakar's avatar Parag Salasakar

mips msa vp9 idct 8x8 optimization

average improvement ~4x-6x

Change-Id: I5edf713721b9e24c7e0ce2e69d8fc3ecab625d91
parent a8a9c2bb
......@@ -777,4 +777,18 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,
VPX_BITS_8)));
#endif
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
MSA, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_msa, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
MSA, FwdTrans8x8HT,
::testing::Values(
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_msa, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_msa, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
......@@ -324,7 +324,15 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fdct16x16_c,
&vp9_idct16x16_256_add_c,
&vp9_idct16x16_1_add_msa,
TX_16X16, 1)));
TX_16X16, 1),
make_tuple(&vp9_fdct8x8_c,
&vp9_idct8x8_64_add_c,
&vp9_idct8x8_12_add_msa,
TX_8X8, 10),
make_tuple(&vp9_fdct8x8_c,
&vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_msa,
TX_8X8, 1)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
This diff is collapsed.
......@@ -388,6 +388,44 @@
out_m; \
})
#define TRANSPOSE4X8_H(in0, in1, in2, in3, \
in4, in5, in6, in7, \
out0, out1, out2, out3, \
out4, out5, out6, out7) { \
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
v8i16 zero_m = { 0 }; \
\
tmp0_n = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \
tmp1_n = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \
tmp2_n = __msa_ilvr_h((v8i16)(in5), (v8i16)(in4)); \
tmp3_n = __msa_ilvr_h((v8i16)(in7), (v8i16)(in6)); \
\
ILV_W_LRLR_SH((tmp0_n), (tmp1_n), (tmp2_n), (tmp3_n), \
tmp2_m, tmp0_m, tmp3_m, tmp1_m); \
\
out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
\
out4 = zero_m; \
out5 = zero_m; \
out6 = zero_m; \
out7 = zero_m; \
}
#define TRANSPOSE8X4_H(in0, in1, in2, in3, \
out0, out1, out2, out3) { \
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
ILV_H_LRLR_SH((in0), (in1), (in2), (in3), \
tmp2_m, tmp0_m, tmp3_m, tmp1_m); \
\
ILV_W_LRLR_SH(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
out1, out0, out3, out2); \
}
/* halfword 8x8 transpose macro */
#define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \
in4, in5, in6, in7, \
......@@ -445,6 +483,14 @@
out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \
}
#define ILV_W_LRLR_SH(in0, in1, in2, in3, \
out0, out1, out2, out3) { \
out0 = (v8i16)__msa_ilvl_w((v4i32)(in1), (v4i32)(in0)); \
out1 = (v8i16)__msa_ilvr_w((v4i32)(in1), (v4i32)(in0)); \
out2 = (v8i16)__msa_ilvl_w((v4i32)(in3), (v4i32)(in2)); \
out3 = (v8i16)__msa_ilvr_w((v4i32)(in3), (v4i32)(in2)); \
}
#define ILV_H_LR_SH(in0, in1, out0, out1) { \
out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \
out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \
......@@ -572,12 +618,29 @@
out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3)); \
}
#define SPLATI_H_4VECS_SH(coeff, val0, val1, val2, val3, \
out0, out1, out2, out3) { \
out0 = __msa_splati_h((v8i16)(coeff), (val0)); \
out1 = __msa_splati_h((v8i16)(coeff), (val1)); \
out2 = __msa_splati_h((v8i16)(coeff), (val2)); \
out3 = __msa_splati_h((v8i16)(coeff), (val3)); \
}
#define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r, \
out0, out1) { \
out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \
out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \
}
#define PCKEV_H_4VECS_SH(in0_l, in0_r, in1_l, in1_r, \
in2_l, in2_r, in3_l, in3_r, \
out0, out1, out2, out3) { \
out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \
out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \
out2 = __msa_pckev_h((v8i16)(in2_l), (v8i16)(in2_r)); \
out3 = __msa_pckev_h((v8i16)(in3_l), (v8i16)(in3_r)); \
}
#define XORI_B_2VECS_UB(val0, val1, \
out0, out1, xor_val) { \
out0 = __msa_xori_b((v16u8)(val0), (xor_val)); \
......
......@@ -425,13 +425,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
specialize qw/vp9_idct8x8_1_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
specialize qw/vp9_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/;
......@@ -457,7 +457,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
......
......@@ -137,6 +137,7 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment