Commit a5d9416f authored by Parag Salasakar's avatar Parag Salasakar

mips msa vp8 post proc optimization

average improvement ~2x-4x

Change-Id: I93abc15389649c169bb8b69127c0b95407d34692
parent ce4c4b96
......@@ -110,4 +110,9 @@ INSTANTIATE_TEST_CASE_P(SSE2, VP8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
#endif
#if HAVE_MSA
INSTANTIATE_TEST_CASE_P(MSA, VP8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_mb_row_msa));
#endif
} // namespace
This diff is collapsed.
......@@ -435,6 +435,19 @@
ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
}
/* Description : Store 8x1 byte block to destination memory from input vector
Arguments : Inputs - in, pdst
Details : Index 0 double word element from 'in' vector is copied to the
GP register and stored to (pdst)
*/
#define ST8x1_UB(in, pdst) \
{ \
uint64_t out0_m; \
\
out0_m = __msa_copy_u_d((v2i64)in, 0); \
SD(out0_m, pdst); \
}
/* Description : Store 8x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
Details : Index 0 double word element from 'in' vector is copied to the
......@@ -479,6 +492,22 @@
SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
}
/* Description : Immediate number of elements to slide with zero
Arguments : Inputs - in0, in1, slide_val
Outputs - out0, out1
Return Type - as per RTYPE
Details : Byte elements from 'zero_m' vector are slid into 'in0' by
value specified in the 'slide_val'
*/
#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
{ \
v16i8 zero_m = { 0 }; \
\
out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
}
#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
/* Description : Immediate number of elements to slide
Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
Outputs - out0, out1
......@@ -642,6 +671,76 @@
out_m; \
})
/* Description : Horizontal addition of 4 signed word elements of input vector
Arguments : Input - in (signed word vector)
Output - sum_m (i32 sum)
Return Type - signed word (GP)
Details : 4 signed word elements of 'in' vector are added together and
the resulting integer sum is returned
*/
#define HADD_SW_S32(in) \
({ \
v2i64 res0_m, res1_m; \
int32_t sum_m; \
\
res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
res1_m = __msa_splati_d(res0_m, 1); \
res0_m = res0_m + res1_m; \
sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
sum_m; \
})
/* Description : Horizontal addition of 8 unsigned halfword elements
Arguments : Inputs - in (unsigned halfword vector)
Outputs - sum_m (u32 sum)
Return Type - unsigned word
Details : 8 unsigned halfword elements of input vector are added
together and the resulting integer sum is returned
*/
#define HADD_UH_U32(in) \
({ \
v4u32 res_m; \
v2u64 res0_m, res1_m; \
uint32_t sum_m; \
\
res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
res0_m = __msa_hadd_u_d(res_m, res_m); \
res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
res0_m = res0_m + res1_m; \
sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
sum_m; \
})
/* Description : Horizontal addition of unsigned byte vector elements
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each unsigned odd byte element from 'in0' is added to
even unsigned byte element from 'in0' (pairwise) and the
halfword result is written to 'out0'
*/
#define HADD_UB2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
}
#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
/* Description : Horizontal subtraction of unsigned byte vector elements
Arguments : Inputs - in0, in1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each unsigned odd byte element from 'in0' is subtracted from
even unsigned byte element from 'in0' (pairwise) and the
halfword result is written to 'out0'
*/
#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
}
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
/* Description : Set element n input vector to GPR value
Arguments : Inputs - in0, in1, in2, in3
Output - out
......@@ -864,6 +963,9 @@
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
}
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
......@@ -1195,6 +1297,28 @@
out3 = in0 - in3; \
}
/* Description : Transpose input 8x8 byte block
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - as per RTYPE
*/
#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
\
ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
}
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
in8, in9, in10, in11, in12, in13, in14, in15
......
......@@ -167,18 +167,18 @@ $vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6;
#
if (vpx_config("CONFIG_POSTPROC") eq "yes") {
add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vp8_mbpost_proc_down mmx sse2/;
specialize qw/vp8_mbpost_proc_down mmx sse2 msa/;
$vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm;
add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vp8_mbpost_proc_across_ip sse2/;
specialize qw/vp8_mbpost_proc_across_ip sse2 msa/;
$vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm;
add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
specialize qw/vp8_post_proc_down_and_across_mb_row sse2/;
specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
specialize qw/vp8_plane_add_noise mmx sse2/;
specialize qw/vp8_plane_add_noise mmx sse2 msa/;
$vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
......
......@@ -124,6 +124,7 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/postproc_msa.c
endif
# common (c)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment